i965: Fix cross-primitive scratch corruption when changing the per-thread allocation.

I haven't found any mention of this in the hardware docs, but
experimentally what seems to be going on is that when the per-thread
scratch slot size is changed between two pipelined draw calls, shader
invocations using the old and new scratch size setting may end up
being executed in parallel, causing their scratch offset calculations
to be based in a different partitioning of the scratch space, which
can cause their thread-local scratch space to overlap leading to
cross-thread scratch corruption.

I've been experimenting with alternative workarounds, like emitting a
PIPE_CONTROL with DC flush and CS stall between draw (or dispatch
compute) calls using different per-thread scratch allocation settings,
or avoiding reuse of the scratch BO if the per-thread scratch
allocation doesn't exactly match the original.  Both seem to be as
effective as this workaround, but they have potential performance
implications, while this should be basically for free.

Fixes over 40 failures in our CI system with spilling forced on
(including CTS, dEQP and Piglit failures) on a number of different
platforms from Gen4 to Gen9.  The 'glsl-max-varyings' piglit test
seems to be able to reproduce this bug consistently in the vertex
shader on at least Gen4, Gen8 and Gen9 with spilling forced on.

Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Francisco Jerez
2016-06-10 16:41:59 -07:00
parent d960284e44
commit a84b5d43e2
17 changed files with 31 additions and 18 deletions

View File

@@ -674,6 +674,19 @@ struct brw_stage_state
/**
* Optional scratch buffer used to store spilled register values and
* variably-indexed GRF arrays.
*
* The contents of this buffer are short-lived so the same memory can be
* re-used at will for multiple shader programs (executed by the same fixed
* function). However reusing a scratch BO for which shader invocations
* are still in flight with a per-thread scratch slot size other than the
* original can cause threads with different scratch slot size and FFTID
* (which may be executed in parallel depending on the shader stage and
* hardware generation) to map to an overlapping region of the scratch
* space, which can potentially lead to mutual scratch space corruption.
* For that reason if you borrow this scratch buffer you should only be
* using the slot size given by the \c per_thread_scratch member below,
* unless you're taking additional measures to synchronize thread execution
* across slot size changes.
*/
drm_intel_bo *scratch_bo;

View File

@@ -83,7 +83,7 @@ brw_upload_vs_unit(struct brw_context *brw)
vs->thread2.scratch_space_base_pointer =
stage_state->scratch_bo->offset64 >> 10; /* reloc */
vs->thread2.per_thread_scratch_space =
ffs(brw->vs.prog_data->base.base.total_scratch) - 11;
ffs(stage_state->per_thread_scratch) - 11;
} else {
vs->thread2.scratch_space_base_pointer = 0;
vs->thread2.per_thread_scratch_space = 0;

View File

@@ -133,7 +133,7 @@ brw_upload_wm_unit(struct brw_context *brw)
wm->thread2.scratch_space_base_pointer =
brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
wm->thread2.per_thread_scratch_space =
ffs(prog_data->base.total_scratch) - 11;
ffs(brw->wm.base.per_thread_scratch) - 11;
} else {
wm->thread2.scratch_space_base_pointer = 0;
wm->thread2.per_thread_scratch_space = 0;

View File

@@ -140,7 +140,7 @@ upload_gs_state(struct brw_context *brw)
if (prog_data->base.total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0); /* no scratch space */
}

View File

@@ -126,7 +126,7 @@ upload_vs_state(struct brw_context *brw)
if (brw->vs.prog_data->base.base.total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -220,7 +220,7 @@ gen6_upload_wm_state(struct brw_context *brw,
if (prog_data->base.total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw)
*/
OUT_RELOC64(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else if (brw->is_haswell) {
/* Haswell's Per Thread Scratch Space is in the range [0, 10]
* where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
*/
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->total_scratch) - 12);
ffs(stage_state->per_thread_scratch) - 12);
} else {
/* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
* where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
*/
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
prog_data->total_scratch / 1024 - 1);
stage_state->per_thread_scratch / 1024 - 1);
}
} else {
OUT_BATCH(0);

View File

@@ -82,7 +82,7 @@ gen7_upload_ds_state(struct brw_context *brw)
if (prog_data->total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -64,7 +64,7 @@ upload_gs_state(struct brw_context *brw)
if (brw->gs.prog_data->base.base.total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -83,7 +83,7 @@ gen7_upload_hs_state(struct brw_context *brw)
if (prog_data->base.total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -56,7 +56,7 @@ upload_vs_state(struct brw_context *brw)
if (prog_data->base.total_scratch) {
OUT_RELOC(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -235,7 +235,7 @@ gen7_upload_ps_state(struct brw_context *brw,
if (prog_data->base.total_scratch) {
OUT_RELOC(brw->wm.base.scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
}

View File

@@ -52,7 +52,7 @@ gen8_upload_ds_state(struct brw_context *brw)
if (prog_data->total_scratch) {
OUT_RELOC64(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
OUT_BATCH(0);

View File

@@ -57,7 +57,7 @@ gen8_upload_gs_state(struct brw_context *brw)
if (brw->gs.prog_data->base.base.total_scratch) {
OUT_RELOC64(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
OUT_BATCH(0);

View File

@@ -52,7 +52,7 @@ gen8_upload_hs_state(struct brw_context *brw)
if (prog_data->base.total_scratch) {
OUT_RELOC64(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
OUT_BATCH(0);

View File

@@ -255,7 +255,7 @@ gen8_upload_ps_state(struct brw_context *brw,
if (prog_data->base.total_scratch) {
OUT_RELOC64(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
OUT_BATCH(0);

View File

@@ -58,7 +58,7 @@ upload_vs_state(struct brw_context *brw)
if (prog_data->base.total_scratch) {
OUT_RELOC64(stage_state->scratch_bo,
I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
ffs(prog_data->base.total_scratch) - 11);
ffs(stage_state->per_thread_scratch) - 11);
} else {
OUT_BATCH(0);
OUT_BATCH(0);