i965: Fix cross-primitive scratch corruption when changing the per-thread allocation.

I haven't found any mention of this in the hardware docs, but experimentally what seems to be going on is that when the per-thread scratch slot size is changed between two pipelined draw calls, shader invocations using the old and new scratch size setting may end up being executed in parallel, causing their scratch offset calculations to be based in a different partitioning of the scratch space, which can cause their thread-local scratch space to overlap leading to cross-thread scratch corruption. I've been experimenting with alternative workarounds, like emitting a PIPE_CONTROL with DC flush and CS stall between draw (or dispatch compute) calls using different per-thread scratch allocation settings, or avoiding reuse of the scratch BO if the per-thread scratch allocation doesn't exactly match the original. Both seem to be as effective as this workaround, but they have potential performance implications, while this should be basically for free. Fixes over 40 failures in our CI system with spilling forced on (including CTS, dEQP and Piglit failures) on a number of different platforms from Gen4 to Gen9. The 'glsl-max-varyings' piglit test seems to be able to reproduce this bug consistently in the vertex shader on at least Gen4, Gen8 and Gen9 with spilling forced on. Cc: <mesa-stable@lists.freedesktop.org> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-06-10 16:41:59 -07:00
parent d960284e44
commit a84b5d43e2
17 changed files with 31 additions and 18 deletions
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -674,6 +674,19 @@ struct brw_stage_state
   /**
    * Optional scratch buffer used to store spilled register values and
    * variably-indexed GRF arrays.
+    *
+    * The contents of this buffer are short-lived so the same memory can be
+    * re-used at will for multiple shader programs (executed by the same fixed
+    * function).  However reusing a scratch BO for which shader invocations
+    * are still in flight with a per-thread scratch slot size other than the
+    * original can cause threads with different scratch slot size and FFTID
+    * (which may be executed in parallel depending on the shader stage and
+    * hardware generation) to map to an overlapping region of the scratch
+    * space, which can potentially lead to mutual scratch space corruption.
+    * For that reason if you borrow this scratch buffer you should only be
+    * using the slot size given by the \c per_thread_scratch member below,
+    * unless you're taking additional measures to synchronize thread execution
+    * across slot size changes.
    */
   drm_intel_bo *scratch_bo;

--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -83,7 +83,7 @@ brw_upload_vs_unit(struct brw_context *brw)
      vs->thread2.scratch_space_base_pointer =
 	 stage_state->scratch_bo->offset64 >> 10; /* reloc */
      vs->thread2.per_thread_scratch_space =
-	 ffs(brw->vs.prog_data->base.base.total_scratch) - 11;
+	 ffs(stage_state->per_thread_scratch) - 11;
   } else {
      vs->thread2.scratch_space_base_pointer = 0;
      vs->thread2.per_thread_scratch_space = 0;
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -133,7 +133,7 @@ brw_upload_wm_unit(struct brw_context *brw)
      wm->thread2.scratch_space_base_pointer =
 	 brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
      wm->thread2.per_thread_scratch_space =
-	 ffs(prog_data->base.total_scratch) - 11;
+	 ffs(brw->wm.base.per_thread_scratch) - 11;
   } else {
      wm->thread2.scratch_space_base_pointer = 0;
      wm->thread2.per_thread_scratch_space = 0;
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -140,7 +140,7 @@ upload_gs_state(struct brw_context *brw)
      if (prog_data->base.total_scratch) {
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0); /* no scratch space */
      }
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -126,7 +126,7 @@ upload_vs_state(struct brw_context *brw)
   if (brw->vs.prog_data->base.base.total_scratch) {
      OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
   }
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -220,7 +220,7 @@ gen6_upload_wm_state(struct brw_context *brw,
   if (prog_data->base.total_scratch) {
      OUT_RELOC(stage_state->scratch_bo,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(prog_data->base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
   }
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw)
          */
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
      } else if (brw->is_haswell) {
         /* Haswell's Per Thread Scratch Space is in the range [0, 10]
          * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
          */
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->total_scratch) - 12);
+                   ffs(stage_state->per_thread_scratch) - 12);
      } else {
         /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
          * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
          */
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   prog_data->total_scratch / 1024 - 1);
+                   stage_state->per_thread_scratch / 1024 - 1);
      }
   } else {
      OUT_BATCH(0);
--- a/src/mesa/drivers/dri/i965/gen7_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c
@@ -82,7 +82,7 @@ gen7_upload_ds_state(struct brw_context *brw)
      if (prog_data->total_scratch) {
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
      }
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -64,7 +64,7 @@ upload_gs_state(struct brw_context *brw)
      if (brw->gs.prog_data->base.base.total_scratch) {
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
      }
--- a/src/mesa/drivers/dri/i965/gen7_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c
@@ -83,7 +83,7 @@ gen7_upload_hs_state(struct brw_context *brw)
      if (prog_data->base.total_scratch) {
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
      }
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -56,7 +56,7 @@ upload_vs_state(struct brw_context *brw)
   if (prog_data->base.total_scratch) {
      OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(prog_data->base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
   }
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -235,7 +235,7 @@ gen7_upload_ps_state(struct brw_context *brw,
   if (prog_data->base.total_scratch) {
      OUT_RELOC(brw->wm.base.scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(prog_data->base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
   }
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c
@@ -52,7 +52,7 @@ gen8_upload_ds_state(struct brw_context *brw)
      if (prog_data->total_scratch) {
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
         OUT_BATCH(0);
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -57,7 +57,7 @@ gen8_upload_gs_state(struct brw_context *brw)
      if (brw->gs.prog_data->base.base.total_scratch) {
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
         OUT_BATCH(0);
--- a/src/mesa/drivers/dri/i965/gen8_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c
@@ -52,7 +52,7 @@ gen8_upload_hs_state(struct brw_context *brw)
      if (prog_data->base.total_scratch) {
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->base.total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
         OUT_BATCH(0);
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -255,7 +255,7 @@ gen8_upload_ps_state(struct brw_context *brw,
   if (prog_data->base.total_scratch) {
      OUT_RELOC64(stage_state->scratch_bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(prog_data->base.total_scratch) - 11);
+                  ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
      OUT_BATCH(0);
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -58,7 +58,7 @@ upload_vs_state(struct brw_context *brw)
   if (prog_data->base.total_scratch) {
      OUT_RELOC64(stage_state->scratch_bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(prog_data->base.total_scratch) - 11);
+                  ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
      OUT_BATCH(0);