st/nir: Re-vectorize shader IO

We scalarize IO to enable further optimizations, such as propagating
constant components across shaders, eliminating dead components, and
so on.  This patch attempts to re-vectorize those operations after
the varying optimizations are done.

Intel GPUs are a scalar architecture, but IO operations work on whole
vec4's at a time, so we'd prefer to have a single IO load per vector
rather than 4 scalar IO loads.  This re-vectorization can help a lot.

Broadcom GPUs, however, really do want scalar IO.  radeonsi may want
this, or may want to leave it to LLVM.  So, we make a new flag in the
NIR compiler options struct, and key it off of that, allowing drivers
to pick.  (It's a bit awkward because we have per-stage settings, but
this is about IO between two stages...but I expect drivers to globally
prefer one way or the other.  We can adjust later if needed.)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
This commit is contained in:
Kenneth Graunke
2019-04-11 12:28:48 -07:00
parent 1d0a8cf40d
commit c31b4420e7
2 changed files with 31 additions and 0 deletions

View File

@@ -2349,6 +2349,12 @@ typedef struct nir_shader_compiler_options {
bool lower_hadd;
bool lower_add_sat;
/**
* Should IO be re-vectorized? Some scalar ISAs still operate on vec4's
* for IO purposes and would prefer loads/stores be vectorized.
*/
bool vectorize_io;
/**
* Should nir_lower_io() create load_interpolated_input intrinsics?
*

View File

@@ -654,6 +654,28 @@ st_nir_get_mesa_program(struct gl_context *ctx,
prog->nir = nir;
}
static void
st_nir_vectorize_io(nir_shader *producer, nir_shader *consumer)
{
NIR_PASS_V(producer, nir_lower_io_to_vector, nir_var_shader_out);
NIR_PASS_V(producer, nir_opt_combine_stores, nir_var_shader_out);
NIR_PASS_V(consumer, nir_lower_io_to_vector, nir_var_shader_in);
if ((producer)->info.stage != MESA_SHADER_TESS_CTRL) {
/* Calling lower_io_to_vector creates output variable writes with
* write-masks. We only support these for TCS outputs, so for other
* stages, we need to call nir_lower_io_to_temporaries to get rid of
* them. This, in turn, creates temporary variables and extra
* copy_deref intrinsics that we need to clean up.
*/
NIR_PASS_V(producer, nir_lower_io_to_temporaries,
nir_shader_get_entrypoint(producer), true, false);
NIR_PASS_V(producer, nir_lower_global_vars_to_local);
NIR_PASS_V(producer, nir_split_var_copies);
NIR_PASS_V(producer, nir_lower_var_copies);
}
}
static void
st_nir_link_shaders(nir_shader **producer, nir_shader **consumer, bool scalar)
{
@@ -844,6 +866,9 @@ st_link_nir(struct gl_context *ctx,
prev_shader->sh.LinkedTransformFeedback->NumVarying > 0))
nir_compact_varyings(shader_program->_LinkedShaders[prev]->Program->nir,
nir, ctx->API != API_OPENGL_COMPAT);
if (ctx->Const.ShaderCompilerOptions[i].NirOptions->vectorize_io)
st_nir_vectorize_io(prev_shader->nir, nir);
}
prev = i;
}