radeonsi: add Wave32 heuristics and shader profiles

This generally works well.

There are new cases that select Wave32, and there are shader profiles
which adjust that.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13966>
This commit is contained in:
Marek Olšák
2021-11-19 18:36:03 -05:00
committed by Marge Bot
parent e2a1883337
commit b3b2f97f2e
5 changed files with 123 additions and 5 deletions

View File

@@ -65,6 +65,7 @@ static const struct debug_named_value radeonsi_debug_options[] = {
{"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
{"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
{"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
{"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."},
{"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
{"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
{"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},

View File

@@ -197,6 +197,7 @@ enum
DBG_GISEL,
DBG_W32_GE,
DBG_W32_PS,
DBG_W32_PS_DISCARD,
DBG_W32_CS,
DBG_W64_GE,
DBG_W64_PS,

View File

@@ -288,6 +288,10 @@ enum
#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
#define SI_PROFILE_WAVE32 (1 << 0)
#define SI_PROFILE_WAVE64 (1 << 1)
#define SI_PROFILE_IGNORE_LLVM_DISCARD_BUG (1 << 2)
/**
* For VS shader keys, describe any fixups required for vertex fetch.
*
@@ -344,6 +348,7 @@ struct si_shader_info {
shader_info base;
gl_shader_stage stage;
uint32_t options; /* bitmask of SI_PROFILE_* */
ubyte num_inputs;
ubyte num_outputs;
@@ -404,6 +409,7 @@ struct si_shader_info {
bool uses_bindless_samplers;
bool uses_bindless_images;
bool uses_indirect_descriptor;
bool has_divergent_loop;
bool uses_vmem_return_type_sampler_or_bvh;
bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */

View File

@@ -31,6 +31,29 @@
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "tgsi/tgsi_from_mesa.h"
#include "util/mesa-sha1.h"
struct si_shader_profile {
uint32_t sha1[SHA1_DIGEST_LENGTH32];
uint32_t options;
};
static struct si_shader_profile profiles[] =
{
{
/* Viewperf/Energy isn't affected by the discard bug. */
{0x17118671, 0xd0102e0c, 0x947f3592, 0xb2057e7b, 0x4da5d9b0},
SI_PROFILE_IGNORE_LLVM_DISCARD_BUG,
},
{
/* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32,
* probably due to interpolation performance.
*/
{0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8},
SI_PROFILE_WAVE64,
},
};
static const nir_src *get_texture_src(nir_tex_instr *instr, nir_tex_src_type type)
{
@@ -397,6 +420,14 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
info->base = nir->info;
info->stage = nir->info.stage;
/* Get options from shader profiles. */
for (unsigned i = 0; i < ARRAY_SIZE(profiles); i++) {
if (_mesa_printed_sha1_equal(info->base.source_sha1, profiles[i].sha1)) {
info->options = profiles[i].options;
break;
}
}
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
if (info->base.tess.primitive_mode == GL_ISOLINES)
info->base.tess.primitive_mode = GL_LINES;
@@ -531,6 +562,8 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
/* Trim output read masks based on write masks. */
for (unsigned i = 0; i < info->num_outputs; i++)
info->output_readmask[i] &= info->output_usagemask[i];
info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir);
}
static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
@@ -932,5 +965,8 @@ char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
if (sscreen->options.inline_uniforms)
nir_find_inlinable_uniforms(nir);
NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
return NULL;
}

View File

@@ -52,13 +52,87 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
(stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg))
return 64;
if (stage == MESA_SHADER_COMPUTE)
return sscreen->debug_flags & DBG(W32_CS) ? 32 : 64;
/* Small workgroups use Wave32 unconditionally. */
if (stage == MESA_SHADER_COMPUTE && info &&
!info->base.workgroup_size_variable &&
info->base.workgroup_size[0] *
info->base.workgroup_size[1] *
info->base.workgroup_size[2] <= 32)
return 32;
if (stage == MESA_SHADER_FRAGMENT)
return sscreen->debug_flags & DBG(W32_PS) ? 32 : 64;
/* Debug flags. */
unsigned dbg_wave_size = 0;
if (sscreen->debug_flags &
(stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) :
stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE)))
dbg_wave_size = 32;
return sscreen->debug_flags & DBG(W32_GE) ? 32 : 64;
if (sscreen->debug_flags &
(stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) :
stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) {
assert(!dbg_wave_size);
dbg_wave_size = 64;
}
/* Shader profiles. */
unsigned profile_wave_size = 0;
if (info && info->options & SI_PROFILE_WAVE32)
profile_wave_size = 32;
if (info && info->options & SI_PROFILE_WAVE64) {
assert(!profile_wave_size);
profile_wave_size = 64;
}
if (profile_wave_size) {
/* Only debug flags override shader profiles. */
if (dbg_wave_size)
return dbg_wave_size;
return profile_wave_size;
}
/* LLVM 13 and 14 have a bug that causes compile failures with discard in Wave32
* in some cases. Alpha test in Wave32 is luckily unaffected.
*/
if (stage == MESA_SHADER_FRAGMENT && info->base.fs.uses_discard &&
!(info && info->options & SI_PROFILE_IGNORE_LLVM_DISCARD_BUG) &&
LLVM_VERSION_MAJOR >= 13 && !(sscreen->debug_flags & DBG(W32_PS_DISCARD)))
return 64;
/* Debug flags except w32psdiscard don't override the discard bug workaround,
* but they override everything else.
*/
if (dbg_wave_size)
return dbg_wave_size;
/* Pixel shaders without interp instructions don't suffer from reduced interpolation
* performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
*/
if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
return 32;
/* There are a few very rare cases where VS is better with Wave32, and there are no known
* cases where Wave64 is better.
*/
if (stage <= MESA_SHADER_GEOMETRY)
return 32;
/* TODO: Merged shaders must use the same wave size because the driver doesn't recompile
* individual shaders of merged shaders to match the wave size between them.
*/
bool merged_shader = shader && !shader->is_gs_copy_shader &&
(shader->key.ge.as_ls || shader->key.ge.as_es ||
stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY);
/* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
* while the other half is idling but occupying VGPRs, preventing other waves from launching.
* Wave32 eliminates the idling half to allow the next wave to start.
*/
if (!merged_shader && info && info->has_divergent_loop)
return 32;
return 64;
}
/* SHADER_CACHE */