radeonsi: add Wave32 heuristics and shader profiles
This generally works well. There are new cases that select Wave32, and there are shader profiles which adjust that. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13966>
This commit is contained in:
@@ -65,6 +65,7 @@ static const struct debug_named_value radeonsi_debug_options[] = {
|
||||
{"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
|
||||
{"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
|
||||
{"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
|
||||
{"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."},
|
||||
{"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
|
||||
{"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
|
||||
{"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
|
||||
|
@@ -197,6 +197,7 @@ enum
|
||||
DBG_GISEL,
|
||||
DBG_W32_GE,
|
||||
DBG_W32_PS,
|
||||
DBG_W32_PS_DISCARD,
|
||||
DBG_W32_CS,
|
||||
DBG_W64_GE,
|
||||
DBG_W64_PS,
|
||||
|
@@ -288,6 +288,10 @@ enum
|
||||
#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
|
||||
#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
|
||||
|
||||
#define SI_PROFILE_WAVE32 (1 << 0)
|
||||
#define SI_PROFILE_WAVE64 (1 << 1)
|
||||
#define SI_PROFILE_IGNORE_LLVM_DISCARD_BUG (1 << 2)
|
||||
|
||||
/**
|
||||
* For VS shader keys, describe any fixups required for vertex fetch.
|
||||
*
|
||||
@@ -344,6 +348,7 @@ struct si_shader_info {
|
||||
shader_info base;
|
||||
|
||||
gl_shader_stage stage;
|
||||
uint32_t options; /* bitmask of SI_PROFILE_* */
|
||||
|
||||
ubyte num_inputs;
|
||||
ubyte num_outputs;
|
||||
@@ -404,6 +409,7 @@ struct si_shader_info {
|
||||
bool uses_bindless_samplers;
|
||||
bool uses_bindless_images;
|
||||
bool uses_indirect_descriptor;
|
||||
bool has_divergent_loop;
|
||||
|
||||
bool uses_vmem_return_type_sampler_or_bvh;
|
||||
bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */
|
||||
|
@@ -31,6 +31,29 @@
|
||||
#include "si_pipe.h"
|
||||
#include "si_shader_internal.h"
|
||||
#include "tgsi/tgsi_from_mesa.h"
|
||||
#include "util/mesa-sha1.h"
|
||||
|
||||
|
||||
struct si_shader_profile {
|
||||
uint32_t sha1[SHA1_DIGEST_LENGTH32];
|
||||
uint32_t options;
|
||||
};
|
||||
|
||||
static struct si_shader_profile profiles[] =
|
||||
{
|
||||
{
|
||||
/* Viewperf/Energy isn't affected by the discard bug. */
|
||||
{0x17118671, 0xd0102e0c, 0x947f3592, 0xb2057e7b, 0x4da5d9b0},
|
||||
SI_PROFILE_IGNORE_LLVM_DISCARD_BUG,
|
||||
},
|
||||
{
|
||||
/* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32,
|
||||
* probably due to interpolation performance.
|
||||
*/
|
||||
{0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8},
|
||||
SI_PROFILE_WAVE64,
|
||||
},
|
||||
};
|
||||
|
||||
static const nir_src *get_texture_src(nir_tex_instr *instr, nir_tex_src_type type)
|
||||
{
|
||||
@@ -397,6 +420,14 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
|
||||
info->base = nir->info;
|
||||
info->stage = nir->info.stage;
|
||||
|
||||
/* Get options from shader profiles. */
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(profiles); i++) {
|
||||
if (_mesa_printed_sha1_equal(info->base.source_sha1, profiles[i].sha1)) {
|
||||
info->options = profiles[i].options;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
|
||||
if (info->base.tess.primitive_mode == GL_ISOLINES)
|
||||
info->base.tess.primitive_mode = GL_LINES;
|
||||
@@ -531,6 +562,8 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
|
||||
/* Trim output read masks based on write masks. */
|
||||
for (unsigned i = 0; i < info->num_outputs; i++)
|
||||
info->output_readmask[i] &= info->output_usagemask[i];
|
||||
|
||||
info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir);
|
||||
}
|
||||
|
||||
static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
|
||||
@@ -932,5 +965,8 @@ char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
|
||||
if (sscreen->options.inline_uniforms)
|
||||
nir_find_inlinable_uniforms(nir);
|
||||
|
||||
NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
|
||||
NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
@@ -52,13 +52,87 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
|
||||
(stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg))
|
||||
return 64;
|
||||
|
||||
if (stage == MESA_SHADER_COMPUTE)
|
||||
return sscreen->debug_flags & DBG(W32_CS) ? 32 : 64;
|
||||
/* Small workgroups use Wave32 unconditionally. */
|
||||
if (stage == MESA_SHADER_COMPUTE && info &&
|
||||
!info->base.workgroup_size_variable &&
|
||||
info->base.workgroup_size[0] *
|
||||
info->base.workgroup_size[1] *
|
||||
info->base.workgroup_size[2] <= 32)
|
||||
return 32;
|
||||
|
||||
if (stage == MESA_SHADER_FRAGMENT)
|
||||
return sscreen->debug_flags & DBG(W32_PS) ? 32 : 64;
|
||||
/* Debug flags. */
|
||||
unsigned dbg_wave_size = 0;
|
||||
if (sscreen->debug_flags &
|
||||
(stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) :
|
||||
stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE)))
|
||||
dbg_wave_size = 32;
|
||||
|
||||
return sscreen->debug_flags & DBG(W32_GE) ? 32 : 64;
|
||||
if (sscreen->debug_flags &
|
||||
(stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) :
|
||||
stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) {
|
||||
assert(!dbg_wave_size);
|
||||
dbg_wave_size = 64;
|
||||
}
|
||||
|
||||
/* Shader profiles. */
|
||||
unsigned profile_wave_size = 0;
|
||||
if (info && info->options & SI_PROFILE_WAVE32)
|
||||
profile_wave_size = 32;
|
||||
|
||||
if (info && info->options & SI_PROFILE_WAVE64) {
|
||||
assert(!profile_wave_size);
|
||||
profile_wave_size = 64;
|
||||
}
|
||||
|
||||
if (profile_wave_size) {
|
||||
/* Only debug flags override shader profiles. */
|
||||
if (dbg_wave_size)
|
||||
return dbg_wave_size;
|
||||
|
||||
return profile_wave_size;
|
||||
}
|
||||
|
||||
/* LLVM 13 and 14 have a bug that causes compile failures with discard in Wave32
|
||||
* in some cases. Alpha test in Wave32 is luckily unaffected.
|
||||
*/
|
||||
if (stage == MESA_SHADER_FRAGMENT && info->base.fs.uses_discard &&
|
||||
!(info && info->options & SI_PROFILE_IGNORE_LLVM_DISCARD_BUG) &&
|
||||
LLVM_VERSION_MAJOR >= 13 && !(sscreen->debug_flags & DBG(W32_PS_DISCARD)))
|
||||
return 64;
|
||||
|
||||
/* Debug flags except w32psdiscard don't override the discard bug workaround,
|
||||
* but they override everything else.
|
||||
*/
|
||||
if (dbg_wave_size)
|
||||
return dbg_wave_size;
|
||||
|
||||
/* Pixel shaders without interp instructions don't suffer from reduced interpolation
|
||||
* performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
|
||||
*/
|
||||
if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
|
||||
return 32;
|
||||
|
||||
/* There are a few very rare cases where VS is better with Wave32, and there are no known
|
||||
* cases where Wave64 is better.
|
||||
*/
|
||||
if (stage <= MESA_SHADER_GEOMETRY)
|
||||
return 32;
|
||||
|
||||
/* TODO: Merged shaders must use the same wave size because the driver doesn't recompile
|
||||
* individual shaders of merged shaders to match the wave size between them.
|
||||
*/
|
||||
bool merged_shader = shader && !shader->is_gs_copy_shader &&
|
||||
(shader->key.ge.as_ls || shader->key.ge.as_es ||
|
||||
stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
/* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
|
||||
* while the other half is idling but occupying VGPRs, preventing other waves from launching.
|
||||
* Wave32 eliminates the idling half to allow the next wave to start.
|
||||
*/
|
||||
if (!merged_shader && info && info->has_divergent_loop)
|
||||
return 32;
|
||||
|
||||
return 64;
|
||||
}
|
||||
|
||||
/* SHADER_CACHE */
|
||||
|
Reference in New Issue
Block a user