nir/loop_analyze: Determine iteration counts for more kinds of loops

If loop iterator is incremented with something other than regular
addition, it would be more error prone to calculate the number of
iterations theoretically. What we can do instead, is try to emulate the
loop, and determine the number of iterations empirically.

These operations are covered:
 - imul
 - fmul
 - ishl
 - ishr
 - ushr

Also add unit tests for loop unrollment.

Improves performance of Aztec Ruins (sixonix
gfxbench5.aztec_ruins_vk_high) by -1.28042% +/- 0.498555% (N=5) on Intel
Arc A770.

v2 (idr): Rebase on 3 years. :( Use nir_phi_instr_add_src in the test
cases.

v3 (idr): Use try_eval_const_alu in to evaluate loop termination
condition in get_iteration_empirical. Also restructure the loop
slightly. This fixed off by one iteration errors in "inverted" loop
tests (e.g., nir_loop_analyze_test.ushr_ieq_known_count_invert_31).

v4 (idr): Use try_eval_const_alu in to evaluate induction variable
update in get_iteration_empirical. This fixes non-commutative update
operations (e.g., shifts) when the induction varible is not the first
source. This fixes the unit test
nir_loop_analyze_test.ishl_rev_ieq_infinite_loop_unknown_count.

v5 (idr): Fix _type parameter for fadd and fadd_rev loop unroll
tests. Hopefully that fixes the failure on s390x. Temporarily disable
fmul. This works-around the revealed problem in
glsl-fs-loop-unroll-mul-fp64, and there were no shader-db or fossil-db
changes.

v6 (idr): Plumb max_unroll_iterations into get_iteration_empirical. I
was going to do this, but I forgot. Suggested by Tim.

v7 (idr): Disable fadd tests on s390x. They fail because S390 is weird.

Almost all of the shaders affected (OpenGL or Vulkan) are from gfxbench
or geekbench. A couple shaders in Deus Ex (OpenGL), Dirt Rally (OpenGL),
Octopath Traveler (Vulkan), and Rise of the Tomb Raider (Vulkan) are
helped.

The lost / gained shaders in OpenGL are an Aztec Ruins shader that goes
from SIMD16 to SIMD8. The spills / fills affected are in a single Aztec
Ruins (Vulkan) compute shader.

shader-db results:

Skylake, Ice Lake, and Tiger Lake had similar results. (Tiger Lake shown)
total loops in shared programs: 5514 -> 5470 (-0.80%)
loops in affected programs: 62 -> 18 (-70.97%)
helped: 37 / HURT: 0

LOST:   2
GAINED: 2

Haswell and Broadwell had similar results. (Broadwell shown)
total loops in shared programs: 5346 -> 5298 (-0.90%)
loops in affected programs: 66 -> 18 (-72.73%)
helped: 39 / HURT: 0

fossil-db results:

Skylake, Ice Lake, and Tiger Lake had similar results. (Tiger Lake shown)
Instructions in all programs: 157374679 -> 157397421 (+0.0%)
Instructions hurt: 28

SENDs in all programs: 7463800 -> 7467639 (+0.1%)
SENDs hurt: 28

Loops in all programs: 38980 -> 38950 (-0.1%)
Loops helped: 28

Cycles in all programs: 7559486451 -> 7557455384 (-0.0%)
Cycles helped: 28

Spills in all programs: 11405 -> 11403 (-0.0%)
Spills helped: 1

Fills in all programs: 19578 -> 19588 (+0.1%)
Fills hurt: 1

Lost: 1

Signed-off-by: Yevhenii Kolesnikov <yevhenii.kolesnikov@globallogic.com>
Reviewed-by: Timothy Arceri <tarceri@itsqueeze.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3445>
This commit is contained in:
Yevhenii Kolesnikov
2020-01-17 13:01:01 +02:00
committed by Marge Bot
parent f051967f19
commit 9427aaeab7
4 changed files with 307 additions and 42 deletions

View File

@@ -907,6 +907,43 @@ get_iteration(nir_op cond_op, nir_const_value initial, nir_const_value step,
return iter_u64 > INT_MAX ? -1 : (int)iter_u64;
}
static int32_t
get_iteration_empirical(nir_alu_instr *cond_alu, nir_alu_instr *incr_alu,
nir_ssa_def *basis, nir_const_value initial,
bool invert_cond, unsigned execution_mode,
unsigned max_unroll_iterations)
{
int iter_count = 0;
nir_const_value result;
nir_const_value iter = initial;
const nir_ssa_def *originals[2] = { basis, NULL };
const nir_const_value *replacements[2] = { &iter, NULL };
while (iter_count <= max_unroll_iterations) {
bool success;
success = try_eval_const_alu(&result, cond_alu, originals, replacements,
1, execution_mode);
if (!success)
return -1;
const bool cond_succ = invert_cond ? !result.b : result.b;
if (cond_succ)
return iter_count;
iter_count++;
success = try_eval_const_alu(&result, incr_alu, originals, replacements,
1, execution_mode);
assert(success);
iter = result;
}
return -1;
}
static bool
will_break_on_first_iteration(nir_alu_instr *cond_alu, nir_ssa_def *basis,
nir_ssa_def *limit_basis,
@@ -980,7 +1017,8 @@ calculate_iterations(nir_ssa_def *basis, nir_ssa_def *limit_basis,
nir_const_value initial, nir_const_value step,
nir_const_value limit, nir_alu_instr *alu,
nir_ssa_scalar cond, nir_op alu_op, bool limit_rhs,
bool invert_cond, unsigned execution_mode)
bool invert_cond, unsigned execution_mode,
unsigned max_unroll_iterations)
{
/* nir_op_isub should have been lowered away by this point */
assert(alu->op != nir_op_isub);
@@ -1027,6 +1065,12 @@ calculate_iterations(nir_ssa_def *basis, nir_ssa_def *limit_basis,
return 0;
}
/* For loops incremented with addition operation, it's easy to
* calculate the number of iterations theoretically. Even though it
* is possible for other operations as well, it is much more error
* prone, and doesn't cover all possible cases. So, we try to
* emulate the loop.
*/
int iter_int;
switch (alu->op) {
case nir_op_iadd:
@@ -1037,12 +1081,20 @@ calculate_iterations(nir_ssa_def *basis, nir_ssa_def *limit_basis,
iter_int = get_iteration(alu_op, initial, step, limit, bit_size,
execution_mode);
break;
case nir_op_imul:
case nir_op_fmul:
/* Detecting non-zero loop counts when the loop increment is floating
* point multiplication triggers a preexisting problem in
* glsl-fs-loop-unroll-mul-fp64.shader_test. See
* https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3445#note_1779438.
*/
return -1;
case nir_op_imul:
case nir_op_ishl:
case nir_op_ishr:
case nir_op_ushr:
return -1;
return get_iteration_empirical(cond_alu, alu, basis, initial,
invert_cond, execution_mode,
max_unroll_iterations);
default:
unreachable("Invalid induction variable increment operation.");
}
@@ -1206,7 +1258,8 @@ try_find_trip_count_vars_in_iand(nir_ssa_scalar *cond,
* loop.
*/
static void
find_trip_count(loop_info_state *state, unsigned execution_mode)
find_trip_count(loop_info_state *state, unsigned execution_mode,
unsigned max_unroll_iterations)
{
bool trip_count_known = true;
bool guessed_trip_count = false;
@@ -1329,7 +1382,8 @@ find_trip_count(loop_info_state *state, unsigned execution_mode)
cond,
alu_op, limit_rhs,
invert_cond,
execution_mode);
execution_mode,
max_unroll_iterations);
/* Where we not able to calculate the iteration count */
if (iterations == -1) {
@@ -1488,7 +1542,9 @@ get_loop_info(loop_info_state *state, nir_function_impl *impl)
return;
/* Run through each of the terminators and try to compute a trip-count */
find_trip_count(state, impl->function->shader->info.float_controls_execution_mode);
find_trip_count(state,
impl->function->shader->info.float_controls_execution_mode,
impl->function->shader->options->max_unroll_iterations);
nir_foreach_block_in_cf_node(block, &state->loop->cf_node) {
nir_foreach_instr(instr, block) {