2014-11-13 21:19:28 -08:00
|
|
|
/*
|
|
|
|
* Copyright © 2014 Intel Corporation
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
* Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Jason Ekstrand (jason@jlekstrand.net)
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _NIR_SEARCH_
|
|
|
|
#define _NIR_SEARCH_
|
|
|
|
|
|
|
|
#include "nir.h"
|
|
|
|
|
|
|
|
#define NIR_SEARCH_MAX_VARIABLES 16
|
|
|
|
|
2018-10-22 14:08:13 -05:00
|
|
|
struct nir_builder;
|
|
|
|
|
2014-11-13 21:19:28 -08:00
|
|
|
typedef enum {
|
|
|
|
nir_search_value_expression,
|
|
|
|
nir_search_value_variable,
|
|
|
|
nir_search_value_constant,
|
|
|
|
} nir_search_value_type;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
nir_search_value_type type;
|
2016-04-25 12:23:38 -07:00
|
|
|
|
nir/algebraic: Rewrite bit-size inference
Before this commit, there were two copies of the algorithm: one in C,
that we would use to figure out what bit-size to give the replacement
expression, and one in Python, that emulated the C one and tried to
prove that the C algorithm would never fail to correctly assign
bit-sizes. That seemed pretty fragile, and likely to fall over if we
make any changes. Furthermore, the C code was really just recomputing
more-or-less the same thing as the Python code every time. Instead, we
can just store the results of the Python algorithm in the C
datastructure, and consult it to compute the bitsize of each value,
moving the "brains" entirely into Python. Since the Python algorithm no
longer has to match C, it's also a lot easier to change it to something
more closely approximating an actual type-inference algorithm. The
algorithm used is based on Hindley-Milner, although deliberately
weakened a little. It's a few more lines than the old one, judging by
the diffstat, but I think it's easier to verify that it's correct while
being as general as possible.
We could split this up into two changes, first making the C code use the
results of the Python code and then rewriting the Python algorithm, but
since the old algorithm never tracked which variable each equivalence
class, it would mean we'd have to add some non-trivial code which would
then get thrown away. I think it's better to see the final state all at
once, although I could also try splitting it up.
v2:
- Replace instances of "== None" and "!= None" with "is None" and
"is not None".
- Rename first_src to first_unsized_src
- Only merge the destination with the first unsized source, since the
sources have already been merged.
- Add a comment explaining what nir_search_value::bit_size now means.
v3:
- Fix one last instance to use "is not" instead of !=
- Don't try to be so clever when choosing which error message to print
based on whether we're in the search or replace expression.
- Fix trailing whitespace.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Dylan Baker <dylan@pnwbakers.com>
2018-11-23 17:34:19 +01:00
|
|
|
/**
|
|
|
|
* Bit size of the value. It is interpreted as follows:
|
|
|
|
*
|
|
|
|
* For a search expression:
|
|
|
|
* - If bit_size > 0, then the value only matches an SSA value with the
|
|
|
|
* given bit size.
|
|
|
|
* - If bit_size <= 0, then the value matches any size SSA value.
|
|
|
|
*
|
|
|
|
* For a replace expression:
|
|
|
|
* - If bit_size > 0, then the value is constructed with the given bit size.
|
|
|
|
* - If bit_size == 0, then the value is constructed with the same bit size
|
|
|
|
* as the search value.
|
|
|
|
* - If bit_size < 0, then the value is constructed with the same bit size
|
|
|
|
* as variable (-bit_size - 1).
|
|
|
|
*/
|
|
|
|
int bit_size;
|
2014-11-13 21:19:28 -08:00
|
|
|
} nir_search_value;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
nir_search_value value;
|
|
|
|
|
|
|
|
/** The variable index; Must be less than NIR_SEARCH_MAX_VARIABLES */
|
|
|
|
unsigned variable;
|
2015-01-22 14:15:27 -08:00
|
|
|
|
|
|
|
/** Indicates that the given variable must be a constant
|
|
|
|
*
|
2016-05-09 12:46:13 -04:00
|
|
|
* This is only allowed in search expressions and indicates that the
|
2015-01-22 14:15:27 -08:00
|
|
|
* given variable is only allowed to match constant values.
|
|
|
|
*/
|
|
|
|
bool is_constant;
|
2015-01-28 16:29:21 -08:00
|
|
|
|
|
|
|
/** Indicates that the given variable must have a certain type
|
|
|
|
*
|
|
|
|
* This is only allowed in search expressions and indicates that the
|
|
|
|
* given variable is only allowed to match values that come from an ALU
|
|
|
|
* instruction with the given output type. A type of nir_type_void
|
|
|
|
* means it can match any type.
|
|
|
|
*
|
|
|
|
* Note: A variable that is both constant and has a non-void type will
|
|
|
|
* never match anything.
|
|
|
|
*/
|
|
|
|
nir_alu_type type;
|
2016-05-07 13:01:24 -04:00
|
|
|
|
|
|
|
/** Optional condition fxn ptr
|
|
|
|
*
|
|
|
|
* This is only allowed in search expressions, and allows additional
|
|
|
|
* constraints to be placed on the match. Typically used for 'is_constant'
|
|
|
|
* variables to require, for example, power-of-two in order for the search
|
|
|
|
* to match.
|
|
|
|
*/
|
|
|
|
bool (*cond)(nir_alu_instr *instr, unsigned src,
|
|
|
|
unsigned num_components, const uint8_t *swizzle);
|
2014-11-13 21:19:28 -08:00
|
|
|
} nir_search_variable;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
nir_search_value value;
|
|
|
|
|
2015-08-14 11:45:30 -07:00
|
|
|
nir_alu_type type;
|
|
|
|
|
2014-11-13 21:19:28 -08:00
|
|
|
union {
|
2015-08-14 11:45:30 -07:00
|
|
|
uint64_t u;
|
|
|
|
int64_t i;
|
|
|
|
double d;
|
2014-11-13 21:19:28 -08:00
|
|
|
} data;
|
|
|
|
} nir_search_constant;
|
|
|
|
|
2018-11-07 15:40:02 -06:00
|
|
|
enum nir_search_op {
|
|
|
|
nir_search_op_i2f = nir_last_opcode + 1,
|
|
|
|
nir_search_op_u2f,
|
|
|
|
nir_search_op_f2f,
|
|
|
|
nir_search_op_f2u,
|
|
|
|
nir_search_op_f2i,
|
|
|
|
nir_search_op_u2u,
|
|
|
|
nir_search_op_i2i,
|
2018-11-07 13:43:40 -06:00
|
|
|
nir_search_op_b2f,
|
|
|
|
nir_search_op_b2i,
|
|
|
|
nir_search_op_i2b,
|
|
|
|
nir_search_op_f2b,
|
nir/search: Add automaton-based pre-searching
nir_opt_algebraic is currently one of the most expensive NIR passes,
because of the many different patterns we've added over the years. Even
though patterns are already sorted by opcode, there are still way too
many patterns for common opcodes like bcsel and fadd, which means that
many patterns are tried but only a few actually match. One way to fix
this is to add a pre-pass over the code that scans it using an automaton
constructed beforehand, similar to the automatons produced by lex and
yacc for parsing source code. This automaton has to walk the SSA graph
and recognize possible pattern matches.
It turns out that the theory to do this is quite mature already, having
been developed for instruction selection as well as other non-compiler
things. I followed the presentation in the dissertation cited in the
code, "Tree algorithms: Two Taxonomies and a Toolkit," trying to keep
the naming similar. To create the automaton, we have to perform
something like the classical NFA to DFA subset construction used by lex,
but it turns out that actually computing the transition table for all
possible states would be way too expensive, with the dissertation
reporting times of almost half an hour for an example of size similar to
nir_opt_algebraic. Instead, we adopt one of the "filter" approaches
explained in the dissertation, which trade much faster table generation
and table size for a few more table lookups per instruction at runtime.
I chose the filter which resulted the fastest table generation time,
with medium table size. Right now, the table generation takes around .5
seconds, despite being implemented in pure Python, which I think is good
enough. Based on the numbers in the dissertation, the other choice might
make table compilation time 25x slower to get 4x smaller table size, but
I don't think that's worth it. As of now, we get the following binary
size before and after this patch:
text data bss dec hex filename
11979455 464720 730864 13175039 c908ff before i965_dri.so
text data bss dec hex filename
12037835 616244 791792 13445871 cd2aef after i965_dri.so
There are a number of places where I've simplified the automaton by
getting rid of details in the LHS patterns rather than complicate things
to deal with them. For example, right now the automaton doesn't
distinguish between constants with different values. This means that it
isn't as precise as it could be, but the decrease in compile time is
still worth it -- these are the compilation time numbers for a shader-db
run with my (admittedly old) database on Intel skylake:
Difference at 95.0% confidence
-42.3485 +/- 1.375
-7.20383% +/- 0.229926%
(Student's t, pooled s = 1.69843)
We can always experiment with making it more precise later.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-02-18 14:20:34 +01:00
|
|
|
nir_num_search_ops,
|
2018-11-07 15:40:02 -06:00
|
|
|
};
|
|
|
|
|
nir/search: Add automaton-based pre-searching
nir_opt_algebraic is currently one of the most expensive NIR passes,
because of the many different patterns we've added over the years. Even
though patterns are already sorted by opcode, there are still way too
many patterns for common opcodes like bcsel and fadd, which means that
many patterns are tried but only a few actually match. One way to fix
this is to add a pre-pass over the code that scans it using an automaton
constructed beforehand, similar to the automatons produced by lex and
yacc for parsing source code. This automaton has to walk the SSA graph
and recognize possible pattern matches.
It turns out that the theory to do this is quite mature already, having
been developed for instruction selection as well as other non-compiler
things. I followed the presentation in the dissertation cited in the
code, "Tree algorithms: Two Taxonomies and a Toolkit," trying to keep
the naming similar. To create the automaton, we have to perform
something like the classical NFA to DFA subset construction used by lex,
but it turns out that actually computing the transition table for all
possible states would be way too expensive, with the dissertation
reporting times of almost half an hour for an example of size similar to
nir_opt_algebraic. Instead, we adopt one of the "filter" approaches
explained in the dissertation, which trade much faster table generation
and table size for a few more table lookups per instruction at runtime.
I chose the filter which resulted the fastest table generation time,
with medium table size. Right now, the table generation takes around .5
seconds, despite being implemented in pure Python, which I think is good
enough. Based on the numbers in the dissertation, the other choice might
make table compilation time 25x slower to get 4x smaller table size, but
I don't think that's worth it. As of now, we get the following binary
size before and after this patch:
text data bss dec hex filename
11979455 464720 730864 13175039 c908ff before i965_dri.so
text data bss dec hex filename
12037835 616244 791792 13445871 cd2aef after i965_dri.so
There are a number of places where I've simplified the automaton by
getting rid of details in the LHS patterns rather than complicate things
to deal with them. For example, right now the automaton doesn't
distinguish between constants with different values. This means that it
isn't as precise as it could be, but the decrease in compile time is
still worth it -- these are the compilation time numbers for a shader-db
run with my (admittedly old) database on Intel skylake:
Difference at 95.0% confidence
-42.3485 +/- 1.375
-7.20383% +/- 0.229926%
(Student's t, pooled s = 1.69843)
We can always experiment with making it more precise later.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-02-18 14:20:34 +01:00
|
|
|
uint16_t nir_search_op_for_nir_op(nir_op op);
|
|
|
|
|
2014-11-13 21:19:28 -08:00
|
|
|
typedef struct {
|
|
|
|
nir_search_value value;
|
|
|
|
|
2016-03-17 11:04:49 -07:00
|
|
|
/* When set on a search expression, the expression will only match an SSA
|
|
|
|
* value that does *not* have the exact bit set. If unset, the exact bit
|
|
|
|
* on the SSA value is ignored.
|
|
|
|
*/
|
|
|
|
bool inexact;
|
|
|
|
|
nir/search: Search for all combinations of commutative ops
Consider the following search expression and NIR sequence:
('iadd', ('imul', a, b), b)
ssa_2 = imul ssa_0, ssa_1
ssa_3 = iadd ssa_2, ssa_0
The current algorithm is greedy and, the moment the imul finds a match,
it commits those variable names and returns success. In the above
example, it maps a -> ssa_0 and b -> ssa_1. When we then try to match
the iadd, it sees that ssa_0 is not b and fails to match. The iadd
match will attempt to flip itself and try again (which won't work) but
it cannot ask the imul to try a flipped match.
This commit instead counts the number of commutative ops in each
expression and assigns an index to each. It then does a loop and loops
over the full combinatorial matrix of commutative operations. In order
to keep things sane, we limit it to at most 4 commutative operations (16
combinations). There is only one optimization in opt_algebraic that
goes over this limit and it's the bitfieldReverse detection for some UE4
demo.
Shader-db results on Kaby Lake:
total instructions in shared programs: 15310125 -> 15302469 (-0.05%)
instructions in affected programs: 1797123 -> 1789467 (-0.43%)
helped: 6751
HURT: 2264
total cycles in shared programs: 357346617 -> 357202526 (-0.04%)
cycles in affected programs: 15931005 -> 15786914 (-0.90%)
helped: 6024
HURT: 3436
total loops in shared programs: 4360 -> 4360 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0
total spills in shared programs: 23675 -> 23666 (-0.04%)
spills in affected programs: 235 -> 226 (-3.83%)
helped: 5
HURT: 1
total fills in shared programs: 32040 -> 32032 (-0.02%)
fills in affected programs: 190 -> 182 (-4.21%)
helped: 6
HURT: 2
LOST: 18
GAINED: 5
Reviewed-by: Thomas Helland <thomashelland90@gmail.com>
2019-03-22 17:45:29 -05:00
|
|
|
/* Commutative expression index. This is assigned by opt_algebraic.py when
|
|
|
|
* search structures are constructed and is a unique (to this structure)
|
|
|
|
* index within the commutative operation bitfield used for searching for
|
|
|
|
* all combinations of expressions containing commutative operations.
|
|
|
|
*/
|
|
|
|
int8_t comm_expr_idx;
|
|
|
|
|
|
|
|
/* Number of commutative expressions in this expression including this one
|
|
|
|
* (if it is commutative).
|
|
|
|
*/
|
|
|
|
uint8_t comm_exprs;
|
|
|
|
|
2018-11-07 15:40:02 -06:00
|
|
|
/* One of nir_op or nir_search_op */
|
|
|
|
uint16_t opcode;
|
2014-11-13 21:19:28 -08:00
|
|
|
const nir_search_value *srcs[4];
|
2017-01-10 15:47:31 +11:00
|
|
|
|
|
|
|
/** Optional condition fxn ptr
|
|
|
|
*
|
|
|
|
* This allows additional constraints on expression matching, it is
|
|
|
|
* typically used to match an expressions uses such as the number of times
|
|
|
|
* the expression is used, and whether its used by an if.
|
|
|
|
*/
|
|
|
|
bool (*cond)(nir_alu_instr *instr);
|
2014-11-13 21:19:28 -08:00
|
|
|
} nir_search_expression;
|
|
|
|
|
|
|
|
NIR_DEFINE_CAST(nir_search_value_as_variable, nir_search_value,
|
2016-10-05 18:09:25 -07:00
|
|
|
nir_search_variable, value,
|
|
|
|
type, nir_search_value_variable)
|
2014-11-13 21:19:28 -08:00
|
|
|
NIR_DEFINE_CAST(nir_search_value_as_constant, nir_search_value,
|
2016-10-05 18:09:25 -07:00
|
|
|
nir_search_constant, value,
|
|
|
|
type, nir_search_value_constant)
|
2014-11-13 21:19:28 -08:00
|
|
|
NIR_DEFINE_CAST(nir_search_value_as_expression, nir_search_value,
|
2016-10-05 18:09:25 -07:00
|
|
|
nir_search_expression, value,
|
|
|
|
type, nir_search_value_expression)
|
2014-11-13 21:19:28 -08:00
|
|
|
|
2018-10-22 14:08:13 -05:00
|
|
|
nir_ssa_def *
|
|
|
|
nir_replace_instr(struct nir_builder *b, nir_alu_instr *instr,
|
|
|
|
const nir_search_expression *search,
|
|
|
|
const nir_search_value *replace);
|
2014-11-13 21:19:28 -08:00
|
|
|
|
|
|
|
#endif /* _NIR_SEARCH_ */
|