vtn: Remove transpose(m0)*m1 fast path
This is broken for games that rely on invariant geometry since the usage
of matrices can affect how gl_Position is computed. The fdot fastpath
relied on if and how fdot is lowered for correctness.
Totals from 6578 (7.73% of 85071) affected shaders:
MaxWaves: 147190 -> 147170 (-0.01%)
Instrs: 4451406 -> 4438140 (-0.30%); split: -0.31%, +0.01%
CodeSize: 23553020 -> 23541772 (-0.05%); split: -0.07%, +0.03%
VGPRs: 302304 -> 302328 (+0.01%)
SpillSGPRs: 1309 -> 1329 (+1.53%)
Latency: 22509985 -> 22177164 (-1.48%); split: -1.48%, +0.00%
InvThroughput: 4862795 -> 4842951 (-0.41%); split: -0.41%, +0.01%
VClause: 85035 -> 84998 (-0.04%); split: -0.06%, +0.02%
SClause: 131008 -> 131055 (+0.04%); split: -0.02%, +0.05%
Copies: 298935 -> 298060 (-0.29%); split: -0.71%, +0.41%
PreSGPRs: 266833 -> 267292 (+0.17%); split: -0.85%, +1.03%
PreVGPRs: 249511 -> 249601 (+0.04%)
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9562
cc: mesa-stable
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26821>
(cherry picked from commit 4d02543853
)
This commit is contained in:

committed by
Eric Engestrom

parent
e7dd2637a5
commit
a3bcb52482
@@ -94,38 +94,16 @@ matrix_multiply(struct vtn_builder *b,
|
||||
transpose_result = true;
|
||||
}
|
||||
|
||||
if (src0_transpose && !src1_transpose &&
|
||||
glsl_get_base_type(src0->type) == GLSL_TYPE_FLOAT) {
|
||||
/* We already have the rows of src0 and the columns of src1 available,
|
||||
* so we can just take the dot product of each row with each column to
|
||||
* get the result.
|
||||
*/
|
||||
|
||||
for (unsigned i = 0; i < src1_columns; i++) {
|
||||
nir_def *vec_src[4];
|
||||
for (unsigned j = 0; j < src0_rows; j++) {
|
||||
vec_src[j] = nir_fdot(&b->nb, src0_transpose->elems[j]->def,
|
||||
src1->elems[i]->def);
|
||||
}
|
||||
dest->elems[i]->def = nir_vec(&b->nb, vec_src, src0_rows);
|
||||
}
|
||||
} else {
|
||||
/* We don't handle the case where src1 is transposed but not src0, since
|
||||
* the general case only uses individual components of src1 so the
|
||||
* optimizer should chew through the transpose we emitted for src1.
|
||||
*/
|
||||
|
||||
for (unsigned i = 0; i < src1_columns; i++) {
|
||||
/* dest[i] = sum(src0[j] * src1[i][j] for all j) */
|
||||
for (unsigned i = 0; i < src1_columns; i++) {
|
||||
/* dest[i] = sum(src0[j] * src1[i][j] for all j) */
|
||||
dest->elems[i]->def =
|
||||
nir_fmul(&b->nb, src0->elems[src0_columns - 1]->def,
|
||||
nir_channel(&b->nb, src1->elems[i]->def, src0_columns - 1));
|
||||
for (int j = src0_columns - 2; j >= 0; j--) {
|
||||
dest->elems[i]->def =
|
||||
nir_fmul(&b->nb, src0->elems[src0_columns - 1]->def,
|
||||
nir_channel(&b->nb, src1->elems[i]->def, src0_columns - 1));
|
||||
for (int j = src0_columns - 2; j >= 0; j--) {
|
||||
dest->elems[i]->def =
|
||||
nir_ffma(&b->nb, src0->elems[j]->def,
|
||||
nir_channel(&b->nb, src1->elems[i]->def, j),
|
||||
dest->elems[i]->def);
|
||||
}
|
||||
nir_ffma(&b->nb, src0->elems[j]->def,
|
||||
nir_channel(&b->nb, src1->elems[i]->def, j),
|
||||
dest->elems[i]->def);
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user