intel/compiler: Add and use a pass to generate imul_32x16 instructions

Gfx8 and Gfx9 platforms are helped for cycles because now many
instructions like

    mul(8)          g12<1>D         g10<8,8,1>D     6D

become

    mul(8)          g12<1>D         g10<8,8,1>D     6W

It is the same number of instructions, but the 32x16 multiply is a
little faster.

v2: Fix transposed hi and lo in "(hi >= INT16_MIN && lo <= INT16_MAX)".
Noticed by Caio.  Use nir_src_is_const instead of open coding it.
Suggested by Caio.

Broadwell and Skylake had similar results. (Skylake shown)
total cycles in shared programs: 845748380 -> 845145547 (-0.07%)
cycles in affected programs: 446346348 -> 445743515 (-0.14%)
helped: 6017
HURT: 0
helped stats (abs) min: 2 max: 7380 x̄: 100.19 x̃: 8
helped stats (rel) min: <.01% max: 3.72% x̄: 0.41% x̃: 0.39%
95% mean confidence interval for cycles value: -113.37 -87.00
95% mean confidence interval for cycles %-change: -0.42% -0.41%
Cycles are helped.

Skylake
Cycles in all programs: 8844820715 -> 8828897462 (-0.2%)
Cycles helped: 47914
Cycles hurt: 1

No shader-db or fossil-db changes on any other Intel platform.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
This commit is contained in:
Ian Romanick
2022-02-02 18:49:25 -08:00
committed by Marge Bot
parent 9479e3a19b
commit f90d71055b
4 changed files with 126 additions and 0 deletions

View File

@@ -1286,6 +1286,9 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
OPT(brw_nir_opt_peephole_ffma);
}
if (devinfo->ver >= 7 && is_scalar)
OPT(brw_nir_opt_peephole_imul32x16);
if (OPT(nir_opt_comparison_pre)) {
OPT(nir_copy_prop);
OPT(nir_opt_dce);

View File

@@ -176,6 +176,8 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
bool brw_nir_opt_peephole_ffma(nir_shader *shader);
bool brw_nir_opt_peephole_imul32x16(nir_shader *shader);
void brw_nir_optimize(nir_shader *nir,
const struct brw_compiler *compiler,
bool is_scalar,

View File

@@ -0,0 +1,120 @@
/*
* Copyright © 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir.h"
#include "compiler/nir/nir_builder.h"
/**
* Implement a peephole pass to convert integer multiplications to imul32x16.
*/
static void
replace_imul_instr(nir_builder *b, nir_alu_instr *imul, unsigned small_val,
nir_op new_opcode)
{
assert(small_val == 0 || small_val == 1);
b->cursor = nir_before_instr(&imul->instr);
nir_alu_instr *imul_32x16 = nir_alu_instr_create(b->shader, new_opcode);
imul_32x16->dest.saturate = imul->dest.saturate;
imul_32x16->dest.write_mask = imul->dest.write_mask;
nir_alu_src_copy(&imul_32x16->src[0], &imul->src[1 - small_val], imul_32x16);
nir_alu_src_copy(&imul_32x16->src[1], &imul->src[small_val], imul_32x16);
nir_ssa_dest_init(&imul_32x16->instr, &imul_32x16->dest.dest,
imul->dest.dest.ssa.num_components,
32, NULL);
nir_ssa_def_rewrite_uses(&imul->dest.dest.ssa,
&imul_32x16->dest.dest.ssa);
nir_builder_instr_insert(b, &imul_32x16->instr);
nir_instr_remove(&imul->instr);
nir_instr_free(&imul->instr);
}
static bool
brw_nir_opt_peephole_imul32x16_instr(nir_builder *b,
nir_instr *instr,
UNUSED void *cb_data)
{
if (instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *imul = nir_instr_as_alu(instr);
if (imul->op != nir_op_imul)
return false;
if (imul->dest.dest.ssa.bit_size != 32)
return false;
nir_op new_opcode = nir_num_opcodes;
unsigned i;
for (i = 0; i < 2; i++) {
if (!nir_src_is_const(imul->src[i].src))
continue;
int64_t lo = INT64_MAX;
int64_t hi = INT64_MIN;
for (unsigned comp = 0; comp < imul->dest.dest.ssa.num_components; comp++) {
int64_t v = nir_src_comp_as_int(imul->src[i].src, comp);
if (v < lo)
lo = v;
if (v > hi)
hi = v;
}
if (lo >= INT16_MIN && hi <= INT16_MAX) {
new_opcode = nir_op_imul_32x16;
break;
} else if (lo >= 0 && hi <= UINT16_MAX) {
new_opcode = nir_op_umul_32x16;
break;
}
}
if (new_opcode != nir_num_opcodes) {
replace_imul_instr(b, imul, i, new_opcode);
return true;
}
return false;
}
bool
brw_nir_opt_peephole_imul32x16(nir_shader *shader)
{
return nir_shader_instructions_pass(shader,
brw_nir_opt_peephole_imul32x16_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}

View File

@@ -98,6 +98,7 @@ libintel_compiler_files = files(
'brw_nir_lower_shading_rate_output.c',
'brw_nir_lower_storage_image.c',
'brw_nir_opt_peephole_ffma.c',
'brw_nir_opt_peephole_imul32x16.c',
'brw_nir_rt.h',
'brw_nir_rt.c',
'brw_nir_rt_builder.h',