intel/brw: Handle 16-bit sampler return payloads

API requires samplers to return 32-bit even though hardware can handle
16-bit floating point, so we detect that case and make more efficient
use of memory BW. This is helping improve performance of encode and
decode tokens during LLM by at least 5% across multiple platforms.

Thank you Kenneth Graunke for suggesting and guiding me throughout
this implementation.

Signed-off-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30447>
This commit is contained in:
Sushma Venkatesh Reddy
2024-07-30 23:04:34 -07:00
committed by Marge Bot
parent ddd9e043dc
commit 0116430d39
3 changed files with 27 additions and 9 deletions

View File

@@ -8623,7 +8623,12 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
bool is_simd8_16bit = nir_alu_type_get_type_size(instr->dest_type) == 16
&& bld.dispatch_width() == 8;
brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type),
(is_simd8_16bit ? 8 : 4) + instr->is_sparse);
fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
inst->offset = header_bits;
@@ -8635,15 +8640,18 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
if (instr->is_sparse) {
read_size = util_last_bit(write_mask) - 1;
inst->size_written =
read_size * inst->dst.component_size(inst->exec_size) +
(is_simd8_16bit ? 2 : 1) * read_size *
inst->dst.component_size(inst->exec_size) +
(reg_unit(devinfo) * REG_SIZE);
} else {
read_size = util_last_bit(write_mask);
inst->size_written =
read_size * inst->dst.component_size(inst->exec_size);
(is_simd8_16bit ? 2 : 1) * read_size *
inst->dst.component_size(inst->exec_size);
}
} else {
inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
inst->size_written = (is_simd8_16bit ? 2 : 1) * 4 *
inst->dst.component_size(inst->exec_size) +
(instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
}
@@ -8666,7 +8674,8 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
inst->keep_payload_trailing_zeros = true;
}
if (instr->op != nir_texop_query_levels && !instr->is_sparse) {
if (instr->op != nir_texop_query_levels && !instr->is_sparse
&& !is_simd8_16bit) {
/* In most cases we can write directly to the result. */
inst->dst = nir_def_reg;
} else {
@@ -8675,7 +8684,7 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
*/
brw_reg nir_dest[5];
for (unsigned i = 0; i < read_size; i++)
nir_dest[i] = offset(dst, bld, i);
nir_dest[i] = offset(dst, bld, (is_simd8_16bit ? 2 : 1) * i);
if (instr->op == nir_texop_query_levels) {
/* # levels is in .w */

View File

@@ -1124,13 +1124,16 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
inst->mlen = mlen;
inst->header_size = header_size;
inst->sfid = BRW_SFID_SAMPLER;
uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
? GFX8_SAMPLER_RETURN_FORMAT_16BITS
: GFX8_SAMPLER_RETURN_FORMAT_32BITS;
if (surface.file == IMM &&
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
inst->desc = brw_sampler_desc(devinfo, surface.ud,
sampler.file == IMM ? sampler.ud % 16 : 0,
msg_type,
simd_mode,
0 /* return_format unused on gfx7+ */);
sampler_ret_type);
inst->src[0] = brw_imm_ud(0);
inst->src[1] = brw_imm_ud(0);
} else if (surface_handle.file != BAD_FILE) {
@@ -1140,7 +1143,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
sampler.file == IMM ? sampler.ud % 16 : 0,
msg_type,
simd_mode,
0 /* return_format unused on gfx7+ */);
sampler_ret_type);
/* For bindless samplers, the entire address is included in the message
* header so we can leave the portion in the message descriptor 0.
@@ -1166,7 +1169,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
0, /* sampler */
msg_type,
simd_mode,
0 /* return_format unused on gfx7+ */);
sampler_ret_type);
const fs_builder ubld = bld.group(1, 0).exec_all();
brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
if (surface.equals(sampler)) {

View File

@@ -996,6 +996,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
OPT(nir_lower_alu_to_scalar, NULL, NULL);
struct nir_opt_16bit_tex_image_options options = {
.rounding_mode = nir_rounding_mode_undef,
.opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
};
OPT(nir_opt_16bit_tex_image, &options);
if (nir->info.stage == MESA_SHADER_GEOMETRY)
OPT(nir_lower_gs_intrinsics, 0);