ac/nir: Move ac_nir_lower_mem_access_bit_sizes to separate file.
Also ac_nir_flag_smem_for_loads along with it. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32966>
This commit is contained in:
@@ -1143,134 +1143,6 @@ ac_nir_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer)
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
enum amd_gfx_level gfx_level;
|
||||
bool use_llvm;
|
||||
bool after_lowering;
|
||||
} mem_access_cb_data;
|
||||
|
||||
static bool
|
||||
use_smem_for_load(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data_)
|
||||
{
|
||||
const mem_access_cb_data *cb_data = (mem_access_cb_data *)cb_data_;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_ssbo:
|
||||
case nir_intrinsic_load_global:
|
||||
case nir_intrinsic_load_global_constant:
|
||||
case nir_intrinsic_load_global_amd:
|
||||
case nir_intrinsic_load_constant:
|
||||
if (cb_data->use_llvm)
|
||||
return false;
|
||||
break;
|
||||
case nir_intrinsic_load_ubo:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (intrin->def.divergent || (cb_data->after_lowering && intrin->def.bit_size < 32))
|
||||
return false;
|
||||
|
||||
enum gl_access_qualifier access = nir_intrinsic_access(intrin);
|
||||
bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
|
||||
bool reorder = nir_intrinsic_can_reorder(intrin) || ((access & ACCESS_NON_WRITEABLE) && !(access & ACCESS_VOLATILE));
|
||||
if (!reorder || (glc && cb_data->gfx_level < GFX8))
|
||||
return false;
|
||||
|
||||
nir_intrinsic_set_access(intrin, access | ACCESS_SMEM_AMD);
|
||||
return true;
|
||||
}
|
||||
|
||||
static nir_mem_access_size_align
|
||||
lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, uint32_t align_mul, uint32_t align_offset,
|
||||
bool offset_is_const, enum gl_access_qualifier access, const void *cb_data_)
|
||||
{
|
||||
const mem_access_cb_data *cb_data = (mem_access_cb_data *)cb_data_;
|
||||
const bool is_load = nir_intrinsic_infos[intrin].has_dest;
|
||||
const bool is_smem = intrin == nir_intrinsic_load_push_constant || (access & ACCESS_SMEM_AMD);
|
||||
const uint32_t combined_align = nir_combined_align(align_mul, align_offset);
|
||||
|
||||
/* Make 8-bit accesses 16-bit if possible */
|
||||
if (is_load && bit_size == 8 && combined_align >= 2 && bytes % 2 == 0)
|
||||
bit_size = 16;
|
||||
|
||||
unsigned max_components = 4;
|
||||
if (cb_data->use_llvm && access & (ACCESS_COHERENT | ACCESS_VOLATILE) &&
|
||||
(intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_store_global))
|
||||
max_components = 1;
|
||||
else if (is_smem)
|
||||
max_components = MIN2(512 / bit_size, 16);
|
||||
|
||||
nir_mem_access_size_align res;
|
||||
res.num_components = MIN2(bytes / (bit_size / 8), max_components);
|
||||
res.bit_size = bit_size;
|
||||
res.align = MIN2(bit_size / 8, 4); /* 64-bit access only requires 4 byte alignment. */
|
||||
res.shift = nir_mem_access_shift_method_shift64;
|
||||
|
||||
if (!is_load)
|
||||
return res;
|
||||
|
||||
/* Lower 8/16-bit loads to 32-bit, unless it's a VMEM scalar load. */
|
||||
|
||||
const bool support_subdword = res.num_components == 1 && !is_smem &&
|
||||
(!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
|
||||
|
||||
if (res.bit_size >= 32 || support_subdword)
|
||||
return res;
|
||||
|
||||
const uint32_t max_pad = 4 - MIN2(combined_align, 4);
|
||||
|
||||
/* Global loads don't have bounds checking, so increasing the size might not be safe. */
|
||||
if (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_load_global_constant) {
|
||||
if (align_mul < 4) {
|
||||
/* If we split the load, only lower it to 32-bit if this is a SMEM load. */
|
||||
const unsigned chunk_bytes = align(bytes, 4) - max_pad;
|
||||
if (!is_smem && chunk_bytes < bytes)
|
||||
return res;
|
||||
}
|
||||
|
||||
res.num_components = DIV_ROUND_UP(bytes, 4);
|
||||
} else {
|
||||
res.num_components = DIV_ROUND_UP(bytes + max_pad, 4);
|
||||
}
|
||||
res.num_components = MIN2(res.num_components, max_components);
|
||||
res.bit_size = 32;
|
||||
res.align = 4;
|
||||
res.shift = is_smem ? res.shift : nir_mem_access_shift_method_bytealign_amd;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool
|
||||
ac_nir_flag_smem_for_loads(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm, bool after_lowering)
|
||||
{
|
||||
mem_access_cb_data cb_data = {
|
||||
.gfx_level = gfx_level,
|
||||
.use_llvm = use_llvm,
|
||||
.after_lowering = after_lowering,
|
||||
};
|
||||
return nir_shader_intrinsics_pass(shader, &use_smem_for_load, nir_metadata_all, &cb_data);
|
||||
}
|
||||
|
||||
bool
|
||||
ac_nir_lower_mem_access_bit_sizes(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm)
|
||||
{
|
||||
mem_access_cb_data cb_data = {
|
||||
.gfx_level = gfx_level,
|
||||
.use_llvm = use_llvm,
|
||||
};
|
||||
nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
|
||||
.callback = &lower_mem_access_cb,
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
|
||||
nir_var_mem_global | nir_var_mem_constant | nir_var_mem_shared |
|
||||
nir_var_shader_temp,
|
||||
.may_lower_unaligned_stores_to_atomics = false,
|
||||
.cb_data = &cb_data,
|
||||
};
|
||||
return nir_lower_mem_access_bit_sizes(shader, &lower_mem_access_options);
|
||||
}
|
||||
|
||||
bool
|
||||
ac_nir_optimize_uniform_atomics(nir_shader *nir)
|
||||
{
|
||||
|
138
src/amd/common/ac_nir_lower_mem_access_bit_sizes.c
Normal file
138
src/amd/common/ac_nir_lower_mem_access_bit_sizes.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Copyright © 2021 Valve Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "ac_nir.h"
|
||||
#include "ac_nir_helpers.h"
|
||||
|
||||
#include "nir_builder.h"
|
||||
|
||||
typedef struct {
|
||||
enum amd_gfx_level gfx_level;
|
||||
bool use_llvm;
|
||||
bool after_lowering;
|
||||
} mem_access_cb_data;
|
||||
|
||||
static bool
|
||||
use_smem_for_load(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data_)
|
||||
{
|
||||
const mem_access_cb_data *cb_data = (mem_access_cb_data *)cb_data_;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_ssbo:
|
||||
case nir_intrinsic_load_global:
|
||||
case nir_intrinsic_load_global_constant:
|
||||
case nir_intrinsic_load_global_amd:
|
||||
case nir_intrinsic_load_constant:
|
||||
if (cb_data->use_llvm)
|
||||
return false;
|
||||
break;
|
||||
case nir_intrinsic_load_ubo:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (intrin->def.divergent || (cb_data->after_lowering && intrin->def.bit_size < 32))
|
||||
return false;
|
||||
|
||||
enum gl_access_qualifier access = nir_intrinsic_access(intrin);
|
||||
bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
|
||||
bool reorder = nir_intrinsic_can_reorder(intrin) || ((access & ACCESS_NON_WRITEABLE) && !(access & ACCESS_VOLATILE));
|
||||
if (!reorder || (glc && cb_data->gfx_level < GFX8))
|
||||
return false;
|
||||
|
||||
nir_intrinsic_set_access(intrin, access | ACCESS_SMEM_AMD);
|
||||
return true;
|
||||
}
|
||||
|
||||
static nir_mem_access_size_align
|
||||
lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, uint32_t align_mul, uint32_t align_offset,
|
||||
bool offset_is_const, enum gl_access_qualifier access, const void *cb_data_)
|
||||
{
|
||||
const mem_access_cb_data *cb_data = (mem_access_cb_data *)cb_data_;
|
||||
const bool is_load = nir_intrinsic_infos[intrin].has_dest;
|
||||
const bool is_smem = intrin == nir_intrinsic_load_push_constant || (access & ACCESS_SMEM_AMD);
|
||||
const uint32_t combined_align = nir_combined_align(align_mul, align_offset);
|
||||
|
||||
/* Make 8-bit accesses 16-bit if possible */
|
||||
if (is_load && bit_size == 8 && combined_align >= 2 && bytes % 2 == 0)
|
||||
bit_size = 16;
|
||||
|
||||
unsigned max_components = 4;
|
||||
if (cb_data->use_llvm && access & (ACCESS_COHERENT | ACCESS_VOLATILE) &&
|
||||
(intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_store_global))
|
||||
max_components = 1;
|
||||
else if (is_smem)
|
||||
max_components = MIN2(512 / bit_size, 16);
|
||||
|
||||
nir_mem_access_size_align res;
|
||||
res.num_components = MIN2(bytes / (bit_size / 8), max_components);
|
||||
res.bit_size = bit_size;
|
||||
res.align = MIN2(bit_size / 8, 4); /* 64-bit access only requires 4 byte alignment. */
|
||||
res.shift = nir_mem_access_shift_method_shift64;
|
||||
|
||||
if (!is_load)
|
||||
return res;
|
||||
|
||||
/* Lower 8/16-bit loads to 32-bit, unless it's a VMEM scalar load. */
|
||||
|
||||
const bool support_subdword = res.num_components == 1 && !is_smem &&
|
||||
(!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
|
||||
|
||||
if (res.bit_size >= 32 || support_subdword)
|
||||
return res;
|
||||
|
||||
const uint32_t max_pad = 4 - MIN2(combined_align, 4);
|
||||
|
||||
/* Global loads don't have bounds checking, so increasing the size might not be safe. */
|
||||
if (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_load_global_constant) {
|
||||
if (align_mul < 4) {
|
||||
/* If we split the load, only lower it to 32-bit if this is a SMEM load. */
|
||||
const unsigned chunk_bytes = align(bytes, 4) - max_pad;
|
||||
if (!is_smem && chunk_bytes < bytes)
|
||||
return res;
|
||||
}
|
||||
|
||||
res.num_components = DIV_ROUND_UP(bytes, 4);
|
||||
} else {
|
||||
res.num_components = DIV_ROUND_UP(bytes + max_pad, 4);
|
||||
}
|
||||
res.num_components = MIN2(res.num_components, max_components);
|
||||
res.bit_size = 32;
|
||||
res.align = 4;
|
||||
res.shift = is_smem ? res.shift : nir_mem_access_shift_method_bytealign_amd;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool
|
||||
ac_nir_flag_smem_for_loads(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm, bool after_lowering)
|
||||
{
|
||||
mem_access_cb_data cb_data = {
|
||||
.gfx_level = gfx_level,
|
||||
.use_llvm = use_llvm,
|
||||
.after_lowering = after_lowering,
|
||||
};
|
||||
return nir_shader_intrinsics_pass(shader, &use_smem_for_load, nir_metadata_all, &cb_data);
|
||||
}
|
||||
|
||||
bool
|
||||
ac_nir_lower_mem_access_bit_sizes(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm)
|
||||
{
|
||||
mem_access_cb_data cb_data = {
|
||||
.gfx_level = gfx_level,
|
||||
.use_llvm = use_llvm,
|
||||
};
|
||||
nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
|
||||
.callback = &lower_mem_access_cb,
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
|
||||
nir_var_mem_global | nir_var_mem_constant | nir_var_mem_shared |
|
||||
nir_var_shader_temp,
|
||||
.may_lower_unaligned_stores_to_atomics = false,
|
||||
.cb_data = &cb_data,
|
||||
};
|
||||
return nir_lower_mem_access_bit_sizes(shader, &lower_mem_access_options);
|
||||
}
|
@@ -94,6 +94,7 @@ amd_common_files = files(
|
||||
'ac_nir_lower_intrinsics_to_args.c',
|
||||
'ac_nir_lower_legacy_gs.c',
|
||||
'ac_nir_lower_legacy_vs.c',
|
||||
'ac_nir_lower_mem_access_bit_sizes.c',
|
||||
'ac_nir_lower_resinfo.c',
|
||||
'ac_nir_lower_taskmesh_io_to_mem.c',
|
||||
'ac_nir_lower_tess_io_to_mem.c',
|
||||
|
Reference in New Issue
Block a user