radv,aco: allow unaligned LDS access on GFX9+
fossil-db (GFX10.3): Totals from 223 (0.16% of 139391) affected shaders: SGPRs: 10032 -> 10096 (+0.64%) VGPRs: 7480 -> 7592 (+1.50%) CodeSize: 853960 -> 821920 (-3.75%); split: -3.76%, +0.01% MaxWaves: 5916 -> 5908 (-0.14%) Instrs: 154935 -> 150281 (-3.00%); split: -3.01%, +0.01% Cycles: 3202496 -> 3080680 (-3.80%); split: -3.81%, +0.00% VMEM: 48187 -> 46671 (-3.15%); split: +0.29%, -3.44% SMEM: 13869 -> 13850 (-0.14%); split: +1.52%, -1.66% VClause: 3110 -> 3085 (-0.80%); split: -1.03%, +0.23% SClause: 4376 -> 4381 (+0.11%) Copies: 12132 -> 12065 (-0.55%); split: -2.61%, +2.06% Branches: 5204 -> 5203 (-0.02%) PreVGPRs: 6304 -> 6359 (+0.87%); split: -0.10%, +0.97% See https://reviews.llvm.org/D82788 Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8762>
This commit is contained in:
@@ -3466,31 +3466,35 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
|
|||||||
bool large_ds_read = bld.program->chip_class >= GFX7;
|
bool large_ds_read = bld.program->chip_class >= GFX7;
|
||||||
bool usable_read2 = bld.program->chip_class >= GFX7;
|
bool usable_read2 = bld.program->chip_class >= GFX7;
|
||||||
|
|
||||||
|
bool aligned2 = align % 2 == 0;
|
||||||
|
bool aligned4 = align % 4 == 0;
|
||||||
|
bool aligned8 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (align % 8 == 0);
|
||||||
|
bool aligned16 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (align % 16 == 0);
|
||||||
|
|
||||||
bool read2 = false;
|
bool read2 = false;
|
||||||
unsigned size = 0;
|
unsigned size = 0;
|
||||||
aco_opcode op;
|
aco_opcode op;
|
||||||
//TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
|
if (bytes_needed >= 16 && aligned16 && large_ds_read) {
|
||||||
if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
|
|
||||||
size = 16;
|
size = 16;
|
||||||
op = aco_opcode::ds_read_b128;
|
op = aco_opcode::ds_read_b128;
|
||||||
} else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
|
} else if (bytes_needed >= 16 && aligned8 && const_offset % 8 == 0 && usable_read2) {
|
||||||
size = 16;
|
size = 16;
|
||||||
read2 = true;
|
read2 = true;
|
||||||
op = aco_opcode::ds_read2_b64;
|
op = aco_opcode::ds_read2_b64;
|
||||||
} else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
|
} else if (bytes_needed >= 12 && aligned16 && large_ds_read) {
|
||||||
size = 12;
|
size = 12;
|
||||||
op = aco_opcode::ds_read_b96;
|
op = aco_opcode::ds_read_b96;
|
||||||
} else if (bytes_needed >= 8 && align % 8 == 0) {
|
} else if (bytes_needed >= 8 && aligned8) {
|
||||||
size = 8;
|
size = 8;
|
||||||
op = aco_opcode::ds_read_b64;
|
op = aco_opcode::ds_read_b64;
|
||||||
} else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
|
} else if (bytes_needed >= 8 && aligned4 && const_offset % 4 == 0) {
|
||||||
size = 8;
|
size = 8;
|
||||||
read2 = true;
|
read2 = true;
|
||||||
op = aco_opcode::ds_read2_b32;
|
op = aco_opcode::ds_read2_b32;
|
||||||
} else if (bytes_needed >= 4 && align % 4 == 0) {
|
} else if (bytes_needed >= 4 && aligned4) {
|
||||||
size = 4;
|
size = 4;
|
||||||
op = aco_opcode::ds_read_b32;
|
op = aco_opcode::ds_read_b32;
|
||||||
} else if (bytes_needed >= 2 && align % 2 == 0) {
|
} else if (bytes_needed >= 2 && aligned2) {
|
||||||
size = 2;
|
size = 2;
|
||||||
op = aco_opcode::ds_read_u16;
|
op = aco_opcode::ds_read_u16;
|
||||||
} else {
|
} else {
|
||||||
@@ -3854,8 +3858,8 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
|
|||||||
|
|
||||||
bool aligned2 = offset % 2 == 0 && align % 2 == 0;
|
bool aligned2 = offset % 2 == 0 && align % 2 == 0;
|
||||||
bool aligned4 = offset % 4 == 0 && align % 4 == 0;
|
bool aligned4 = offset % 4 == 0 && align % 4 == 0;
|
||||||
bool aligned8 = offset % 8 == 0 && align % 8 == 0;
|
bool aligned8 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (offset % 8 == 0 && align % 8 == 0);
|
||||||
bool aligned16 = offset % 16 == 0 && align % 16 == 0;
|
bool aligned16 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (offset % 16 == 0 && align % 16 == 0);
|
||||||
|
|
||||||
//TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
|
//TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
|
||||||
aco_opcode op = aco_opcode::num_opcodes;
|
aco_opcode op = aco_opcode::num_opcodes;
|
||||||
|
@@ -98,6 +98,10 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||||||
program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
|
program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
|
||||||
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
|
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
|
||||||
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
|
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
|
||||||
|
/* GFX10 WGP has a bug making naturally aligned access required. The LLVM
|
||||||
|
* subtarget feature is called "FeatureLdsMisalignedBug".
|
||||||
|
*/
|
||||||
|
program->dev.has_unaligned_lds_access = chip_class >= GFX9 && !(chip_class == GFX10 && wgp_mode);
|
||||||
|
|
||||||
program->dev.vgpr_limit = 256;
|
program->dev.vgpr_limit = 256;
|
||||||
program->dev.physical_vgprs = 256;
|
program->dev.physical_vgprs = 256;
|
||||||
|
@@ -1799,6 +1799,7 @@ struct DeviceInfo {
|
|||||||
uint16_t lds_alloc_granule;
|
uint16_t lds_alloc_granule;
|
||||||
uint32_t lds_limit; /* in bytes */
|
uint32_t lds_limit; /* in bytes */
|
||||||
bool has_16bank_lds;
|
bool has_16bank_lds;
|
||||||
|
bool has_unaligned_lds_access;
|
||||||
uint16_t physical_sgprs;
|
uint16_t physical_sgprs;
|
||||||
uint16_t physical_vgprs;
|
uint16_t physical_vgprs;
|
||||||
uint16_t vgpr_limit;
|
uint16_t vgpr_limit;
|
||||||
|
@@ -3053,6 +3053,9 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
|
|||||||
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
|
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
|
||||||
void *data)
|
void *data)
|
||||||
{
|
{
|
||||||
|
struct radv_device *device = data;
|
||||||
|
enum chip_class chip = device->physical_device->rad_info.chip_class;
|
||||||
|
|
||||||
if (num_components > 4)
|
if (num_components > 4)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@@ -3081,9 +3084,9 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
|
|||||||
FALLTHROUGH;
|
FALLTHROUGH;
|
||||||
case nir_intrinsic_load_shared:
|
case nir_intrinsic_load_shared:
|
||||||
case nir_intrinsic_store_shared:
|
case nir_intrinsic_store_shared:
|
||||||
if (bit_size * num_components == 96) /* 96 bit loads require 128 bit alignment and are split otherwise */
|
if (chip < GFX9 && bit_size * num_components == 96) /* 96 bit loads require 128 bit alignment on GFX6-8 and are split otherwise */
|
||||||
return align % 16 == 0;
|
return align % 16 == 0;
|
||||||
else if (bit_size * num_components == 128) /* 128 bit loads require 64 bit alignment and are split otherwise */
|
else if (chip < GFX9 && bit_size * num_components == 128) /* 128 bit loads require 64 bit alignment on GFX6-8 and are split otherwise */
|
||||||
return align % 8 == 0;
|
return align % 8 == 0;
|
||||||
else
|
else
|
||||||
return align % (bit_size == 8 ? 2 : 4) == 0;
|
return align % (bit_size == 8 ? 2 : 4) == 0;
|
||||||
@@ -3330,6 +3333,7 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,
|
|||||||
nir_var_mem_push_const | nir_var_mem_shared |
|
nir_var_mem_push_const | nir_var_mem_shared |
|
||||||
nir_var_mem_global,
|
nir_var_mem_global,
|
||||||
.callback = mem_vectorize_callback,
|
.callback = mem_vectorize_callback,
|
||||||
|
.cb_data = device,
|
||||||
.robust_modes = 0,
|
.robust_modes = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user