aco: be more careful combining additions that could wrap into loads/stores

SMEM does the addition with 64-bits, not 32. So if the original code
relied on wrapping around (for example, for subtraction), it would break.

Apparently swizzled MUBUF accesses also have issues with combining
additions that could overflow. Normal MUBUF accesses seem fine.

fossil-db (Navi):
Totals from 27219 (20.02% of 135946) affected shaders:
CodeSize: 128303256 -> 131062756 (+2.15%); split: -0.00%, +2.15%
Instrs: 24818911 -> 25280558 (+1.86%); split: -0.01%, +1.87%
VMEM: 162311926 -> 177226874 (+9.19%); split: +9.36%, -0.17%
SMEM: 18182559 -> 20218734 (+11.20%); split: +11.53%, -0.34%
VClause: 423635 -> 424398 (+0.18%); split: -0.02%, +0.20%
SClause: 865384 -> 1104986 (+27.69%); split: -0.00%, +27.69%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2748
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2720>
This commit is contained in:
Rhys Perry
2019-10-15 17:00:55 +01:00
committed by Marge Bot
parent 4d0e06257a
commit d169f09e37
10 changed files with 41 additions and 36 deletions

View File

@@ -1,7 +1,3 @@
# ACO specific issues.
dEQP-VK.transform_feedback.simple.multistreams_1
dEQP-VK.transform_feedback.simple.multistreams_3
dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide

View File

@@ -1,7 +1,3 @@
# ACO specific issues.
dEQP-VK.transform_feedback.simple.multistreams_1
dEQP-VK.transform_feedback.simple.multistreams_3
dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide

View File

@@ -1,7 +1,3 @@
# ACO specific issues.
dEQP-VK.transform_feedback.simple.multistreams_1
dEQP-VK.transform_feedback.simple.multistreams_3
dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide

View File

@@ -1,7 +1,3 @@
# ACO specific issues.
dEQP-VK.transform_feedback.simple.multistreams_1
dEQP-VK.transform_feedback.simple.multistreams_3
dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
dEQP-VK.rasterization.flatshading.non_strict_lines_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide

View File

@@ -552,8 +552,9 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])), ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])),
("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]), ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]),
("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])] ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])]
formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
%>\\ %>\\
% for name, formats, struct, shapes in formats: % for name, formats, struct, shapes, extra_field_setup in formats:
% for num_definitions, num_operands in shapes: % for num_definitions, num_operands in shapes:
<% <%
args = ['aco_opcode opcode'] args = ['aco_opcode opcode']
@@ -581,6 +582,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
% endfor % endfor
${f.get_builder_initialization(num_operands)} ${f.get_builder_initialization(num_operands)}
% endfor % endfor
${extra_field_setup}
return insert(instr); return insert(instr);
} }

View File

@@ -3539,6 +3539,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
mubuf->barrier = info->barrier; mubuf->barrier = info->barrier;
mubuf->can_reorder = info->can_reorder; mubuf->can_reorder = info->can_reorder;
mubuf->offset = const_offset; mubuf->offset = const_offset;
mubuf->swizzled = info->swizzle_component_size != 0;
RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val); mubuf->definitions[0] = Definition(val);
@@ -3990,7 +3991,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
} }
void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false) unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
bool swizzled = false)
{ {
assert(vdata.id()); assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -4003,8 +4005,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u); Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
/* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false, /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
/* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc); /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
/* dlc*/ false, /* slc */ slc);
static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder; static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
} }
@@ -4026,7 +4029,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
for (unsigned i = 0; i < write_count; i++) { for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset; unsigned const_offset = offsets[i] + base_const_offset;
emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc); emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
} }
} }
@@ -4838,7 +4841,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
if (use_mubuf) { if (use_mubuf) {
Instruction *mubuf = bld.mubuf(opcode, Instruction *mubuf = bld.mubuf(opcode,
Definition(fetch_dst), list, fetch_index, soffset, Definition(fetch_dst), list, fetch_index, soffset,
fetch_offset, false, true).instr; fetch_offset, false, false, true).instr;
static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true; static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
} else { } else {
Instruction *mtbuf = bld.mtbuf(opcode, Instruction *mtbuf = bld.mtbuf(opcode,
@@ -6889,7 +6892,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
for (unsigned i = 0; i < write_count; i++) { for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true); bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
} }
} }
@@ -10341,8 +10344,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u)); Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
bld.mubuf(aco_opcode::buffer_store_dword, bld.mubuf(aco_opcode::buffer_store_dword,
/* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word, /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
/* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false, /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
/* disable_wqm */ false, /* glc */ true); /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
tf_const_offset += 4; tf_const_offset += 4;
begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero); begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);

View File

@@ -1078,7 +1078,8 @@ struct MUBUF_instruction : public Instruction {
bool lds : 1; /* Return read-data to LDS instead of VGPRs */ bool lds : 1; /* Return read-data to LDS instead of VGPRs */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */
bool can_reorder : 1; bool can_reorder : 1;
uint8_t padding : 2; bool swizzled:1;
uint8_t padding : 1;
barrier_interaction barrier; barrier_interaction barrier;
}; };
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

View File

@@ -88,6 +88,7 @@ class Format(Enum):
elif self == Format.MUBUF: elif self == Format.MUBUF:
return [('unsigned', 'offset', None), return [('unsigned', 'offset', None),
('bool', 'offen', None), ('bool', 'offen', None),
('bool', 'swizzled', 'false'),
('bool', 'idxen', 'false'), ('bool', 'idxen', 'false'),
('bool', 'addr64', 'false'), ('bool', 'addr64', 'false'),
('bool', 'disable_wqm', 'false'), ('bool', 'disable_wqm', 'false'),

View File

@@ -715,8 +715,11 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
return true; return true;
} }
bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
{ {
if (prevent_overflow)
return false; //TODO
Operand op = instr->operands[op_index]; Operand op = instr->operands[op_index];
if (!op.isTemp()) if (!op.isTemp())
@@ -754,7 +757,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp
continue; continue;
uint32_t offset2 = 0; uint32_t offset2 = 0;
if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) { if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
*offset += offset2; *offset += offset2;
} else { } else {
*base = add_instr->operands[!i].getTemp(); *base = add_instr->operands[!i].getTemp();
@@ -927,6 +930,15 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
while (info.is_temp()) while (info.is_temp())
info = ctx.info[info.temp.id()]; info = ctx.info[info.temp.id()];
/* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
* overflow for scratch accesses works only on GFX9+ and saddr overflow
* never works. Since swizzling is the only thing that separates
* scratch accesses and other accesses and swizzling changing how
* addressing works significantly, this probably applies to swizzled
* MUBUF accesses. */
bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9;
bool saddr_prevent_overflow = mubuf->swizzled;
if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) { if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
assert(!mubuf->idxen); assert(!mubuf->idxen);
instr->operands[1] = Operand(v1); instr->operands[1] = Operand(v1);
@@ -937,12 +949,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
instr->operands[2] = Operand((uint32_t) 0); instr->operands[2] = Operand((uint32_t) 0);
mubuf->offset += info.val; mubuf->offset += info.val;
continue; continue;
} else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
base.regClass() == v1 && mubuf->offset + offset < 4096) {
assert(!mubuf->idxen); assert(!mubuf->idxen);
instr->operands[1].setTemp(base); instr->operands[1].setTemp(base);
mubuf->offset += offset; mubuf->offset += offset;
continue; continue;
} else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
base.regClass() == s1 && mubuf->offset + offset < 4096) {
instr->operands[i].setTemp(base); instr->operands[i].setTemp(base);
mubuf->offset += offset; mubuf->offset += offset;
continue; continue;
@@ -957,7 +971,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
uint32_t offset; uint32_t offset;
bool has_usable_ds_offset = ctx.program->chip_class >= GFX7; bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
if (has_usable_ds_offset && if (has_usable_ds_offset &&
i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
base.regClass() == instr->operands[i].regClass() && base.regClass() == instr->operands[i].regClass() &&
instr->opcode != aco_opcode::ds_swizzle_b32) { instr->opcode != aco_opcode::ds_swizzle_b32) {
if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
@@ -993,7 +1007,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
(ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
instr->operands[i] = Operand(info.val); instr->operands[i] = Operand(info.val);
continue; continue;
} else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
if (soe && if (soe &&
(!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) || (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||

View File

@@ -1566,9 +1566,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
split->definitions[i] = bld.def(v1); split->definitions[i] = bld.def(v1);
bld.insert(split); bld.insert(split);
for (unsigned i = 0; i < temp.size(); i++) for (unsigned i = 0; i < temp.size(); i++)
bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false); bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
} else { } else {
bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false); bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
} }
} else { } else {
ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1632,11 +1632,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
for (unsigned i = 0; i < def.size(); i++) { for (unsigned i = 0; i < def.size(); i++) {
Temp tmp = bld.tmp(v1); Temp tmp = bld.tmp(v1);
vec->operands[i] = Operand(tmp); vec->operands[i] = Operand(tmp);
bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false); bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
} }
bld.insert(vec); bld.insert(vec);
} else { } else {
bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false); bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
} }
} else { } else {
uint32_t spill_slot = slots[spill_id]; uint32_t spill_slot = slots[spill_id];