aco: rework vertex fetching a bit
This will make it easier to skip unused channels at the start and to split unaligned loads on GFX10. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3086>
This commit is contained in:
@@ -472,6 +472,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
|
|||||||
("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]),
|
("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]),
|
||||||
("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]),
|
("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]),
|
||||||
("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]),
|
("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]),
|
||||||
|
("mtbuf", [Format.MTBUF], 'MTBUF_instruction', [(0, 4), (1, 3)]),
|
||||||
("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes?
|
("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes?
|
||||||
("exp", [Format.EXP], 'Export_instruction', [(0, 4)]),
|
("exp", [Format.EXP], 'Export_instruction', [(0, 4)]),
|
||||||
("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])),
|
("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])),
|
||||||
|
@@ -3066,32 +3066,59 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned get_num_channels_from_data_format(unsigned data_format)
|
bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
|
||||||
|
unsigned offset, unsigned stride, unsigned channels)
|
||||||
{
|
{
|
||||||
switch (data_format) {
|
unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
|
||||||
case V_008F0C_BUF_DATA_FORMAT_8:
|
if (vtx_info->chan_byte_size != 4 && channels == 3)
|
||||||
case V_008F0C_BUF_DATA_FORMAT_16:
|
return false;
|
||||||
case V_008F0C_BUF_DATA_FORMAT_32:
|
return true;
|
||||||
return 1;
|
}
|
||||||
case V_008F0C_BUF_DATA_FORMAT_8_8:
|
|
||||||
case V_008F0C_BUF_DATA_FORMAT_16_16:
|
uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
|
||||||
case V_008F0C_BUF_DATA_FORMAT_32_32:
|
unsigned offset, unsigned stride, unsigned *channels)
|
||||||
return 2;
|
{
|
||||||
case V_008F0C_BUF_DATA_FORMAT_10_11_11:
|
if (!vtx_info->chan_byte_size) {
|
||||||
case V_008F0C_BUF_DATA_FORMAT_11_11_10:
|
*channels = vtx_info->num_channels;
|
||||||
case V_008F0C_BUF_DATA_FORMAT_32_32_32:
|
return vtx_info->chan_format;
|
||||||
return 3;
|
|
||||||
case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
|
|
||||||
case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
|
|
||||||
case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
|
|
||||||
case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
|
|
||||||
case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
|
|
||||||
return 4;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 4;
|
unsigned num_channels = *channels;
|
||||||
|
if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
|
||||||
|
unsigned new_channels = num_channels + 1;
|
||||||
|
/* first, assume more loads is worse and try using a larger data format */
|
||||||
|
while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
|
||||||
|
new_channels++;
|
||||||
|
/* don't make the attribute potentially out-of-bounds */
|
||||||
|
if (offset + new_channels * vtx_info->chan_byte_size > stride)
|
||||||
|
new_channels = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (new_channels == 5) {
|
||||||
|
/* then try decreasing load size (at the cost of more loads) */
|
||||||
|
new_channels = *channels;
|
||||||
|
while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
|
||||||
|
new_channels--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (new_channels < *channels)
|
||||||
|
*channels = new_channels;
|
||||||
|
num_channels = new_channels;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (vtx_info->chan_format) {
|
||||||
|
case V_008F0C_BUF_DATA_FORMAT_8:
|
||||||
|
return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
|
||||||
|
V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
|
||||||
|
case V_008F0C_BUF_DATA_FORMAT_16:
|
||||||
|
return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
|
||||||
|
V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
|
||||||
|
case V_008F0C_BUF_DATA_FORMAT_32:
|
||||||
|
return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
|
||||||
|
V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
|
||||||
|
}
|
||||||
|
unreachable("shouldn't reach here");
|
||||||
|
return V_008F0C_BUF_DATA_FORMAT_INVALID;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
|
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
|
||||||
@@ -3148,11 +3175,11 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
|
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
|
||||||
|
|
||||||
unsigned dfmt = attrib_format & 0xf;
|
unsigned dfmt = attrib_format & 0xf;
|
||||||
|
|
||||||
unsigned nfmt = (attrib_format >> 4) & 0x7;
|
unsigned nfmt = (attrib_format >> 4) & 0x7;
|
||||||
unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
|
const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
|
||||||
|
|
||||||
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
|
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
|
||||||
unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
|
unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
|
||||||
unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
|
unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
|
||||||
bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
|
bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
|
||||||
if (post_shuffle)
|
if (post_shuffle)
|
||||||
@@ -3183,53 +3210,74 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
get_arg(ctx, ctx->args->ac.vertex_id));
|
get_arg(ctx, ctx->args->ac.vertex_id));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (attrib_stride != 0 && attrib_offset > attrib_stride) {
|
Temp channels[num_channels];
|
||||||
index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
|
unsigned channel_start = 0;
|
||||||
attrib_offset = attrib_offset % attrib_stride;
|
bool direct_fetch = false;
|
||||||
|
|
||||||
|
/* load channels */
|
||||||
|
while (channel_start < num_channels) {
|
||||||
|
unsigned fetch_size = num_channels - channel_start;
|
||||||
|
unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
|
||||||
|
unsigned fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
|
||||||
|
|
||||||
|
Temp fetch_index = index;
|
||||||
|
if (attrib_stride != 0 && fetch_offset > attrib_stride) {
|
||||||
|
fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
|
||||||
|
fetch_offset = fetch_offset % attrib_stride;
|
||||||
|
}
|
||||||
|
|
||||||
|
Operand soffset(0u);
|
||||||
|
if (fetch_offset >= 4096) {
|
||||||
|
soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
|
||||||
|
fetch_offset %= 4096;
|
||||||
|
}
|
||||||
|
|
||||||
|
aco_opcode opcode;
|
||||||
|
switch (fetch_size) {
|
||||||
|
case 1:
|
||||||
|
opcode = aco_opcode::tbuffer_load_format_x;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
opcode = aco_opcode::tbuffer_load_format_xy;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
opcode = aco_opcode::tbuffer_load_format_xyz;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
opcode = aco_opcode::tbuffer_load_format_xyzw;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
unreachable("Unimplemented load_input vector size");
|
||||||
|
}
|
||||||
|
|
||||||
|
Temp fetch_dst;
|
||||||
|
if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
|
||||||
|
(alpha_adjust == RADV_ALPHA_ADJUST_NONE || num_channels <= 3)) {
|
||||||
|
direct_fetch = true;
|
||||||
|
fetch_dst = dst;
|
||||||
|
} else {
|
||||||
|
fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
Instruction *mtbuf = bld.mtbuf(opcode,
|
||||||
|
Definition(fetch_dst), fetch_index, list, soffset,
|
||||||
|
fetch_dfmt, nfmt, fetch_offset,
|
||||||
|
false, true).instr;
|
||||||
|
static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
|
||||||
|
|
||||||
|
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
|
||||||
|
|
||||||
|
if (fetch_size == 1) {
|
||||||
|
channels[channel_start] = fetch_dst;
|
||||||
|
} else {
|
||||||
|
for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
|
||||||
|
channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
|
||||||
|
}
|
||||||
|
|
||||||
|
channel_start += fetch_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
Operand soffset(0u);
|
if (!direct_fetch) {
|
||||||
if (attrib_offset >= 4096) {
|
|
||||||
soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
|
|
||||||
attrib_offset = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
aco_opcode opcode;
|
|
||||||
switch (num_channels) {
|
|
||||||
case 1:
|
|
||||||
opcode = aco_opcode::tbuffer_load_format_x;
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
opcode = aco_opcode::tbuffer_load_format_xy;
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
opcode = aco_opcode::tbuffer_load_format_xyz;
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
opcode = aco_opcode::tbuffer_load_format_xyzw;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
unreachable("Unimplemented load_input vector size");
|
|
||||||
}
|
|
||||||
|
|
||||||
Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
|
|
||||||
|
|
||||||
aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
|
|
||||||
mubuf->operands[0] = Operand(index);
|
|
||||||
mubuf->operands[1] = Operand(list);
|
|
||||||
mubuf->operands[2] = soffset;
|
|
||||||
mubuf->definitions[0] = Definition(tmp);
|
|
||||||
mubuf->idxen = true;
|
|
||||||
mubuf->can_reorder = true;
|
|
||||||
mubuf->dfmt = dfmt;
|
|
||||||
mubuf->nfmt = nfmt;
|
|
||||||
assert(attrib_offset < 4096);
|
|
||||||
mubuf->offset = attrib_offset;
|
|
||||||
ctx->block->instructions.emplace_back(std::move(mubuf));
|
|
||||||
|
|
||||||
emit_split_vector(ctx, tmp, tmp.size());
|
|
||||||
|
|
||||||
if (tmp.id() != dst.id()) {
|
|
||||||
bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
|
bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
|
||||||
nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
|
nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
|
||||||
|
|
||||||
@@ -3238,13 +3286,18 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
|
const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
|
||||||
|
|
||||||
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
|
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
|
||||||
|
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
|
||||||
|
unsigned num_temp = 0;
|
||||||
for (unsigned i = 0; i < dst.size(); i++) {
|
for (unsigned i = 0; i < dst.size(); i++) {
|
||||||
unsigned idx = i + component;
|
unsigned idx = i + component;
|
||||||
if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
|
if (idx < num_channels && channels[swizzle[idx]].id()) {
|
||||||
Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
|
Temp channel = channels[swizzle[idx]];
|
||||||
vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
|
if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
|
||||||
} else if (idx < num_channels) {
|
channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
|
||||||
vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
|
vec->operands[i] = Operand(channel);
|
||||||
|
|
||||||
|
num_temp++;
|
||||||
|
elems[i] = channel;
|
||||||
} else if (is_float && idx == 3) {
|
} else if (is_float && idx == 3) {
|
||||||
vec->operands[i] = Operand(0x3f800000u);
|
vec->operands[i] = Operand(0x3f800000u);
|
||||||
} else if (!is_float && idx == 3) {
|
} else if (!is_float && idx == 3) {
|
||||||
@@ -3256,8 +3309,10 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
vec->definitions[0] = Definition(dst);
|
vec->definitions[0] = Definition(dst);
|
||||||
ctx->block->instructions.emplace_back(std::move(vec));
|
ctx->block->instructions.emplace_back(std::move(vec));
|
||||||
emit_split_vector(ctx, dst, dst.size());
|
emit_split_vector(ctx, dst, dst.size());
|
||||||
}
|
|
||||||
|
|
||||||
|
if (num_temp == dst.size())
|
||||||
|
ctx->allocated_vec.emplace(dst.id(), elems);
|
||||||
|
}
|
||||||
} else if (ctx->stage == fragment_fs) {
|
} else if (ctx->stage == fragment_fs) {
|
||||||
nir_instr *off_instr = instr->src[0].ssa->parent_instr;
|
nir_instr *off_instr = instr->src[0].ssa->parent_instr;
|
||||||
if (off_instr->type != nir_instr_type_load_const ||
|
if (off_instr->type != nir_instr_type_load_const ||
|
||||||
|
@@ -77,7 +77,6 @@ class Format(Enum):
|
|||||||
elif self == Format.MTBUF:
|
elif self == Format.MTBUF:
|
||||||
return [('unsigned', 'dfmt', None),
|
return [('unsigned', 'dfmt', None),
|
||||||
('unsigned', 'nfmt', None),
|
('unsigned', 'nfmt', None),
|
||||||
('unsigned', 'img_format', None),
|
|
||||||
('unsigned', 'offset', None),
|
('unsigned', 'offset', None),
|
||||||
('bool', 'offen', None),
|
('bool', 'offen', None),
|
||||||
('bool', 'idxen', 'false'),
|
('bool', 'idxen', 'false'),
|
||||||
@@ -85,8 +84,7 @@ class Format(Enum):
|
|||||||
('bool', 'glc', 'false'),
|
('bool', 'glc', 'false'),
|
||||||
('bool', 'dlc', 'false'),
|
('bool', 'dlc', 'false'),
|
||||||
('bool', 'slc', 'false'),
|
('bool', 'slc', 'false'),
|
||||||
('bool', 'tfe', 'false'),
|
('bool', 'tfe', 'false')]
|
||||||
('bool', 'lds', 'false')]
|
|
||||||
elif self == Format.MUBUF:
|
elif self == Format.MUBUF:
|
||||||
return [('unsigned', 'offset', None),
|
return [('unsigned', 'offset', None),
|
||||||
('bool', 'offen', None),
|
('bool', 'offen', None),
|
||||||
|
Reference in New Issue
Block a user