amd: fix typos
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22432>
This commit is contained in:

committed by
Marge Bot

parent
1aab6820fb
commit
aea48a4ff1
@@ -309,7 +309,7 @@ void ac_get_hs_info(struct radeon_info *info,
|
|||||||
* store the task payload which is passed to mesh shaders.
|
* store the task payload which is passed to mesh shaders.
|
||||||
*
|
*
|
||||||
* The driver only needs to create this BO once,
|
* The driver only needs to create this BO once,
|
||||||
* and it will always be able to accomodate the maximum needed
|
* and it will always be able to accommodate the maximum needed
|
||||||
* task payload size.
|
* task payload size.
|
||||||
*
|
*
|
||||||
* The following memory layout is used:
|
* The following memory layout is used:
|
||||||
|
@@ -241,7 +241,7 @@ cull_small_primitive_line(nir_builder *b, nir_ssa_def *pos[3][4],
|
|||||||
* it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
|
* it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
|
||||||
* because it doesn't enter any diamond and thus can't exit any diamond.
|
* because it doesn't enter any diamond and thus can't exit any diamond.
|
||||||
*
|
*
|
||||||
* The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
|
* The viewport is rotated by 45 degrees to turn diamonds into squares, and a bounding
|
||||||
* box test is used to determine whether a line is entirely inside any square (diamond).
|
* box test is used to determine whether a line is entirely inside any square (diamond).
|
||||||
*
|
*
|
||||||
* The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
|
* The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
|
||||||
@@ -264,7 +264,7 @@ cull_small_primitive_line(nir_builder *b, nir_ssa_def *pos[3][4],
|
|||||||
v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate);
|
v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Rotate the viewport by 45 degress, so that diamonds become squares. */
|
/* Rotate the viewport by 45 degrees, so that diamonds become squares. */
|
||||||
rotate_45degrees(b, v0);
|
rotate_45degrees(b, v0);
|
||||||
rotate_45degrees(b, v1);
|
rotate_45degrees(b, v1);
|
||||||
|
|
||||||
|
@@ -62,7 +62,7 @@ emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, ni
|
|||||||
unsigned full_dwords = total_bytes / 4u;
|
unsigned full_dwords = total_bytes / 4u;
|
||||||
unsigned remaining_bytes = total_bytes - full_dwords * 4u;
|
unsigned remaining_bytes = total_bytes - full_dwords * 4u;
|
||||||
|
|
||||||
/* Accomodate max number of split 64-bit loads */
|
/* Accommodate max number of split 64-bit loads */
|
||||||
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];
|
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];
|
||||||
|
|
||||||
/* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
|
/* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
|
||||||
|
@@ -1494,7 +1494,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
|
|||||||
/* Run culling algorithms if culling is enabled.
|
/* Run culling algorithms if culling is enabled.
|
||||||
*
|
*
|
||||||
* NGG culling can be enabled or disabled in runtime.
|
* NGG culling can be enabled or disabled in runtime.
|
||||||
* This is determined by a SGPR shader argument which is acccessed
|
* This is determined by a SGPR shader argument which is accessed
|
||||||
* by the following NIR intrinsic.
|
* by the following NIR intrinsic.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -3233,7 +3233,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
|
|||||||
|
|
||||||
/* We want to export primitives to streamout buffer in sequence,
|
/* We want to export primitives to streamout buffer in sequence,
|
||||||
* but not all vertices are alive or mark end of a primitive, so
|
* but not all vertices are alive or mark end of a primitive, so
|
||||||
* there're "holes". We don't need continous invocations to write
|
* there're "holes". We don't need continuous invocations to write
|
||||||
* primitives to streamout buffer like final vertex export, so
|
* primitives to streamout buffer like final vertex export, so
|
||||||
* just repack to get the sequence (export_seq) is enough, no need
|
* just repack to get the sequence (export_seq) is enough, no need
|
||||||
* to do compaction.
|
* to do compaction.
|
||||||
@@ -4018,7 +4018,7 @@ ms_emit_arrayed_outputs(nir_builder *b,
|
|||||||
nir_ssa_def *zero = nir_imm_int(b, 0);
|
nir_ssa_def *zero = nir_imm_int(b, 0);
|
||||||
|
|
||||||
u_foreach_bit64(slot, mask) {
|
u_foreach_bit64(slot, mask) {
|
||||||
/* Should not occour here, handled separately. */
|
/* Should not occur here, handled separately. */
|
||||||
assert(slot != VARYING_SLOT_PRIMITIVE_COUNT && slot != VARYING_SLOT_PRIMITIVE_INDICES);
|
assert(slot != VARYING_SLOT_PRIMITIVE_COUNT && slot != VARYING_SLOT_PRIMITIVE_INDICES);
|
||||||
|
|
||||||
unsigned component_mask = s->output_info[slot].components_mask;
|
unsigned component_mask = s->output_info[slot].components_mask;
|
||||||
|
@@ -151,8 +151,8 @@ emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s)
|
|||||||
|
|
||||||
uint64_t outputs_written = b->shader->info.outputs_written;
|
uint64_t outputs_written = b->shader->info.outputs_written;
|
||||||
/* use outputs_written to determine export format as we use it to set
|
/* use outputs_written to determine export format as we use it to set
|
||||||
* R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store ouput,
|
* R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store output,
|
||||||
* because store ouput may be optimized out.
|
* because store output may be optimized out.
|
||||||
*/
|
*/
|
||||||
unsigned format =
|
unsigned format =
|
||||||
ac_get_spi_shader_z_format(outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH),
|
ac_get_spi_shader_z_format(outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH),
|
||||||
@@ -352,7 +352,7 @@ emit_ps_color_export(nir_builder *b, lower_ps_state *s, gl_frag_result slot, uns
|
|||||||
pack_op = nir_op_pack_snorm_2x16;
|
pack_op = nir_op_pack_snorm_2x16;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
unreachable("unsupport color export format");
|
unreachable("unsupported color export format");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -192,7 +192,7 @@ static void ac_sqtt_fill_cpu_info(struct sqtt_file_chunk_cpu_info *chunk)
|
|||||||
if (os_get_total_physical_memory(&system_ram_size))
|
if (os_get_total_physical_memory(&system_ram_size))
|
||||||
chunk->system_ram_size = system_ram_size / (1024 * 1024);
|
chunk->system_ram_size = system_ram_size / (1024 * 1024);
|
||||||
|
|
||||||
/* Parse cpuinfo to get more detailled information. */
|
/* Parse cpuinfo to get more detailed information. */
|
||||||
f = fopen("/proc/cpuinfo", "r");
|
f = fopen("/proc/cpuinfo", "r");
|
||||||
if (!f)
|
if (!f)
|
||||||
return;
|
return;
|
||||||
|
@@ -364,7 +364,7 @@ unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsi
|
|||||||
// Use the regularity properties of the combined format enum.
|
// Use the regularity properties of the combined format enum.
|
||||||
//
|
//
|
||||||
// Note: float is incompatible with 8-bit data formats,
|
// Note: float is incompatible with 8-bit data formats,
|
||||||
// [us]{norm,scaled} are incomparible with 32-bit data formats.
|
// [us]{norm,scaled} are incompatible with 32-bit data formats.
|
||||||
// [us]scaled are not writable.
|
// [us]scaled are not writable.
|
||||||
switch (nfmt) {
|
switch (nfmt) {
|
||||||
case V_008F0C_BUF_NUM_FORMAT_UNORM:
|
case V_008F0C_BUF_NUM_FORMAT_UNORM:
|
||||||
|
@@ -4073,7 +4073,7 @@ void ac_check_shadowed_regs(enum amd_gfx_level gfx_level, enum radeon_family fam
|
|||||||
unsigned end_reg_offset = reg_offset + count * 4;
|
unsigned end_reg_offset = reg_offset + count * 4;
|
||||||
unsigned end_range_offset = ranges[i].offset + ranges[i].size;
|
unsigned end_range_offset = ranges[i].offset + ranges[i].size;
|
||||||
|
|
||||||
/* Test if the ranges interect. */
|
/* Test if the ranges intersect. */
|
||||||
if (MAX2(ranges[i].offset, reg_offset) < MIN2(end_range_offset, end_reg_offset)) {
|
if (MAX2(ranges[i].offset, reg_offset) < MIN2(end_range_offset, end_reg_offset)) {
|
||||||
/* Assertion: A register can be listed only once. */
|
/* Assertion: A register can be listed only once. */
|
||||||
assert(!found);
|
assert(!found);
|
||||||
|
@@ -793,7 +793,7 @@ static int gfx6_compute_level(ADDR_HANDLE addrlib, const struct ac_surf_config *
|
|||||||
if (ret == ADDR_OK) {
|
if (ret == ADDR_OK) {
|
||||||
/* If the DCC memory isn't properly
|
/* If the DCC memory isn't properly
|
||||||
* aligned, the data are interleaved
|
* aligned, the data are interleaved
|
||||||
* accross slices.
|
* across slices.
|
||||||
*/
|
*/
|
||||||
if (AddrDccOut->dccRamSizeAligned)
|
if (AddrDccOut->dccRamSizeAligned)
|
||||||
dcc_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize;
|
dcc_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize;
|
||||||
|
@@ -327,7 +327,7 @@ Waiting for the VMEM/DS instruction to finish, a VALU or export instruction, or
|
|||||||
### VALUTransUseHazard
|
### VALUTransUseHazard
|
||||||
|
|
||||||
Triggered by:
|
Triggered by:
|
||||||
A VALU instrction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
|
A VALU instruction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
|
||||||
transcendental instructions in-between.
|
transcendental instructions in-between.
|
||||||
|
|
||||||
Mitigated by:
|
Mitigated by:
|
||||||
|
@@ -105,7 +105,7 @@ This means that we need to insert `s_waitcnt` instructions (and its variants) so
|
|||||||
#### Resolve hazards and insert NOPs
|
#### Resolve hazards and insert NOPs
|
||||||
|
|
||||||
Some instructions require wait states or other instructions to resolve hazards which are not handled by the hardware.
|
Some instructions require wait states or other instructions to resolve hazards which are not handled by the hardware.
|
||||||
This pass makes sure that no known hazards occour.
|
This pass makes sure that no known hazards occur.
|
||||||
|
|
||||||
#### Emit program - Assembler
|
#### Emit program - Assembler
|
||||||
|
|
||||||
@@ -118,7 +118,7 @@ Which software stage gets executed on which hardware stage depends on what kind
|
|||||||
|
|
||||||
An important difference is that VS is always the first stage to run in SW models,
|
An important difference is that VS is always the first stage to run in SW models,
|
||||||
whereas HW VS refers to the last HW stage before fragment shading in GCN/RDNA terminology.
|
whereas HW VS refers to the last HW stage before fragment shading in GCN/RDNA terminology.
|
||||||
That's why, among other things, the HW VS is no longer used to execute the SW VS when tesselation or geometry shading are used.
|
That's why, among other things, the HW VS is no longer used to execute the SW VS when tessellation or geometry shading are used.
|
||||||
|
|
||||||
#### Glossary of software stages
|
#### Glossary of software stages
|
||||||
|
|
||||||
|
@@ -786,7 +786,7 @@ get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
|
|||||||
|
|
||||||
/* extract a full dword if possible */
|
/* extract a full dword if possible */
|
||||||
if (tmp.bytes() >= (dword + 1) * 4) {
|
if (tmp.bytes() >= (dword + 1) * 4) {
|
||||||
/* if the source is splitted into components, use p_create_vector */
|
/* if the source is split into components, use p_create_vector */
|
||||||
auto it = ctx->allocated_vec.find(tmp.id());
|
auto it = ctx->allocated_vec.find(tmp.id());
|
||||||
if (it != ctx->allocated_vec.end()) {
|
if (it != ctx->allocated_vec.end()) {
|
||||||
unsigned index = dword << 1;
|
unsigned index = dword << 1;
|
||||||
@@ -5549,7 +5549,7 @@ mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
|
|||||||
ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
|
ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
|
||||||
alignment, max_fetched_components);
|
alignment, max_fetched_components);
|
||||||
const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
|
const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
|
||||||
/* Adjust bytes needed in case we need to do a smaller load due to aligment.
|
/* Adjust bytes needed in case we need to do a smaller load due to alignment.
|
||||||
* If a larger format is selected, it's still OK to load a smaller amount from it.
|
* If a larger format is selected, it's still OK to load a smaller amount from it.
|
||||||
*/
|
*/
|
||||||
bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
|
bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
|
||||||
|
@@ -2037,11 +2037,11 @@ static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
|
|||||||
static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
|
static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
|
||||||
static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
|
static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
|
||||||
/* pre-GFX9 */
|
/* pre-GFX9 */
|
||||||
static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
|
static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tessellation control */
|
||||||
static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
|
static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
|
||||||
static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
|
static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
|
||||||
static constexpr Stage tess_eval_es(HWStage::ES,
|
static constexpr Stage tess_eval_es(HWStage::ES,
|
||||||
SWStage::TES); /* tesselation evaluation before geometry */
|
SWStage::TES); /* tessellation evaluation before geometry */
|
||||||
static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
|
static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
|
||||||
/* Raytracing */
|
/* Raytracing */
|
||||||
static constexpr Stage raytracing_cs(HWStage::CS, SWStage::RT);
|
static constexpr Stage raytracing_cs(HWStage::CS, SWStage::RT);
|
||||||
|
@@ -71,7 +71,7 @@ aco_opcode
|
|||||||
get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
|
get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
|
||||||
{
|
{
|
||||||
/* Because some 16-bit instructions are already VOP3 on GFX10, we use the
|
/* Because some 16-bit instructions are already VOP3 on GFX10, we use the
|
||||||
* 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
|
* 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
|
||||||
* DPP with the arithmetic instructions. This requires to sign-extend.
|
* DPP with the arithmetic instructions. This requires to sign-extend.
|
||||||
*/
|
*/
|
||||||
switch (op) {
|
switch (op) {
|
||||||
@@ -718,7 +718,7 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c
|
|||||||
|
|
||||||
for (unsigned i = 0; i < src.size(); i++) {
|
for (unsigned i = 0; i < src.size(); i++) {
|
||||||
if (!identity[i].isConstant() ||
|
if (!identity[i].isConstant() ||
|
||||||
identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
|
identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
|
||||||
if (ctx->program->gfx_level < GFX10)
|
if (ctx->program->gfx_level < GFX10)
|
||||||
assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
|
assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
|
||||||
identity[i].physReg() == PhysReg{sitmp + i});
|
identity[i].physReg() == PhysReg{sitmp + i});
|
||||||
|
@@ -205,7 +205,7 @@ class Opcode(object):
|
|||||||
- name is the name of the opcode (prepend nir_op_ for the enum name)
|
- name is the name of the opcode (prepend nir_op_ for the enum name)
|
||||||
- all types are strings that get nir_type_ prepended to them
|
- all types are strings that get nir_type_ prepended to them
|
||||||
- input_types is a list of types
|
- input_types is a list of types
|
||||||
- algebraic_properties is a space-seperated string, where nir_op_is_ is
|
- algebraic_properties is a space-separated string, where nir_op_is_ is
|
||||||
prepended before each entry
|
prepended before each entry
|
||||||
- const_expr is an expression or series of statements that computes the
|
- const_expr is an expression or series of statements that computes the
|
||||||
constant value of the opcode given the constant values of its inputs.
|
constant value of the opcode given the constant values of its inputs.
|
||||||
|
@@ -1135,7 +1135,7 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
|
|||||||
sel.offset() == 0 &&
|
sel.offset() == 0 &&
|
||||||
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
|
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
|
||||||
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
|
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
|
||||||
/* The undesireable upper bits are already shifted out. */
|
/* The undesirable upper bits are already shifted out. */
|
||||||
return;
|
return;
|
||||||
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
|
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
|
||||||
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
|
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
|
||||||
@@ -3567,7 +3567,7 @@ apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
|
|||||||
unsigned sign_ext = extract->operands[3].constantValue();
|
unsigned sign_ext = extract->operands[3].constantValue();
|
||||||
unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
|
unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
|
||||||
|
|
||||||
/* TODO: These are doable, but probably don't occour too often. */
|
/* TODO: These are doable, but probably don't occur too often. */
|
||||||
if (extract_idx || sign_ext || dst_bitsize != 32)
|
if (extract_idx || sign_ext || dst_bitsize != 32)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@@ -694,7 +694,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
/* don't use LDS/GDS instructions to hide latency since it can
|
/* don't use LDS/GDS instructions to hide latency since it can
|
||||||
* significanly worsen LDS scheduling */
|
* significantly worsen LDS scheduling */
|
||||||
if (candidate->isDS() || !can_move_down) {
|
if (candidate->isDS() || !can_move_down) {
|
||||||
add_to_hazard_query(&hq, candidate.get());
|
add_to_hazard_query(&hq, candidate.get());
|
||||||
ctx.mv.downwards_skip(cursor);
|
ctx.mv.downwards_skip(cursor);
|
||||||
|
@@ -881,7 +881,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
|
|||||||
Temp var = phi->operands[i].getTemp();
|
Temp var = phi->operands[i].getTemp();
|
||||||
|
|
||||||
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
|
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
|
||||||
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
|
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
|
||||||
if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
|
if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
|
||||||
ctx.unused_remats.erase(ctx.remat[var].instr);
|
ctx.unused_remats.erase(ctx.remat[var].instr);
|
||||||
|
|
||||||
@@ -1001,7 +1001,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
|
|||||||
ctx.renames[pred_idx].find(phi->operands[i].getTemp());
|
ctx.renames[pred_idx].find(phi->operands[i].getTemp());
|
||||||
if (it != ctx.renames[pred_idx].end()) {
|
if (it != ctx.renames[pred_idx].end()) {
|
||||||
phi->operands[i].setTemp(it->second);
|
phi->operands[i].setTemp(it->second);
|
||||||
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
|
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
|
||||||
} else {
|
} else {
|
||||||
auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
|
auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
|
||||||
if (remat_it != ctx.remat.end()) {
|
if (remat_it != ctx.remat.end()) {
|
||||||
@@ -1117,7 +1117,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
|
|||||||
tmp = rename;
|
tmp = rename;
|
||||||
} else {
|
} else {
|
||||||
tmp = pair.first;
|
tmp = pair.first;
|
||||||
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
|
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
|
||||||
if (ctx.remat.count(tmp))
|
if (ctx.remat.count(tmp))
|
||||||
ctx.unused_remats.erase(ctx.remat[tmp].instr);
|
ctx.unused_remats.erase(ctx.remat[tmp].instr);
|
||||||
}
|
}
|
||||||
@@ -1162,7 +1162,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|||||||
std::vector<aco_ptr<Instruction>> instructions;
|
std::vector<aco_ptr<Instruction>> instructions;
|
||||||
unsigned idx = 0;
|
unsigned idx = 0;
|
||||||
|
|
||||||
/* phis are handled separetely */
|
/* phis are handled separately */
|
||||||
while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
|
while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
|
||||||
block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
|
block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
|
||||||
instructions.emplace_back(std::move(block->instructions[idx++]));
|
instructions.emplace_back(std::move(block->instructions[idx++]));
|
||||||
@@ -1191,7 +1191,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|||||||
if (rename_it != ctx.renames[block_idx].end()) {
|
if (rename_it != ctx.renames[block_idx].end()) {
|
||||||
op.setTemp(rename_it->second);
|
op.setTemp(rename_it->second);
|
||||||
} else {
|
} else {
|
||||||
/* prevent its definining instruction from being DCE'd if it could be rematerialized */
|
/* prevent its defining instruction from being DCE'd if it could be rematerialized */
|
||||||
auto remat_it = ctx.remat.find(op.getTemp());
|
auto remat_it = ctx.remat.find(op.getTemp());
|
||||||
if (remat_it != ctx.remat.end()) {
|
if (remat_it != ctx.remat.end()) {
|
||||||
ctx.unused_remats.erase(remat_it->second.instr);
|
ctx.unused_remats.erase(remat_it->second.instr);
|
||||||
|
@@ -459,7 +459,7 @@ public:
|
|||||||
free(buffer);
|
free(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Delete copy-constructor and -assigment to avoid double free() */
|
/* Delete copy-constructor and -assignment to avoid double free() */
|
||||||
monotonic_buffer_resource(const monotonic_buffer_resource&) = delete;
|
monotonic_buffer_resource(const monotonic_buffer_resource&) = delete;
|
||||||
monotonic_buffer_resource& operator=(const monotonic_buffer_resource&) = delete;
|
monotonic_buffer_resource& operator=(const monotonic_buffer_resource&) = delete;
|
||||||
|
|
||||||
|
@@ -6,5 +6,5 @@ The submit ioctl is stubbed out to not execute anything.
|
|||||||
Export `MESA_LOADER_DRIVER_OVERRIDE=r300
|
Export `MESA_LOADER_DRIVER_OVERRIDE=r300
|
||||||
LD_PRELOAD=$prefix/lib/libradeon_noop_drm_shim.so`. (or r600 for r600-class HW)
|
LD_PRELOAD=$prefix/lib/libradeon_noop_drm_shim.so`. (or r600 for r600-class HW)
|
||||||
|
|
||||||
By default, rv515 is exposed. The chip can be selected an enviornment
|
By default, rv515 is exposed. The chip can be selected an environment
|
||||||
variable like `RADEON_GPU_ID=CAYMAN` or `RADEON_GPU_ID=0x6740`.
|
variable like `RADEON_GPU_ID=CAYMAN` or `RADEON_GPU_ID=0x6740`.
|
||||||
|
@@ -872,7 +872,7 @@ void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_
|
|||||||
*
|
*
|
||||||
* where d is the depth of the texture array and layer
|
* where d is the depth of the texture array and layer
|
||||||
* comes from the component indicated in the tables below.
|
* comes from the component indicated in the tables below.
|
||||||
* Workaroudn for an issue where the layer is taken from a
|
* Workaround for an issue where the layer is taken from a
|
||||||
* helper invocation which happens to fall on a different
|
* helper invocation which happens to fall on a different
|
||||||
* layer due to extrapolation."
|
* layer due to extrapolation."
|
||||||
*
|
*
|
||||||
@@ -1972,7 +1972,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
|
|||||||
if (atomic) {
|
if (atomic) {
|
||||||
data_type = LLVMTypeOf(a->data[0]);
|
data_type = LLVMTypeOf(a->data[0]);
|
||||||
} else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
|
} else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
|
||||||
/* Image stores might have been shrinked using the format. */
|
/* Image stores might have been shrunk using the format. */
|
||||||
data_type = LLVMTypeOf(a->data[0]);
|
data_type = LLVMTypeOf(a->data[0]);
|
||||||
dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
|
dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
|
||||||
} else {
|
} else {
|
||||||
|
@@ -450,7 +450,7 @@ struct waterfall_context {
|
|||||||
* to implement the body.
|
* to implement the body.
|
||||||
*
|
*
|
||||||
* params:
|
* params:
|
||||||
* - ctx is the usal nir context
|
* - ctx is the usual nir context
|
||||||
* - wctx is a temporary struct containing some loop info. Can be left uninitialized.
|
* - wctx is a temporary struct containing some loop info. Can be left uninitialized.
|
||||||
* - value is the possibly divergent value for which we built the loop
|
* - value is the possibly divergent value for which we built the loop
|
||||||
* - divergent is whether value is actually divergent. If false we just pass
|
* - divergent is whether value is actually divergent. If false we just pass
|
||||||
|
@@ -218,7 +218,7 @@ combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
|
|||||||
* tree depth for internal nodes)
|
* tree depth for internal nodes)
|
||||||
*
|
*
|
||||||
* Dividing area by both relative costs will make it more likely that we merge nodes with
|
* Dividing area by both relative costs will make it more likely that we merge nodes with
|
||||||
* a hight child cost.
|
* a high child cost.
|
||||||
*/
|
*/
|
||||||
float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
|
float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
|
||||||
float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
|
float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
|
||||||
|
@@ -356,7 +356,7 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
|
|||||||
nir_ssa_def *load = loads[0];
|
nir_ssa_def *load = loads[0];
|
||||||
|
|
||||||
/* Extract the channels we actually need when we couldn't skip starting
|
/* Extract the channels we actually need when we couldn't skip starting
|
||||||
* components or had to emit more than one load instrinsic.
|
* components or had to emit more than one load intrinsic.
|
||||||
*/
|
*/
|
||||||
if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
|
if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
|
||||||
load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
|
load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
|
||||||
|
@@ -198,19 +198,19 @@ radix_sort_vk_destroy(radix_sort_vk_t * rs, //
|
|||||||
// must be honored. All alignments are power of 2.
|
// must be honored. All alignments are power of 2.
|
||||||
//
|
//
|
||||||
// Input:
|
// Input:
|
||||||
// count : Maximum number of keyvals
|
// count : Maximum number of keyvals
|
||||||
//
|
//
|
||||||
// Outputs:
|
// Outputs:
|
||||||
// keyval_size : Size of a single keyval
|
// keyval_size : Size of a single keyval
|
||||||
//
|
//
|
||||||
// keyvals_size : Minimum size of the even and odd keyval buffers
|
// keyvals_size : Minimum size of the even and odd keyval buffers
|
||||||
// keyvals_alignment : Alignment of each keyval buffer
|
// keyvals_alignment : Alignment of each keyval buffer
|
||||||
//
|
//
|
||||||
// internal_size : Minimum size of internal buffer
|
// internal_size : Minimum size of internal buffer
|
||||||
// internal_aligment : Alignment of the internal buffer
|
// internal_alignment : Alignment of the internal buffer
|
||||||
//
|
//
|
||||||
// indirect_size : Minimum size of indirect buffer
|
// indirect_size : Minimum size of indirect buffer
|
||||||
// indirect_aligment : Alignment of the indirect buffer
|
// indirect_alignment : Alignment of the indirect buffer
|
||||||
//
|
//
|
||||||
// .keyvals_even/odd
|
// .keyvals_even/odd
|
||||||
// -----------------
|
// -----------------
|
||||||
|
@@ -174,7 +174,7 @@ radv_CreateDescriptorSetLayout(VkDevice _device, const VkDescriptorSetLayoutCrea
|
|||||||
size += ycbcr_sampler_count * sizeof(struct vk_ycbcr_conversion_state);
|
size += ycbcr_sampler_count * sizeof(struct vk_ycbcr_conversion_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We need to allocate decriptor set layouts off the device allocator with DEVICE scope because
|
/* We need to allocate descriptor set layouts off the device allocator with DEVICE scope because
|
||||||
* they are reference counted and may not be destroyed when vkDestroyDescriptorSetLayout is
|
* they are reference counted and may not be destroyed when vkDestroyDescriptorSetLayout is
|
||||||
* called.
|
* called.
|
||||||
*/
|
*/
|
||||||
|
@@ -70,7 +70,7 @@ class radv_llvm_per_thread_info {
|
|||||||
struct ac_compiler_passes *passes;
|
struct ac_compiler_passes *passes;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* we have to store a linked list per thread due to the possiblity of multiple gpus being required */
|
/* we have to store a linked list per thread due to the possibility of multiple gpus being required */
|
||||||
static thread_local std::list<radv_llvm_per_thread_info> radv_llvm_per_thread_list;
|
static thread_local std::list<radv_llvm_per_thread_info> radv_llvm_per_thread_list;
|
||||||
|
|
||||||
bool
|
bool
|
||||||
|
@@ -2315,7 +2315,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
|
|||||||
if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
|
if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
|
||||||
device->cs_wave_size = 32;
|
device->cs_wave_size = 32;
|
||||||
|
|
||||||
/* For pixel shaders, wave64 is recommanded. */
|
/* For pixel shaders, wave64 is recommended. */
|
||||||
if (device->instance->perftest_flags & RADV_PERFTEST_PS_WAVE_32)
|
if (device->instance->perftest_flags & RADV_PERFTEST_PS_WAVE_32)
|
||||||
device->ps_wave_size = 32;
|
device->ps_wave_size = 32;
|
||||||
|
|
||||||
@@ -2641,7 +2641,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice,
|
|||||||
device->ws->query_value(device->ws, RADEON_GTT_USAGE);
|
device->ws->query_value(device->ws, RADEON_GTT_USAGE);
|
||||||
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
|
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
|
||||||
|
|
||||||
/* Compute the total free space that can be allocated for this process accross all heaps. */
|
/* Compute the total free space that can be allocated for this process across all heaps. */
|
||||||
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
|
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
|
||||||
|
|
||||||
memoryBudget->heapBudget[vram_vis_heap_idx] = total_free_space + total_internal_usage;
|
memoryBudget->heapBudget[vram_vis_heap_idx] = total_free_space + total_internal_usage;
|
||||||
@@ -2673,7 +2673,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice,
|
|||||||
|
|
||||||
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
|
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
|
||||||
|
|
||||||
/* Compute the total free space that can be allocated for this process accross all heaps. */
|
/* Compute the total free space that can be allocated for this process across all heaps. */
|
||||||
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
|
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
|
||||||
|
|
||||||
/* Compute the remaining visible VRAM size for this process. */
|
/* Compute the remaining visible VRAM size for this process. */
|
||||||
|
@@ -4106,7 +4106,7 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv
|
|||||||
if (enable_mrt_compaction) {
|
if (enable_mrt_compaction) {
|
||||||
blend.spi_shader_col_format = radv_compact_spi_shader_col_format(ps, &blend);
|
blend.spi_shader_col_format = radv_compact_spi_shader_col_format(ps, &blend);
|
||||||
|
|
||||||
/* In presense of MRT holes (ie. the FS exports MRT1 but not MRT0), the compiler will remap
|
/* In presence of MRT holes (ie. the FS exports MRT1 but not MRT0), the compiler will remap
|
||||||
* them, so that only MRT0 is exported and the driver will compact SPI_SHADER_COL_FORMAT to
|
* them, so that only MRT0 is exported and the driver will compact SPI_SHADER_COL_FORMAT to
|
||||||
* match what the FS actually exports. Though, to make sure the hw remapping works as
|
* match what the FS actually exports. Though, to make sure the hw remapping works as
|
||||||
* expected, we should also clear color attachments without exports in CB_SHADER_MASK.
|
* expected, we should also clear color attachments without exports in CB_SHADER_MASK.
|
||||||
|
@@ -1789,7 +1789,7 @@ struct radv_cmd_buffer {
|
|||||||
} ace_internal;
|
} ace_internal;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether a query pool has been resetted and we have to flush caches.
|
* Whether a query pool has been reset and we have to flush caches.
|
||||||
*/
|
*/
|
||||||
bool pending_reset_query;
|
bool pending_reset_query;
|
||||||
|
|
||||||
|
@@ -1466,7 +1466,7 @@ emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
|
|||||||
if (cmd_buffer->pending_reset_query) {
|
if (cmd_buffer->pending_reset_query) {
|
||||||
if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
|
if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
|
||||||
/* Only need to flush caches if the query pool size is
|
/* Only need to flush caches if the query pool size is
|
||||||
* large enough to be resetted using the compute shader
|
* large enough to be reset using the compute shader
|
||||||
* path. Small pools don't need any cache flushes
|
* path. Small pools don't need any cache flushes
|
||||||
* because we use a CP dma clear.
|
* because we use a CP dma clear.
|
||||||
*/
|
*/
|
||||||
|
@@ -1242,7 +1242,7 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
|
|||||||
|
|
||||||
rra_dump_chunk_description(accel_struct_offsets[i],
|
rra_dump_chunk_description(accel_struct_offsets[i],
|
||||||
sizeof(struct rra_accel_struct_chunk_header), accel_struct_size,
|
sizeof(struct rra_accel_struct_chunk_header), accel_struct_size,
|
||||||
"RawAccelStruc", RADV_RRA_CHUNK_ID_ACCEL_STRUCT, file);
|
"RawAccelStruct", RADV_RRA_CHUNK_ID_ACCEL_STRUCT, file);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t file_end = (uint64_t)ftell(file);
|
uint64_t file_end = (uint64_t)ftell(file);
|
||||||
|
@@ -108,7 +108,7 @@ intersect_ray_amd_software_box(struct radv_device *device, nir_builder *b, nir_s
|
|||||||
nir_store_var(b, child_indices,
|
nir_store_var(b, child_indices,
|
||||||
nir_imm_ivec4(b, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu), 0xf);
|
nir_imm_ivec4(b, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu), 0xf);
|
||||||
|
|
||||||
/* Need to remove infinities here because otherwise we get nasty NaN propogation
|
/* Need to remove infinities here because otherwise we get nasty NaN propagation
|
||||||
* if the direction has 0s in it. */
|
* if the direction has 0s in it. */
|
||||||
/* inv_dir = clamp(inv_dir, -FLT_MAX, FLT_MAX); */
|
/* inv_dir = clamp(inv_dir, -FLT_MAX, FLT_MAX); */
|
||||||
inv_dir = nir_fclamp(b, inv_dir, nir_imm_float(b, -FLT_MAX), nir_imm_float(b, FLT_MAX));
|
inv_dir = nir_fclamp(b, inv_dir, nir_imm_float(b, -FLT_MAX), nir_imm_float(b, FLT_MAX));
|
||||||
@@ -238,7 +238,7 @@ intersect_ray_amd_software_tri(struct radv_device *device, nir_builder *b, nir_s
|
|||||||
nir_ssa_def *k_indices[3] = {kx, ky, kz};
|
nir_ssa_def *k_indices[3] = {kx, ky, kz};
|
||||||
nir_ssa_def *k = nir_vec(b, k_indices, 3);
|
nir_ssa_def *k = nir_vec(b, k_indices, 3);
|
||||||
|
|
||||||
/* Swap kx and ky dimensions to preseve winding order */
|
/* Swap kx and ky dimensions to preserve winding order */
|
||||||
unsigned swap_xy_swizzle[4] = {1, 0, 2, 3};
|
unsigned swap_xy_swizzle[4] = {1, 0, 2, 3};
|
||||||
k = nir_bcsel(b, nir_flt(b, nir_vector_extract(b, dir, kz), nir_imm_float(b, 0.0f)),
|
k = nir_bcsel(b, nir_flt(b, nir_vector_extract(b, dir, kz), nir_imm_float(b, 0.0f)),
|
||||||
nir_swizzle(b, k, swap_xy_swizzle, 3), k);
|
nir_swizzle(b, k, swap_xy_swizzle, 3), k);
|
||||||
|
@@ -2080,7 +2080,7 @@ radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config,
|
|||||||
|
|
||||||
size += code_dw * sizeof(uint32_t) + sizeof(struct radv_shader_binary_legacy);
|
size += code_dw * sizeof(uint32_t) + sizeof(struct radv_shader_binary_legacy);
|
||||||
|
|
||||||
/* We need to calloc to prevent unintialized data because this will be used
|
/* We need to calloc to prevent uninitialized data because this will be used
|
||||||
* directly for the disk cache. Uninitialized data can appear because of
|
* directly for the disk cache. Uninitialized data can appear because of
|
||||||
* padding in the struct or because legacy_binary->data can be at an offset
|
* padding in the struct or because legacy_binary->data can be at an offset
|
||||||
* from the start less than sizeof(radv_shader_binary_legacy). */
|
* from the start less than sizeof(radv_shader_binary_legacy). */
|
||||||
|
@@ -307,7 +307,7 @@ radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbu
|
|||||||
if (pdevice->rad_info.gfx_level >= GFX11) {
|
if (pdevice->rad_info.gfx_level >= GFX11) {
|
||||||
/* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
|
/* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
|
||||||
* To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
|
* To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
|
||||||
* substract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
|
* subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
|
||||||
*
|
*
|
||||||
* 1) get the current buffer base address for this SE
|
* 1) get the current buffer base address for this SE
|
||||||
* 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
|
* 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
|
||||||
|
Reference in New Issue
Block a user