amd: fix typos
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22432>
This commit is contained in:

committed by
Marge Bot

parent
1aab6820fb
commit
aea48a4ff1
@@ -309,7 +309,7 @@ void ac_get_hs_info(struct radeon_info *info,
|
||||
* store the task payload which is passed to mesh shaders.
|
||||
*
|
||||
* The driver only needs to create this BO once,
|
||||
* and it will always be able to accomodate the maximum needed
|
||||
* and it will always be able to accommodate the maximum needed
|
||||
* task payload size.
|
||||
*
|
||||
* The following memory layout is used:
|
||||
|
@@ -241,7 +241,7 @@ cull_small_primitive_line(nir_builder *b, nir_ssa_def *pos[3][4],
|
||||
* it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
|
||||
* because it doesn't enter any diamond and thus can't exit any diamond.
|
||||
*
|
||||
* The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
|
||||
* The viewport is rotated by 45 degrees to turn diamonds into squares, and a bounding
|
||||
* box test is used to determine whether a line is entirely inside any square (diamond).
|
||||
*
|
||||
* The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
|
||||
@@ -264,7 +264,7 @@ cull_small_primitive_line(nir_builder *b, nir_ssa_def *pos[3][4],
|
||||
v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate);
|
||||
}
|
||||
|
||||
/* Rotate the viewport by 45 degress, so that diamonds become squares. */
|
||||
/* Rotate the viewport by 45 degrees, so that diamonds become squares. */
|
||||
rotate_45degrees(b, v0);
|
||||
rotate_45degrees(b, v1);
|
||||
|
||||
|
@@ -62,7 +62,7 @@ emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, ni
|
||||
unsigned full_dwords = total_bytes / 4u;
|
||||
unsigned remaining_bytes = total_bytes - full_dwords * 4u;
|
||||
|
||||
/* Accomodate max number of split 64-bit loads */
|
||||
/* Accommodate max number of split 64-bit loads */
|
||||
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u];
|
||||
|
||||
/* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */
|
||||
|
@@ -1494,7 +1494,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
|
||||
/* Run culling algorithms if culling is enabled.
|
||||
*
|
||||
* NGG culling can be enabled or disabled in runtime.
|
||||
* This is determined by a SGPR shader argument which is acccessed
|
||||
* This is determined by a SGPR shader argument which is accessed
|
||||
* by the following NIR intrinsic.
|
||||
*/
|
||||
|
||||
@@ -3233,7 +3233,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
|
||||
|
||||
/* We want to export primitives to streamout buffer in sequence,
|
||||
* but not all vertices are alive or mark end of a primitive, so
|
||||
* there're "holes". We don't need continous invocations to write
|
||||
* there're "holes". We don't need continuous invocations to write
|
||||
* primitives to streamout buffer like final vertex export, so
|
||||
* just repack to get the sequence (export_seq) is enough, no need
|
||||
* to do compaction.
|
||||
@@ -4018,7 +4018,7 @@ ms_emit_arrayed_outputs(nir_builder *b,
|
||||
nir_ssa_def *zero = nir_imm_int(b, 0);
|
||||
|
||||
u_foreach_bit64(slot, mask) {
|
||||
/* Should not occour here, handled separately. */
|
||||
/* Should not occur here, handled separately. */
|
||||
assert(slot != VARYING_SLOT_PRIMITIVE_COUNT && slot != VARYING_SLOT_PRIMITIVE_INDICES);
|
||||
|
||||
unsigned component_mask = s->output_info[slot].components_mask;
|
||||
|
@@ -151,8 +151,8 @@ emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s)
|
||||
|
||||
uint64_t outputs_written = b->shader->info.outputs_written;
|
||||
/* use outputs_written to determine export format as we use it to set
|
||||
* R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store ouput,
|
||||
* because store ouput may be optimized out.
|
||||
* R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store output,
|
||||
* because store output may be optimized out.
|
||||
*/
|
||||
unsigned format =
|
||||
ac_get_spi_shader_z_format(outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH),
|
||||
@@ -352,7 +352,7 @@ emit_ps_color_export(nir_builder *b, lower_ps_state *s, gl_frag_result slot, uns
|
||||
pack_op = nir_op_pack_snorm_2x16;
|
||||
break;
|
||||
default:
|
||||
unreachable("unsupport color export format");
|
||||
unreachable("unsupported color export format");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@@ -192,7 +192,7 @@ static void ac_sqtt_fill_cpu_info(struct sqtt_file_chunk_cpu_info *chunk)
|
||||
if (os_get_total_physical_memory(&system_ram_size))
|
||||
chunk->system_ram_size = system_ram_size / (1024 * 1024);
|
||||
|
||||
/* Parse cpuinfo to get more detailled information. */
|
||||
/* Parse cpuinfo to get more detailed information. */
|
||||
f = fopen("/proc/cpuinfo", "r");
|
||||
if (!f)
|
||||
return;
|
||||
|
@@ -364,7 +364,7 @@ unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsi
|
||||
// Use the regularity properties of the combined format enum.
|
||||
//
|
||||
// Note: float is incompatible with 8-bit data formats,
|
||||
// [us]{norm,scaled} are incomparible with 32-bit data formats.
|
||||
// [us]{norm,scaled} are incompatible with 32-bit data formats.
|
||||
// [us]scaled are not writable.
|
||||
switch (nfmt) {
|
||||
case V_008F0C_BUF_NUM_FORMAT_UNORM:
|
||||
|
@@ -4073,7 +4073,7 @@ void ac_check_shadowed_regs(enum amd_gfx_level gfx_level, enum radeon_family fam
|
||||
unsigned end_reg_offset = reg_offset + count * 4;
|
||||
unsigned end_range_offset = ranges[i].offset + ranges[i].size;
|
||||
|
||||
/* Test if the ranges interect. */
|
||||
/* Test if the ranges intersect. */
|
||||
if (MAX2(ranges[i].offset, reg_offset) < MIN2(end_range_offset, end_reg_offset)) {
|
||||
/* Assertion: A register can be listed only once. */
|
||||
assert(!found);
|
||||
|
@@ -793,7 +793,7 @@ static int gfx6_compute_level(ADDR_HANDLE addrlib, const struct ac_surf_config *
|
||||
if (ret == ADDR_OK) {
|
||||
/* If the DCC memory isn't properly
|
||||
* aligned, the data are interleaved
|
||||
* accross slices.
|
||||
* across slices.
|
||||
*/
|
||||
if (AddrDccOut->dccRamSizeAligned)
|
||||
dcc_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize;
|
||||
|
@@ -327,7 +327,7 @@ Waiting for the VMEM/DS instruction to finish, a VALU or export instruction, or
|
||||
### VALUTransUseHazard
|
||||
|
||||
Triggered by:
|
||||
A VALU instrction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
|
||||
A VALU instruction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
|
||||
transcendental instructions in-between.
|
||||
|
||||
Mitigated by:
|
||||
|
@@ -105,7 +105,7 @@ This means that we need to insert `s_waitcnt` instructions (and its variants) so
|
||||
#### Resolve hazards and insert NOPs
|
||||
|
||||
Some instructions require wait states or other instructions to resolve hazards which are not handled by the hardware.
|
||||
This pass makes sure that no known hazards occour.
|
||||
This pass makes sure that no known hazards occur.
|
||||
|
||||
#### Emit program - Assembler
|
||||
|
||||
@@ -118,7 +118,7 @@ Which software stage gets executed on which hardware stage depends on what kind
|
||||
|
||||
An important difference is that VS is always the first stage to run in SW models,
|
||||
whereas HW VS refers to the last HW stage before fragment shading in GCN/RDNA terminology.
|
||||
That's why, among other things, the HW VS is no longer used to execute the SW VS when tesselation or geometry shading are used.
|
||||
That's why, among other things, the HW VS is no longer used to execute the SW VS when tessellation or geometry shading are used.
|
||||
|
||||
#### Glossary of software stages
|
||||
|
||||
|
@@ -786,7 +786,7 @@ get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
|
||||
|
||||
/* extract a full dword if possible */
|
||||
if (tmp.bytes() >= (dword + 1) * 4) {
|
||||
/* if the source is splitted into components, use p_create_vector */
|
||||
/* if the source is split into components, use p_create_vector */
|
||||
auto it = ctx->allocated_vec.find(tmp.id());
|
||||
if (it != ctx->allocated_vec.end()) {
|
||||
unsigned index = dword << 1;
|
||||
@@ -5549,7 +5549,7 @@ mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
|
||||
ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
|
||||
alignment, max_fetched_components);
|
||||
const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
|
||||
/* Adjust bytes needed in case we need to do a smaller load due to aligment.
|
||||
/* Adjust bytes needed in case we need to do a smaller load due to alignment.
|
||||
* If a larger format is selected, it's still OK to load a smaller amount from it.
|
||||
*/
|
||||
bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
|
||||
|
@@ -2037,11 +2037,11 @@ static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
|
||||
static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
|
||||
static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
|
||||
/* pre-GFX9 */
|
||||
static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
|
||||
static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tessellation control */
|
||||
static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
|
||||
static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
|
||||
static constexpr Stage tess_eval_es(HWStage::ES,
|
||||
SWStage::TES); /* tesselation evaluation before geometry */
|
||||
SWStage::TES); /* tessellation evaluation before geometry */
|
||||
static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
|
||||
/* Raytracing */
|
||||
static constexpr Stage raytracing_cs(HWStage::CS, SWStage::RT);
|
||||
|
@@ -71,7 +71,7 @@ aco_opcode
|
||||
get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
|
||||
{
|
||||
/* Because some 16-bit instructions are already VOP3 on GFX10, we use the
|
||||
* 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
|
||||
* 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
|
||||
* DPP with the arithmetic instructions. This requires to sign-extend.
|
||||
*/
|
||||
switch (op) {
|
||||
@@ -718,7 +718,7 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c
|
||||
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
if (!identity[i].isConstant() ||
|
||||
identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
|
||||
identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
|
||||
if (ctx->program->gfx_level < GFX10)
|
||||
assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
|
||||
identity[i].physReg() == PhysReg{sitmp + i});
|
||||
|
@@ -205,7 +205,7 @@ class Opcode(object):
|
||||
- name is the name of the opcode (prepend nir_op_ for the enum name)
|
||||
- all types are strings that get nir_type_ prepended to them
|
||||
- input_types is a list of types
|
||||
- algebraic_properties is a space-seperated string, where nir_op_is_ is
|
||||
- algebraic_properties is a space-separated string, where nir_op_is_ is
|
||||
prepended before each entry
|
||||
- const_expr is an expression or series of statements that computes the
|
||||
constant value of the opcode given the constant values of its inputs.
|
||||
|
@@ -1135,7 +1135,7 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
|
||||
sel.offset() == 0 &&
|
||||
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
|
||||
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
|
||||
/* The undesireable upper bits are already shifted out. */
|
||||
/* The undesirable upper bits are already shifted out. */
|
||||
return;
|
||||
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
|
||||
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
|
||||
@@ -3567,7 +3567,7 @@ apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
|
||||
unsigned sign_ext = extract->operands[3].constantValue();
|
||||
unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
|
||||
|
||||
/* TODO: These are doable, but probably don't occour too often. */
|
||||
/* TODO: These are doable, but probably don't occur too often. */
|
||||
if (extract_idx || sign_ext || dst_bitsize != 32)
|
||||
return false;
|
||||
|
||||
|
@@ -694,7 +694,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
|
||||
break;
|
||||
|
||||
/* don't use LDS/GDS instructions to hide latency since it can
|
||||
* significanly worsen LDS scheduling */
|
||||
* significantly worsen LDS scheduling */
|
||||
if (candidate->isDS() || !can_move_down) {
|
||||
add_to_hazard_query(&hq, candidate.get());
|
||||
ctx.mv.downwards_skip(cursor);
|
||||
|
@@ -881,7 +881,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
|
||||
Temp var = phi->operands[i].getTemp();
|
||||
|
||||
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
|
||||
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
|
||||
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
|
||||
if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
|
||||
ctx.unused_remats.erase(ctx.remat[var].instr);
|
||||
|
||||
@@ -1001,7 +1001,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
|
||||
ctx.renames[pred_idx].find(phi->operands[i].getTemp());
|
||||
if (it != ctx.renames[pred_idx].end()) {
|
||||
phi->operands[i].setTemp(it->second);
|
||||
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
|
||||
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
|
||||
} else {
|
||||
auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
|
||||
if (remat_it != ctx.remat.end()) {
|
||||
@@ -1117,7 +1117,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
|
||||
tmp = rename;
|
||||
} else {
|
||||
tmp = pair.first;
|
||||
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
|
||||
/* prevent the defining instruction from being DCE'd if it could be rematerialized */
|
||||
if (ctx.remat.count(tmp))
|
||||
ctx.unused_remats.erase(ctx.remat[tmp].instr);
|
||||
}
|
||||
@@ -1162,7 +1162,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
||||
std::vector<aco_ptr<Instruction>> instructions;
|
||||
unsigned idx = 0;
|
||||
|
||||
/* phis are handled separetely */
|
||||
/* phis are handled separately */
|
||||
while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
|
||||
block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
|
||||
instructions.emplace_back(std::move(block->instructions[idx++]));
|
||||
@@ -1191,7 +1191,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
||||
if (rename_it != ctx.renames[block_idx].end()) {
|
||||
op.setTemp(rename_it->second);
|
||||
} else {
|
||||
/* prevent its definining instruction from being DCE'd if it could be rematerialized */
|
||||
/* prevent its defining instruction from being DCE'd if it could be rematerialized */
|
||||
auto remat_it = ctx.remat.find(op.getTemp());
|
||||
if (remat_it != ctx.remat.end()) {
|
||||
ctx.unused_remats.erase(remat_it->second.instr);
|
||||
|
@@ -459,7 +459,7 @@ public:
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
/* Delete copy-constructor and -assigment to avoid double free() */
|
||||
/* Delete copy-constructor and -assignment to avoid double free() */
|
||||
monotonic_buffer_resource(const monotonic_buffer_resource&) = delete;
|
||||
monotonic_buffer_resource& operator=(const monotonic_buffer_resource&) = delete;
|
||||
|
||||
|
@@ -6,5 +6,5 @@ The submit ioctl is stubbed out to not execute anything.
|
||||
Export `MESA_LOADER_DRIVER_OVERRIDE=r300
|
||||
LD_PRELOAD=$prefix/lib/libradeon_noop_drm_shim.so`. (or r600 for r600-class HW)
|
||||
|
||||
By default, rv515 is exposed. The chip can be selected an enviornment
|
||||
By default, rv515 is exposed. The chip can be selected an environment
|
||||
variable like `RADEON_GPU_ID=CAYMAN` or `RADEON_GPU_ID=0x6740`.
|
||||
|
@@ -872,7 +872,7 @@ void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_
|
||||
*
|
||||
* where d is the depth of the texture array and layer
|
||||
* comes from the component indicated in the tables below.
|
||||
* Workaroudn for an issue where the layer is taken from a
|
||||
* Workaround for an issue where the layer is taken from a
|
||||
* helper invocation which happens to fall on a different
|
||||
* layer due to extrapolation."
|
||||
*
|
||||
@@ -1972,7 +1972,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
|
||||
if (atomic) {
|
||||
data_type = LLVMTypeOf(a->data[0]);
|
||||
} else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
|
||||
/* Image stores might have been shrinked using the format. */
|
||||
/* Image stores might have been shrunk using the format. */
|
||||
data_type = LLVMTypeOf(a->data[0]);
|
||||
dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
|
||||
} else {
|
||||
|
@@ -450,7 +450,7 @@ struct waterfall_context {
|
||||
* to implement the body.
|
||||
*
|
||||
* params:
|
||||
* - ctx is the usal nir context
|
||||
* - ctx is the usual nir context
|
||||
* - wctx is a temporary struct containing some loop info. Can be left uninitialized.
|
||||
* - value is the possibly divergent value for which we built the loop
|
||||
* - divergent is whether value is actually divergent. If false we just pass
|
||||
|
@@ -218,7 +218,7 @@ combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
|
||||
* tree depth for internal nodes)
|
||||
*
|
||||
* Dividing area by both relative costs will make it more likely that we merge nodes with
|
||||
* a hight child cost.
|
||||
* a high child cost.
|
||||
*/
|
||||
float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
|
||||
float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
|
||||
|
@@ -356,7 +356,7 @@ lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs
|
||||
nir_ssa_def *load = loads[0];
|
||||
|
||||
/* Extract the channels we actually need when we couldn't skip starting
|
||||
* components or had to emit more than one load instrinsic.
|
||||
* components or had to emit more than one load intrinsic.
|
||||
*/
|
||||
if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
|
||||
load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
|
||||
|
@@ -207,10 +207,10 @@ radix_sort_vk_destroy(radix_sort_vk_t * rs, //
|
||||
// keyvals_alignment : Alignment of each keyval buffer
|
||||
//
|
||||
// internal_size : Minimum size of internal buffer
|
||||
// internal_aligment : Alignment of the internal buffer
|
||||
// internal_alignment : Alignment of the internal buffer
|
||||
//
|
||||
// indirect_size : Minimum size of indirect buffer
|
||||
// indirect_aligment : Alignment of the indirect buffer
|
||||
// indirect_alignment : Alignment of the indirect buffer
|
||||
//
|
||||
// .keyvals_even/odd
|
||||
// -----------------
|
||||
|
@@ -174,7 +174,7 @@ radv_CreateDescriptorSetLayout(VkDevice _device, const VkDescriptorSetLayoutCrea
|
||||
size += ycbcr_sampler_count * sizeof(struct vk_ycbcr_conversion_state);
|
||||
}
|
||||
|
||||
/* We need to allocate decriptor set layouts off the device allocator with DEVICE scope because
|
||||
/* We need to allocate descriptor set layouts off the device allocator with DEVICE scope because
|
||||
* they are reference counted and may not be destroyed when vkDestroyDescriptorSetLayout is
|
||||
* called.
|
||||
*/
|
||||
|
@@ -70,7 +70,7 @@ class radv_llvm_per_thread_info {
|
||||
struct ac_compiler_passes *passes;
|
||||
};
|
||||
|
||||
/* we have to store a linked list per thread due to the possiblity of multiple gpus being required */
|
||||
/* we have to store a linked list per thread due to the possibility of multiple gpus being required */
|
||||
static thread_local std::list<radv_llvm_per_thread_info> radv_llvm_per_thread_list;
|
||||
|
||||
bool
|
||||
|
@@ -2315,7 +2315,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
|
||||
if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
|
||||
device->cs_wave_size = 32;
|
||||
|
||||
/* For pixel shaders, wave64 is recommanded. */
|
||||
/* For pixel shaders, wave64 is recommended. */
|
||||
if (device->instance->perftest_flags & RADV_PERFTEST_PS_WAVE_32)
|
||||
device->ps_wave_size = 32;
|
||||
|
||||
@@ -2641,7 +2641,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice,
|
||||
device->ws->query_value(device->ws, RADEON_GTT_USAGE);
|
||||
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
|
||||
|
||||
/* Compute the total free space that can be allocated for this process accross all heaps. */
|
||||
/* Compute the total free space that can be allocated for this process across all heaps. */
|
||||
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
|
||||
|
||||
memoryBudget->heapBudget[vram_vis_heap_idx] = total_free_space + total_internal_usage;
|
||||
@@ -2673,7 +2673,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice,
|
||||
|
||||
uint64_t total_usage = MAX2(total_internal_usage, total_system_usage);
|
||||
|
||||
/* Compute the total free space that can be allocated for this process accross all heaps. */
|
||||
/* Compute the total free space that can be allocated for this process across all heaps. */
|
||||
uint64_t total_free_space = total_heap_size - MIN2(total_heap_size, total_usage);
|
||||
|
||||
/* Compute the remaining visible VRAM size for this process. */
|
||||
|
@@ -4106,7 +4106,7 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv
|
||||
if (enable_mrt_compaction) {
|
||||
blend.spi_shader_col_format = radv_compact_spi_shader_col_format(ps, &blend);
|
||||
|
||||
/* In presense of MRT holes (ie. the FS exports MRT1 but not MRT0), the compiler will remap
|
||||
/* In presence of MRT holes (ie. the FS exports MRT1 but not MRT0), the compiler will remap
|
||||
* them, so that only MRT0 is exported and the driver will compact SPI_SHADER_COL_FORMAT to
|
||||
* match what the FS actually exports. Though, to make sure the hw remapping works as
|
||||
* expected, we should also clear color attachments without exports in CB_SHADER_MASK.
|
||||
|
@@ -1789,7 +1789,7 @@ struct radv_cmd_buffer {
|
||||
} ace_internal;
|
||||
|
||||
/**
|
||||
* Whether a query pool has been resetted and we have to flush caches.
|
||||
* Whether a query pool has been reset and we have to flush caches.
|
||||
*/
|
||||
bool pending_reset_query;
|
||||
|
||||
|
@@ -1466,7 +1466,7 @@ emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
|
||||
if (cmd_buffer->pending_reset_query) {
|
||||
if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
|
||||
/* Only need to flush caches if the query pool size is
|
||||
* large enough to be resetted using the compute shader
|
||||
* large enough to be reset using the compute shader
|
||||
* path. Small pools don't need any cache flushes
|
||||
* because we use a CP dma clear.
|
||||
*/
|
||||
|
@@ -1242,7 +1242,7 @@ radv_rra_dump_trace(VkQueue vk_queue, char *filename)
|
||||
|
||||
rra_dump_chunk_description(accel_struct_offsets[i],
|
||||
sizeof(struct rra_accel_struct_chunk_header), accel_struct_size,
|
||||
"RawAccelStruc", RADV_RRA_CHUNK_ID_ACCEL_STRUCT, file);
|
||||
"RawAccelStruct", RADV_RRA_CHUNK_ID_ACCEL_STRUCT, file);
|
||||
}
|
||||
|
||||
uint64_t file_end = (uint64_t)ftell(file);
|
||||
|
@@ -108,7 +108,7 @@ intersect_ray_amd_software_box(struct radv_device *device, nir_builder *b, nir_s
|
||||
nir_store_var(b, child_indices,
|
||||
nir_imm_ivec4(b, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu), 0xf);
|
||||
|
||||
/* Need to remove infinities here because otherwise we get nasty NaN propogation
|
||||
/* Need to remove infinities here because otherwise we get nasty NaN propagation
|
||||
* if the direction has 0s in it. */
|
||||
/* inv_dir = clamp(inv_dir, -FLT_MAX, FLT_MAX); */
|
||||
inv_dir = nir_fclamp(b, inv_dir, nir_imm_float(b, -FLT_MAX), nir_imm_float(b, FLT_MAX));
|
||||
@@ -238,7 +238,7 @@ intersect_ray_amd_software_tri(struct radv_device *device, nir_builder *b, nir_s
|
||||
nir_ssa_def *k_indices[3] = {kx, ky, kz};
|
||||
nir_ssa_def *k = nir_vec(b, k_indices, 3);
|
||||
|
||||
/* Swap kx and ky dimensions to preseve winding order */
|
||||
/* Swap kx and ky dimensions to preserve winding order */
|
||||
unsigned swap_xy_swizzle[4] = {1, 0, 2, 3};
|
||||
k = nir_bcsel(b, nir_flt(b, nir_vector_extract(b, dir, kz), nir_imm_float(b, 0.0f)),
|
||||
nir_swizzle(b, k, swap_xy_swizzle, 3), k);
|
||||
|
@@ -2080,7 +2080,7 @@ radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config,
|
||||
|
||||
size += code_dw * sizeof(uint32_t) + sizeof(struct radv_shader_binary_legacy);
|
||||
|
||||
/* We need to calloc to prevent unintialized data because this will be used
|
||||
/* We need to calloc to prevent uninitialized data because this will be used
|
||||
* directly for the disk cache. Uninitialized data can appear because of
|
||||
* padding in the struct or because legacy_binary->data can be at an offset
|
||||
* from the start less than sizeof(radv_shader_binary_legacy). */
|
||||
|
@@ -307,7 +307,7 @@ radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbu
|
||||
if (pdevice->rad_info.gfx_level >= GFX11) {
|
||||
/* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
|
||||
* To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
|
||||
* substract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
|
||||
* subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
|
||||
*
|
||||
* 1) get the current buffer base address for this SE
|
||||
* 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
|
||||
|
Reference in New Issue
Block a user