amd,zink: remove options.varying_estimate_instr_cost callbacks

They are a maintainenance burden since they would need changes to
support more instruction types that nir_opt_varyings will be able to
move between shaders, and they are almost identical to
default_varying_estimate_instr_cost, so just use that.

The cost threshold is adjusted for AMD because
default_varying_estimate_instr_cost is slightly different.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32424>
This commit is contained in:
Marek Olšák
2024-11-30 22:55:41 -05:00
committed by Marge Bot
parent c0de78f120
commit d8468d5463
5 changed files with 1 additions and 207 deletions

View File

@@ -1601,112 +1601,13 @@ ac_nir_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer)
/* TCS->TES and VS->TES (OpenGL only) */
case MESA_SHADER_FRAGMENT:
/* Up to 3 uniforms and 5 ALUs. */
return 14;
return 12;
default:
unreachable("unexpected shader stage");
}
}
unsigned
ac_nir_varying_estimate_instr_cost(nir_instr *instr)
{
unsigned dst_bit_size, src_bit_size, num_dst_dwords;
nir_op alu_op;
/* This is a very loose approximation based on gfx10. */
switch (instr->type) {
case nir_instr_type_alu:
dst_bit_size = nir_instr_as_alu(instr)->def.bit_size;
src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size;
alu_op = nir_instr_as_alu(instr)->op;
num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
switch (alu_op) {
case nir_op_mov:
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_vec5:
case nir_op_vec8:
case nir_op_vec16:
case nir_op_fabs:
case nir_op_fneg:
case nir_op_fsat:
return 0;
case nir_op_imul:
case nir_op_umul_low:
return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords;
case nir_op_imul_high:
case nir_op_umul_high:
case nir_op_imul_2x32_64:
case nir_op_umul_2x32_64:
return 4;
case nir_op_fexp2:
case nir_op_flog2:
case nir_op_frcp:
case nir_op_frsq:
case nir_op_fsqrt:
case nir_op_fsin:
case nir_op_fcos:
case nir_op_fsin_amd:
case nir_op_fcos_amd:
return 4; /* FP16 & FP32. */
case nir_op_fpow:
return 4 + 1 + 4; /* log2 + mul + exp2 */
case nir_op_fsign:
return dst_bit_size == 64 ? 4 : 3; /* See ac_build_fsign. */
case nir_op_idiv:
case nir_op_udiv:
case nir_op_imod:
case nir_op_umod:
case nir_op_irem:
return dst_bit_size == 64 ? 80 : 40;
case nir_op_fdiv:
return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */
case nir_op_fmod:
case nir_op_frem:
return dst_bit_size == 64 ? 80 : 8;
default:
/* Double opcodes. Comparisons have always full performance. */
if ((dst_bit_size == 64 &&
nir_op_infos[alu_op].output_type & nir_type_float) ||
(dst_bit_size >= 8 && src_bit_size == 64 &&
nir_op_infos[alu_op].input_types[0] & nir_type_float))
return 16;
return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32);
}
case nir_instr_type_intrinsic:
dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size;
num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
case nir_intrinsic_load_deref:
/* Uniform or UBO load.
* Set a low cost to balance the number of scalar loads and ALUs.
*/
return 3 * num_dst_dwords;
default:
unreachable("unexpected intrinsic");
}
default:
unreachable("unexpected instr type");
}
}
typedef struct {
enum amd_gfx_level gfx_level;
bool use_llvm;

View File

@@ -317,9 +317,6 @@ ac_nir_opt_pack_half(nir_shader *shader, enum amd_gfx_level gfx_level);
unsigned
ac_nir_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer);
unsigned
ac_nir_varying_estimate_instr_cost(nir_instr *instr);
bool
ac_nir_opt_shared_append(nir_shader *shader);

View File

@@ -64,7 +64,6 @@ get_nir_options_for_stage(struct radv_physical_device *pdev, gl_shader_stage sta
options->max_unroll_iterations_aggressive = 128;
options->lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv;
options->io_options |= nir_io_mediump_is_32bit;
options->varying_estimate_instr_cost = ac_nir_varying_estimate_instr_cost;
options->varying_expression_max_cost = ac_nir_varying_expression_max_cost;
}

View File

@@ -1595,5 +1595,4 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
BITFIELD_BIT(MESA_SHADER_TESS_EVAL);
options->support_indirect_outputs = BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
options->varying_expression_max_cost = si_varying_expression_max_cost;
options->varying_estimate_instr_cost = ac_nir_varying_estimate_instr_cost;
}

View File

@@ -1265,106 +1265,6 @@ amd_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer)
}
}
/* from radeonsi */
static unsigned
amd_varying_estimate_instr_cost(nir_instr *instr)
{
unsigned dst_bit_size, src_bit_size, num_dst_dwords;
nir_op alu_op;
/* This is a very loose approximation based on gfx10. */
switch (instr->type) {
case nir_instr_type_alu:
dst_bit_size = nir_instr_as_alu(instr)->def.bit_size;
src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size;
alu_op = nir_instr_as_alu(instr)->op;
num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
switch (alu_op) {
case nir_op_mov:
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_vec5:
case nir_op_vec8:
case nir_op_vec16:
case nir_op_fabs:
case nir_op_fneg:
case nir_op_fsat:
return 0;
case nir_op_imul:
case nir_op_umul_low:
return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords;
case nir_op_imul_high:
case nir_op_umul_high:
case nir_op_imul_2x32_64:
case nir_op_umul_2x32_64:
return 4;
case nir_op_fexp2:
case nir_op_flog2:
case nir_op_frcp:
case nir_op_frsq:
case nir_op_fsqrt:
case nir_op_fsin:
case nir_op_fcos:
case nir_op_fsin_amd:
case nir_op_fcos_amd:
return 4; /* FP16 & FP32. */
case nir_op_fpow:
return 4 + 1 + 4; /* log2 + mul + exp2 */
case nir_op_fsign:
return dst_bit_size == 64 ? 4 : 3; /* See ac_build_fsign. */
case nir_op_idiv:
case nir_op_udiv:
case nir_op_imod:
case nir_op_umod:
case nir_op_irem:
return dst_bit_size == 64 ? 80 : 40;
case nir_op_fdiv:
return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */
case nir_op_fmod:
case nir_op_frem:
return dst_bit_size == 64 ? 80 : 8;
default:
/* Double opcodes. Comparisons have always full performance. */
if ((dst_bit_size == 64 &&
nir_op_infos[alu_op].output_type & nir_type_float) ||
(dst_bit_size >= 8 && src_bit_size == 64 &&
nir_op_infos[alu_op].input_types[0] & nir_type_float))
return 16;
return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32);
}
case nir_instr_type_intrinsic:
dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size;
num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
case nir_intrinsic_load_deref:
/* Uniform or UBO load.
* Set a low cost to balance the number of scalar loads and ALUs.
*/
return 3 * num_dst_dwords;
default:
unreachable("unexpected intrinsic");
}
default:
unreachable("unexpected instr type");
}
}
void
zink_screen_init_compiler(struct zink_screen *screen)
{
@@ -1438,12 +1338,10 @@ zink_screen_init_compiler(struct zink_screen *screen)
case VK_DRIVER_ID_AMD_OPEN_SOURCE:
case VK_DRIVER_ID_AMD_PROPRIETARY:
screen->nir_options.varying_expression_max_cost = amd_varying_expression_max_cost;
screen->nir_options.varying_estimate_instr_cost = amd_varying_estimate_instr_cost;
break;
default:
mesa_logw("zink: instruction costs not implemented for this implementation!");
screen->nir_options.varying_expression_max_cost = amd_varying_expression_max_cost;
screen->nir_options.varying_estimate_instr_cost = amd_varying_estimate_instr_cost;
}
} else {
screen->nir_options.io_options |= nir_io_dont_optimize;