radv/aco,aco: set lower_fmod
This simplifies ACO and allows the lowered code to be optimized (in particular, constant folded). Totals from affected shaders: SGPRS: 1776 -> 1776 (0.00 %) VGPRS: 1436 -> 1436 (0.00 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 203452 -> 203564 (0.06 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 103 -> 103 (0.00 %) At least some of the code size increase seems to be from literals being applied to instructions as a result of constant folding. v2: remove fmod/frem handling in init_context() Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
@@ -1225,35 +1225,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
||||
}
|
||||
break;
|
||||
}
|
||||
case nir_op_fmod:
|
||||
case nir_op_frem: {
|
||||
if (dst.size() == 1) {
|
||||
Temp rcp = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]));
|
||||
Temp mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]), rcp);
|
||||
|
||||
aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f32 : aco_opcode::v_trunc_f32;
|
||||
Temp floor = bld.vop1(op, bld.def(v1), mul);
|
||||
|
||||
mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]), floor);
|
||||
bld.vop2(aco_opcode::v_sub_f32, Definition(dst), get_alu_src(ctx, instr->src[0]), mul);
|
||||
} else if (dst.size() == 2) {
|
||||
Temp rcp = bld.vop1(aco_opcode::v_rcp_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]));
|
||||
Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[0]), rcp);
|
||||
|
||||
aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f64 : aco_opcode::v_trunc_f64;
|
||||
Temp floor = bld.vop1(op, bld.def(v1), mul);
|
||||
|
||||
mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]), floor);
|
||||
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), mul);
|
||||
VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
|
||||
sub->neg[1] = true;
|
||||
} else {
|
||||
fprintf(stderr, "Unimplemented NIR instr bit size: ");
|
||||
nir_print_instr(&instr->instr, stderr);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case nir_op_fmax: {
|
||||
if (dst.size() == 1) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
|
||||
|
@@ -201,8 +201,6 @@ void init_context(isel_context *ctx, nir_shader *shader)
|
||||
case nir_op_fmax3:
|
||||
case nir_op_fmin3:
|
||||
case nir_op_fmed3:
|
||||
case nir_op_fmod:
|
||||
case nir_op_frem:
|
||||
case nir_op_fneg:
|
||||
case nir_op_fabs:
|
||||
case nir_op_fsat:
|
||||
|
@@ -91,6 +91,7 @@ static const struct nir_shader_compiler_options nir_options_aco = {
|
||||
.lower_flrp64 = true,
|
||||
.lower_device_index_to_zero = true,
|
||||
.lower_fdiv = true,
|
||||
.lower_fmod = true,
|
||||
.lower_bitfield_insert_to_bitfield_select = true,
|
||||
.lower_bitfield_extract = true,
|
||||
.lower_pack_snorm_2x16 = true,
|
||||
|
Reference in New Issue
Block a user