From e5bfce6f46e65aebf5ef34e7796833e0d9445185 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 30 Jan 2024 08:35:37 +0100 Subject: [PATCH] broadcom/compiler: support subgroup reduction operations from fragment shaders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In fragment shaders these instructions consider a lane active when any lane in the same quad is active, which is not what we want, so we need to include the current sample mask in the condition mask used with these instructions to limit lane selection to those that are really active. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 128 +++++++++++++---------------- 1 file changed, 56 insertions(+), 72 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 00120340175..e6ed160523a 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -3280,6 +3280,49 @@ emit_load_local_invocation_index(struct v3d_compile *c) vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); } +/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in + * fragment shaders a lane is considered active if any sample flags are set + * for *any* lane in the same quad. This is not what we want. To fix this we + * can emit MSF to get the active lanes and produce a condition that we + * can then use with these operations to limit execution only to lanes that + * are really active in each quad. Further, we also need to disable lanes + * that may be disabled because of non-uniform control flow. + */ +static enum v3d_qpu_cond +setup_subgroup_reduction_condition(struct v3d_compile *c) +{ + assert(c->s->info.stage == MESA_SHADER_FRAGMENT || + c->s->info.stage == MESA_SHADER_COMPUTE); + + enum v3d_qpu_cond cond = V3D_QPU_COND_NONE; + + /* Produce condition for 'lane is active' from current sample flags. + * Only required for fragment shaders. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), vir_MSF(c)), + V3D_QPU_PF_PUSHZ); + cond = V3D_QPU_COND_IFNA; + } + + /* If we are in non-uniform control-flow update the condition to + * also limit lanes to those in the current execution mask. + */ + if (vir_in_nonuniform_control_flow(c)) { + if (cond == V3D_QPU_COND_IFNA) { + vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_UF_NORNZ); + } else { + assert(cond == V3D_QPU_COND_NONE); + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + cond = V3D_QPU_COND_IFA; + } + + return cond; +} + static void emit_compute_barrier(struct v3d_compile *c) { @@ -3840,21 +3883,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_ballot: { assert(c->devinfo->ver >= 71); struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c); struct qreg res = vir_get_temp(c); - if (vir_in_nonuniform_control_flow(c)) { - /* Ballot uses the MSF mask and the condition mask to - * identify active lanes. Particularly, it uses the - * condition mask to filter out lanes disabled by - * control flow. - */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_BALLOT_dest(c, res, value), - V3D_QPU_COND_IFA); - } else { - vir_BALLOT_dest(c, res, value); - } - + vir_set_cond(vir_BALLOT_dest(c, res, value), cond); ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); break; } @@ -3871,21 +3902,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_read_first_invocation: { assert(c->devinfo->ver >= 71); struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c); struct qreg res = vir_get_temp(c); - if (vir_in_nonuniform_control_flow(c)) { - /* Bcastf uses the MSF mask and the condition mask to - * identify active lanes. Particularly, it uses the - * condition mask to filter out lanes disabled by - * control flow. - */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_BCASTF_dest(c, res, value), - V3D_QPU_COND_IFA); - } else { - vir_BCASTF_dest(c, res, value); - } - + vir_set_cond(vir_BCASTF_dest(c, res, value), cond); ntq_store_def(c, &instr->def, 0, vir_MOV(c, res)); break; } @@ -3903,25 +3922,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_vote_ieq: { assert(c->devinfo->ver >= 71); struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c); struct qreg res = vir_get_temp(c); - if (vir_in_nonuniform_control_flow(c)) { - /* Alleq uses the MSF mask and the condition mask to - * identify active lanes. Particularly, it uses the - * condition mask to filter out lanes disabled by - * control flow. - */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ? - vir_ALLEQ_dest(c, res, value) : - vir_ALLFEQ_dest(c, res, value), - V3D_QPU_COND_IFA); - } else { - if (instr->intrinsic == nir_intrinsic_vote_ieq) - vir_ALLEQ_dest(c, res, value); - else - vir_ALLFEQ_dest(c, res, value); - } + vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ? + vir_ALLEQ_dest(c, res, value) : + vir_ALLFEQ_dest(c, res, value), + cond); /* Produce boolean result */ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res), @@ -3934,20 +3940,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_vote_all: { assert(c->devinfo->ver >= 71); struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c); struct qreg res = vir_get_temp(c); - if (vir_in_nonuniform_control_flow(c)) { - /* Alleq uses the MSF mask and the condition mask to - * identify active lanes. Particularly, it uses the - * condition mask to filter out lanes disabled by - * control flow. - */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_ALLEQ_dest(c, res, value), - V3D_QPU_COND_IFA); - } else { - vir_ALLEQ_dest(c, res, value); - } + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); /* We want to check if 'all lanes are equal (alleq != 0) and * their value is True (value != 0)'. @@ -3969,20 +3964,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_vote_any: { assert(c->devinfo->ver >= 71); struct qreg value = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c); struct qreg res = vir_get_temp(c); - if (vir_in_nonuniform_control_flow(c)) { - /* Alleq uses the MSF mask and the condition mask to - * identify active lanes. Particularly, it uses the - * condition mask to filter out lanes disabled by - * control flow. - */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_ALLEQ_dest(c, res, value), - V3D_QPU_COND_IFA); - } else { - vir_ALLEQ_dest(c, res, value); - } + vir_set_cond(vir_ALLEQ_dest(c, res, value), cond); /* We want to check 'not (all lanes are equal (alleq != 0)' * and their value is False (value == 0))'.