nir/lower_idiv: add options to use fp32 for 8-bit division lowering

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10081>
2021-04-07 19:17:46 +01:00
parent 7db8d307bc
commit a2619b97f5
11 changed files with 74 additions and 32 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3319,7 +3319,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
         /* TODO: Implement nir_op_uadd_sat with LLVM. */
         if (!radv_use_llvm_for_stage(device, i))
            nir_opt_idiv_const(nir[i], 8);
-         nir_lower_idiv(nir[i], nir_lower_idiv_precise);
+
         nir_lower_idiv(nir[i], &(nir_lower_idiv_options){
                                   .imprecise_32bit_lowering = false,
                                   .allow_fp16 = true,
                                });
         nir_opt_sink(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
         nir_opt_move(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -1397,7 +1397,11 @@ v3d_attempt_compile(struct v3d_compile *c)
        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
        NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
        NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
-        NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast);
+        nir_lower_idiv_options idiv_options = {
                .imprecise_32bit_lowering = true,
                .allow_fp16 = true,
        };
        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
        if (c->key->robust_buffer_access) {
           /* v3d_nir_lower_robust_buffer_access assumes constant buffer
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4855,19 +4855,26 @@ enum nir_lower_non_uniform_access_type {
 bool nir_lower_non_uniform_access(nir_shader *shader,
                                  enum nir_lower_non_uniform_access_type);
-enum nir_lower_idiv_path {
+typedef struct {
-   /* This path is based on NV50LegalizeSSA::handleDIV(). It is the faster of
+   /* If true, a 32-bit division lowering based on NV50LegalizeSSA::handleDIV()
-    * the two but it is not exact in some cases (for example, 1091317713u /
+    * is used. It is the faster of the two but it is not exact in some cases
-    * 1034u gives 5209173 instead of 1055432) */
+    * (for example, 1091317713u / 1034u gives 5209173 instead of 1055432).
-   nir_lower_idiv_fast,
+    *
-   /* This path is based on AMDGPUTargetLowering::LowerUDIVREM() and
+    * If false, a lowering based on AMDGPUTargetLowering::LowerUDIVREM() and
-    * AMDGPUTargetLowering::LowerSDIVREM(). It requires more instructions than
+    * AMDGPUTargetLowering::LowerSDIVREM() is used. It requires more
-    * the nv50 path and many of them are integer multiplications, so it is
+    * instructions than the nv50 path and many of them are integer
-    * probably slower. It should always return the correct result, though. */
+    * multiplications, so it is probably slower. It should always return the
-   nir_lower_idiv_precise,
+    * correct result, though.
-};
+    */
   bool imprecise_32bit_lowering;
-bool nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path);
+   /* Whether 16-bit floating point arithmetic should be allowed in 8-bit
    * division lowering
    */
   bool allow_fp16;
 } nir_lower_idiv_options;
 bool nir_lower_idiv(nir_shader *shader, const nir_lower_idiv_options *options);
 typedef struct nir_input_attachment_options {
   bool use_fragcoord_sysval;
--- a/src/compiler/nir/nir_lower_idiv.c
+++ b/src/compiler/nir/nir_lower_idiv.c
@@ -200,11 +200,12 @@ convert_instr_precise(nir_builder *bld, nir_op op,
 static nir_ssa_def *
 convert_instr_small(nir_builder *b, nir_op op,
-      nir_ssa_def *numer, nir_ssa_def *denom)
+      nir_ssa_def *numer, nir_ssa_def *denom,
      const nir_lower_idiv_options *options)
 {
   unsigned sz = numer->bit_size;
   nir_alu_type int_type = nir_op_infos[op].output_type | sz;
-   nir_alu_type float_type = nir_type_float | (sz * 2);
+   nir_alu_type float_type = nir_type_float | (options->allow_fp16 ? sz * 2 : 32);
   nir_ssa_def *p = nir_type_convert(b, numer, int_type, float_type);
   nir_ssa_def *q = nir_type_convert(b, denom, int_type, float_type);
@@ -240,18 +241,18 @@ convert_instr_small(nir_builder *b, nir_op op,
 static nir_ssa_def *
 lower_idiv(nir_builder *b, nir_instr *instr, void *_data)
 {
-   enum nir_lower_idiv_path *path = _data;
+   const nir_lower_idiv_options *options = _data;
   nir_alu_instr *alu = nir_instr_as_alu(instr);
   nir_ssa_def *numer = nir_ssa_for_alu_src(b, alu, 0);
   nir_ssa_def *denom = nir_ssa_for_alu_src(b, alu, 1);
   if (numer->bit_size < 32)
-      return convert_instr_small(b, alu->op, numer, denom);
+      return convert_instr_small(b, alu->op, numer, denom, options);
-   else if (*path == nir_lower_idiv_precise)
+   else if (options->imprecise_32bit_lowering)
      return convert_instr_precise(b, alu->op, numer, denom);
   else
      return convert_instr(b, alu->op, numer, denom);
   else
      return convert_instr_precise(b, alu->op, numer, denom);
 }
 static bool
@@ -278,10 +279,10 @@ inst_is_idiv(const nir_instr *instr, UNUSED const void *_state)
 }
 bool
-nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path)
+nir_lower_idiv(nir_shader *shader, const nir_lower_idiv_options *options)
 {
   return nir_shader_lower_instructions(shader,
         inst_is_idiv,
         lower_idiv,
-         &path);
+         (void *)options);
 }
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -320,7 +320,11 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
 	/* do idiv lowering after first opt loop to get a chance to propagate
 	 * constants for divide by immed power-of-two:
 	 */
-	const bool idiv_progress = OPT(s, nir_lower_idiv, nir_lower_idiv_fast);
+	nir_lower_idiv_options idiv_options = {
 		.imprecise_32bit_lowering = true,
 		.allow_fp16 = true,
 	};
 	const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);
 	if (idiv_progress)
 		ir3_optimize_loop(s);
--- a/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
@@ -1113,7 +1113,11 @@ etna_compile_shader_nir(struct etna_shader_variant *v)
   NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_all, UINT32_MAX);
   NIR_PASS_V(s, nir_lower_tex, &(struct nir_lower_tex_options) { .lower_txp = ~0u });
   NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
-   NIR_PASS_V(s, nir_lower_idiv, nir_lower_idiv_fast);
+   nir_lower_idiv_options idiv_options = {
      .imprecise_32bit_lowering = true,
      .allow_fp16 = true,
   };
   NIR_PASS_V(s, nir_lower_idiv, &idiv_options);
   etna_optimize_loop(s);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
@@ -3139,7 +3139,11 @@ Converter::run()
   /*TODO: improve this lowering/optimisation loop so that we can use
    *      nir_opt_idiv_const effectively before this.
    */
-   NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_precise);
+   nir_lower_idiv_options idiv_options = {
      .imprecise_32bit_lowering = false,
      .allow_fp16 = true,
   };
   NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
   do {
      progress = false;
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -863,9 +863,11 @@ int r600_shader_from_nir(struct r600_context *rctx,
   NIR_PASS_V(sel->nir, nir_lower_vars_to_ssa);
   NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
-   NIR_PASS_V(sel->nir, nir_lower_idiv,
+   nir_lower_idiv_options idiv_options = {
-              sel->nir->info.stage == MESA_SHADER_COMPUTE ?
+      .imprecise_32bit_lowering = sel->nir->info.stage != MESA_SHADER_COMPUTE,
-                 nir_lower_idiv_precise : nir_lower_idiv_fast);
+      .allow_fp16 = true,
   };
   NIR_PASS_V(sel->nir, nir_lower_idiv, &idiv_options);
   NIR_PASS_V(sel->nir, r600_lower_alu);
   NIR_PASS_V(sel->nir, nir_lower_phis_to_scalar);
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2316,7 +2316,11 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
        NIR_PASS_V(c->s, vc4_nir_lower_io, c);
        NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
-        NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast);
+        nir_lower_idiv_options idiv_options = {
                .imprecise_32bit_lowering = true,
                .allow_fp16 = true,
        };
        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
        vc4_optimize_nir(c->s);
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -2834,7 +2834,11 @@ bi_optimize_nir(nir_shader *nir)
        NIR_PASS(progress, nir, nir_lower_int64);
-        NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_fast);
+        nir_lower_idiv_options idiv_options = {
                .imprecise_32bit_lowering = true,
                .allow_fp16 = true,
        };
        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -296,7 +296,11 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
                (nir->options->lower_flrp64 ? 64 : 0);
        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
-        NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_fast);
+        nir_lower_idiv_options idiv_options = {
                .imprecise_32bit_lowering = true,
                .allow_fp16 = true,
        };
        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
        nir_lower_tex_options lower_tex_options = {
                .lower_txs_lod = true,