nir/lower_idiv: Remove imprecise_32bit_lowering
NIR has two implementations of lower_idiv, keyed on the imprecise_32bit_lowering flag. This flag is misleading: the results when setting this flag "imprecise", they're completely wrong for some values. If a backend has a native implementation of umul_high, the correct path isn't that much more expensive. If it doesn't, it's substantially slower for highp integer divison... but in practice, non-constant highp integer division is pretty rare. After a painful migration of the tree, this code path has no more users. Remove it so nobody else gets the bright idea of using it again. Closes: #6555 Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Reviewed-by: Emma Anholt <emma@anholt.net> Reviewed-by: Christian Gmeiner <christian.gmeiner@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19303>
This commit is contained in:

committed by
Marge Bot

parent
37bbcc2e4a
commit
941c37c085
@@ -3875,7 +3875,6 @@ radv_postprocess_nir(struct radv_pipeline *pipeline,
|
|||||||
|
|
||||||
NIR_PASS(_, stage->nir, nir_lower_idiv,
|
NIR_PASS(_, stage->nir, nir_lower_idiv,
|
||||||
&(nir_lower_idiv_options){
|
&(nir_lower_idiv_options){
|
||||||
.imprecise_32bit_lowering = false,
|
|
||||||
.allow_fp16 = gfx_level >= GFX9,
|
.allow_fp16 = gfx_level >= GFX9,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@@ -5254,18 +5254,6 @@ bool nir_lower_non_uniform_access(nir_shader *shader,
|
|||||||
const nir_lower_non_uniform_access_options *options);
|
const nir_lower_non_uniform_access_options *options);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* If true, a 32-bit division lowering based on NV50LegalizeSSA::handleDIV()
|
|
||||||
* is used. It is the faster of the two but it is not exact in some cases
|
|
||||||
* (for example, 1091317713u / 1034u gives 5209173 instead of 1055432).
|
|
||||||
*
|
|
||||||
* If false, a lowering based on AMDGPUTargetLowering::LowerUDIVREM() and
|
|
||||||
* AMDGPUTargetLowering::LowerSDIVREM() is used. It requires more
|
|
||||||
* instructions than the nv50 path and many of them are integer
|
|
||||||
* multiplications, so it is probably slower. It should always return the
|
|
||||||
* correct result, though.
|
|
||||||
*/
|
|
||||||
bool imprecise_32bit_lowering;
|
|
||||||
|
|
||||||
/* Whether 16-bit floating point arithmetic should be allowed in 8-bit
|
/* Whether 16-bit floating point arithmetic should be allowed in 8-bit
|
||||||
* division lowering
|
* division lowering
|
||||||
*/
|
*/
|
||||||
|
@@ -27,98 +27,6 @@
|
|||||||
#include "nir.h"
|
#include "nir.h"
|
||||||
#include "nir_builder.h"
|
#include "nir_builder.h"
|
||||||
|
|
||||||
/* Has two paths
|
|
||||||
* One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on
|
|
||||||
* NV50LegalizeSSA::handleDIV()
|
|
||||||
*
|
|
||||||
* Note that this path probably does not have not enough precision for
|
|
||||||
* compute shaders. Perhaps we want a second higher precision (looping)
|
|
||||||
* version of this? Or perhaps we assume if you can do compute shaders you
|
|
||||||
* can also branch out to a pre-optimized shader library routine..
|
|
||||||
*
|
|
||||||
* The other path (nir_lower_idiv_precise) is based off of code used by LLVM's
|
|
||||||
* AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly.
|
|
||||||
*/
|
|
||||||
|
|
||||||
static nir_ssa_def *
|
|
||||||
convert_instr(nir_builder *bld, nir_op op,
|
|
||||||
nir_ssa_def *numer, nir_ssa_def *denom)
|
|
||||||
{
|
|
||||||
nir_ssa_def *af, *bf, *a, *b, *q, *r, *rt;
|
|
||||||
bool is_signed;
|
|
||||||
|
|
||||||
is_signed = (op == nir_op_idiv ||
|
|
||||||
op == nir_op_imod ||
|
|
||||||
op == nir_op_irem);
|
|
||||||
|
|
||||||
if (is_signed) {
|
|
||||||
af = nir_i2f32(bld, numer);
|
|
||||||
bf = nir_i2f32(bld, denom);
|
|
||||||
af = nir_fabs(bld, af);
|
|
||||||
bf = nir_fabs(bld, bf);
|
|
||||||
a = nir_iabs(bld, numer);
|
|
||||||
b = nir_iabs(bld, denom);
|
|
||||||
} else {
|
|
||||||
af = nir_u2f32(bld, numer);
|
|
||||||
bf = nir_u2f32(bld, denom);
|
|
||||||
a = numer;
|
|
||||||
b = denom;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get first result: */
|
|
||||||
bf = nir_frcp(bld, bf);
|
|
||||||
bf = nir_isub(bld, bf, nir_imm_int(bld, 2)); /* yes, really */
|
|
||||||
q = nir_fmul(bld, af, bf);
|
|
||||||
|
|
||||||
if (is_signed) {
|
|
||||||
q = nir_f2i32(bld, q);
|
|
||||||
} else {
|
|
||||||
q = nir_f2u32(bld, q);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get error of first result: */
|
|
||||||
r = nir_imul(bld, q, b);
|
|
||||||
r = nir_isub(bld, a, r);
|
|
||||||
r = nir_u2f32(bld, r);
|
|
||||||
r = nir_fmul(bld, r, bf);
|
|
||||||
r = nir_f2u32(bld, r);
|
|
||||||
|
|
||||||
/* add quotients: */
|
|
||||||
q = nir_iadd(bld, q, r);
|
|
||||||
|
|
||||||
/* correction: if modulus >= divisor, add 1 */
|
|
||||||
r = nir_imul(bld, q, b);
|
|
||||||
r = nir_isub(bld, a, r);
|
|
||||||
rt = nir_uge(bld, r, b);
|
|
||||||
|
|
||||||
if (op == nir_op_umod) {
|
|
||||||
q = nir_bcsel(bld, rt, nir_isub(bld, r, b), r);
|
|
||||||
} else {
|
|
||||||
r = nir_b2i32(bld, rt);
|
|
||||||
|
|
||||||
q = nir_iadd(bld, q, r);
|
|
||||||
if (is_signed) {
|
|
||||||
/* fix the sign: */
|
|
||||||
r = nir_ixor(bld, numer, denom);
|
|
||||||
r = nir_ilt(bld, r, nir_imm_int(bld, 0));
|
|
||||||
b = nir_ineg(bld, q);
|
|
||||||
q = nir_bcsel(bld, r, b, q);
|
|
||||||
|
|
||||||
if (op == nir_op_imod || op == nir_op_irem) {
|
|
||||||
q = nir_imul(bld, q, denom);
|
|
||||||
q = nir_isub(bld, numer, q);
|
|
||||||
if (op == nir_op_imod) {
|
|
||||||
q = nir_bcsel(bld, nir_ieq_imm(bld, q, 0),
|
|
||||||
nir_imm_int(bld, 0),
|
|
||||||
nir_bcsel(bld, r, nir_iadd(bld, q, denom), q));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */
|
/* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */
|
||||||
static nir_ssa_def *
|
static nir_ssa_def *
|
||||||
emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo)
|
emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo)
|
||||||
@@ -245,8 +153,6 @@ lower_idiv(nir_builder *b, nir_instr *instr, void *_data)
|
|||||||
|
|
||||||
if (numer->bit_size < 32)
|
if (numer->bit_size < 32)
|
||||||
return convert_instr_small(b, alu->op, numer, denom, options);
|
return convert_instr_small(b, alu->op, numer, denom, options);
|
||||||
else if (options->imprecise_32bit_lowering)
|
|
||||||
return convert_instr(b, alu->op, numer, denom);
|
|
||||||
else
|
else
|
||||||
return convert_instr_precise(b, alu->op, numer, denom);
|
return convert_instr_precise(b, alu->op, numer, denom);
|
||||||
}
|
}
|
||||||
|
@@ -1254,7 +1254,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||||||
/* Lower integer division by constants before nir_lower_idiv. */
|
/* Lower integer division by constants before nir_lower_idiv. */
|
||||||
OPT(nir_opt_idiv_const, 32);
|
OPT(nir_opt_idiv_const, 32);
|
||||||
const nir_lower_idiv_options options = {
|
const nir_lower_idiv_options options = {
|
||||||
.imprecise_32bit_lowering = false,
|
|
||||||
.allow_fp16 = false
|
.allow_fp16 = false
|
||||||
};
|
};
|
||||||
OPT(nir_lower_idiv, &options);
|
OPT(nir_lower_idiv, &options);
|
||||||
|
@@ -3356,7 +3356,6 @@ Converter::run()
|
|||||||
* nir_opt_idiv_const effectively before this.
|
* nir_opt_idiv_const effectively before this.
|
||||||
*/
|
*/
|
||||||
nir_lower_idiv_options idiv_options = {
|
nir_lower_idiv_options idiv_options = {
|
||||||
.imprecise_32bit_lowering = false,
|
|
||||||
.allow_fp16 = true,
|
.allow_fp16 = true,
|
||||||
};
|
};
|
||||||
NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
|
NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
|
||||||
|
Reference in New Issue
Block a user