From a257e2daad983204abf9ba47856f9ace0bc79b05 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Thu, 10 Aug 2023 14:12:37 -0400
Subject: [PATCH] nir: Lower fquantize2f16

Passes dEQP-VK.spirv_assembly.*opquantize*.

Unlike the DXIL lowering, this should correctly handle NaNs. (I belive Dozen has
a bug here that is masked by running constant folding early and poor CTS
coverage.) It is also faster than the DXIL lowering for hardware that supports
f2f16 conversions natively. It is not as good as a backend implementation that
could flush-to-zero in hardware... but for a debug instruction it should be more
than good enough.

It might be slightly better to multiply with 0.0 to get the appropriate zero,
but NIR really likes optimizing that out ...

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24616>
---
 src/compiler/nir/nir.h                |  3 +++
 src/compiler/nir/nir_opt_algebraic.py | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 6b5414dca49..31bc0f68518 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3847,6 +3847,9 @@ typedef struct nir_shader_compiler_options {
     *  type casts (e.g. f2f16).
     */
    bool preserve_mediump;
+
+   /** lowers fquantize2f16 to alu ops. */
+   bool lower_fquantize2f16;
 } nir_shader_compiler_options;
 
 typedef struct nir_shader {
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 374ddb68126..a430aa1bca1 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -2729,6 +2729,21 @@ optimizations.extend([
     (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal')
     ])
 
+
+"""
+  if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16)
+     return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */;
+  else
+     return f2f32(f2f16(val));
+"""
+optimizations.extend([
+    (('fquantize2f16', 'a@32'),
+     ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)),
+               ('iand', a, 1 << 31),
+               ('!f2f32', ('!f2f16_rtne', a))),
+     'options->lower_fquantize2f16')
+    ])
+
 for s in range(0, 31):
     mask = 0xffffffff << s