nir: port fp16 casting code from dxil
This moves the dxil pass to common code and makes dxil use the new code. Acked-by: Adam Jackson <ajax@redhat.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Jesse Natalie <jenatali@microsoft.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9643>
This commit is contained in:
@@ -147,6 +147,7 @@ files_libnir = files(
|
|||||||
'nir_lower_fb_read.c',
|
'nir_lower_fb_read.c',
|
||||||
'nir_lower_flatshade.c',
|
'nir_lower_flatshade.c',
|
||||||
'nir_lower_flrp.c',
|
'nir_lower_flrp.c',
|
||||||
|
'nir_lower_fp16_conv.c',
|
||||||
'nir_lower_fragcoord_wtrans.c',
|
'nir_lower_fragcoord_wtrans.c',
|
||||||
'nir_lower_fragcolor.c',
|
'nir_lower_fragcolor.c',
|
||||||
'nir_lower_frexp.c',
|
'nir_lower_frexp.c',
|
||||||
|
@@ -5004,6 +5004,7 @@ bool nir_shader_uses_view_index(nir_shader *shader);
|
|||||||
bool nir_can_lower_multiview(nir_shader *shader);
|
bool nir_can_lower_multiview(nir_shader *shader);
|
||||||
bool nir_lower_multiview(nir_shader *shader, uint32_t view_mask);
|
bool nir_lower_multiview(nir_shader *shader, uint32_t view_mask);
|
||||||
|
|
||||||
|
bool nir_lower_fp16_casts(nir_shader *shader);
|
||||||
bool nir_normalize_cubemap_coords(nir_shader *shader);
|
bool nir_normalize_cubemap_coords(nir_shader *shader);
|
||||||
|
|
||||||
void nir_live_ssa_defs_impl(nir_function_impl *impl);
|
void nir_live_ssa_defs_impl(nir_function_impl *impl);
|
||||||
|
235
src/compiler/nir/nir_lower_fp16_conv.c
Normal file
235
src/compiler/nir/nir_lower_fp16_conv.c
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
/*
|
||||||
|
* Copyright © Microsoft Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "nir_builder.h"
|
||||||
|
|
||||||
|
/* The following float-to-half conversion routines are based on the "half" library:
|
||||||
|
* https://sourceforge.net/projects/half/
|
||||||
|
*
|
||||||
|
* half - IEEE 754-based half-precision floating-point library.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
|
||||||
|
* modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||||
|
* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
* Version 2.1.0
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
lower_fp16_casts_filter(const nir_instr *instr, const void *data)
|
||||||
|
{
|
||||||
|
if (instr->type == nir_instr_type_alu) {
|
||||||
|
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
||||||
|
switch (alu->op) {
|
||||||
|
case nir_op_f2f16:
|
||||||
|
case nir_op_f2f16_rtne:
|
||||||
|
case nir_op_f2f16_rtz:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else if (instr->type == nir_instr_type_intrinsic) {
|
||||||
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||||
|
return intrin->intrinsic == nir_intrinsic_convert_alu_types &&
|
||||||
|
nir_intrinsic_dest_type(intrin) == nir_type_float16;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
half_rounded(nir_builder *b, nir_ssa_def *value, nir_ssa_def *guard, nir_ssa_def *sticky,
|
||||||
|
nir_ssa_def *sign, nir_rounding_mode mode)
|
||||||
|
{
|
||||||
|
switch (mode) {
|
||||||
|
case nir_rounding_mode_rtne:
|
||||||
|
return nir_iadd(b, value, nir_iand(b, guard, nir_ior(b, sticky, value)));
|
||||||
|
case nir_rounding_mode_ru:
|
||||||
|
sign = nir_ushr(b, sign, nir_imm_int(b, 31));
|
||||||
|
return nir_iadd(b, value, nir_iand(b, nir_inot(b, sign),
|
||||||
|
nir_ior(b, guard, sticky)));
|
||||||
|
case nir_rounding_mode_rd:
|
||||||
|
sign = nir_ushr(b, sign, nir_imm_int(b, 31));
|
||||||
|
return nir_iadd(b, value, nir_iand(b, sign,
|
||||||
|
nir_ior(b, guard, sticky)));
|
||||||
|
default:
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
float_to_half_impl(nir_builder *b, nir_ssa_def *src, nir_rounding_mode mode)
|
||||||
|
{
|
||||||
|
nir_ssa_def *f32infinity = nir_imm_int(b, 255 << 23);
|
||||||
|
nir_ssa_def *f16max = nir_imm_int(b, (127 + 16) << 23);
|
||||||
|
nir_ssa_def *sign = nir_iand(b, src, nir_imm_int(b, 0x80000000));
|
||||||
|
nir_ssa_def *one = nir_imm_int(b, 1);
|
||||||
|
|
||||||
|
nir_ssa_def *abs = nir_iand(b, src, nir_imm_int(b, 0x7FFFFFFF));
|
||||||
|
/* NaN or INF. For rtne, overflow also becomes INF, so combine the comparisons */
|
||||||
|
nir_push_if(b, nir_ige(b, abs, mode == nir_rounding_mode_rtne ? f16max : f32infinity));
|
||||||
|
nir_ssa_def *inf_nanfp16 = nir_bcsel(b,
|
||||||
|
nir_ilt(b, f32infinity, abs),
|
||||||
|
nir_imm_int(b, 0x7E00),
|
||||||
|
nir_imm_int(b, 0x7C00));
|
||||||
|
nir_push_else(b, NULL);
|
||||||
|
|
||||||
|
nir_ssa_def *overflowed_fp16 = NULL;
|
||||||
|
if (mode != nir_rounding_mode_rtne) {
|
||||||
|
/* Handle overflow */
|
||||||
|
nir_push_if(b, nir_ige(b, abs, f16max));
|
||||||
|
switch (mode) {
|
||||||
|
case nir_rounding_mode_rtz:
|
||||||
|
overflowed_fp16 = nir_imm_int(b, 0x7BFF);
|
||||||
|
break;
|
||||||
|
case nir_rounding_mode_ru:
|
||||||
|
/* Negative becomes max float, positive becomes inf */
|
||||||
|
overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7BFF), nir_imm_int(b, 0x7C00));
|
||||||
|
break;
|
||||||
|
case nir_rounding_mode_rd:
|
||||||
|
/* Negative becomes inf, positive becomes max float */
|
||||||
|
overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7C00), nir_imm_int(b, 0x7BFF));
|
||||||
|
break;
|
||||||
|
default: unreachable("Should've been handled already");
|
||||||
|
}
|
||||||
|
nir_push_else(b, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 113 << 23)));
|
||||||
|
|
||||||
|
/* FP16 will be normal */
|
||||||
|
nir_ssa_def *zero = nir_imm_int(b, 0);
|
||||||
|
nir_ssa_def *value = nir_ior(b,
|
||||||
|
nir_ishl(b,
|
||||||
|
nir_isub(b,
|
||||||
|
nir_ushr(b, abs, nir_imm_int(b, 23)),
|
||||||
|
nir_imm_int(b, 112)),
|
||||||
|
nir_imm_int(b, 10)),
|
||||||
|
nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 13)), nir_imm_int(b, 0x3FFF)));
|
||||||
|
nir_ssa_def *guard = nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 12)), one);
|
||||||
|
nir_ssa_def *sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, abs, nir_imm_int(b, 0xFFF)), zero), one, zero);
|
||||||
|
nir_ssa_def *normal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
|
||||||
|
|
||||||
|
nir_push_else(b, NULL);
|
||||||
|
nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 102 << 23)));
|
||||||
|
|
||||||
|
/* FP16 will be denormal */
|
||||||
|
nir_ssa_def *i = nir_isub(b, nir_imm_int(b, 125), nir_ushr(b, abs, nir_imm_int(b, 23)));
|
||||||
|
nir_ssa_def *masked = nir_ior(b, nir_iand(b, abs, nir_imm_int(b, 0x7FFFFF)), nir_imm_int(b, 0x800000));
|
||||||
|
value = nir_ushr(b, masked, nir_iadd(b, i, one));
|
||||||
|
guard = nir_iand(b, nir_ushr(b, masked, i), one);
|
||||||
|
sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, masked, nir_isub(b, nir_ishl(b, one, i), one)), zero), one, zero);
|
||||||
|
nir_ssa_def *denormal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
|
||||||
|
|
||||||
|
nir_push_else(b, NULL);
|
||||||
|
|
||||||
|
/* Handle underflow. Nonzero values need to shift up or down for round-up or round-down */
|
||||||
|
nir_ssa_def *underflowed_fp16 = zero;
|
||||||
|
if (mode == nir_rounding_mode_ru ||
|
||||||
|
mode == nir_rounding_mode_rd) {
|
||||||
|
nir_push_if(b, nir_i2b1(b, abs));
|
||||||
|
|
||||||
|
if (mode == nir_rounding_mode_ru)
|
||||||
|
underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), zero, one);
|
||||||
|
else
|
||||||
|
underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), one, zero);
|
||||||
|
|
||||||
|
nir_push_else(b, NULL);
|
||||||
|
nir_pop_if(b, NULL);
|
||||||
|
underflowed_fp16 = nir_if_phi(b, underflowed_fp16, zero);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_pop_if(b, NULL);
|
||||||
|
nir_ssa_def *underflowed_or_denorm_fp16 = nir_if_phi(b, denormal_fp16, underflowed_fp16);
|
||||||
|
|
||||||
|
nir_pop_if(b, NULL);
|
||||||
|
nir_ssa_def *finite_fp16 = nir_if_phi(b, normal_fp16, underflowed_or_denorm_fp16);
|
||||||
|
|
||||||
|
nir_ssa_def *finite_or_overflowed_fp16 = finite_fp16;
|
||||||
|
if (mode != nir_rounding_mode_rtne) {
|
||||||
|
nir_pop_if(b, NULL);
|
||||||
|
finite_or_overflowed_fp16 = nir_if_phi(b, overflowed_fp16, finite_fp16);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_pop_if(b, NULL);
|
||||||
|
nir_ssa_def *fp16 = nir_if_phi(b, inf_nanfp16, finite_or_overflowed_fp16);
|
||||||
|
|
||||||
|
return nir_u2u16(b, nir_ior(b, fp16, nir_ushr(b, sign, nir_imm_int(b, 16))));
|
||||||
|
}
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
lower_fp16_cast_impl(nir_builder *b, nir_instr *instr, void *data)
|
||||||
|
{
|
||||||
|
nir_ssa_def *src, *dst;
|
||||||
|
uint8_t *swizzle = NULL;
|
||||||
|
nir_rounding_mode mode = nir_rounding_mode_rtne;
|
||||||
|
|
||||||
|
if (instr->type == nir_instr_type_alu) {
|
||||||
|
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
||||||
|
src = alu->src[0].src.ssa;
|
||||||
|
swizzle = alu->src[0].swizzle;
|
||||||
|
dst = &alu->dest.dest.ssa;
|
||||||
|
assert(src->bit_size == 32);
|
||||||
|
switch (alu->op) {
|
||||||
|
case nir_op_f2f16:
|
||||||
|
case nir_op_f2f16_rtne:
|
||||||
|
break;
|
||||||
|
case nir_op_f2f16_rtz:
|
||||||
|
mode = nir_rounding_mode_rtz;
|
||||||
|
break;
|
||||||
|
default: unreachable("Should've been filtered");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||||
|
assert(nir_intrinsic_src_type(intrin) == nir_type_float32);
|
||||||
|
src = intrin->src[0].ssa;
|
||||||
|
dst = &intrin->dest.ssa;
|
||||||
|
mode = nir_intrinsic_rounding_mode(intrin);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_ssa_def *rets[NIR_MAX_VEC_COMPONENTS] = { NULL };
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < dst->num_components; i++) {
|
||||||
|
nir_ssa_def *comp = nir_channel(b, src, swizzle ? swizzle[i] : i);
|
||||||
|
rets[i] = float_to_half_impl(b, comp, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return nir_vec(b, rets, dst->num_components);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
nir_lower_fp16_casts(nir_shader *shader)
|
||||||
|
{
|
||||||
|
return nir_shader_lower_instructions(shader,
|
||||||
|
lower_fp16_casts_filter,
|
||||||
|
lower_fp16_cast_impl,
|
||||||
|
NULL);
|
||||||
|
}
|
@@ -1361,7 +1361,7 @@ clc_to_dxil(struct clc_context *ctx,
|
|||||||
NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil);
|
NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil);
|
||||||
NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs);
|
NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs);
|
||||||
NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil);
|
NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil);
|
||||||
NIR_PASS_V(nir, dxil_nir_lower_fp16_casts);
|
NIR_PASS_V(nir, nir_lower_fp16_casts);
|
||||||
NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
|
NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
|
||||||
|
|
||||||
// Convert pack to pack_split
|
// Convert pack to pack_split
|
||||||
|
@@ -1130,220 +1130,3 @@ dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size)
|
|||||||
|
|
||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The following float-to-half conversion routines are based on the "half" library:
|
|
||||||
* https://sourceforge.net/projects/half/
|
|
||||||
*
|
|
||||||
* half - IEEE 754-based half-precision floating-point library.
|
|
||||||
*
|
|
||||||
* Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
|
|
||||||
* modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
||||||
* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
||||||
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
||||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*
|
|
||||||
* Version 2.1.0
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
static bool
|
|
||||||
lower_fp16_casts_filter(const nir_instr *instr, const void *data)
|
|
||||||
{
|
|
||||||
if (instr->type == nir_instr_type_alu) {
|
|
||||||
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
|
||||||
/* TODO: DXIL has instructions for f2f16_rtz. For CL, it's not precise enough
|
|
||||||
* due to denorm handling. If the f2f16 instruction has undef rounding mode,
|
|
||||||
* we could map that too, but for CL, f2f16 is implied to mean rtne.
|
|
||||||
*/
|
|
||||||
switch (alu->op) {
|
|
||||||
case nir_op_f2f16:
|
|
||||||
case nir_op_f2f16_rtne:
|
|
||||||
case nir_op_f2f16_rtz:
|
|
||||||
return true;
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else if (instr->type == nir_instr_type_intrinsic) {
|
|
||||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
||||||
return intrin->intrinsic == nir_intrinsic_convert_alu_types &&
|
|
||||||
nir_intrinsic_dest_type(intrin) == nir_type_float16;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static nir_ssa_def *
|
|
||||||
half_rounded(nir_builder *b, nir_ssa_def *value, nir_ssa_def *guard, nir_ssa_def *sticky,
|
|
||||||
nir_ssa_def *sign, nir_rounding_mode mode)
|
|
||||||
{
|
|
||||||
switch (mode) {
|
|
||||||
case nir_rounding_mode_rtne:
|
|
||||||
return nir_iadd(b, value, nir_iand(b, guard, nir_ior(b, sticky, value)));
|
|
||||||
case nir_rounding_mode_ru:
|
|
||||||
sign = nir_ushr(b, sign, nir_imm_int(b, 31));
|
|
||||||
return nir_iadd(b, value, nir_iand(b, nir_inot(b, sign),
|
|
||||||
nir_ior(b, guard, sticky)));
|
|
||||||
case nir_rounding_mode_rd:
|
|
||||||
sign = nir_ushr(b, sign, nir_imm_int(b, 31));
|
|
||||||
return nir_iadd(b, value, nir_iand(b, sign,
|
|
||||||
nir_ior(b, guard, sticky)));
|
|
||||||
default:
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static nir_ssa_def *
|
|
||||||
float_to_half_impl(nir_builder *b, nir_ssa_def *src, nir_rounding_mode mode)
|
|
||||||
{
|
|
||||||
nir_ssa_def *f32infinity = nir_imm_int(b, 255 << 23);
|
|
||||||
nir_ssa_def *f16max = nir_imm_int(b, (127 + 16) << 23);
|
|
||||||
nir_ssa_def *sign = nir_iand(b, src, nir_imm_int(b, 0x80000000));
|
|
||||||
nir_ssa_def *one = nir_imm_int(b, 1);
|
|
||||||
|
|
||||||
nir_ssa_def *abs = nir_iand(b, src, nir_imm_int(b, 0x7FFFFFFF));
|
|
||||||
/* NaN or INF. For rtne, overflow also becomes INF, so combine the comparisons */
|
|
||||||
nir_push_if(b, nir_ige(b, abs, mode == nir_rounding_mode_rtne ? f16max : f32infinity));
|
|
||||||
nir_ssa_def *inf_nanfp16 = nir_bcsel(b,
|
|
||||||
nir_ilt(b, f32infinity, abs),
|
|
||||||
nir_imm_int(b, 0x7E00),
|
|
||||||
nir_imm_int(b, 0x7C00));
|
|
||||||
nir_push_else(b, NULL);
|
|
||||||
|
|
||||||
nir_ssa_def *overflowed_fp16 = NULL;
|
|
||||||
if (mode != nir_rounding_mode_rtne) {
|
|
||||||
/* Handle overflow */
|
|
||||||
nir_push_if(b, nir_ige(b, abs, f16max));
|
|
||||||
switch (mode) {
|
|
||||||
case nir_rounding_mode_rtz:
|
|
||||||
overflowed_fp16 = nir_imm_int(b, 0x7BFF);
|
|
||||||
break;
|
|
||||||
case nir_rounding_mode_ru:
|
|
||||||
/* Negative becomes max float, positive becomes inf */
|
|
||||||
overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7BFF), nir_imm_int(b, 0x7C00));
|
|
||||||
break;
|
|
||||||
case nir_rounding_mode_rd:
|
|
||||||
/* Negative becomes inf, positive becomes max float */
|
|
||||||
overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7C00), nir_imm_int(b, 0x7BFF));
|
|
||||||
break;
|
|
||||||
default: unreachable("Should've been handled already");
|
|
||||||
}
|
|
||||||
nir_push_else(b, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 113 << 23)));
|
|
||||||
|
|
||||||
/* FP16 will be normal */
|
|
||||||
nir_ssa_def *zero = nir_imm_int(b, 0);
|
|
||||||
nir_ssa_def *value = nir_ior(b,
|
|
||||||
nir_ishl(b,
|
|
||||||
nir_isub(b,
|
|
||||||
nir_ushr(b, abs, nir_imm_int(b, 23)),
|
|
||||||
nir_imm_int(b, 112)),
|
|
||||||
nir_imm_int(b, 10)),
|
|
||||||
nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 13)), nir_imm_int(b, 0x3FFF)));
|
|
||||||
nir_ssa_def *guard = nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 12)), one);
|
|
||||||
nir_ssa_def *sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, abs, nir_imm_int(b, 0xFFF)), zero), one, zero);
|
|
||||||
nir_ssa_def *normal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
|
|
||||||
|
|
||||||
nir_push_else(b, NULL);
|
|
||||||
nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 102 << 23)));
|
|
||||||
|
|
||||||
/* FP16 will be denormal */
|
|
||||||
nir_ssa_def *i = nir_isub(b, nir_imm_int(b, 125), nir_ushr(b, abs, nir_imm_int(b, 23)));
|
|
||||||
nir_ssa_def *masked = nir_ior(b, nir_iand(b, abs, nir_imm_int(b, 0x7FFFFF)), nir_imm_int(b, 0x800000));
|
|
||||||
value = nir_ushr(b, masked, nir_iadd(b, i, one));
|
|
||||||
guard = nir_iand(b, nir_ushr(b, masked, i), one);
|
|
||||||
sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, masked, nir_isub(b, nir_ishl(b, one, i), one)), zero), one, zero);
|
|
||||||
nir_ssa_def *denormal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
|
|
||||||
|
|
||||||
nir_push_else(b, NULL);
|
|
||||||
|
|
||||||
/* Handle underflow. Nonzero values need to shift up or down for round-up or round-down */
|
|
||||||
nir_ssa_def *underflowed_fp16 = zero;
|
|
||||||
if (mode == nir_rounding_mode_ru ||
|
|
||||||
mode == nir_rounding_mode_rd) {
|
|
||||||
nir_push_if(b, nir_i2b1(b, abs));
|
|
||||||
|
|
||||||
if (mode == nir_rounding_mode_ru)
|
|
||||||
underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), zero, one);
|
|
||||||
else
|
|
||||||
underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), one, zero);
|
|
||||||
|
|
||||||
nir_push_else(b, NULL);
|
|
||||||
nir_pop_if(b, NULL);
|
|
||||||
underflowed_fp16 = nir_if_phi(b, underflowed_fp16, zero);
|
|
||||||
}
|
|
||||||
|
|
||||||
nir_pop_if(b, NULL);
|
|
||||||
nir_ssa_def *underflowed_or_denorm_fp16 = nir_if_phi(b, denormal_fp16, underflowed_fp16);
|
|
||||||
|
|
||||||
nir_pop_if(b, NULL);
|
|
||||||
nir_ssa_def *finite_fp16 = nir_if_phi(b, normal_fp16, underflowed_or_denorm_fp16);
|
|
||||||
|
|
||||||
nir_ssa_def *finite_or_overflowed_fp16 = finite_fp16;
|
|
||||||
if (mode != nir_rounding_mode_rtne) {
|
|
||||||
nir_pop_if(b, NULL);
|
|
||||||
finite_or_overflowed_fp16 = nir_if_phi(b, overflowed_fp16, finite_fp16);
|
|
||||||
}
|
|
||||||
|
|
||||||
nir_pop_if(b, NULL);
|
|
||||||
nir_ssa_def *fp16 = nir_if_phi(b, inf_nanfp16, finite_or_overflowed_fp16);
|
|
||||||
|
|
||||||
return nir_u2u16(b, nir_ior(b, fp16, nir_ushr(b, sign, nir_imm_int(b, 16))));
|
|
||||||
}
|
|
||||||
|
|
||||||
static nir_ssa_def *
|
|
||||||
lower_fp16_cast_impl(nir_builder *b, nir_instr *instr, void *data)
|
|
||||||
{
|
|
||||||
nir_ssa_def *src, *dst;
|
|
||||||
uint8_t *swizzle = NULL;
|
|
||||||
nir_rounding_mode mode = nir_rounding_mode_rtne;
|
|
||||||
|
|
||||||
if (instr->type == nir_instr_type_alu) {
|
|
||||||
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
|
||||||
src = alu->src[0].src.ssa;
|
|
||||||
swizzle = alu->src[0].swizzle;
|
|
||||||
dst = &alu->dest.dest.ssa;
|
|
||||||
assert(src->bit_size == 32);
|
|
||||||
switch (alu->op) {
|
|
||||||
case nir_op_f2f16:
|
|
||||||
case nir_op_f2f16_rtne:
|
|
||||||
break;
|
|
||||||
case nir_op_f2f16_rtz:
|
|
||||||
mode = nir_rounding_mode_rtz;
|
|
||||||
break;
|
|
||||||
default: unreachable("Should've been filtered");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
||||||
assert(nir_intrinsic_src_type(intrin) == nir_type_float32);
|
|
||||||
src = intrin->src[0].ssa;
|
|
||||||
dst = &intrin->dest.ssa;
|
|
||||||
mode = nir_intrinsic_rounding_mode(intrin);
|
|
||||||
}
|
|
||||||
|
|
||||||
nir_ssa_def *rets[NIR_MAX_VEC_COMPONENTS] = { NULL };
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < dst->num_components; i++) {
|
|
||||||
nir_ssa_def *comp = nir_channel(b, src, swizzle ? swizzle[i] : i);
|
|
||||||
rets[i] = float_to_half_impl(b, comp, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
return nir_vec(b, rets, dst->num_components);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
dxil_nir_lower_fp16_casts(nir_shader *shader)
|
|
||||||
{
|
|
||||||
return nir_shader_lower_instructions(shader,
|
|
||||||
lower_fp16_casts_filter,
|
|
||||||
lower_fp16_cast_impl,
|
|
||||||
NULL);
|
|
||||||
}
|
|
||||||
|
Reference in New Issue
Block a user