Files
third_party_mesa3d/src/compiler/glsl/astc_decoder.glsl
Tapani Pälli 977bc760fa mesa: add astc decoder shader template (glsl es version)
This shader originates from Granite 3D engine and has been adapted
to be used with Open GL and some GLSL ES specifics.

GLSL ES adaptation:

- remove Vulkan specifics: EXT_samplerless_texture_functions usage,
  specialization constants, push constant usage
- inline bitextract.h
- always DECODE_8BIT and hardcode error color (for now)
- port to GLSL ES, required some type changes, explicit type
  conversions and setting up precisions for types

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Nanley Chery <nanley.g.chery@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19886>
2023-02-17 07:57:12 +00:00

1308 lines
39 KiB
GLSL

#version 320 es
precision highp float;
precision highp int;
precision highp usamplerBuffer;
precision highp usampler2D;
precision highp image2D;
precision highp uimage2D;
/* Copyright (c) 2020-2022 Hans-Kristian Arntzen
* Copyright (c) 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(local_size_x = %u, local_size_y = %u, local_size_z = 4) in;
#define utextureBuffer usamplerBuffer
#define utexture2D usampler2D
layout(binding = 0) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer;
layout(binding = 1) uniform utextureBuffer LUTEndpointUnquantize;
layout(binding = 2) uniform utextureBuffer LUTWeightQuantizer;
layout(binding = 3) uniform utextureBuffer LUTWeightUnquantize;
layout(binding = 4) uniform utextureBuffer LUTTritQuintDecode;
layout(binding = 5) uniform utexture2D LUTPartitionTable;
layout(binding = 6) uniform utexture2D PayloadInput;
layout(rgba8ui, binding = 7) writeonly uniform uimage2D OutputImage;
const bool DECODE_8BIT = true;
const int MODE_LDR = 0;
const int MODE_HDR = 1;
const int MODE_HDR_LDR_ALPHA = 2;
const uvec4 error_color = uvec4(255, 0, 255, 255);
/* bitextract.h */
int extract_bits(uvec4 payload, int offset, int bits)
{
int last_offset = offset + bits - 1;
int result;
if (bits <= 0)
result = 0;
else if ((last_offset >> 5) == (offset >> 5))
result = int(bitfieldExtract(payload[offset >> 5], offset & 31, bits));
else
{
int first_bits = 32 - (offset & 31);
int result_first = int(bitfieldExtract(payload[offset >> 5], offset & 31, first_bits));
int result_second = int(bitfieldExtract(payload[(offset >> 5) + 1], 0, bits - first_bits));
result = result_first | (result_second << first_bits);
}
return result;
}
/* bitextract.h */
int extract_bits_sign(uvec4 payload, int offset, int bits)
{
int last_offset = offset + bits - 1;
int result;
if (bits <= 0)
result = 0;
else if ((last_offset >> 5) == (offset >> 5))
result = bitfieldExtract(int(payload[offset >> 5]), offset & 31, bits);
else
{
int first_bits = 32 - (offset & 31);
int result_first = int(bitfieldExtract(payload[offset >> 5], offset & 31, first_bits));
int result_second = bitfieldExtract(int(payload[(offset >> 5) + 1]), 0, bits - first_bits);
result = result_first | (result_second << first_bits);
}
return result;
}
/* bitextract.h */
int extract_bits_reverse(uvec4 payload, int offset, int bits)
{
int last_offset = offset + bits - 1;
int result;
if (bits <= 0)
result = 0;
else if ((last_offset >> 5) == (offset >> 5))
result = int(bitfieldReverse(bitfieldExtract(payload[offset >> 5], offset & 31, bits)) >> (32 - bits));
else
{
int first_bits = 32 - (offset & 31);
uint result_first = bitfieldExtract(payload[offset >> 5], offset & 31, first_bits);
uint result_second = bitfieldExtract(payload[(offset >> 5) + 1], 0, bits - first_bits);
result = int(bitfieldReverse(result_first | (result_second << first_bits)) >> (32 - bits));
}
return result;
}
void swap(inout int a, inout int b)
{
int tmp = a;
a = b;
b = tmp;
}
ivec4 build_coord()
{
ivec2 payload_coord = ivec2(gl_WorkGroupID.xy) * 2;
payload_coord.x += int(gl_LocalInvocationID.z) & 1;
payload_coord.y += (int(gl_LocalInvocationID.z) >> 1) & 1;
ivec2 coord = payload_coord * ivec2(gl_WorkGroupSize.xy);
coord += ivec2(gl_LocalInvocationID.xy);
return ivec4(coord, payload_coord);
}
ivec4 interpolate_endpoint(ivec4 ep0, ivec4 ep1, ivec4 weight, int decode_mode)
{
if (decode_mode == MODE_HDR)
{
ep0 <<= 4;
ep1 <<= 4;
}
else if (decode_mode == MODE_HDR_LDR_ALPHA)
{
ep0.rgb <<= 4;
ep1.rgb <<= 4;
ep0.a *= 0x101;
ep1.a *= 0x101;
}
else if (DECODE_8BIT)
{
// This isn't quite right in all cases.
// In normal ASTC with sRGB, the alpha channel is supposed to
// be decoded as FP16,
// even when color components are SRGB 8-bit (?!?!?!?!).
// This is correct if decode_unorm8 mode is used though,
// for sanity, we're going to assume unorm8 decoding mode
// is implied when using sRGB.
ep0 = (ep0 << 8) | ivec4(0x80);
ep1 = (ep1 << 8) | ivec4(0x80);
}
else
{
ep0 *= 0x101;
ep1 *= 0x101;
}
ivec4 color = (ep0 * (64 - weight) + ep1 * weight + 32) >> 6;
return color;
}
bvec4 bvec_or(bvec4 a, bvec4 b)
{
return bvec4(ivec4(a) | ivec4(b));
}
uint round_down_quantize_fp16(int color)
{
// ASTC has a very peculiar way of converting the decoded result to FP16.
// 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)).
int msb = findMSB(color);
int shamt = msb;
int m = ((color << 10) >> shamt) & 0x3ff;
int e = msb - 1;
uint decoded = color == 0xffff ? 0x3c00u : uint(e < 1 ? (color << 8) : (m | (e << 10)));
return decoded;
}
uvec4 round_down_quantize_fp16(ivec4 color)
{
// ASTC has a very peculiar way of converting the decoded result to FP16.
// 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)).
ivec4 msb = findMSB(color);
ivec4 shamt = msb;
ivec4 m = ((color << 10) >> shamt) & 0x3ff;
ivec4 e = msb - 1;
uvec4 decoded = uvec4(m | (e << 10));
uvec4 denorm_decode = uvec4(color << 8);
decoded = mix(decoded, uvec4(denorm_decode), lessThan(e, ivec4(1)));
decoded = mix(decoded, uvec4(0x3c00), equal(color, ivec4(0xffff)));
return decoded;
}
uvec4 decode_fp16(ivec4 color, int decode_mode)
{
if (decode_mode != MODE_LDR)
{
// Interpret the value as FP16, but with some extra fixups along the way to make the interpolation more
// logarithmic (apparently). From spec:
ivec4 e = color >> 11;
ivec4 m = color & 0x7ff;
ivec4 mt = 4 * m - 512;
mt = mix(mt, ivec4(3 * m), lessThan(m, ivec4(512)));
mt = mix(mt, ivec4(5 * m - 2048), greaterThanEqual(m, ivec4(1536)));
ivec4 decoded = (e << 10) + (mt >> 3);
// +Inf or NaN are decoded to 0x7bff (max finite value).
decoded = mix(decoded, ivec4(0x7bff), bvec_or(greaterThan(decoded & 0x7fff, ivec4(0x7c00)), equal(decoded, ivec4(0x7c00))));
if (decode_mode == MODE_HDR_LDR_ALPHA)
decoded.a = int(round_down_quantize_fp16(color.a));
return uvec4(decoded);
}
else
{
return round_down_quantize_fp16(color);
}
}
struct BlockMode
{
ivec2 weight_grid_size;
int weight_mode_index;
int num_partitions;
int seed;
int cem;
int config_bits;
int primary_config_bits;
bool dual_plane;
bool void_extent;
};
bool decode_error = false;
BlockMode decode_block_mode(uvec4 payload)
{
BlockMode mode;
mode.void_extent = (payload.x & 0x1ffu) == 0x1fcu;
if (mode.void_extent)
return mode;
mode.dual_plane = (payload.x & (1u << 10u)) != 0u;
uint higher = (payload.x >> 2u) & 3u;
uint lower = payload.x & 3u;
if (lower != 0u)
{
mode.weight_mode_index = int((payload.x >> 4u) & 1u);
mode.weight_mode_index |= int((payload.x << 1u) & 6u);
mode.weight_mode_index |= int((payload.x >> 6u) & 8u);
if (higher < 2u)
{
mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 2) + 4u + 4u * higher);
mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2u);
}
else if (higher == 2u)
{
mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2u);
mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 2) + 8u);
}
else
{
if ((payload.x & (1u << 8u)) != 0u)
{
mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 1) + 2u);
mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2u);
}
else
{
mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2u);
mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 1) + 6u);
}
}
}
else
{
int p3 = int(bitfieldExtract(payload.x, 9, 1));
int hi = int(bitfieldExtract(payload.x, 7, 2));
int lo = int(bitfieldExtract(payload.x, 5, 2));
if (hi == 0)
{
mode.weight_grid_size.x = 12;
mode.weight_grid_size.y = lo + 2;
}
else if (hi == 1)
{
mode.weight_grid_size.x = lo + 2;
mode.weight_grid_size.y = 12;
}
else if (hi == 2)
{
mode.dual_plane = false;
p3 = 0;
mode.weight_grid_size.x = lo + 6;
mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 9, 2) + 6u);
}
else
{
if (lo == 0)
mode.weight_grid_size = ivec2(6, 10);
else if (lo == 1)
mode.weight_grid_size = ivec2(10, 6);
else
decode_error = true;
}
int p0 = int(bitfieldExtract(payload.x, 4, 1));
int p1 = int(bitfieldExtract(payload.x, 2, 1));
int p2 = int(bitfieldExtract(payload.x, 3, 1));
mode.weight_mode_index = p0 + (p1 << 1) + (p2 << 2) + (p3 << 3);
}
// 11 bits for block mode.
// 2 bits for partition select
// If partitions > 1:
// 4 bits CEM selector
// If dual_plane:
// 2 bits of CCS
// else:
// 10 for partition seed
// 2 bits for CEM main selector
// If CEM[1:0] = 00:
// 4 bits for CEM extra selector if all same type.
// else:
// (1 + 2) * num_partitions if different types.
// First 4 bits are encoded next to CEM[1:0], otherwise, packed before weights.
// If dual_plane:
// 2 bits of CCS before extra CEM bits.
const int CONFIG_BITS_BLOCK = 11;
const int CONFIG_BITS_PARTITION_MODE = 2;
const int CONFIG_BITS_SEED = 10;
const int CONFIG_BITS_PRIMARY_MULTI_CEM = 2;
const int CONFIG_BITS_CEM = 4;
const int CONFIG_BITS_EXTRA_CEM_PER_PARTITION = 3;
const int CONFIG_BITS_CCS = 2;
mode.num_partitions = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK, CONFIG_BITS_PARTITION_MODE)) + 1;
if (mode.num_partitions > 1)
{
mode.seed = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_SEED));
mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED,
CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM));
}
else
mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_CEM));
int config_bits;
if (mode.num_partitions > 1)
{
bool single_cem = (mode.cem & 3) == 0;
if (single_cem)
{
config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE +
CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM;
}
else
{
config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE +
CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM +
CONFIG_BITS_EXTRA_CEM_PER_PARTITION * mode.num_partitions;
}
}
else
{
config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_CEM;
}
// Other config bits are packed before the weights.
int primary_config_bits;
if (mode.num_partitions > 1)
{
primary_config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED +
CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM;
}
else
primary_config_bits = config_bits;
if (mode.dual_plane)
config_bits += CONFIG_BITS_CCS;
// This is not allowed.
if (any(greaterThan(mode.weight_grid_size, ivec2(gl_WorkGroupSize.xy))))
decode_error = true;
if (mode.dual_plane && mode.num_partitions > 3)
decode_error = true;
mode.config_bits = config_bits;
mode.primary_config_bits = primary_config_bits;
return mode;
}
int idiv3_floor(int v)
{
return (v * 0x5556) >> 16;
}
int idiv3_ceil(int v)
{
return idiv3_floor(v + 2);
}
int idiv5_floor(int v)
{
return (v * 0x3334) >> 16;
}
int idiv5_ceil(int v)
{
return idiv5_floor(v + 4);
}
uvec4 build_bitmask(int bits)
{
ivec4 num_bits = ivec4(bits, bits - 32, bits - 64, bits - 96);
uvec4 mask = uvec4(1) << clamp(num_bits, ivec4(0), ivec4(31));
mask--;
mask = mix(mask, uvec4(0xffffffffu), greaterThanEqual(uvec4(bits), uvec4(32, 64, 96, 128)));
return mask;
}
int decode_integer_sequence(uvec4 payload, int start_bit, int index, ivec3 quant)
{
int ret;
if (quant.y != 0)
{
// Trit-decoding.
int block = idiv5_floor(index);
int offset = index - block * 5;
start_bit += block * (5 * quant.x + 8);
int t0_t1_offset = start_bit + (quant.x * 1 + 0);
int t2_t3_offset = start_bit + (quant.x * 2 + 2);
int t4_offset = start_bit + (quant.x * 3 + 4);
int t5_t6_offset = start_bit + (quant.x * 4 + 5);
int t7_offset = start_bit + (quant.x * 5 + 7);
int t = (extract_bits(payload, t0_t1_offset, 2) << 0) |
(extract_bits(payload, t2_t3_offset, 2) << 2) |
(extract_bits(payload, t4_offset, 1) << 4) |
(extract_bits(payload, t5_t6_offset, 2) << 5) |
(extract_bits(payload, t7_offset, 1) << 7);
t = int(texelFetch(LUTTritQuintDecode, t).x);
t = (t >> (3 * offset)) & 7;
int m_offset = offset * quant.x;
m_offset += idiv5_ceil(offset * 8);
if (quant.x != 0)
{
int m = extract_bits(payload, m_offset + start_bit, quant.x);
ret = (t << quant.x) | m;
}
else
ret = t;
}
else if (quant.z != 0)
{
// Quint-decoding
int block = idiv3_floor(index);
int offset = index - block * 3;
start_bit += block * (3 * quant.x + 7);
int q0_q1_q2_offset = start_bit + (quant.x * 1 + 0);
int q3_q4_offset = start_bit + (quant.x * 2 + 3);
int q5_q6_offset = start_bit + (quant.x * 3 + 5);
int q = (extract_bits(payload, q0_q1_q2_offset, 3) << 0) |
(extract_bits(payload, q3_q4_offset, 2) << 3) |
(extract_bits(payload, q5_q6_offset, 2) << 5);
q = int(texelFetch(LUTTritQuintDecode, 256 + q).x);
q = (q >> (3 * offset)) & 7;
int m_offset = offset * quant.x;
m_offset += idiv3_ceil(offset * 7);
if (quant.x != 0)
{
int m = extract_bits(payload, m_offset + start_bit, quant.x);
ret = (q << quant.x) | m;
}
else
ret = q;
}
else
{
int bit = index * quant.x;
ret = extract_bits(payload, start_bit + bit, quant.x);
}
return ret;
}
ivec2 normalize_coord(ivec2 pixel_coord)
{
// This resolves to a compile-time constant.
const ivec2 D = ivec2((vec2((1024 + ivec2(gl_WorkGroupSize.xy >> 1u))) + 0.5) / vec2(gl_WorkGroupSize.xy - 1u));
ivec2 c = D * pixel_coord;
return c;
}
int decode_weight(uvec4 payload, int weight_index, ivec4 quant)
{
int primary_weight = decode_integer_sequence(payload, 0, weight_index, quant.xyz);
primary_weight = int(texelFetch(LUTWeightUnquantize, primary_weight + quant.w).x);
return primary_weight;
}
int decode_weight_bilinear(uvec4 payload, ivec2 coord, int weight_resolution,
int stride, int offset, ivec2 fractional, ivec4 quant)
{
int index = coord.y * weight_resolution + coord.x;
int p00 = decode_weight(payload, stride * index + offset, quant);
int p10, p01, p11;
if (fractional.x != 0)
p10 = decode_weight(payload, stride * (index + 1) + offset, quant);
else
p10 = p00;
if (fractional.y != 0)
{
p01 = decode_weight(payload, stride * (index + weight_resolution) + offset, quant);
if (fractional.x != 0)
p11 = decode_weight(payload, stride * (index + weight_resolution + 1) + offset, quant);
else
p11 = p01;
}
else
{
p01 = p00;
p11 = p10;
}
int w11 = (fractional.x * fractional.y + 8) >> 4;
int w10 = fractional.x - w11;
int w01 = fractional.y - w11;
int w00 = 16 - fractional.x - fractional.y + w11;
return (p00 * w00 + p10 * w10 + p01 * w01 + p11 * w11 + 8) >> 4;
}
ivec4 decode_weights(uvec4 payload, BlockMode mode, ivec2 normalized_pixel, out int weight_cost_bits)
{
ivec4 quant = ivec4(texelFetch(LUTWeightQuantizer, mode.weight_mode_index));
int num_weights = mode.weight_grid_size.x * mode.weight_grid_size.y;
num_weights <<= int(mode.dual_plane);
weight_cost_bits =
quant.x * num_weights +
idiv5_ceil(num_weights * 8 * quant.y) +
idiv3_ceil(num_weights * 7 * quant.z);
// Decoders must deal with error conditions and return the correct error color.
if (weight_cost_bits < 24 || weight_cost_bits > 96 || num_weights > 64)
{
decode_error = true;
return ivec4(0);
}
int ccs;
if (mode.dual_plane)
{
int extra_cem_bits = 0;
if ((mode.cem & 3) != 0)
extra_cem_bits = max(mode.num_partitions * 3 - 4, 0);
ccs = extract_bits(payload, 126 - weight_cost_bits - extra_cem_bits, 2);
}
payload = bitfieldReverse(payload);
payload = payload.wzyx;
payload &= build_bitmask(weight_cost_bits);
// Scale the normalized coordinate to weight grid.
ivec2 weight_pixel_fixed_point = (normalized_pixel * (mode.weight_grid_size - 1) + 32) >> 6;
ivec2 weight_pixel = weight_pixel_fixed_point >> 4;
ivec2 weight_pixel_fractional = weight_pixel_fixed_point & 0xf;
ivec4 ret;
int primary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x,
1 << int(mode.dual_plane), 0,
weight_pixel_fractional, quant);
if (mode.dual_plane)
{
int secondary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x,
2, 1,
weight_pixel_fractional, quant);
ret = mix(ivec4(primary_weight), ivec4(secondary_weight), equal(ivec4(ccs), ivec4(0, 1, 2, 3)));
}
else
ret = ivec4(primary_weight);
return ret;
}
void decode_endpoint_ldr_luma_direct(out ivec4 ep0, out ivec4 ep1,
int v0, int v1)
{
ep0 = ivec4(ivec3(v0), 0xff);
ep1 = ivec4(ivec3(v1), 0xff);
}
void decode_endpoint_hdr_luma_direct(out ivec4 ep0, out ivec4 ep1,
int v0, int v1)
{
int y0, y1;
if (v1 >= v0)
{
y0 = v0 << 4;
y1 = v1 << 4;
}
else
{
y0 = (v1 << 4) + 8;
y1 = (v0 << 4) - 8;
}
ep0 = ivec4(ivec3(y0), 0x780);
ep1 = ivec4(ivec3(y1), 0x780);
}
void decode_endpoint_hdr_luma_direct_small_range(out ivec4 ep0, out ivec4 ep1,
int v0, int v1)
{
int y0, y1, d;
if ((v0 & 0x80) != 0)
{
y0 = ((v1 & 0xe0) << 4) | ((v0 & 0x7f) << 2);
d = (v1 & 0x1f) << 2;
}
else
{
y0 = ((v1 & 0xf0) << 4) | ((v0 & 0x7f) << 1);
d = (v1 & 0x0f) << 1;
}
y1 = min(y0 + d, 0xfff);
ep0 = ivec4(ivec3(y0), 0x780);
ep1 = ivec4(ivec3(y1), 0x780);
}
void decode_endpoint_ldr_luma_base_offset(out ivec4 ep0, out ivec4 ep1,
int v0, int v1)
{
int l0 = (v0 >> 2) | (v1 & 0xc0);
int l1 = l0 + (v1 & 0x3f);
l1 = min(l1, 0xff);
ep0 = ivec4(ivec3(l0), 0xff);
ep1 = ivec4(ivec3(l1), 0xff);
}
void decode_endpoint_ldr_luma_alpha_direct(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3)
{
ep0 = ivec4(ivec3(v0), v2);
ep1 = ivec4(ivec3(v1), v3);
}
ivec4 blue_contract(int r, int g, int b, int a)
{
ivec4 ret;
ret.r = (r + b) >> 1;
ret.g = (g + b) >> 1;
ret.b = b;
ret.a = a;
return ret;
}
void bit_transfer_signed(inout int a, inout int b)
{
b >>= 1;
b |= a & 0x80;
a >>= 1;
a &= 0x3f;
a = bitfieldExtract(a, 0, 6);
}
void decode_endpoint_ldr_luma_alpha_base_offset(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3)
{
bit_transfer_signed(v1, v0);
bit_transfer_signed(v3, v2);
int v0_v1 = clamp(v0 + v1, 0, 0xff);
int v2_v3 = clamp(v2 + v3, 0, 0xff);
v0 = clamp(v0, 0, 0xff);
v2 = clamp(v2, 0, 0xff);
ep0 = ivec4(ivec3(v0), v2);
ep1 = ivec4(ivec3(v0_v1), v2_v3);
}
void decode_endpoint_ldr_rgb_base_scale(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3)
{
ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, 0xff);
ep1 = ivec4(v0, v1, v2, 0xff);
}
void decode_endpoint_ldr_rgb_base_scale_two_a(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3, int v4, int v5)
{
ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, v4);
ep1 = ivec4(v0, v1, v2, v5);
}
void decode_endpoint_ldr_rgb_direct(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3, int v4, int v5)
{
int s0 = v0 + v2 + v4;
int s1 = v1 + v3 + v5;
if (s1 >= s0)
{
ep0 = ivec4(v0, v2, v4, 0xff);
ep1 = ivec4(v1, v3, v5, 0xff);
}
else
{
ep0 = blue_contract(v1, v3, v5, 0xff);
ep1 = blue_contract(v0, v2, v4, 0xff);
}
}
void decode_endpoint_hdr_rgb_scale(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3)
{
// Mind-numbing weird format, just copy from spec ...
int mode_value = ((v0 & 0xc0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
int major_component;
int mode;
if ((mode_value & 0xc) != 0xc)
{
major_component = mode_value >> 2;
mode = mode_value & 3;
}
else if (mode_value != 0xf)
{
major_component = mode_value & 3;
mode = 4;
}
else
{
major_component = 0;
mode = 5;
}
int red = v0 & 0x3f;
int green = v1 & 0x1f;
int blue = v2 & 0x1f;
int scale = v3 & 0x1f;
int x0 = (v1 >> 6) & 1;
int x1 = (v1 >> 5) & 1;
int x2 = (v2 >> 6) & 1;
int x3 = (v2 >> 5) & 1;
int x4 = (v3 >> 7) & 1;
int x5 = (v3 >> 6) & 1;
int x6 = (v3 >> 5) & 1;
int ohm = 1 << mode;
if ((ohm & 0x30) != 0) green |= x0 << 6;
if ((ohm & 0x3a) != 0) green |= x1 << 5;
if ((ohm & 0x30) != 0) blue |= x2 << 6;
if ((ohm & 0x3a) != 0) blue |= x3 << 5;
if ((ohm & 0x3d) != 0) scale |= x6 << 5;
if ((ohm & 0x2d) != 0) scale |= x5 << 6;
if ((ohm & 0x04) != 0) scale |= x4 << 7;
if ((ohm & 0x3b) != 0) red |= x4 << 6;
if ((ohm & 0x04) != 0) red |= x3 << 6;
if ((ohm & 0x10) != 0) red |= x5 << 7;
if ((ohm & 0x0f) != 0) red |= x2 << 7;
if ((ohm & 0x05) != 0) red |= x1 << 8;
if ((ohm & 0x0a) != 0) red |= x0 << 8;
if ((ohm & 0x05) != 0) red |= x0 << 9;
if ((ohm & 0x02) != 0) red |= x6 << 9;
if ((ohm & 0x01) != 0) red |= x3 << 10;
if ((ohm & 0x02) != 0) red |= x5 << 10;
int shamt = max(mode, 1);
red <<= shamt;
green <<= shamt;
blue <<= shamt;
scale <<= shamt;
if (mode != 5)
{
green = red - green;
blue = red - blue;
}
if (major_component == 1)
swap(red, green);
else if (major_component == 2)
swap(red, blue);
ep1 = ivec4(clamp(ivec3(red, green, blue), ivec3(0), ivec3(0xfff)), 0x780);
ep0 = ivec4(clamp(ivec3(red, green, blue) - scale, ivec3(0), ivec3(0xfff)), 0x780);
}
void decode_endpoint_hdr_rgb_direct(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3, int v4, int v5)
{
int major_component = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6);
if (major_component == 3)
{
ep0 = ivec4(v0 << 4, v2 << 4, (v4 & 0x7f) << 5, 0x780);
ep1 = ivec4(v1 << 4, v3 << 4, (v5 & 0x7f) << 5, 0x780);
return;
}
int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5);
int va = v0 | ((v1 & 0x40) << 2);
int vb0 = v2 & 0x3f;
int vb1 = v3 & 0x3f;
int vc = v1 & 0x3f;
int vd0 = v4 & 0x7f;
int vd1 = v5 & 0x7f;
int d_bits = 7 - (mode & 1);
if ((mode & 5) == 4)
d_bits -= 2;
vd0 = bitfieldExtract(vd0, 0, d_bits);
vd1 = bitfieldExtract(vd1, 0, d_bits);
int x0 = (v2 >> 6) & 1;
int x1 = (v3 >> 6) & 1;
int x2 = (v4 >> 6) & 1;
int x3 = (v5 >> 6) & 1;
int x4 = (v4 >> 5) & 1;
int x5 = (v5 >> 5) & 1;
int ohm = 1 << mode;
if ((ohm & 0xa4) != 0) va |= x0 << 9;
if ((ohm & 0x08) != 0) va |= x2 << 9;
if ((ohm & 0x50) != 0) va |= x4 << 9;
if ((ohm & 0x50) != 0) va |= x5 << 10;
if ((ohm & 0xa0) != 0) va |= x1 << 10;
if ((ohm & 0xc0) != 0) va |= x2 << 11;
if ((ohm & 0x04) != 0) vc |= x1 << 6;
if ((ohm & 0xe8) != 0) vc |= x3 << 6;
if ((ohm & 0x20) != 0) vc |= x2 << 7;
if ((ohm & 0x5b) != 0) vb0 |= x0 << 6;
if ((ohm & 0x5b) != 0) vb1 |= x1 << 6;
if ((ohm & 0x12) != 0) vb0 |= x2 << 7;
if ((ohm & 0x12) != 0) vb1 |= x3 << 7;
int shamt = (mode >> 1) ^ 3;
va <<= shamt;
vb0 <<= shamt;
vb1 <<= shamt;
vc <<= shamt;
vd0 <<= shamt;
vd1 <<= shamt;
ep1 = ivec4(clamp(ivec3(va, va - vb0, va - vb1), ivec3(0), ivec3(0xfff)), 0x780);
ep0 = ivec4(clamp(ivec3(va - vc, va - vb0 - vc - vd0, va - vb1 - vc - vd1), ivec3(0), ivec3(0xfff)), 0x780);
if (major_component == 1)
{
swap(ep0.r, ep0.g);
swap(ep1.r, ep1.g);
}
else if (major_component == 2)
{
swap(ep0.r, ep0.b);
swap(ep1.r, ep1.b);
}
}
void decode_endpoint_ldr_rgb_base_offset(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3, int v4, int v5)
{
bit_transfer_signed(v1, v0);
bit_transfer_signed(v3, v2);
bit_transfer_signed(v5, v4);
if (v1 + v3 + v5 >= 0)
{
ep0 = ivec4(v0, v2, v4, 0xff);
ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, 0xff);
}
else
{
ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xff);
ep1 = blue_contract(v0, v2, v4, 0xff);
}
ep0.rgb = clamp(ep0.rgb, ivec3(0), ivec3(0xff));
ep1.rgb = clamp(ep1.rgb, ivec3(0), ivec3(0xff));
}
void decode_endpoint_ldr_rgba_direct(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3,
int v4, int v5, int v6, int v7)
{
int s0 = v0 + v2 + v4;
int s1 = v1 + v3 + v5;
if (s1 >= s0)
{
ep0 = ivec4(v0, v2, v4, v6);
ep1 = ivec4(v1, v3, v5, v7);
}
else
{
ep0 = blue_contract(v1, v3, v5, v7);
ep1 = blue_contract(v0, v2, v4, v6);
}
}
void decode_endpoint_ldr_rgba_base_offset(out ivec4 ep0, out ivec4 ep1,
int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
{
bit_transfer_signed(v1, v0);
bit_transfer_signed(v3, v2);
bit_transfer_signed(v5, v4);
bit_transfer_signed(v7, v6);
if (v1 + v3 + v5 >= 0)
{
ep0 = ivec4(v0, v2, v4, v6);
ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, v6 + v7);
}
else
{
ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7);
ep1 = blue_contract(v0, v2, v4, v6);
}
ep0 = clamp(ep0, ivec4(0), ivec4(0xff));
ep1 = clamp(ep1, ivec4(0), ivec4(0xff));
}
void decode_endpoint_hdr_alpha(out int ep0, out int ep1, int v6, int v7)
{
int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
v6 &= 0x7f;
v7 &= 0x7f;
if (mode == 3)
{
ep0 = v6 << 5;
ep1 = v7 << 5;
}
else
{
v6 |= (v7 << (mode + 1)) & 0x780;
v7 &= 0x3f >> mode;
v7 ^= 0x20 >> mode;
v7 -= 0x20 >> mode;
v6 <<= 4 - mode;
v7 <<= 4 - mode;
v7 += v6;
v7 = clamp(v7, 0, 0xfff);
ep0 = v6;
ep1 = v7;
}
}
void decode_endpoint(out ivec4 ep0, out ivec4 ep1, out int decode_mode,
uvec4 payload, int bit_offset, ivec4 quant, int ep_mode,
int base_endpoint_index, int num_endpoint_bits)
{
num_endpoint_bits += bit_offset;
payload &= build_bitmask(num_endpoint_bits);
// Could of course use an array, but that doesn't lower nicely to indexed registers on all GPUs.
int v0, v1, v2, v3, v4, v5, v6, v7;
int num_values = 2 * ((ep_mode >> 2) + 1);
#define DECODE_EP(i) \
int(texelFetch(LUTEndpointUnquantize, quant.w + decode_integer_sequence(payload, bit_offset, i + base_endpoint_index, quant.xyz)).x)
int hi_bits = ep_mode >> 2;
v0 = DECODE_EP(0);
v1 = DECODE_EP(1);
if (hi_bits >= 1)
{
v2 = DECODE_EP(2);
v3 = DECODE_EP(3);
}
if (hi_bits >= 2)
{
v4 = DECODE_EP(4);
v5 = DECODE_EP(5);
}
if (hi_bits >= 3)
{
v6 = DECODE_EP(6);
v7 = DECODE_EP(7);
}
switch (ep_mode)
{
case 0:
decode_endpoint_ldr_luma_direct(ep0, ep1,
v0, v1);
decode_mode = MODE_LDR;
break;
case 1:
decode_endpoint_ldr_luma_base_offset(ep0, ep1,
v0, v1);
decode_mode = MODE_LDR;
break;
case 2:
decode_endpoint_hdr_luma_direct(ep0, ep1,
v0, v1);
decode_mode = MODE_HDR;
break;
case 3:
decode_endpoint_hdr_luma_direct_small_range(ep0, ep1,
v0, v1);
decode_mode = MODE_HDR;
break;
case 4:
decode_endpoint_ldr_luma_alpha_direct(ep0, ep1,
v0, v1, v2, v3);
decode_mode = MODE_LDR;
break;
case 5:
decode_endpoint_ldr_luma_alpha_base_offset(ep0, ep1,
v0, v1, v2, v3);
decode_mode = MODE_LDR;
break;
case 6:
decode_endpoint_ldr_rgb_base_scale(ep0, ep1,
v0, v1, v2, v3);
decode_mode = MODE_LDR;
break;
case 7:
decode_endpoint_hdr_rgb_scale(ep0, ep1,
v0, v1, v2, v3);
decode_mode = MODE_HDR;
break;
case 8:
decode_endpoint_ldr_rgb_direct(ep0, ep1,
v0, v1, v2, v3, v4, v5);
decode_mode = MODE_LDR;
break;
case 9:
decode_endpoint_ldr_rgb_base_offset(ep0, ep1,
v0, v1, v2, v3, v4, v5);
decode_mode = MODE_LDR;
break;
case 10:
decode_endpoint_ldr_rgb_base_scale_two_a(ep0, ep1,
v0, v1, v2, v3, v4, v5);
decode_mode = MODE_LDR;
break;
case 11:
case 14:
case 15:
decode_endpoint_hdr_rgb_direct(ep0, ep1,
v0, v1, v2, v3, v4, v5);
if (ep_mode == 14)
{
ep0.a = v6;
ep1.a = v7;
decode_mode = MODE_HDR_LDR_ALPHA;
}
else if (ep_mode == 15)
{
decode_endpoint_hdr_alpha(ep0.a, ep1.a, v6, v7);
decode_mode = MODE_HDR;
}
else
decode_mode = MODE_HDR;
break;
case 12:
decode_endpoint_ldr_rgba_direct(ep0, ep1,
v0, v1, v2, v3, v4, v5, v6, v7);
decode_mode = MODE_LDR;
break;
case 13:
decode_endpoint_ldr_rgba_base_offset(ep0, ep1,
v0, v1, v2, v3, v4, v5, v6, v7);
decode_mode = MODE_LDR;
break;
}
if (DECODE_8BIT && decode_mode != MODE_LDR)
decode_error = true;
}
#define CHECK_DECODE_ERROR() do { \
if (decode_error) \
{ \
emit_decode_error(coord.xy); \
return; \
} \
} while(false)
void emit_decode_error(ivec2 coord)
{
imageStore(OutputImage, coord, error_color);
}
int compute_num_endpoint_pairs(int num_partitions, int cem)
{
int ret;
if (num_partitions > 1)
{
bool single_cem = (cem & 3) == 0;
if (single_cem)
ret = ((cem >> 4) + 1) * num_partitions;
else
ret = (cem & 3) * num_partitions + bitCount(bitfieldExtract(uint(cem), 2, num_partitions));
}
else
{
ret = (cem >> 2) + 1;
}
return ret;
}
void decode_cem_base_endpoint(uvec4 payload, int weight_cost_bits, inout int cem, out int base_endpoint_index,
int num_partitions, int partition_index)
{
if (num_partitions > 1)
{
bool single_cem = (cem & 3) == 0;
if (single_cem)
{
cem >>= 2;
base_endpoint_index = ((cem >> 2) + 1) * partition_index;
}
else
{
if (partition_index != 0)
base_endpoint_index = (cem & 3) * partition_index + bitCount(bitfieldExtract(uint(cem), 2, partition_index));
else
base_endpoint_index = 0;
int base_class = (cem & 3) - 1;
int extra_cem_bits = num_partitions * 3 - 4;
int extra_bits = extract_bits(payload, 128 - weight_cost_bits - extra_cem_bits, extra_cem_bits);
cem = (extra_bits << 4) | (cem >> 2);
int class_offset_bit = (cem >> partition_index) & 1;
int ep_bits = (cem >> (num_partitions + 2 * partition_index)) & 3;
cem = 4 * (base_class + class_offset_bit) + ep_bits;
}
base_endpoint_index *= 2;
}
else
{
base_endpoint_index = 0;
}
}
ivec4 void_extent_color(uvec4 payload, out int decode_mode)
{
int min_s = extract_bits(payload, 12, 13);
int max_s = extract_bits(payload, 12 + 13, 13);
int min_t = extract_bits(payload, 12 + 2 * 13, 13);
int max_t = extract_bits(payload, 12 + 3 * 13, 13);
int reserved = extract_bits(payload, 10, 2);
if (reserved != 3)
{
decode_error = true;
return ivec4(0);
}
if (!all(equal(ivec4(min_s, max_s, min_t, max_t), ivec4((1 << 13) - 1))))
{
if (any(greaterThanEqual(ivec2(min_s, min_t), ivec2(max_s, max_t))))
{
decode_error = true;
return ivec4(0);
}
}
decode_mode = (payload.x & (1u << 9)) != 0u ? MODE_HDR : MODE_LDR;
int r = extract_bits(payload, 64, 16);
int g = extract_bits(payload, 64 + 16, 16);
int b = extract_bits(payload, 64 + 32, 16);
int a = extract_bits(payload, 64 + 48, 16);
return ivec4(r, g, b, a);
}
void main()
{
ivec4 coord = build_coord();
if (any(greaterThanEqual(coord.xy, imageSize(OutputImage))))
return;
ivec2 pixel_coord = ivec2(gl_LocalInvocationID.xy);
int linear_pixel = int(gl_WorkGroupSize.x) * pixel_coord.y + pixel_coord.x;
uvec4 payload = texelFetch(PayloadInput, coord.zw, 0);
BlockMode block_mode = decode_block_mode(payload);
CHECK_DECODE_ERROR();
ivec4 final_color;
int decode_mode;
if (block_mode.void_extent)
{
final_color = void_extent_color(payload, decode_mode);
CHECK_DECODE_ERROR();
}
else
{
int weight_cost_bits;
ivec4 weights = decode_weights(payload, block_mode, normalize_coord(pixel_coord), weight_cost_bits);
int partition_index = 0;
if (block_mode.num_partitions > 1)
{
int lut_x = pixel_coord.x + int(gl_WorkGroupSize.x) * (block_mode.seed & 31);
int lut_y = pixel_coord.y + int(gl_WorkGroupSize.y) * (block_mode.seed >> 5);
partition_index = int(texelFetch(LUTPartitionTable, ivec2(lut_x, lut_y), 0).x);
partition_index = (partition_index >> (2 * block_mode.num_partitions - 4)) & 3;
}
int available_endpoint_bits = max(128 - block_mode.config_bits - weight_cost_bits, 0);
// In multi-partition mode, the 6-bit CEM field is encoded as
// First two bits tell if all CEM field are the same, if not we specify a class offset, and N bits
// after that will offset the class by 1.
int num_endpoint_pairs = compute_num_endpoint_pairs(block_mode.num_partitions, block_mode.cem);
// Error color must be emitted if we need more than 18 integer sequence encoded values of color.
if (num_endpoint_pairs > 9)
{
decode_error = true;
emit_decode_error(coord.xy);
return;
}
ivec4 endpoint_quant = ivec4(texelFetch(LUTRemainingBitsToEndpointQuantizer,
128 * (num_endpoint_pairs - 1) + available_endpoint_bits));
// Only read the bits we need for endpoints.
int num_endpoint_values = num_endpoint_pairs * 2;
available_endpoint_bits =
endpoint_quant.x * num_endpoint_values +
idiv5_ceil(endpoint_quant.y * 8 * num_endpoint_values) +
idiv3_ceil(endpoint_quant.z * 7 * num_endpoint_values);
// No space left for color endpoints.
if (all(equal(endpoint_quant.xyz, ivec3(0))))
{
decode_error = true;
emit_decode_error(coord.xy);
return;
}
int endpoint_bit_offset = block_mode.primary_config_bits;
ivec4 ep0, ep1;
// Decode CEM for multi-partition schemes.
int cem = block_mode.cem;
int base_endpoint_index;
decode_cem_base_endpoint(payload, weight_cost_bits, cem, base_endpoint_index,
block_mode.num_partitions, partition_index);
decode_endpoint(ep0, ep1, decode_mode, payload, endpoint_bit_offset, endpoint_quant,
cem, base_endpoint_index, available_endpoint_bits);
CHECK_DECODE_ERROR();
final_color = interpolate_endpoint(ep0, ep1, weights, decode_mode);
}
if (DECODE_8BIT)
{
imageStore(OutputImage, coord.xy, uvec4(final_color >> 8));
}
else
{
uvec4 encoded;
if (block_mode.void_extent && decode_mode == MODE_HDR)
encoded = uvec4(final_color);
else
encoded = decode_fp16(final_color, decode_mode);
imageStore(OutputImage, coord.xy, encoded);
}
}