/* * Copyright 2021 Alyssa Rosenzweig * Copyright 2020 Collabora Ltd. * Copyright 2016 Broadcom * SPDX-License-Identifier: MIT */ #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_builtin_builder.h" #include "agx_compile.h" #include "agx_compiler.h" #include "agx_internal_formats.h" #include "agx_nir.h" #include "libagx_shaders.h" #include "nir_builder_opcodes.h" #include "nir_intrinsics.h" #include "nir_intrinsics_indices.h" static nir_def * texture_descriptor_ptr_for_handle(nir_builder *b, nir_def *handle) { /* Bindless handles are a vec2, where the first source is the (constant) * uniform register number and the second source is the byte offset. */ nir_scalar uniform = nir_scalar_resolved(handle, 0); unsigned uniform_idx = nir_scalar_as_uint(uniform); nir_def *base = nir_load_preamble(b, 1, 64, uniform_idx); nir_def *offset = nir_u2u64(b, nir_channel(b, handle, 1)); return nir_iadd(b, base, offset); } static nir_def * texture_descriptor_ptr(nir_builder *b, nir_tex_instr *tex) { int handle_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle); assert(handle_idx >= 0 && "must be bindless"); return texture_descriptor_ptr_for_handle(b, tex->src[handle_idx].src.ssa); } static bool lower_txs(nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); b->cursor = nir_before_instr(instr); if (tex->op != nir_texop_txs) return false; nir_def *ptr = texture_descriptor_ptr(b, tex); unsigned nr_comps = tex->def.num_components; assert(nr_comps <= 3); int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod); nir_def *lod = lod_idx >= 0 ? nir_u2u16(b, tex->src[lod_idx].src.ssa) : nir_imm_intN_t(b, 0, 16); nir_def *res = libagx_txs(b, ptr, lod, nir_imm_int(b, nr_comps), nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_BUF), nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_1D), nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_2D), nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE), nir_imm_bool(b, tex->is_array)); nir_def_rewrite_uses(&tex->def, nir_trim_vector(b, res, nr_comps)); nir_instr_remove(instr); return true; } /* * Given a 1D buffer texture coordinate, calculate the 2D coordinate vector that * will be used to access the linear 2D texture bound to the buffer. */ static nir_def * coords_for_buffer_texture(nir_builder *b, nir_def *coord) { return nir_vec2(b, nir_iand_imm(b, coord, BITFIELD_MASK(10)), nir_ushr_imm(b, coord, 10)); } /* * Buffer textures are lowered to 2D (1024xN) textures in the driver to access * more storage. When lowering, we need to fix up the coordinate accordingly. * * Furthermore, RGB32 formats are emulated by lowering to global memory access, * so to read a buffer texture we generate code that looks like: * * if (descriptor->format == RGB32) * return ((uint32_t *) descriptor->address)[x]; * else * return txf(texture_as_2d, vec2(x % 1024, x / 1024)); */ static bool lower_buffer_texture(nir_builder *b, nir_tex_instr *tex) { nir_def *coord = nir_steal_tex_src(tex, nir_tex_src_coord); /* The OpenGL ES 3.2 specification says on page 187: * * When a buffer texture is accessed in a shader, the results of a texel * fetch are undefined if the specified texel coordinate is negative, or * greater than or equal to the clamped number of texels in the texture * image. * * However, faulting would be undesirable for robustness, so clamp. */ nir_def *size = nir_get_texture_size(b, tex); coord = nir_umin(b, coord, nir_iadd_imm(b, size, -1)); nir_def *desc = texture_descriptor_ptr(b, tex); bool is_float = nir_alu_type_get_base_type(tex->dest_type) == nir_type_float; /* Lower RGB32 reads if the format requires */ nir_if *nif = nir_push_if(b, libagx_texture_is_rgb32(b, desc)); nir_def *rgb32 = nir_trim_vector( b, libagx_texture_load_rgb32(b, desc, coord, nir_imm_bool(b, is_float)), nir_tex_instr_dest_size(tex)); nir_push_else(b, nif); /* Otherwise, lower the texture instruction to read from 2D */ assert(coord->num_components == 1 && "buffer textures are 1D"); tex->sampler_dim = GLSL_SAMPLER_DIM_2D; nir_def *coord2d = coords_for_buffer_texture(b, coord); nir_instr_remove(&tex->instr); nir_builder_instr_insert(b, &tex->instr); nir_tex_instr_add_src(tex, nir_tex_src_backend1, coord2d); nir_block *else_block = nir_cursor_current_block(b->cursor); nir_pop_if(b, nif); /* Put it together with a phi */ nir_def *phi = nir_if_phi(b, rgb32, &tex->def); nir_def_rewrite_uses(&tex->def, phi); nir_phi_instr *phi_instr = nir_instr_as_phi(phi->parent_instr); nir_phi_src *else_src = nir_phi_get_src_from_block(phi_instr, else_block); nir_src_rewrite(&else_src->src, &tex->def); return true; } /* * Given a 1D texture coordinate, calculate the 2D coordinate vector that * will be used to access the linear 2D texture bound to the 1D texture. */ static nir_def * coords_for_1d_texture(nir_builder *b, nir_def *coord, bool is_array) { /* Add a zero Y component to the coordinate */ if (is_array) { assert(coord->num_components >= 2); return nir_vec3(b, nir_channel(b, coord, 0), nir_imm_intN_t(b, 0, coord->bit_size), nir_channel(b, coord, 1)); } else { assert(coord->num_components >= 1); return nir_vec2(b, coord, nir_imm_intN_t(b, 0, coord->bit_size)); } } /* * NIR indexes into array textures with unclamped floats (integer for txf). AGX * requires the index to be a clamped integer. Lower tex_src_coord into * tex_src_backend1 for array textures by type-converting and clamping. */ static bool lower_regular_texture(nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); b->cursor = nir_before_instr(instr); if (nir_tex_instr_is_query(tex)) return false; if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) return lower_buffer_texture(b, tex); /* Don't lower twice */ if (nir_tex_instr_src_index(tex, nir_tex_src_backend1) >= 0) return false; /* Get the coordinates */ nir_def *coord = nir_steal_tex_src(tex, nir_tex_src_coord); nir_def *ms_idx = nir_steal_tex_src(tex, nir_tex_src_ms_index); /* It's unclear if mipmapped 1D textures work in the hardware. For now, we * always lower to 2D. */ if (tex->sampler_dim == GLSL_SAMPLER_DIM_1D) { coord = coords_for_1d_texture(b, coord, tex->is_array); /* Add a zero Y component to other sources */ nir_tex_src_type other_srcs[] = { nir_tex_src_ddx, nir_tex_src_ddy, nir_tex_src_offset, }; for (unsigned i = 0; i < ARRAY_SIZE(other_srcs); ++i) { nir_def *src = nir_steal_tex_src(tex, other_srcs[i]); if (!src) continue; assert(src->num_components == 1); src = nir_vec2(b, src, nir_imm_intN_t(b, 0, src->bit_size)); nir_tex_instr_add_src(tex, other_srcs[i], src); } tex->sampler_dim = GLSL_SAMPLER_DIM_2D; tex->coord_components++; } /* The layer is always the last component of the NIR coordinate, split it off * because we'll need to swizzle. */ nir_def *layer = NULL; if (tex->is_array) { unsigned lidx = coord->num_components - 1; nir_def *unclamped_layer = nir_channel(b, coord, lidx); coord = nir_trim_vector(b, coord, lidx); /* Round layer to nearest even */ if (tex->op != nir_texop_txf && tex->op != nir_texop_txf_ms) unclamped_layer = nir_f2u32(b, nir_fround_even(b, unclamped_layer)); /* For a cube array, the layer is zero-indexed component 3 of the * coordinate but the number of layers is component 2 of the txs result. */ if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { assert(lidx == 3 && "4 components"); lidx = 2; } /* Clamp to max layer = (# of layers - 1) for out-of-bounds handling. * Layer must be 16-bits for the hardware, drop top bits after clamping. */ if (!(tex->backend_flags & AGX_TEXTURE_FLAG_NO_CLAMP)) { nir_def *txs = nir_get_texture_size(b, tex); nir_def *nr_layers = nir_channel(b, txs, lidx); nir_def *max_layer = nir_iadd_imm(b, nr_layers, -1); layer = nir_umin(b, unclamped_layer, max_layer); } else { layer = unclamped_layer; } layer = nir_u2u16(b, layer); } /* Combine layer and multisample index into 32-bit so we don't need a vec5 or * vec6 16-bit coordinate tuple, which would be inconvenient in NIR for * little benefit (a minor optimization, I guess). */ nir_def *sample_array = (ms_idx && layer) ? nir_pack_32_2x16_split(b, ms_idx, layer) : ms_idx ? nir_u2u32(b, ms_idx) : layer ? nir_u2u32(b, layer) : NULL; /* Combine into the final 32-bit tuple */ if (sample_array != NULL) { unsigned end = coord->num_components; coord = nir_pad_vector(b, coord, end + 1); coord = nir_vector_insert_imm(b, coord, sample_array, end); } nir_tex_instr_add_src(tex, nir_tex_src_backend1, coord); /* Furthermore, if there is an offset vector, it must be packed */ nir_def *offset = nir_steal_tex_src(tex, nir_tex_src_offset); if (offset != NULL) { nir_def *packed = NULL; for (unsigned c = 0; c < offset->num_components; ++c) { nir_def *nibble = nir_iand_imm(b, nir_channel(b, offset, c), 0xF); nir_def *shifted = nir_ishl_imm(b, nibble, 4 * c); if (packed != NULL) packed = nir_ior(b, packed, shifted); else packed = shifted; } nir_tex_instr_add_src(tex, nir_tex_src_backend2, packed); } return true; } static nir_def * bias_for_tex(nir_builder *b, nir_tex_instr *tex) { nir_instr *instr = nir_get_texture_size(b, tex)->parent_instr; nir_tex_instr *query = nir_instr_as_tex(instr); query->op = nir_texop_lod_bias_agx; query->dest_type = nir_type_float16; nir_def_init(instr, &query->def, 1, 16); return &query->def; } static bool lower_sampler_bias(nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); b->cursor = nir_before_instr(instr); switch (tex->op) { case nir_texop_tex: { tex->op = nir_texop_txb; nir_tex_instr_add_src(tex, nir_tex_src_bias, bias_for_tex(b, tex)); return true; } case nir_texop_txb: case nir_texop_txl: { nir_tex_src_type src = tex->op == nir_texop_txl ? nir_tex_src_lod : nir_tex_src_bias; nir_def *orig = nir_steal_tex_src(tex, src); assert(orig != NULL && "invalid NIR"); if (orig->bit_size != 16) orig = nir_f2f16(b, orig); nir_tex_instr_add_src(tex, src, nir_fadd(b, orig, bias_for_tex(b, tex))); return true; } case nir_texop_txd: { /* For txd, the computed level-of-detail is log2(rho) * where rho should scale proportionally to all * derivatives. So scale derivatives by exp2(bias) to * get level-of-detail log2(exp2(bias) * rho) = bias + log2(rho). */ nir_def *scale = nir_fexp2(b, nir_f2f32(b, bias_for_tex(b, tex))); nir_tex_src_type src[] = {nir_tex_src_ddx, nir_tex_src_ddy}; for (unsigned s = 0; s < ARRAY_SIZE(src); ++s) { nir_def *orig = nir_steal_tex_src(tex, src[s]); assert(orig != NULL && "invalid"); nir_def *scaled = nir_fmul(b, nir_f2f32(b, orig), scale); nir_tex_instr_add_src(tex, src[s], scaled); } return true; } case nir_texop_txf: case nir_texop_txf_ms: case nir_texop_txs: case nir_texop_tg4: case nir_texop_texture_samples: case nir_texop_samples_identical: /* These operations do not use a sampler */ return false; default: unreachable("Unhandled texture operation"); } } static bool legalize_image_lod(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { nir_src *src; #define CASE(op, idx) \ case nir_intrinsic_##op: \ case nir_intrinsic_bindless_##op: \ src = &intr->src[idx]; \ break; switch (intr->intrinsic) { CASE(image_load, 3) CASE(image_store, 4) CASE(image_size, 1) default: return false; } #undef CASE if (src->ssa->bit_size == 16) return false; b->cursor = nir_before_instr(&intr->instr); nir_src_rewrite(src, nir_i2i16(b, src->ssa)); return true; } static nir_def * txs_for_image(nir_builder *b, nir_intrinsic_instr *intr, unsigned num_components, unsigned bit_size) { nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2); tex->op = nir_texop_txs; tex->is_array = nir_intrinsic_image_array(intr); tex->dest_type = nir_type_uint32; tex->sampler_dim = nir_intrinsic_image_dim(intr); tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_lod, intr->src[1].ssa); tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_handle, intr->src[0].ssa); nir_def_init(&tex->instr, &tex->def, num_components, bit_size); nir_builder_instr_insert(b, &tex->instr); return &tex->def; } static nir_def * image_texel_address(nir_builder *b, nir_intrinsic_instr *intr, bool return_index) { /* First, calculate the address of the PBE descriptor */ nir_def *desc_address = texture_descriptor_ptr_for_handle(b, intr->src[0].ssa); nir_def *coord = intr->src[1].ssa; enum pipe_format format = nir_intrinsic_format(intr); nir_def *blocksize_B = nir_imm_int(b, util_format_get_blocksize(format)); enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr); bool layered = nir_intrinsic_image_array(intr) || (dim == GLSL_SAMPLER_DIM_CUBE) || (dim == GLSL_SAMPLER_DIM_3D); /* The last 8 bytes of the 24-byte PBE descriptor contain either the * software-defined atomic descriptor, or (if array image) a pointer to the * descriptor. Grab the address. */ nir_def *meta_ptr = nir_iadd_imm(b, desc_address, 16); if (layered) meta_ptr = nir_load_global_constant(b, meta_ptr, 8, 1, 64); if (dim == GLSL_SAMPLER_DIM_BUF && return_index) { return nir_channel(b, coord, 0); } else if (dim == GLSL_SAMPLER_DIM_BUF) { return libagx_buffer_texel_address(b, meta_ptr, coord, blocksize_B); } else { return libagx_image_texel_address( b, meta_ptr, coord, nir_u2u32(b, intr->src[2].ssa), blocksize_B, nir_imm_bool(b, dim == GLSL_SAMPLER_DIM_MS), nir_imm_bool(b, layered), nir_imm_bool(b, return_index)); } } static void lower_buffer_image(nir_builder *b, nir_intrinsic_instr *intr) { nir_def *coord_vector = intr->src[1].ssa; nir_def *coord = nir_channel(b, coord_vector, 0); /* Lower the buffer load/store to a 2D image load/store, matching the 2D * texture/PBE descriptor the driver supplies for buffer images. */ nir_def *coord2d = coords_for_buffer_texture(b, coord); nir_src_rewrite(&intr->src[1], nir_pad_vector(b, coord2d, 4)); nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D); } static void lower_1d_image(nir_builder *b, nir_intrinsic_instr *intr) { nir_def *coord = intr->src[1].ssa; bool is_array = nir_intrinsic_image_array(intr); nir_def *coord2d = coords_for_1d_texture(b, coord, is_array); nir_src_rewrite(&intr->src[1], nir_pad_vector(b, coord2d, 4)); nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D); } /* * AGX needs the face and the layer specified separately. This matches how NIR * texture instructions work, but not how NIR image intrinsics work. Here we * lower by dividing the combined layer-face into separate components which the * compiler can consume. */ static void lower_cube_array_image(nir_builder *b, nir_intrinsic_instr *intr) { nir_def *x = nir_channel(b, intr->src[1].ssa, 0); nir_def *y = nir_channel(b, intr->src[1].ssa, 1); nir_def *z = nir_channel(b, intr->src[1].ssa, 2); nir_def *face = nir_umod_imm(b, z, 6); nir_def *layer = nir_udiv_imm(b, z, 6); nir_src_rewrite(&intr->src[1], nir_vec4(b, x, y, face, layer)); } static bool lower_images(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { b->cursor = nir_before_instr(&intr->instr); switch (intr->intrinsic) { case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_bindless_image_load: case nir_intrinsic_bindless_image_store: { /* Legalize MSAA index */ nir_src_rewrite(&intr->src[2], nir_u2u16(b, intr->src[2].ssa)); switch (nir_intrinsic_image_dim(intr)) { case GLSL_SAMPLER_DIM_1D: lower_1d_image(b, intr); return true; case GLSL_SAMPLER_DIM_BUF: lower_buffer_image(b, intr); return true; case GLSL_SAMPLER_DIM_CUBE: if (nir_intrinsic_image_array(intr)) lower_cube_array_image(b, intr); return true; default: return true; } } case nir_intrinsic_bindless_image_size: nir_def_rewrite_uses( &intr->def, txs_for_image(b, intr, intr->def.num_components, intr->def.bit_size)); return true; case nir_intrinsic_bindless_image_texel_address: nir_def_rewrite_uses(&intr->def, image_texel_address(b, intr, false)); return true; case nir_intrinsic_image_size: case nir_intrinsic_image_texel_address: unreachable("should've been lowered"); default: return false; } } /* * Early texture lowering passes, called by the driver before lowering * descriptor bindings. That means these passes operate on texture derefs. The * purpose is to make descriptor crawls explicit in the NIR, so that the driver * can accurately lower descriptors after this pass but before calling * agx_preprocess_nir (and hence the full agx_nir_lower_texture). */ bool agx_nir_lower_texture_early(nir_shader *s) { bool progress = false; nir_lower_tex_options lower_tex_options = { .lower_txp = ~0, .lower_invalid_implicit_lod = true, .lower_tg4_offsets = true, .lower_index_to_offset = true, /* XXX: Metal seems to handle just like 3D txd, so why doesn't it work? * TODO: Stop using this lowering */ .lower_txd_cube_map = true, }; NIR_PASS(progress, s, nir_lower_tex, &lower_tex_options); return progress; } bool agx_nir_lower_texture(nir_shader *s, bool support_lod_bias) { bool progress = false; nir_tex_src_type_constraints tex_constraints = { [nir_tex_src_lod] = {true, 16}, [nir_tex_src_bias] = {true, 16}, [nir_tex_src_ms_index] = {true, 16}, [nir_tex_src_texture_offset] = {true, 16}, [nir_tex_src_sampler_offset] = {true, 16}, }; /* Insert fences before lowering image atomics, since image atomics need * different fencing than other image operations. */ NIR_PASS(progress, s, agx_nir_fence_images); NIR_PASS(progress, s, nir_lower_image_atomics_to_global); /* Lower bias after nir_lower_tex (to get rid of txd) but before * lower_regular_texture (which will shuffle around the sources) */ if (support_lod_bias) { NIR_PASS(progress, s, nir_shader_instructions_pass, lower_sampler_bias, nir_metadata_block_index | nir_metadata_dominance, NULL); } NIR_PASS(progress, s, nir_shader_intrinsics_pass, legalize_image_lod, nir_metadata_block_index | nir_metadata_dominance, NULL); NIR_PASS(progress, s, nir_shader_intrinsics_pass, lower_images, nir_metadata_block_index | nir_metadata_dominance, NULL); NIR_PASS(progress, s, nir_legalize_16bit_sampler_srcs, tex_constraints); /* Lower texture sources after legalizing types (as the lowering depends on * 16-bit multisample indices) but before lowering queries (as the lowering * generates txs for array textures). */ NIR_PASS(progress, s, nir_shader_instructions_pass, lower_regular_texture, nir_metadata_none, NULL); NIR_PASS(progress, s, nir_shader_instructions_pass, lower_txs, nir_metadata_block_index | nir_metadata_dominance, NULL); return progress; } static bool lower_multisampled_store(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { b->cursor = nir_before_instr(&intr->instr); if (intr->intrinsic != nir_intrinsic_bindless_image_store) return false; if (nir_intrinsic_image_dim(intr) != GLSL_SAMPLER_DIM_MS) return false; nir_def *index_px = nir_u2u32(b, image_texel_address(b, intr, true)); nir_def *coord2d = coords_for_buffer_texture(b, index_px); nir_src_rewrite(&intr->src[1], nir_pad_vector(b, coord2d, 4)); nir_src_rewrite(&intr->src[2], nir_imm_int(b, 0)); nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D); nir_intrinsic_set_image_array(intr, false); return true; } bool agx_nir_lower_multisampled_image_store(nir_shader *s) { return nir_shader_intrinsics_pass( s, lower_multisampled_store, nir_metadata_block_index | nir_metadata_dominance, NULL); } /* * Given a non-bindless instruction, return whether agx_nir_lower_texture will * lower it to something involving a descriptor crawl. This requires the driver * to lower the instruction to bindless before calling agx_nir_lower_texture. * The implementation just enumerates the cases handled in this file. */ bool agx_nir_needs_texture_crawl(nir_instr *instr) { if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { /* Queries, atomics always become a crawl */ case nir_intrinsic_image_size: case nir_intrinsic_image_deref_size: case nir_intrinsic_image_atomic: case nir_intrinsic_image_deref_atomic: case nir_intrinsic_image_atomic_swap: case nir_intrinsic_image_deref_atomic_swap: return true; /* Multisampled stores need a crawl, others do not */ case nir_intrinsic_image_store: case nir_intrinsic_image_deref_store: return nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_MS; /* Loads do not need a crawl, even from buffers */ default: return false; } } else if (instr->type == nir_instr_type_tex) { nir_tex_instr *tex = nir_instr_as_tex(instr); /* Array textures get clamped to their size via txs */ if (tex->is_array) return true; switch (tex->op) { /* Queries always become a crawl */ case nir_texop_txs: return true; /* Buffer textures need their format read */ default: return tex->sampler_dim == GLSL_SAMPLER_DIM_BUF; } } return false; }