pan/bi: Use write masks on Valhall texture instrs

I noticed a sequence like the following in a scheduled SuperTuxKart shader: TEX_SINGLE.slot0 @r0:r1, .. LD_VAR.wait0 @r2, ... FMA r1, ... Why do we stall waiting for the TEX_SINGLE instruction when it's not actually read? Because its upper channels are *never* read, leading to a write-after-write dependency when the register allocator puts some unrelated ALU destination in there. By appropriately masking the texture instruction's write, that false dependency disappears, avoiding the stall. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20426>
2022-11-30 21:29:43 -05:00
parent 7d9c771b9b
commit bd83e5ddaf
1 changed files with 32 additions and 6 deletions
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -3780,8 +3780,12 @@ bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
        image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
        image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));

-        unsigned mask = BI_WRITE_MASK_RGBA;
-        unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
+
+        /* Only write the components that we actually read */
+        unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+        unsigned comps_per_reg = nir_dest_bit_size(instr->dest) == 16 ? 2 : 1;
+        unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
+
        enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
        enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
        bi_index dest = bi_temp(b->shader);
@@ -3810,10 +3814,32 @@ bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
                unreachable("Unhandled Valhall texture op");
        }

-        bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
-        bi_emit_split_i32(b, w, dest, res_size);
-        bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
-                        DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
+        /* The hardware will write only what we read, and it will into
+         * contiguous registers without gaps (different from Bifrost). NIR
+         * expects the gaps, so fill in the holes (they'll be copypropped and
+         * DCE'd away later).
+         */
+        bi_index unpacked[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
+
+        bi_emit_cached_split_i32(b, dest, res_size);
+
+        /* Index into the packed component array */
+        unsigned j = 0;
+        unsigned comps[4] = { 0 };
+        unsigned nr_components = nir_dest_num_components(instr->dest);
+
+        for (unsigned i = 0; i < nr_components; ++i) {
+                if (mask & BITFIELD_BIT(i)) {
+                        unpacked[i] = dest;
+                        comps[i] = j++;
+                } else {
+                        unpacked[i] = bi_zero();
+                }
+        }
+
+        bi_make_vec_to(b, bi_dest_index(&instr->dest), unpacked,
+                        comps, nir_dest_num_components(instr->dest),
+                        nir_dest_bit_size(instr->dest));
 }

 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube