diff --git a/src/panfrost/bifrost/bi_printer.c.py b/src/panfrost/bifrost/bi_printer.c.py index 6cddcff3011..60ff1116b8a 100644 --- a/src/panfrost/bifrost/bi_printer.c.py +++ b/src/panfrost/bifrost/bi_printer.c.py @@ -77,7 +77,7 @@ static void bi_print_index(FILE *fp, bi_index index) { if (index.discard) - fputs("`", fp); + fputs("^", fp); if (bi_is_null(index)) fprintf(fp, "_"); diff --git a/src/panfrost/bifrost/valhall/asm.py b/src/panfrost/bifrost/valhall/asm.py index 6b7b5ceb1a9..47b345c7f16 100644 --- a/src/panfrost/bifrost/valhall/asm.py +++ b/src/panfrost/bifrost/valhall/asm.py @@ -91,7 +91,7 @@ def parse_int(s, minimum, maximum): return number def encode_source(op, fau): - if op[0] == '`': + if op[0] == '^': die_if(op[1] != 'r', f"Expected register after discard {op}") return parse_int(op[2:], 0, 63) | 0x40 elif op[0] == 'r': diff --git a/src/panfrost/bifrost/valhall/disasm.py b/src/panfrost/bifrost/valhall/disasm.py index 30c98b4a94d..7fb271596fd 100644 --- a/src/panfrost/bifrost/valhall/disasm.py +++ b/src/panfrost/bifrost/valhall/disasm.py @@ -67,7 +67,7 @@ va_print_src(FILE *fp, uint8_t src, unsigned fau_page) fprintf(fp, "u%u", value | (fau_page << 6)); } else { bool discard = (type & 1); - fprintf(fp, "%sr%u", discard ? "`" : "", value); + fprintf(fp, "%sr%u", discard ? "^" : "", value); } } diff --git a/src/panfrost/bifrost/valhall/test/assembler-cases.txt b/src/panfrost/bifrost/valhall/test/assembler-cases.txt index 03400b3cde0..44d21a1225d 100644 --- a/src/panfrost/bifrost/valhall/test/assembler-cases.txt +++ b/src/panfrost/bifrost/valhall/test/assembler-cases.txt @@ -9,7 +9,7 @@ e6 00 00 00 00 c1 91 06 MOV.i32 r1, core_id.w0 01 02 00 00 10 c0 a4 00 FADD.f32 r0, r1, r2.neg 01 02 00 00 30 c0 a4 00 FADD.f32 r0, r1, r2.neg.abs 01 02 00 00 32 c0 a4 00 FADD.f32.clamp_m1_1 r0, r1, r2.neg.abs -41 03 00 00 00 c0 1f 50 BRANCHZ.reconverge `r1, offset:3 +41 03 00 00 00 c0 1f 50 BRANCHZ.reconverge ^r1, offset:3 01 d0 00 00 00 c0 a4 00 FADD.f32 r0, r1, 0x3F800000 01 d0 00 00 10 c0 a4 00 FADD.f32 r0, r1, 0x3F800000.neg 01 c0 00 00 00 c0 a4 00 FADD.f32 r0, r1, 0x0 @@ -26,37 +26,37 @@ e6 00 00 00 00 c1 91 06 MOV.i32 r1, core_id.w0 01 c9 00 18 00 c0 a0 00 IADD.u32 r0, r1, 0x7060504.b2 01 02 00 08 20 c0 a1 00 IADD.v2u16 r0, r1, r2 82 3c 27 20 00 c0 a3 01 SHADDX.u64 r0, u2, r60.w0, shift:0x2 -40 00 00 18 82 80 60 08 LOAD.i32.unsigned.slot0.wait0 @r0, `r0, offset:0 -80 7c 47 20 00 c0 a3 01 SHADDX.u64 r0, u0, `r60.w0, shift:0x4 -40 00 00 38 08 44 61 78 STORE.i128.slot0.end @r4:r5:r6:r7, `r0, offset:0 +40 00 00 18 82 80 60 08 LOAD.i32.unsigned.slot0.wait0 @r0, ^r0, offset:0 +80 7c 47 20 00 c0 a3 01 SHADDX.u64 r0, u0, ^r60.w0, shift:0x4 +40 00 00 38 08 44 61 78 STORE.i128.slot0.end @r4:r5:r6:r7, ^r0, offset:0 00 00 00 00 00 c0 00 78 NOP.end -40 c4 c0 9c 01 c1 f0 00 ICMP.u32.gt.m1 r1, `r0, 0x1000000.b3, 0x0 -42 00 00 18 02 40 61 50 STORE.i32.slot0.reconverge @r0, `r2, offset:0 +40 c4 c0 9c 01 c1 f0 00 ICMP.u32.gt.m1 r1, ^r0, 0x1000000.b3, 0x0 +42 00 00 18 02 40 61 50 STORE.i32.slot0.reconverge @r0, ^r2, offset:0 00 c9 8f 12 30 c0 a0 00 CLPER.i32.f1 r0, r0, 0x7060504.b0 00 00 00 30 00 c7 90 00 S8_TO_S32 r7, r0.b3 00 00 00 20 00 c6 90 00 S8_TO_S32 r6, r0.b2 00 00 00 00 00 c4 90 00 S8_TO_S32 r4, r0.b0 -40 00 00 10 00 c5 90 00 S8_TO_S32 r5, `r0.b1 +40 00 00 10 00 c5 90 00 S8_TO_S32 r5, ^r0.b1 00 00 01 30 00 c7 90 00 S8_TO_F32 r7, r0.b3 00 00 0f a0 00 40 90 00 FROUND.v2f16.rtn r0.h0, r0 01 00 0f 90 00 40 90 00 FROUND.v2f16.rtn r0.h0, r1.h10 01 00 0f a0 00 41 90 00 FROUND.v2f16.rtn r1.h0, r1 00 00 0f 90 00 40 90 00 FROUND.v2f16.rtn r0.h0, r0.h10 00 00 0b 00 00 c2 90 00 F16_TO_F32 r2, r0.h0 -40 00 0b 10 00 c3 90 00 F16_TO_F32 r3, `r0.h1 +40 00 0b 10 00 c3 90 00 F16_TO_F32 r3, ^r0.h1 00 00 00 00 00 c0 00 40 NOP.wait0126 -42 43 04 00 00 c0 a5 00 V2F32_TO_V2F16 r0, `r2, `r3 -40 c0 00 28 90 c0 a5 48 FADD.v2f16.wait r0, `r0.abs, 0x0.neg +42 43 04 00 00 c0 a5 00 V2F32_TO_V2F16 r0, ^r2, ^r3 +40 c0 00 28 90 c0 a5 48 FADD.v2f16.wait r0, ^r0.abs, 0x0.neg c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0 3c d0 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0 -40 db 05 04 00 c1 a1 00 MKVEC.v2i16 r1, `r0.h00, 0x3C000000.h10 +40 db 05 04 00 c1 a1 00 MKVEC.v2i16 r1, ^r0.h00, 0x3C000000.h10 f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0 -7b 0d 00 40 04 84 5e 08 LEA_BUF_IMM.slot1.wait0 @r4:r5, `r59, table:0xD, index:0x0 +7b 0d 00 40 04 84 5e 08 LEA_BUF_IMM.slot1.wait0 @r4:r5, ^r59, table:0xD, index:0x0 00 dd c0 08 14 c2 b2 00 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg -41 88 c0 00 04 c1 b2 00 FMA.f32 r1, `r1, u8, 0x0.neg -40 88 c0 00 04 c0 b2 10 FMA.f32.wait1 r0, `r0, u8, 0x0.neg -44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.end @r0:r1:r2, `r4, offset:0 -44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.end @r8:r9:r10:r11, `r4, offset:0 +41 88 c0 00 04 c1 b2 00 FMA.f32 r1, ^r1, u8, 0x0.neg +40 88 c0 00 04 c0 b2 10 FMA.f32.wait1 r0, ^r0, u8, 0x0.neg +44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.end @r0:r1:r2, ^r4, offset:0 +44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.end @r8:r9:r10:r11, ^r4, offset:0 00 00 00 c0 01 c0 45 48 BARRIER.slot7.wait 80 00 00 00 82 82 60 00 LOAD.i8.unsigned.slot0 @r2, u0, offset:0 80 00 00 08 82 82 60 00 LOAD.i16.unsigned.slot0 @r2, u0, offset:0 @@ -76,109 +76,109 @@ f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, 80 00 00 00 c2 82 60 00 LOAD.i8.h0.unsigned.slot0 @r2, u0, offset:0 80 14 00 08 92 82 60 00 LOAD.i16.h1.unsigned.slot0 @r2, u0, offset:20 80 00 00 08 82 82 60 00 LOAD.i16.unsigned.slot0 @r2, u0, offset:0 -42 00 0d 80 40 c2 90 00 FROUND.f32.rtn r2, `r2.neg -42 00 0b 00 40 c2 90 00 F16_TO_F32 r2, `r2.neg.h0 -42 00 0c c0 40 c2 90 00 F32_TO_S32.rtz r2, `r2.neg -42 00 0e e0 40 c2 90 00 V2F16_TO_V2S16.rtz r2, `r2.neg +42 00 0d 80 40 c2 90 00 FROUND.f32.rtn r2, ^r2.neg +42 00 0b 00 40 c2 90 00 F16_TO_F32 r2, ^r2.neg.h0 +42 00 0c c0 40 c2 90 00 F32_TO_S32.rtz r2, ^r2.neg +42 00 0e e0 40 c2 90 00 V2F16_TO_V2S16.rtz r2, ^r2.neg 02 00 0a c0 40 c4 90 00 F16_TO_S32.rtz r4, r2.neg.h00 -42 00 0a d0 40 c5 90 00 F16_TO_S32.rtz r5, `r2.neg.h10 -42 c0 c6 47 48 c2 14 01 FADD_IMM.f32 r2, `r2, #0x4847C6C0 -42 84 67 ac 70 c2 15 01 FADD_IMM.v2f16 r2, `r2, #0x70AC6784 -42 14 13 12 ad c2 12 01 IADD_IMM.v4i8 r2, `r2, #0xAD121314 -42 14 00 13 00 c2 11 01 IADD_IMM.v2i16 r2, `r2, #0x130014 -42 ab 4b 00 00 c2 10 01 IADD_IMM.i32 r2, `r2, #0x4BAB -43 42 c0 84 11 c2 f9 00 ICMP.v2s16.gt.m1 r2, `r3.h10, `r2.h10, 0x0 -43 42 c0 90 01 c2 f5 00 FCMP.v2f16.gt.m1 r2, `r3.h10, `r2.h00, 0x0 -42 00 07 00 20 c2 90 00 V2S16_TO_V2F16 r2, `r2 +42 00 0a d0 40 c5 90 00 F16_TO_S32.rtz r5, ^r2.neg.h10 +42 c0 c6 47 48 c2 14 01 FADD_IMM.f32 r2, ^r2, #0x4847C6C0 +42 84 67 ac 70 c2 15 01 FADD_IMM.v2f16 r2, ^r2, #0x70AC6784 +42 14 13 12 ad c2 12 01 IADD_IMM.v4i8 r2, ^r2, #0xAD121314 +42 14 00 13 00 c2 11 01 IADD_IMM.v2i16 r2, ^r2, #0x130014 +42 ab 4b 00 00 c2 10 01 IADD_IMM.i32 r2, ^r2, #0x4BAB +43 42 c0 84 11 c2 f9 00 ICMP.v2s16.gt.m1 r2, ^r3.h10, ^r2.h10, 0x0 +43 42 c0 90 01 c2 f5 00 FCMP.v2f16.gt.m1 r2, ^r3.h10, ^r2.h00, 0x0 +42 00 07 00 20 c2 90 00 V2S16_TO_V2F16 r2, ^r2 00 c0 c0 00 43 c1 f2 00 ICMP.v4u8.ne.i1 r1, r0.b0000, 0x0, 0x0 -41 03 00 00 00 c0 1f 50 BRANCHZ.reconverge `r1, offset:3 +41 03 00 00 00 c0 1f 50 BRANCHZ.reconverge ^r1, offset:3 00 03 00 00 20 c0 1f 50 BRANCHZ.reconverge r0.h0, offset:3 00 03 00 00 40 c0 1f 50 BRANCHZ.reconverge r0.h1, offset:3 00 03 00 00 00 c0 1f 50 BRANCHZ.reconverge r0, offset:3 c0 00 00 00 00 c0 10 01 IADD_IMM.i32 r0, 0x0, #0x0 c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1 80 00 27 20 00 c2 a3 01 SHADDX.u64 r2, u0, r0.w0, shift:0x2 -40 c9 00 10 00 c0 a0 00 IADD.u32 r0, `r0, 0x7060504.b0 +40 c9 00 10 00 c0 a0 00 IADD.u32 r0, ^r0, 0x7060504.b0 00 82 c0 80 03 c1 f0 00 ICMP.u32.ne.m1 r1, r0, u2, 0x0 04 00 00 00 00 c5 91 00 MOV.i32 r5, r4 04 00 00 00 00 c6 91 00 MOV.i32 r6, r4 04 00 00 00 00 c7 91 08 MOV.i32.wait0 r7, r4 -42 00 00 38 08 44 61 00 STORE.i128.slot0 @r4:r5:r6:r7, `r2, offset:0 -41 f8 ff ff 07 c0 1f 50 BRANCHZ.reconverge `r1, offset:-8 -7d c0 00 08 10 bc a1 00 IADD.v2u16 r60.h1, `r61.h10, 0x0 -44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0 -44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0 -44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0 -7c c0 12 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x1 -7c c0 02 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x0 +42 00 00 38 08 44 61 00 STORE.i128.slot0 @r4:r5:r6:r7, ^r2, offset:0 +41 f8 ff ff 07 c0 1f 50 BRANCHZ.reconverge ^r1, offset:-8 +7d c0 00 08 10 bc a1 00 IADD.v2u16 r60.h1, ^r61.h10, 0x0 +44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, ^r4, ^r6, offset:0x0 +44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, ^r4, ^r6, offset:0x0 +44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, ^r4, ^r6, offset:0x0 +7c c0 12 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, ^r60, 0x0, table:0x2, index:0x1 +7c c0 02 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, ^r60, 0x0, table:0x2, index:0x0 82 81 00 28 f4 82 6a 00 LD_BUFFER.i64.unsigned.slot0 @r2:r3, u2, u1 80 81 00 68 f4 80 6a 00 LD_BUFFER.i64.unsigned.slot1 @r0:r1, u0, u1 84 81 00 a8 f4 a6 6a 00 LD_BUFFER.i64.unsigned.slot2 @r38:r39, u4, u1 83 81 00 a8 f4 a4 6a 00 LD_BUFFER.i64.unsigned.slot2 @r36:r37, u3, u1 83 84 00 28 f4 82 6a 00 LD_BUFFER.i64.unsigned.slot0 @r2:r3, u3, u4 -41 82 00 30 e6 82 6a 00 LD_BUFFER.i96.unsigned.slot0 @r2:r3:r4, `r1, u2 -40 83 00 30 e6 86 6a 08 LD_BUFFER.i96.unsigned.slot0.wait0 @r6:r7:r8, `r0, u3 -40 00 00 00 c0 c0 9c 40 FRCP.f32.wait0126 r0, `r0.neg.abs -40 44 80 00 00 c0 b8 00 MUX.i32.neg r0, `r0, `r4, u0 -40 44 80 00 01 c0 b8 00 MUX.i32 r0, `r0, `r4, u0 -40 44 80 00 02 c0 b8 00 MUX.i32.fp_zero r0, `r0, `r4, u0 -40 44 80 00 03 c0 b8 00 MUX.i32.bit r0, `r0, `r4, u0 +41 82 00 30 e6 82 6a 00 LD_BUFFER.i96.unsigned.slot0 @r2:r3:r4, ^r1, u2 +40 83 00 30 e6 86 6a 08 LD_BUFFER.i96.unsigned.slot0.wait0 @r6:r7:r8, ^r0, u3 +40 00 00 00 c0 c0 9c 40 FRCP.f32.wait0126 r0, ^r0.neg.abs +40 44 80 00 00 c0 b8 00 MUX.i32.neg r0, ^r0, ^r4, u0 +40 44 80 00 01 c0 b8 00 MUX.i32 r0, ^r0, ^r4, u0 +40 44 80 00 02 c0 b8 00 MUX.i32.fp_zero r0, ^r0, ^r4, u0 +40 44 80 00 03 c0 b8 00 MUX.i32.bit r0, ^r0, ^r4, u0 00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0 01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1 -40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, `r0 -41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, `r1, `r2, 0x0.neg, `r0 -42 43 84 85 00 c1 50 01 CSEL.u32.eq r1, `r2, `r3, u4, u5 -42 43 84 85 04 c1 50 01 CSEL.u32.lt r1, `r2, `r3, u4, u5 -42 43 84 85 04 c1 58 01 CSEL.s32.lt r1, `r2, `r3, u4, u5 +40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, ^r0 +41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, ^r1, ^r2, 0x0.neg, ^r0 +42 43 84 85 00 c1 50 01 CSEL.u32.eq r1, ^r2, ^r3, u4, u5 +42 43 84 85 04 c1 50 01 CSEL.u32.lt r1, ^r2, ^r3, u4, u5 +42 43 84 85 04 c1 58 01 CSEL.s32.lt r1, ^r2, ^r3, u4, u5 3d 00 00 12 b4 82 56 08 LD_VAR_SPECIAL.v2.f32.sample.clobber.slot0.wait0 @r2:r3, r61, index:0x0 3d 00 00 33 14 82 5d 08 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.center.retrieve.wait0 @r2:r3, r61, index:0x0 3d 00 00 33 84 80 5d 08 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.sample.store.wait0 @r0:r1, r61, index:0x0 3d 00 08 33 44 80 5d 08 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.centroid.store.wait0 @r0:r1, r61, index:0x8 -7c 7d 11 33 04 80 66 00 LD_ATTR_IMM.v4.f16.slot0 @r0:r1, `r60, `r61, index:0x1, table:0x1 -40 3c 03 23 04 84 78 00 LD_TILE.v3.f16.slot0 @r4:r5, `r0, r60, r3 +7c 7d 11 33 04 80 66 00 LD_ATTR_IMM.v4.f16.slot0 @r0:r1, ^r60, ^r61, index:0x1, table:0x1 +40 3c 03 23 04 84 78 00 LD_TILE.v3.f16.slot0 @r4:r5, ^r0, r60, r3 00 c9 00 20 10 81 a1 00 IADD.v2u16 r1.h1, r0.h10, 0x7060504.b11 -40 c0 00 08 10 41 a1 00 IADD.v2u16 r1.h0, `r0.h10, 0x0 +40 c0 00 08 10 41 a1 00 IADD.v2u16 r1.h0, ^r0.h10, 0x0 02 02 00 04 20 42 a1 00 IADD.v2u16 r2.h0, r2, r2.h10 -42 c0 05 00 00 c2 a1 00 MKVEC.v2i16 r2, `r2.h00, 0x0.h00 -77 c0 05 00 00 c2 a1 00 MKVEC.v2i16 r2, `r55.h00, 0x0.h00 -77 c0 05 10 00 c2 a1 00 MKVEC.v2i16 r2, `r55.h10, 0x0.h00 -c0 77 05 00 00 c2 a1 00 MKVEC.v2i16 r2, 0x0.h00, `r55.h00 -c0 77 05 04 00 c2 a1 00 MKVEC.v2i16 r2, 0x0.h00, `r55.h10 -77 00 14 00 00 c2 90 00 U16_TO_U32 r2, `r55.h00 -77 00 14 10 00 c2 90 00 U16_TO_U32 r2, `r55.h10 -77 00 15 00 00 c2 90 00 U16_TO_F32 r2, `r55.h00 -77 00 15 10 00 c2 90 00 U16_TO_F32 r2, `r55.h10 -77 00 04 00 00 c2 90 00 S16_TO_S32 r2, `r55.h00 -77 00 04 10 00 c2 90 00 S16_TO_S32 r2, `r55.h10 -c0 77 01 08 00 c2 a8 00 ISUB.s32 r2, 0x0, `r55.h0 -c0 77 01 0c 00 c2 a8 00 ISUB.s32 r2, 0x0, `r55.h1 +42 c0 05 00 00 c2 a1 00 MKVEC.v2i16 r2, ^r2.h00, 0x0.h00 +77 c0 05 00 00 c2 a1 00 MKVEC.v2i16 r2, ^r55.h00, 0x0.h00 +77 c0 05 10 00 c2 a1 00 MKVEC.v2i16 r2, ^r55.h10, 0x0.h00 +c0 77 05 00 00 c2 a1 00 MKVEC.v2i16 r2, 0x0.h00, ^r55.h00 +c0 77 05 04 00 c2 a1 00 MKVEC.v2i16 r2, 0x0.h00, ^r55.h10 +77 00 14 00 00 c2 90 00 U16_TO_U32 r2, ^r55.h00 +77 00 14 10 00 c2 90 00 U16_TO_U32 r2, ^r55.h10 +77 00 15 00 00 c2 90 00 U16_TO_F32 r2, ^r55.h00 +77 00 15 10 00 c2 90 00 U16_TO_F32 r2, ^r55.h10 +77 00 04 00 00 c2 90 00 S16_TO_S32 r2, ^r55.h00 +77 00 04 10 00 c2 90 00 S16_TO_S32 r2, ^r55.h10 +c0 77 01 08 00 c2 a8 00 ISUB.s32 r2, 0x0, ^r55.h0 +c0 77 01 0c 00 c2 a8 00 ISUB.s32 r2, 0x0, ^r55.h1 00 c0 c0 00 c0 c7 bd 00 MKVEC.v4i8 r7, r0.b3, 0x0.b0, 0x0 00 c0 c0 00 80 c6 bd 00 MKVEC.v4i8 r6, r0.b2, 0x0.b0, 0x0 00 c0 c0 00 00 c4 bd 00 MKVEC.v4i8 r4, r0.b0, 0x0.b0, 0x0 -40 c0 c0 00 40 c5 bd 00 MKVEC.v4i8 r5, `r0.b1, 0x0.b0, 0x0 +40 c0 c0 00 40 c5 bd 00 MKVEC.v4i8 r5, ^r0.b1, 0x0.b0, 0x0 00 00 11 30 00 c7 90 00 U8_TO_F32 r7, r0.b3 00 00 11 20 00 c6 90 00 U8_TO_F32 r6, r0.b2 00 00 11 00 00 c4 90 00 U8_TO_F32 r4, r0.b0 -40 00 11 10 00 c5 90 00 U8_TO_F32 r5, `r0.b1 +40 00 11 10 00 c5 90 00 U8_TO_F32 r5, ^r0.b1 00 00 01 30 00 c7 90 00 S8_TO_F32 r7, r0.b3 00 00 01 20 00 c6 90 00 S8_TO_F32 r6, r0.b2 00 00 01 00 00 c4 90 00 S8_TO_F32 r4, r0.b0 -40 00 01 10 00 c5 90 00 S8_TO_F32 r5, `r0.b1 -42 00 07 00 20 c2 90 00 V2S16_TO_V2F16 r2, `r2 -42 00 07 00 10 c2 90 00 V2S16_TO_V2F16 r2, `r2.h10 -42 00 07 00 00 c2 90 00 V2S16_TO_V2F16 r2, `r2.h00 -7d 00 07 00 00 c2 90 00 V2S16_TO_V2F16 r2, `r61.h00 -77 00 07 00 00 c2 90 00 V2S16_TO_V2F16 r2, `r55.h00 -77 00 07 00 30 c2 90 00 V2S16_TO_V2F16 r2, `r55.h11 -77 00 07 00 20 c2 90 00 V2S16_TO_V2F16 r2, `r55 -77 00 07 00 10 c2 90 00 V2S16_TO_V2F16 r2, `r55.h10 -42 00 07 00 c0 c2 90 00 V2S16_TO_V2F16 r2, `r2.b01 -42 00 03 00 40 c2 90 00 V2S8_TO_V2F16 r2, `r2.b01 -42 00 03 00 00 c2 90 00 V2S8_TO_V2F16 r2, `r2.b00 +40 00 01 10 00 c5 90 00 S8_TO_F32 r5, ^r0.b1 +42 00 07 00 20 c2 90 00 V2S16_TO_V2F16 r2, ^r2 +42 00 07 00 10 c2 90 00 V2S16_TO_V2F16 r2, ^r2.h10 +42 00 07 00 00 c2 90 00 V2S16_TO_V2F16 r2, ^r2.h00 +7d 00 07 00 00 c2 90 00 V2S16_TO_V2F16 r2, ^r61.h00 +77 00 07 00 00 c2 90 00 V2S16_TO_V2F16 r2, ^r55.h00 +77 00 07 00 30 c2 90 00 V2S16_TO_V2F16 r2, ^r55.h11 +77 00 07 00 20 c2 90 00 V2S16_TO_V2F16 r2, ^r55 +77 00 07 00 10 c2 90 00 V2S16_TO_V2F16 r2, ^r55.h10 +42 00 07 00 c0 c2 90 00 V2S16_TO_V2F16 r2, ^r2.b01 +42 00 03 00 40 c2 90 00 V2S8_TO_V2F16 r2, ^r2.b01 +42 00 03 00 00 c2 90 00 V2S8_TO_V2F16 r2, ^r2.b00 00 00 03 00 40 c2 90 00 V2S8_TO_V2F16 r2, r0.b01 -40 00 03 00 e0 c3 90 00 V2S8_TO_V2F16 r3, `r0.b23 +40 00 03 00 e0 c3 90 00 V2S8_TO_V2F16 r3, ^r0.b23 00 00 03 00 20 c1 90 00 V2S8_TO_V2F16 r1, r0.b20 -40 00 03 00 60 c0 90 00 V2S8_TO_V2F16 r0, `r0.b21 +40 00 03 00 60 c0 90 00 V2S8_TO_V2F16 r0, ^r0.b21 3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0 3d 00 10 72 18 84 5c 00 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10 @@ -190,26 +190,26 @@ c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0 00 00 00 00 00 c0 00 48 NOP.wait 81 0b 80 33 04 8e 78 00 LD_TILE.v4.f16.slot0 @r14:r15, u1, r11, u0 0b 00 04 00 00 cc 91 00 CLZ.u32 r12, r11 -82 4c c0 52 00 cc b4 00 RSHIFT_XOR.i32.not_result r12, u2, `r12.b00, 0x0 -4b c0 4c 10 01 cb b4 08 LSHIFT_AND.i32.wait0 r11, `r11, 0x0.b00, `r12 -4f 49 00 28 00 c9 a5 00 FADD.v2f16 r9, `r15, `r9 -4e 48 00 28 00 c8 a5 00 FADD.v2f16 r8, `r14, `r8 +82 4c c0 52 00 cc b4 00 RSHIFT_XOR.i32.not_result r12, u2, ^r12.b00, 0x0 +4b c0 4c 10 01 cb b4 08 LSHIFT_AND.i32.wait0 r11, ^r11, 0x0.b00, ^r12 +4f 49 00 28 00 c9 a5 00 FADD.v2f16 r9, ^r15, ^r9 +4e 48 00 28 00 c8 a5 00 FADD.v2f16 r8, ^r14, ^r8 0b f8 ff ff 07 c0 1f 50 BRANCHZ.reconverge r11, offset:-8 -4a 00 0c 00 00 fe 91 00 POPCOUNT.i32 r62, `r10 -7e 00 19 00 00 fe 90 00 U32_TO_F32 r62, `r62 -7e 7e 04 00 00 fe a5 00 V2F32_TO_V2F16 r62, `r62, `r62 -7e 00 01 00 00 fe 9c 00 FRCP.f16 r62, `r62.h00 -49 3e c0 22 04 c9 b3 30 FMA.v2f16.wait12 r9, `r9, r62.h00, 0x0.neg -47 43 00 00 00 c3 a4 00 FADD.f32 r3, `r7, `r3 -43 09 00 08 00 c3 a4 40 FADD.f32.wait0126 r3, `r3, r9.h1 +4a 00 0c 00 00 fe 91 00 POPCOUNT.i32 r62, ^r10 +7e 00 19 00 00 fe 90 00 U32_TO_F32 r62, ^r62 +7e 7e 04 00 00 fe a5 00 V2F32_TO_V2F16 r62, ^r62, ^r62 +7e 00 01 00 00 fe 9c 00 FRCP.f16 r62, ^r62.h00 +49 3e c0 22 04 c9 b3 30 FMA.v2f16.wait12 r9, ^r9, r62.h00, 0x0.neg +47 43 00 00 00 c3 a4 00 FADD.f32 r3, ^r7, ^r3 +43 09 00 08 00 c3 a4 40 FADD.f32.wait0126 r3, ^r3, r9.h1 3c 03 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, r3, atest_datum.w0 -46 42 00 00 00 c2 a4 00 FADD.f32 r2, `r6, `r2 -44 40 00 00 00 c0 a4 00 FADD.f32 r0, `r4, `r0 -48 7e c0 22 04 ff b3 00 FMA.v2f16 r63, `r8, `r62.h00, 0x0.neg -45 41 00 00 00 c1 a4 00 FADD.f32 r1, `r5, `r1 -41 3f 00 08 00 c1 a4 00 FADD.f32 r1, `r1, r63.h1 -40 7f 00 04 00 c0 a4 00 FADD.f32 r0, `r0, `r63.h0 -42 49 00 04 00 c2 a4 48 FADD.f32.wait r2, `r2, `r9.h0 +46 42 00 00 00 c2 a4 00 FADD.f32 r2, ^r6, ^r2 +44 40 00 00 00 c0 a4 00 FADD.f32 r0, ^r4, ^r0 +48 7e c0 22 04 ff b3 00 FMA.v2f16 r63, ^r8, ^r62.h00, 0x0.neg +45 41 00 00 00 c1 a4 00 FADD.f32 r1, ^r5, ^r1 +41 3f 00 08 00 c1 a4 00 FADD.f32 r1, ^r1, r63.h1 +40 7f 00 04 00 c0 a4 00 FADD.f32 r0, ^r0, ^r63.h0 +42 49 00 04 00 c2 a4 48 FADD.f32.wait r2, ^r2, ^r9.h0 f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0 c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0 c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1