From 566dc4d74021e7d7a50e30a34a7061b12f65160f Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Tue, 6 Apr 2021 13:14:26 -0700 Subject: [PATCH] intel/eu: Add instruction compaction support on XeHP. This patch includes a number of reworks and fixes squashed in by Nanley Chery, Sagar Ghuge, Jordan Justen and Francisco Jerez. Reviewed-by: Francisco Jerez Part-of: --- src/intel/compiler/brw_eu_compact.c | 192 +++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c index 5ed14651ad5..1841c95bfc1 100644 --- a/src/intel/compiler/brw_eu_compact.c +++ b/src/intel/compiler/brw_eu_compact.c @@ -821,6 +821,44 @@ static const uint16_t gfx12_src1_index_table[16] = { 0b100011010001, /* -r<4;4,1> */ }; +static const uint16_t xehp_src0_index_table[16] = { + 0b000100000000, /* r<1;1,0> */ + 0b000000000000, /* r<0;1,0> */ + 0b000100000010, /* -r<1;1,0> */ + 0b000100000001, /* (abs)r<1;1,0> */ + 0b000000000010, /* -r<0;1,0> */ + 0b001000000000, /* r<2;1,0> */ + 0b001001000000, /* r<2;4,0> */ + 0b001101000000, /* r<4;4,0> */ + 0b001100000000, /* r<4;1,0> */ + 0b000100000011, /* -(abs)r<1;1,0> */ + 0b000000000001, /* (abs)r<0;1,0> */ + 0b111100010000, /* r[a]<1,0> */ + 0b010001100000, /* r<8;8,0> */ + 0b000101000000, /* r<1;4,0> */ + 0b010001001000, /* r<8;4,2> */ + 0b001000000010, /* -r<2;1,0> */ +}; + +static const uint16_t xehp_src1_index_table[16] = { + 0b000001000000, /* r<1;1,0> */ + 0b000000000000, /* r<0;1,0> */ + 0b100001000000, /* -r<1;1,0> */ + 0b100000000000, /* -r<0;1,0> */ + 0b010001000000, /* (abs)r<1;1,0> */ + 0b100011010000, /* -r<4;4,0> */ + 0b000010000000, /* r<2;1,0> */ + 0b000011010000, /* r<4;4,0> */ + 0b000011000000, /* r<4;1,0> */ + 0b110001000000, /* -(abs)r<1;1,0> */ + 0b010000000000, /* (abs)r<0;1,0> */ + 0b110000000000, /* -(abs)r<0;1,0> */ + 0b000100011000, /* r<8;8,0> */ + 0b100010000000, /* -r<2;1,0> */ + 0b100000001001, /* -r<0;2,1> */ + 0b100001000100, /* -r[a]<1;1,0> */ +}; + /* This is actually the control index table for Cherryview (26 bits), but the * only difference from Broadwell (24 bits) is that it has two extra 0-bits at * the start. @@ -883,6 +921,41 @@ static const uint64_t gfx12_3src_control_index_table[32] = { 0b000001001000010101010000000000000011, /* (8|M0) (sat)arf<1>:f :f :f :f */ }; +static const uint64_t xehp_3src_control_index_table[32] = { + 0b0000010010100010101000000000000000100, /* (16|M0) grf<1>:f :f :f :f */ + 0b0000010010100010101000000000000000011, /* (8|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000000011, /* (8|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000000011, /* (W) (8|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000000011, /* (W) (8|M0) arf<1>:f :f :f :f */ + 0b0000010010000010101000000000000010011, /* (8|M8) arf<1>:f :f :f :f */ + 0b0000010010100010101000000000000010011, /* (8|M8) grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000010011, /* (W) (8|M8) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000010011, /* (W) (8|M8) grf<1>:f :f :f :f */ + 0b0000010010100010101000010000000000100, /* (W) (16|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000000100, /* (16|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101010000000000000100, /* (16|M0) (sat)grf<1>:f :f :f :f */ + 0b0000010010100010101000000000000100100, /* (16|M16) grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000000100, /* (W) (16|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000000000, /* (W) (1|M0) grf<1>:f :f :f :f */ + 0b0000010010100010101010000000000000011, /* (8|M0) (sat)grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000100011, /* (W) (8|M16) arf<1>:f :f :f :f */ + 0b0000010010000010101000010000000110011, /* (W) (8|M24) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000100011, /* (W) (8|M16) grf<1>:f :f :f :f */ + 0b0000010010100010101000010000000110011, /* (W) (8|M24) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000110011, /* (8|M24) arf<1>:f :f :f :f */ + 0b0000010010000010101000000000000100011, /* (8|M16) arf<1>:f :f :f :f */ + 0b0000000100111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :b */ + 0b0000000000111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :ub {Atomic} */ + 0b0000100100111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :b {Atomic} */ + 0b0000100000111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :ub {Atomic} */ + 0b0000100100111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :b */ + 0b0000000000111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :ub */ + 0b0000000100111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :b {Atomic} */ + 0b0000100000111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :ub */ + 0b0000101101111010101000100000000000011, /* dpas.8x* (8|M0) grf<1>:f :f :bf :bf {Atomic} */ + 0b0000101101111010101000000000000000011, /* dpas.8x* (8|M0) grf<1>:f :f :bf :bf */ +}; + static const uint32_t gfx12_3src_source_index_table[32] = { 0b100101100001100000000, /* grf<0;0> grf<8;1> grf<0> */ 0b100101100001001000010, /* arf<4;1> grf<8;1> grf<0> */ @@ -918,6 +991,44 @@ static const uint32_t gfx12_3src_source_index_table[32] = { 0b101001100101101000011, /* grf<8;1> arf<8;1> -grf<1> */ }; +static const uint32_t xehp_3src_source_index_table[32] = { + 0b100100000001100000000, /* grf<0;0> grf<1;0> grf<0> */ + 0b100100000001000000001, /* arf<1;0> grf<1;0> grf<0> */ + 0b101100000001100000001, /* grf<1;0> grf<1;0> grf<1> */ + 0b100100000001100000001, /* grf<1;0> grf<1;0> grf<0> */ + 0b101100000000100000001, /* grf<1;0> grf<0;0> grf<1> */ + 0b101100000001100001001, /* -grf<1;0> grf<1;0> grf<1> */ + 0b101000000001100000001, /* grf<1;0> arf<1;0> grf<1> */ + 0b101100000001100000000, /* grf<0;0> grf<1;0> grf<1> */ + 0b100000000001100000000, /* grf<0;0> arf<1;0> grf<0> */ + 0b101100000101100000001, /* grf<1;0> grf<1;0> -grf<1> */ + 0b101100010001100000001, /* grf<1;0> -grf<1;0> grf<1> */ + 0b101100000000100000000, /* grf<0;0> grf<0;0> grf<1> */ + 0b100000000001100000001, /* grf<1;0> arf<1;0> grf<0> */ + 0b100100010001100000000, /* grf<0;0> -grf<1;0> grf<0> */ + 0b100100010001100000001, /* grf<1;0> -grf<1;0> grf<0> */ + 0b100100000001100001001, /* -grf<1;0> grf<1;0> grf<0> */ + 0b100100000000100000001, /* grf<1;0> grf<0;0> grf<0> */ + 0b100100000001100001000, /* -grf<0;0> grf<1;0> grf<0> */ + 0b100100000000100000000, /* grf<0;0> grf<0;0> grf<0> + * dpas.*x1 grf:d grf:[ub,b] grf:[ub,b] + * dpas.*x1 grf:f grf:bf grf:bf + */ + 0b101100010001100000000, /* grf<0;0> -grf<1;0> grf<1> */ + 0b100100000101100000000, /* grf<0;0> grf<1;0> -grf<0> */ + 0b101000000001100000000, /* grf<0;0> arf<1;0> grf<1> */ + 0b100100000101100000001, /* grf<1;0> grf<1;0> -grf<0> */ + 0b101100000101100001001, /* -grf<1;0> grf<1;0> -grf<1> */ + 0b100100010000100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[ub,b] */ + 0b100100000100100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u2,s2] */ + 0b100100010100100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u2,s2] */ + 0b100100001000100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[ub,b] */ + 0b100100001100100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u2,s2] */ + 0b100100000010100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u4,s4] */ + 0b100100001010100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u4,s4] */ + 0b100100010010100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u4,s4] */ +}; + static const uint32_t gfx12_3src_subreg_table[32] = { 0b00000000000000000000, /* .0 .0 .0 .0 */ 0b00100000000000000000, /* .0 .0 .0 .4 */ @@ -1161,7 +1272,34 @@ set_3src_control_index(const struct gen_device_info *devinfo, { assert(devinfo->ver >= 8); - if (devinfo->ver >= 12) { + if (devinfo->verx10 >= 125) { + uint64_t uncompacted = /* 37b/XeHP+ */ + (brw_inst_bits(src, 95, 92) << 33) | /* 4b */ + (brw_inst_bits(src, 90, 88) << 30) | /* 3b */ + (brw_inst_bits(src, 82, 80) << 27) | /* 3b */ + (brw_inst_bits(src, 50, 50) << 26) | /* 1b */ + (brw_inst_bits(src, 49, 48) << 24) | /* 2b */ + (brw_inst_bits(src, 42, 40) << 21) | /* 3b */ + (brw_inst_bits(src, 39, 39) << 20) | /* 1b */ + (brw_inst_bits(src, 38, 36) << 17) | /* 3b */ + (brw_inst_bits(src, 34, 34) << 16) | /* 1b */ + (brw_inst_bits(src, 33, 33) << 15) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 14) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 13) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 12) | /* 1b */ + (brw_inst_bits(src, 27, 24) << 8) | /* 4b */ + (brw_inst_bits(src, 23, 23) << 7) | /* 1b */ + (brw_inst_bits(src, 22, 22) << 6) | /* 1b */ + (brw_inst_bits(src, 21, 19) << 3) | /* 3b */ + (brw_inst_bits(src, 18, 16)); /* 3b */ + + for (unsigned i = 0; i < ARRAY_SIZE(xehp_3src_control_index_table); i++) { + if (xehp_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } + } else if (devinfo->ver >= 12) { uint64_t uncompacted = /* 36b/TGL+ */ (brw_inst_bits(src, 95, 92) << 32) | /* 4b */ (brw_inst_bits(src, 90, 88) << 29) | /* 3b */ @@ -1233,8 +1371,15 @@ set_3src_source_index(const struct gen_device_info *devinfo, (brw_inst_bits(src, 43, 43) << 1) | /* 1b */ (brw_inst_bits(src, 35, 35)); /* 1b */ - for (unsigned i = 0; i < ARRAY_SIZE(gfx12_3src_source_index_table); i++) { - if (gfx12_3src_source_index_table[i] == uncompacted) { + const uint32_t *three_src_source_index_table = + devinfo->verx10 >= 125 ? + xehp_3src_source_index_table : gfx12_3src_source_index_table; + const uint32_t three_src_source_index_table_len = + devinfo->verx10 >= 125 ? ARRAY_SIZE(xehp_3src_source_index_table) : + ARRAY_SIZE(gfx12_3src_source_index_table); + + for (unsigned i = 0; i < three_src_source_index_table_len; i++) { + if (three_src_source_index_table[i] == uncompacted) { brw_compact_inst_set_3src_source_index(devinfo, dst, i); return true; } @@ -1885,7 +2030,30 @@ set_uncompacted_3src_control_index(const struct compaction_state *c, const struct gen_device_info *devinfo = c->devinfo; assert(devinfo->ver >= 8); - if (devinfo->ver >= 12) { + if (devinfo->verx10 >= 125) { + uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint64_t uncompacted = xehp_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 33)); + brw_inst_set_bits(dst, 90, 88, (uncompacted >> 30) & 0x7); + brw_inst_set_bits(dst, 82, 80, (uncompacted >> 27) & 0x7); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 26) & 0x1); + brw_inst_set_bits(dst, 49, 48, (uncompacted >> 24) & 0x3); + brw_inst_set_bits(dst, 42, 40, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 39, 39, (uncompacted >> 20) & 0x1); + brw_inst_set_bits(dst, 38, 36, (uncompacted >> 17) & 0x7); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 27, 24, (uncompacted >> 8) & 0xf); + brw_inst_set_bits(dst, 23, 23, (uncompacted >> 7) & 0x1); + brw_inst_set_bits(dst, 22, 22, (uncompacted >> 6) & 0x1); + brw_inst_set_bits(dst, 21, 19, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 18, 16, (uncompacted >> 0) & 0x7); + + } else if (devinfo->ver >= 12) { uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); uint64_t uncompacted = gfx12_3src_control_index_table[compacted]; @@ -1928,7 +2096,10 @@ set_uncompacted_3src_source_index(const struct gen_device_info *devinfo, uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src); if (devinfo->ver >= 12) { - uint32_t uncompacted = gfx12_3src_source_index_table[compacted]; + const uint32_t *three_src_source_index_table = + devinfo->verx10 >= 125 ? + xehp_3src_source_index_table : gfx12_3src_source_index_table; + uint32_t uncompacted = three_src_source_index_table[compacted]; brw_inst_set_bits(dst, 114, 114, (uncompacted >> 20)); brw_inst_set_bits(dst, 113, 112, (uncompacted >> 18) & 0x3); @@ -2213,6 +2384,8 @@ compaction_state_init(struct compaction_state *c, assert(gfx12_subreg_table[ARRAY_SIZE(gfx12_subreg_table) - 1] != 0); assert(gfx12_src0_index_table[ARRAY_SIZE(gfx12_src0_index_table) - 1] != 0); assert(gfx12_src1_index_table[ARRAY_SIZE(gfx12_src1_index_table) - 1] != 0); + assert(xehp_src0_index_table[ARRAY_SIZE(xehp_src0_index_table) - 1] != 0); + assert(xehp_src1_index_table[ARRAY_SIZE(xehp_src1_index_table) - 1] != 0); c->devinfo = devinfo; switch (devinfo->ver) { @@ -2220,8 +2393,13 @@ compaction_state_init(struct compaction_state *c, c->control_index_table = gfx12_control_index_table;; c->datatype_table = gfx12_datatype_table; c->subreg_table = gfx12_subreg_table; - c->src0_index_table = gfx12_src0_index_table; - c->src1_index_table = gfx12_src1_index_table; + if (devinfo->verx10 >= 125) { + c->src0_index_table = xehp_src0_index_table; + c->src1_index_table = xehp_src1_index_table; + } else { + c->src0_index_table = gfx12_src0_index_table; + c->src1_index_table = gfx12_src1_index_table; + } break; case 11: c->control_index_table = gfx8_control_index_table;