intel/fs: switch compute push constant loads to LSC
We're now able to load up to 8 GRFs in one send. v2: Switch to use transpose + vector of up to 64 (Thanks Curro!) v3: Increase parallelism by not reusing the same register for push constant offset (Curro) v4: Drop dead ADD() instruction (Curro) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Francisco Jerez <currojerez@riseup.net> Reviewed-by: Rohan Garg <rohan.garg@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17555>
This commit is contained in:

committed by
Marge Bot

parent
1e7a131fd1
commit
aa65f83203
@@ -1586,67 +1586,56 @@ fs_visitor::assign_curb_setup()
|
|||||||
assert(devinfo->verx10 >= 125);
|
assert(devinfo->verx10 >= 125);
|
||||||
assert(uniform_push_length <= 1);
|
assert(uniform_push_length <= 1);
|
||||||
} else if (is_compute && devinfo->verx10 >= 125) {
|
} else if (is_compute && devinfo->verx10 >= 125) {
|
||||||
fs_builder ubld = bld.exec_all().group(8, 0).at(
|
assert(devinfo->has_lsc);
|
||||||
|
fs_builder ubld = bld.exec_all().group(1, 0).at(
|
||||||
cfg->first_block(), cfg->first_block()->start());
|
cfg->first_block(), cfg->first_block()->start());
|
||||||
|
|
||||||
/* The base address for our push data is passed in as R0.0[31:6]. We
|
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
||||||
* have to mask off the bottom 6 bits.
|
* to mask off the bottom 6 bits.
|
||||||
*/
|
*/
|
||||||
fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
ubld.group(1, 0).AND(base_addr,
|
ubld.AND(base_addr,
|
||||||
retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
|
retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
|
||||||
brw_imm_ud(INTEL_MASK(31, 6)));
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
||||||
|
|
||||||
fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
||||||
ubld.MOV(header0, brw_imm_ud(0));
|
|
||||||
ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4));
|
|
||||||
|
|
||||||
/* On Gfx12-HP we load constants at the start of the program using A32
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
||||||
* stateless messages.
|
* stateless messages.
|
||||||
*/
|
*/
|
||||||
for (unsigned i = 0; i < uniform_push_length;) {
|
for (unsigned i = 0; i < uniform_push_length;) {
|
||||||
/* Limit ourselves to HW limit of 8 Owords (8 * 16bytes = 128 bytes
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
||||||
* or 4 registers).
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
||||||
*/
|
|
||||||
unsigned num_regs = MIN2(uniform_push_length - i, 4);
|
|
||||||
assert(num_regs > 0);
|
assert(num_regs > 0);
|
||||||
num_regs = 1 << util_logbase2(num_regs);
|
num_regs = 1 << util_logbase2(num_regs);
|
||||||
|
|
||||||
fs_reg header;
|
fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
if (i == 0) {
|
ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE));
|
||||||
header = header0;
|
|
||||||
} else {
|
|
||||||
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
||||||
ubld.MOV(header, brw_imm_ud(0));
|
|
||||||
ubld.group(1, 0).ADD(component(header, 2),
|
|
||||||
component(header0, 2),
|
|
||||||
brw_imm_ud(i * 2));
|
|
||||||
}
|
|
||||||
|
|
||||||
fs_reg srcs[4] = {
|
fs_reg srcs[4] = {
|
||||||
brw_imm_ud(0), /* desc */
|
brw_imm_ud(0), /* desc */
|
||||||
brw_imm_ud(0), /* ex_desc */
|
brw_imm_ud(0), /* ex_desc */
|
||||||
header, /* payload */
|
addr, /* payload */
|
||||||
fs_reg(), /* payload2 */
|
fs_reg(), /* payload2 */
|
||||||
};
|
};
|
||||||
|
|
||||||
fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),
|
fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),
|
||||||
BRW_REGISTER_TYPE_UD);
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
||||||
|
|
||||||
/* This instruction has to be run SIMD16 if we're filling more than a
|
send->sfid = GFX12_SFID_UGM;
|
||||||
* single register.
|
send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
||||||
*/
|
1 /* exec_size */,
|
||||||
unsigned send_width = MIN2(16, num_regs * 8);
|
LSC_ADDR_SURFTYPE_FLAT,
|
||||||
|
LSC_ADDR_SIZE_A32,
|
||||||
fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND,
|
1 /* num_coordinates */,
|
||||||
dest, srcs, 4);
|
LSC_DATA_SIZE_D32,
|
||||||
send->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
num_regs * 8 /* num_channels */,
|
||||||
send->desc = brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
|
true /* transpose */,
|
||||||
GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
|
LSC_CACHE_LOAD_L1STATE_L3MOCS,
|
||||||
BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 2));
|
true /* has_dest */);
|
||||||
send->header_size = 1;
|
send->header_size = 0;
|
||||||
send->mlen = 1;
|
send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
|
||||||
send->size_written = num_regs * REG_SIZE;
|
send->size_written =
|
||||||
|
lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
|
||||||
send->send_is_volatile = true;
|
send->send_is_volatile = true;
|
||||||
|
|
||||||
i += num_regs;
|
i += num_regs;
|
||||||
|
Reference in New Issue
Block a user