intel/fs: drop FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7

We can lower FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD into other more
generic sends and drop this internal opcode.

The idea behind this change is to allow bindless surfaces to be used
for UBO pulls and why it's interesting to be able to reuse
setup_surface_descriptors(). But that will come in a later change.

No shader-db changes on TGL & DG2.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20416>
This commit is contained in:
Lionel Landwerlin
2022-12-21 20:16:27 +02:00
committed by Marge Bot
parent 5bc91550d1
commit 13cca48920
9 changed files with 154 additions and 191 deletions

View File

@@ -559,7 +559,6 @@ enum opcode {
FS_OPCODE_PIXEL_X,
FS_OPCODE_PIXEL_Y,
FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7,
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4,
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
FS_OPCODE_SET_SAMPLE_ID,
@@ -888,6 +887,17 @@ enum tex_logical_srcs {
TEX_LOGICAL_NUM_SRCS,
};
enum pull_uniform_constant_srcs {
/** Surface binding table index */
PULL_UNIFORM_CONSTANT_SRC_SURFACE,
/** Surface offset */
PULL_UNIFORM_CONSTANT_SRC_OFFSET,
/** Pull size */
PULL_UNIFORM_CONSTANT_SRC_SIZE,
PULL_UNIFORM_CONSTANT_SRCS,
};
enum surface_logical_srcs {
/** Surface binding table index */
SURFACE_LOGICAL_SRC_SURFACE,

View File

@@ -248,7 +248,6 @@ fs_inst::is_control_source(unsigned arg) const
{
switch (opcode) {
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
return arg == 0;
@@ -306,9 +305,6 @@ fs_inst::is_payload(unsigned arg) const
case SHADER_OPCODE_BARRIER:
return arg == 0;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
return arg == 1;
case SHADER_OPCODE_SEND:
return arg == 2 || arg == 3;
@@ -864,12 +860,6 @@ fs_inst::size_read(int arg) const
return 1;
break;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
/* The payload is actually stored in src1 */
if (arg == 1)
return mlen * REG_SIZE;
break;
case FS_OPCODE_LINTERP:
if (arg == 1)
return 16;
@@ -2460,8 +2450,14 @@ fs_visitor::lower_constant_loads()
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
const unsigned base = pull_index * 4;
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
srcs, PULL_UNIFORM_CONSTANT_SRCS);
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = VGRF;
@@ -3678,106 +3674,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds()
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
/**
* Turns the generic expression-style uniform pull constant load instruction
* into a hardware-specific series of instructions for loading a pull
* constant.
*
* The expression style allows the CSE pass before this to optimize out
* repeated loads from the same offset, and gives the pre-register-allocation
* scheduling full flexibility, while the conversion to native instructions
* allows the post-register-allocation scheduler the best information
* possible.
*
* Note that execution masking for setting up pull constant loads is special:
* the channels that need to be written are unrelated to the current execution
* mask, since a later instruction will use one of the result channels as a
* source operand for all 8 or 16 of its channels.
*/
void
fs_visitor::lower_uniform_pull_constant_loads()
{
foreach_block_and_inst (block, fs_inst, inst, cfg) {
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
continue;
const fs_reg& surface = inst->src[0];
const fs_reg& offset_B = inst->src[1];
assert(offset_B.file == IMM);
if (devinfo->has_lsc) {
const fs_builder ubld =
fs_builder(this, block, inst).group(8, 0).exec_all();
const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(payload, offset_B);
inst->sfid = GFX12_SFID_UGM;
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1 /* simd_size */,
LSC_ADDR_SURFTYPE_BTI,
LSC_ADDR_SIZE_A32,
1 /* num_coordinates */,
LSC_DATA_SIZE_D32,
inst->size_written / 4,
true /* transpose */,
LSC_CACHE_LOAD_L1STATE_L3MOCS,
true /* has_dest */);
fs_reg ex_desc;
if (surface.file == IMM) {
ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
} else {
/* We only need the first component for the payload so we can use
* one of the other components for the extended descriptor
*/
ex_desc = component(payload, 1);
ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
}
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
inst->ex_mlen = 0;
inst->header_size = 0;
inst->send_has_side_effects = false;
inst->send_is_volatile = true;
inst->exec_size = 1;
/* Finally, the payload */
inst->resize_sources(3);
inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = ex_desc;
inst->src[2] = payload;
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
} else if (devinfo->ver >= 7) {
const fs_builder ubld = fs_builder(this, block, inst).exec_all();
const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
ubld.group(8, 0).MOV(payload,
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
ubld.group(1, 0).MOV(component(payload, 2),
brw_imm_ud(offset_B.ud / 16));
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
inst->src[1] = payload;
inst->header_size = 1;
inst->mlen = 1;
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
} else {
/* Before register allocation, we didn't tell the scheduler about the
* MRF we use. We know it's safe to use this MRF because nothing
* else does except for register spill/unspill, which generates and
* uses its MRF within a single IR instruction.
*/
inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
inst->mlen = 1;
}
}
}
bool
fs_visitor::lower_load_payload()
{

View File

@@ -621,10 +621,6 @@ private:
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
struct brw_reg index,
struct brw_reg offset);
void generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
struct brw_reg dst,
struct brw_reg surf_index,
struct brw_reg payload);
void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index);

View File

@@ -1553,67 +1553,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
read_offset, surf_index);
}
void
fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index,
struct brw_reg payload)
{
assert(index.type == BRW_REGISTER_TYPE_UD);
assert(payload.file == BRW_GENERAL_REGISTER_FILE);
assert(type_sz(dst.type) == 4);
assert(!devinfo->has_lsc);
if (index.file == BRW_IMMEDIATE_VALUE) {
const uint32_t surf_index = index.ud;
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
brw_pop_insn_state(p);
brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE);
brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
brw_set_desc(p, send,
brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
REG_SIZE), true) |
brw_dp_desc(devinfo, surf_index,
GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)));
} else {
const tgl_swsb swsb = brw_get_default_swsb(p);
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
/* a0.0 = surf_index & 0xff */
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
brw_set_dest(p, insn_and, addr);
brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
/* dst = send(payload, a0.0 | <descriptor>) */
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
brw_send_indirect_message(
p, GFX6_SFID_DATAPORT_CONSTANT_CACHE,
retype(dst, BRW_REGISTER_TYPE_UD),
retype(payload, BRW_REGISTER_TYPE_UD), addr,
brw_message_desc(devinfo, 1,
DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
brw_dp_desc(devinfo, 0 /* surface */,
GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)),
false /* EOT */);
brw_pop_insn_state(p);
}
}
void
fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst,
struct brw_reg dst,
@@ -2294,12 +2233,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
send_count++;
break;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
assert(inst->force_writemask_all);
generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]);
send_count++;
break;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
generate_varying_pull_constant_load_gfx4(inst, dst, src[0]);
send_count++;

View File

@@ -4776,9 +4776,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
const unsigned count = MIN2(instr->num_components - c,
(block_sz - base % block_sz) / type_size);
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
packed_consts, surf_index,
brw_imm_ud(base & ~(block_sz - 1)));
fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surf_index;
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
srcs, PULL_UNIFORM_CONSTANT_SRCS);
const fs_reg consts =
retype(byte_offset(packed_consts, base & (block_sz - 1)),

View File

@@ -1001,7 +1001,6 @@ namespace {
abort();
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
@@ -1036,6 +1035,14 @@ namespace {
case SHADER_OPCODE_SEND:
switch (info.sfid) {
case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
if (devinfo->ver >= 7) {
/* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
} else {
abort();
}
case GFX6_SFID_DATAPORT_RENDER_CACHE:
if (devinfo->ver >= 7) {
switch (brw_dp_desc_msg_type(devinfo, info.desc)) {

View File

@@ -2778,3 +2778,118 @@ fs_visitor::lower_logical_sends()
return progress;
}
/**
* Turns the generic expression-style uniform pull constant load instruction
* into a hardware-specific series of instructions for loading a pull
* constant.
*
* The expression style allows the CSE pass before this to optimize out
* repeated loads from the same offset, and gives the pre-register-allocation
* scheduling full flexibility, while the conversion to native instructions
* allows the post-register-allocation scheduler the best information
* possible.
*
* Note that execution masking for setting up pull constant loads is special:
* the channels that need to be written are unrelated to the current execution
* mask, since a later instruction will use one of the result channels as a
* source operand for all 8 or 16 of its channels.
*/
void
fs_visitor::lower_uniform_pull_constant_loads()
{
foreach_block_and_inst (block, fs_inst, inst, cfg) {
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
continue;
const fs_reg& surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
const fs_reg& offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
const fs_reg& size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
assert(offset_B.file == IMM);
assert(size_B.file == IMM);
if (devinfo->has_lsc) {
const fs_builder ubld =
fs_builder(this, block, inst).group(8, 0).exec_all();
const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(payload, offset_B);
inst->sfid = GFX12_SFID_UGM;
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1 /* simd_size */,
LSC_ADDR_SURFTYPE_BTI,
LSC_ADDR_SIZE_A32,
1 /* num_coordinates */,
LSC_DATA_SIZE_D32,
inst->size_written / 4,
true /* transpose */,
LSC_CACHE_LOAD_L1STATE_L3MOCS,
true /* has_dest */);
fs_reg ex_desc;
if (surface.file == IMM) {
ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
} else {
/* We only need the first component for the payload so we can use
* one of the other components for the extended descriptor
*/
ex_desc = component(payload, 1);
ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
}
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
inst->ex_mlen = 0;
inst->header_size = 0;
inst->send_has_side_effects = false;
inst->send_is_volatile = true;
inst->exec_size = 1;
/* Finally, the payload */
inst->resize_sources(3);
inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = ex_desc;
inst->src[2] = payload;
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
} else if (devinfo->ver >= 7) {
const fs_builder ubld = fs_builder(this, block, inst).exec_all();
fs_reg header = bld.exec_all().group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
ubld.group(8, 0).MOV(header,
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
ubld.group(1, 0).MOV(component(header, 2),
brw_imm_ud(offset_B.ud / 16));
inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
inst->opcode = SHADER_OPCODE_SEND;
inst->header_size = 1;
inst->mlen = 1;
uint32_t desc =
brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
size_B.ud / 4, false /* write */);
inst->resize_sources(4);
setup_surface_descriptors(ubld, inst, desc,
inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE],
fs_reg() /* surface_handle */);
inst->src[2] = header;
inst->src[3] = fs_reg(); /* unused for reads */
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
} else {
/* Before register allocation, we didn't tell the scheduler about the
* MRF we use. We know it's safe to use this MRF because nothing
* else does except for register spill/unspill, which generates and
* uses its MRF within a single IR instruction.
*/
inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
inst->mlen = 1;
}
}
}

View File

@@ -326,7 +326,6 @@ schedule_node::set_latency_gfx7(bool is_haswell)
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
case VS_OPCODE_PULL_CONSTANT_LOAD:
/* testing using varying-index pull constants:
*
@@ -399,6 +398,11 @@ schedule_node::set_latency_gfx7(bool is_haswell)
break;
}
case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
/* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
latency = 200;
break;
case GFX6_SFID_DATAPORT_RENDER_CACHE:
switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) {
case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE:

View File

@@ -430,8 +430,6 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
return "uniform_pull_const";
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
return "uniform_pull_const_gfx7";
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
return "varying_pull_const_gfx4";
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: