intel/fs: Add a SCRATCH_HEADER opcode
This opcode is responsible for setting up the buffer base address and per-thread scratch space fields of a scratch message header. For the most part, it's a copy of g0 but some messages need us to zero out g0.2 and the bottom bits of g0.5. This may actually fix a bug when nir_load/store_scratch is used. The docs say that the DWORD scattered messages respect the per-thread scratch size specified in gN.3[3:0] in the message header but we've been leaving it zero. This may mean that we've been ignoring any scratch reads/writes from a load/store_scratch intrinsic above the 1KB mark. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7084>
This commit is contained in:

committed by
Marge Bot

parent
24b64c8408
commit
06ebf23283
@@ -475,6 +475,8 @@ enum opcode {
|
||||
SHADER_OPCODE_GEN4_SCRATCH_WRITE,
|
||||
SHADER_OPCODE_GEN7_SCRATCH_READ,
|
||||
|
||||
SHADER_OPCODE_SCRATCH_HEADER,
|
||||
|
||||
/**
|
||||
* Gen8+ SIMD8 URB Read messages.
|
||||
*/
|
||||
|
@@ -5429,42 +5429,15 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
||||
if ((devinfo->gen < 9 && is_typed_access) || is_stateless) {
|
||||
fs_builder ubld = bld.exec_all().group(8, 0);
|
||||
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
ubld.MOV(header, brw_imm_d(0));
|
||||
if (is_stateless) {
|
||||
/* Copy the per-thread scratch from g0 for bounds checking */
|
||||
ubld.group(1, 0).AND(component(header, 3),
|
||||
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
||||
brw_imm_ud(0xf));
|
||||
|
||||
/* Both the typed and scattered byte/dword A32 messages take a buffer
|
||||
* base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or
|
||||
* MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d
|
||||
* for more details.) This is conveniently where the HW places the
|
||||
* scratch surface base address.
|
||||
*
|
||||
* From the SKL PRM Vol. 7 "Per-Thread Scratch Space":
|
||||
*
|
||||
* "When a thread becomes 'active' it is allocated a portion of
|
||||
* scratch space, sized according to PerThreadScratchSpace. The
|
||||
* starting location of each thread’s scratch space allocation,
|
||||
* ScratchSpaceOffset, is passed in the thread payload in
|
||||
* R0.5[31:10] and is specified as a 1KB-granular offset from the
|
||||
* GeneralStateBaseAddress. The computation of ScratchSpaceOffset
|
||||
* includes the starting address of the stage’s scratch space
|
||||
* allocation, as programmed by ScratchSpaceBasePointer."
|
||||
*
|
||||
* The base address is passed in bits R0.5[31:10] and the bottom 10
|
||||
* bits of R0.5 are used for other things. Therefore, we have to
|
||||
* mask off the bottom 10 bits so that we don't get a garbage base
|
||||
* address.
|
||||
*/
|
||||
ubld.group(1, 0).AND(component(header, 5),
|
||||
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
|
||||
brw_imm_ud(0xfffffc00));
|
||||
}
|
||||
assert(!is_surface_access);
|
||||
ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
|
||||
} else {
|
||||
ubld.MOV(header, brw_imm_d(0));
|
||||
if (is_surface_access)
|
||||
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
|
||||
}
|
||||
}
|
||||
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
|
||||
|
||||
fs_reg payload, payload2;
|
||||
|
@@ -513,6 +513,7 @@ private:
|
||||
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
|
||||
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
|
||||
void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
|
||||
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
|
||||
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
|
||||
struct brw_reg index,
|
||||
struct brw_reg offset);
|
||||
|
@@ -1533,6 +1533,76 @@ fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
|
||||
gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
|
||||
}
|
||||
|
||||
/* The A32 messages take a buffer base address in header.5:[31:0] (See
|
||||
* MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
|
||||
* and OWord block messages in the SKL PRM Vol. 2d for more details.)
|
||||
* Unfortunately, there are a number of subtle differences:
|
||||
*
|
||||
* For the block read/write messages:
|
||||
*
|
||||
* - We always stomp header.2 to fill in the actual scratch address (in
|
||||
* units of OWORDs) so we don't care what's in there.
|
||||
*
|
||||
* - They rely on per-thread scratch space value in header.3[3:0] to do
|
||||
* bounds checking so that needs to be valid. The upper bits of
|
||||
* header.3 are ignored, though, so we can copy all of g0.3.
|
||||
*
|
||||
* - They ignore header.5[9:0] and assumes the address is 1KB aligned.
|
||||
*
|
||||
*
|
||||
* For the byte/dword scattered read/write messages:
|
||||
*
|
||||
* - We want header.2 to be zero because that gets added to the per-channel
|
||||
* offset in the non-header portion of the message.
|
||||
*
|
||||
* - Contrary to what the docs claim, they don't do any bounds checking so
|
||||
* the value of header.3[3:0] doesn't matter.
|
||||
*
|
||||
* - They consider all of header.5 for the base address and header.5[9:0]
|
||||
* are not ignored. This means that we can't copy g0.5 verbatim because
|
||||
* g0.5[9:0] contains the FFTID on most platforms. Instead, we have to
|
||||
* use an AND to mask off the bottom 10 bits.
|
||||
*
|
||||
*
|
||||
* For block messages, just copying g0 gives a valid header because all the
|
||||
* garbage gets ignored except for header.2 which we stomp as part of message
|
||||
* setup. For byte/dword scattered messages, we can just zero out the header
|
||||
* and copy over the bits we need from g0.5. This opcode, however, tries to
|
||||
* satisfy the requirements of both by starting with 0 and filling out the
|
||||
* information required by either set of opcodes.
|
||||
*/
|
||||
void
|
||||
fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
|
||||
{
|
||||
assert(inst->exec_size == 8 && inst->force_writemask_all);
|
||||
assert(dst.file == BRW_GENERAL_REGISTER_FILE);
|
||||
|
||||
dst.type = BRW_REGISTER_TYPE_UD;
|
||||
|
||||
brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
|
||||
if (devinfo->gen >= 12)
|
||||
brw_set_default_swsb(p, tgl_swsb_null());
|
||||
else
|
||||
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
|
||||
|
||||
/* Copy the per-thread scratch space size from g0.3[3:0] */
|
||||
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
||||
insn = brw_AND(p, suboffset(dst, 3),
|
||||
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
||||
brw_imm_ud(INTEL_MASK(3, 0)));
|
||||
if (devinfo->gen < 12) {
|
||||
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
|
||||
brw_inst_set_no_dd_check(p->devinfo, insn, true);
|
||||
}
|
||||
|
||||
/* Copy the scratch base address from g0.5[31:10] */
|
||||
insn = brw_AND(p, suboffset(dst, 5),
|
||||
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
|
||||
brw_imm_ud(INTEL_MASK(31, 10)));
|
||||
if (devinfo->gen < 12)
|
||||
brw_inst_set_no_dd_check(p->devinfo, insn, true);
|
||||
}
|
||||
|
||||
void
|
||||
fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
@@ -2265,6 +2335,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
|
||||
fill_count++;
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_SCRATCH_HEADER:
|
||||
generate_scratch_header(inst, dst);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_MOV_INDIRECT:
|
||||
generate_mov_indirect(inst, dst, src[0], src[1]);
|
||||
break;
|
||||
|
@@ -327,6 +327,7 @@ namespace {
|
||||
case BRW_OPCODE_LINE:
|
||||
case BRW_OPCODE_NOP:
|
||||
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
||||
case SHADER_OPCODE_SCRATCH_HEADER:
|
||||
case FS_OPCODE_DDX_COARSE:
|
||||
case FS_OPCODE_DDX_FINE:
|
||||
case FS_OPCODE_DDY_COARSE:
|
||||
|
@@ -349,6 +349,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
|
||||
return "gen4_scratch_write";
|
||||
case SHADER_OPCODE_GEN7_SCRATCH_READ:
|
||||
return "gen7_scratch_read";
|
||||
case SHADER_OPCODE_SCRATCH_HEADER:
|
||||
return "scratch_header";
|
||||
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
||||
return "gen8_urb_write_simd8";
|
||||
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
||||
|
Reference in New Issue
Block a user