intel/fs: Add a SCRATCH_HEADER opcode

This opcode is responsible for setting up the buffer base address and
per-thread scratch space fields of a scratch message header.  For the
most part, it's a copy of g0 but some messages need us to zero out g0.2
and the bottom bits of g0.5.

This may actually fix a bug when nir_load/store_scratch is used.  The
docs say that the DWORD scattered messages respect the per-thread
scratch size specified in gN.3[3:0] in the message header but we've been
leaving it zero.  This may mean that we've been ignoring any scratch
reads/writes from a load/store_scratch intrinsic above the 1KB mark.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7084>
This commit is contained in:
Jason Ekstrand
2020-10-09 04:13:20 -05:00
committed by Marge Bot
parent 24b64c8408
commit 06ebf23283
6 changed files with 86 additions and 33 deletions

View File

@@ -475,6 +475,8 @@ enum opcode {
SHADER_OPCODE_GEN4_SCRATCH_WRITE,
SHADER_OPCODE_GEN7_SCRATCH_READ,
SHADER_OPCODE_SCRATCH_HEADER,
/**
* Gen8+ SIMD8 URB Read messages.
*/

View File

@@ -5429,42 +5429,15 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
if ((devinfo->gen < 9 && is_typed_access) || is_stateless) {
fs_builder ubld = bld.exec_all().group(8, 0);
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(header, brw_imm_d(0));
if (is_stateless) {
/* Copy the per-thread scratch from g0 for bounds checking */
ubld.group(1, 0).AND(component(header, 3),
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
brw_imm_ud(0xf));
/* Both the typed and scattered byte/dword A32 messages take a buffer
* base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or
* MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d
* for more details.) This is conveniently where the HW places the
* scratch surface base address.
*
* From the SKL PRM Vol. 7 "Per-Thread Scratch Space":
*
* "When a thread becomes 'active' it is allocated a portion of
* scratch space, sized according to PerThreadScratchSpace. The
* starting location of each threads scratch space allocation,
* ScratchSpaceOffset, is passed in the thread payload in
* R0.5[31:10] and is specified as a 1KB-granular offset from the
* GeneralStateBaseAddress. The computation of ScratchSpaceOffset
* includes the starting address of the stages scratch space
* allocation, as programmed by ScratchSpaceBasePointer."
*
* The base address is passed in bits R0.5[31:10] and the bottom 10
* bits of R0.5 are used for other things. Therefore, we have to
* mask off the bottom 10 bits so that we don't get a garbage base
* address.
*/
ubld.group(1, 0).AND(component(header, 5),
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(0xfffffc00));
}
assert(!is_surface_access);
ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
} else {
ubld.MOV(header, brw_imm_d(0));
if (is_surface_access)
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
}
}
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
fs_reg payload, payload2;

View File

@@ -513,6 +513,7 @@ private:
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
struct brw_reg index,
struct brw_reg offset);

View File

@@ -1533,6 +1533,76 @@ fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
}
/* The A32 messages take a buffer base address in header.5:[31:0] (See
* MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
* and OWord block messages in the SKL PRM Vol. 2d for more details.)
* Unfortunately, there are a number of subtle differences:
*
* For the block read/write messages:
*
* - We always stomp header.2 to fill in the actual scratch address (in
* units of OWORDs) so we don't care what's in there.
*
* - They rely on per-thread scratch space value in header.3[3:0] to do
* bounds checking so that needs to be valid. The upper bits of
* header.3 are ignored, though, so we can copy all of g0.3.
*
* - They ignore header.5[9:0] and assumes the address is 1KB aligned.
*
*
* For the byte/dword scattered read/write messages:
*
* - We want header.2 to be zero because that gets added to the per-channel
* offset in the non-header portion of the message.
*
* - Contrary to what the docs claim, they don't do any bounds checking so
* the value of header.3[3:0] doesn't matter.
*
* - They consider all of header.5 for the base address and header.5[9:0]
* are not ignored. This means that we can't copy g0.5 verbatim because
* g0.5[9:0] contains the FFTID on most platforms. Instead, we have to
* use an AND to mask off the bottom 10 bits.
*
*
* For block messages, just copying g0 gives a valid header because all the
* garbage gets ignored except for header.2 which we stomp as part of message
* setup. For byte/dword scattered messages, we can just zero out the header
* and copy over the bits we need from g0.5. This opcode, however, tries to
* satisfy the requirements of both by starting with 0 and filling out the
* information required by either set of opcodes.
*/
void
fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
{
assert(inst->exec_size == 8 && inst->force_writemask_all);
assert(dst.file == BRW_GENERAL_REGISTER_FILE);
dst.type = BRW_REGISTER_TYPE_UD;
brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
if (devinfo->gen >= 12)
brw_set_default_swsb(p, tgl_swsb_null());
else
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
/* Copy the per-thread scratch space size from g0.3[3:0] */
brw_set_default_exec_size(p, BRW_EXECUTE_1);
insn = brw_AND(p, suboffset(dst, 3),
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
brw_imm_ud(INTEL_MASK(3, 0)));
if (devinfo->gen < 12) {
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
brw_inst_set_no_dd_check(p->devinfo, insn, true);
}
/* Copy the scratch base address from g0.5[31:10] */
insn = brw_AND(p, suboffset(dst, 5),
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
if (devinfo->gen < 12)
brw_inst_set_no_dd_check(p->devinfo, insn, true);
}
void
fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
struct brw_reg dst,
@@ -2265,6 +2335,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
fill_count++;
break;
case SHADER_OPCODE_SCRATCH_HEADER:
generate_scratch_header(inst, dst);
break;
case SHADER_OPCODE_MOV_INDIRECT:
generate_mov_indirect(inst, dst, src[0], src[1]);
break;

View File

@@ -327,6 +327,7 @@ namespace {
case BRW_OPCODE_LINE:
case BRW_OPCODE_NOP:
case SHADER_OPCODE_CLUSTER_BROADCAST:
case SHADER_OPCODE_SCRATCH_HEADER:
case FS_OPCODE_DDX_COARSE:
case FS_OPCODE_DDX_FINE:
case FS_OPCODE_DDY_COARSE:

View File

@@ -349,6 +349,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
return "gen4_scratch_write";
case SHADER_OPCODE_GEN7_SCRATCH_READ:
return "gen7_scratch_read";
case SHADER_OPCODE_SCRATCH_HEADER:
return "scratch_header";
case SHADER_OPCODE_URB_WRITE_SIMD8:
return "gen8_urb_write_simd8";
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: