intel/fs: Add a SCRATCH_HEADER opcode
This opcode is responsible for setting up the buffer base address and per-thread scratch space fields of a scratch message header. For the most part, it's a copy of g0 but some messages need us to zero out g0.2 and the bottom bits of g0.5. This may actually fix a bug when nir_load/store_scratch is used. The docs say that the DWORD scattered messages respect the per-thread scratch size specified in gN.3[3:0] in the message header but we've been leaving it zero. This may mean that we've been ignoring any scratch reads/writes from a load/store_scratch intrinsic above the 1KB mark. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7084>
This commit is contained in:

committed by
Marge Bot

parent
24b64c8408
commit
06ebf23283
@@ -475,6 +475,8 @@ enum opcode {
|
|||||||
SHADER_OPCODE_GEN4_SCRATCH_WRITE,
|
SHADER_OPCODE_GEN4_SCRATCH_WRITE,
|
||||||
SHADER_OPCODE_GEN7_SCRATCH_READ,
|
SHADER_OPCODE_GEN7_SCRATCH_READ,
|
||||||
|
|
||||||
|
SHADER_OPCODE_SCRATCH_HEADER,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gen8+ SIMD8 URB Read messages.
|
* Gen8+ SIMD8 URB Read messages.
|
||||||
*/
|
*/
|
||||||
|
@@ -5429,41 +5429,14 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
|||||||
if ((devinfo->gen < 9 && is_typed_access) || is_stateless) {
|
if ((devinfo->gen < 9 && is_typed_access) || is_stateless) {
|
||||||
fs_builder ubld = bld.exec_all().group(8, 0);
|
fs_builder ubld = bld.exec_all().group(8, 0);
|
||||||
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
ubld.MOV(header, brw_imm_d(0));
|
|
||||||
if (is_stateless) {
|
if (is_stateless) {
|
||||||
/* Copy the per-thread scratch from g0 for bounds checking */
|
assert(!is_surface_access);
|
||||||
ubld.group(1, 0).AND(component(header, 3),
|
ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
|
||||||
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
} else {
|
||||||
brw_imm_ud(0xf));
|
ubld.MOV(header, brw_imm_d(0));
|
||||||
|
if (is_surface_access)
|
||||||
/* Both the typed and scattered byte/dword A32 messages take a buffer
|
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
|
||||||
* base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or
|
|
||||||
* MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d
|
|
||||||
* for more details.) This is conveniently where the HW places the
|
|
||||||
* scratch surface base address.
|
|
||||||
*
|
|
||||||
* From the SKL PRM Vol. 7 "Per-Thread Scratch Space":
|
|
||||||
*
|
|
||||||
* "When a thread becomes 'active' it is allocated a portion of
|
|
||||||
* scratch space, sized according to PerThreadScratchSpace. The
|
|
||||||
* starting location of each thread’s scratch space allocation,
|
|
||||||
* ScratchSpaceOffset, is passed in the thread payload in
|
|
||||||
* R0.5[31:10] and is specified as a 1KB-granular offset from the
|
|
||||||
* GeneralStateBaseAddress. The computation of ScratchSpaceOffset
|
|
||||||
* includes the starting address of the stage’s scratch space
|
|
||||||
* allocation, as programmed by ScratchSpaceBasePointer."
|
|
||||||
*
|
|
||||||
* The base address is passed in bits R0.5[31:10] and the bottom 10
|
|
||||||
* bits of R0.5 are used for other things. Therefore, we have to
|
|
||||||
* mask off the bottom 10 bits so that we don't get a garbage base
|
|
||||||
* address.
|
|
||||||
*/
|
|
||||||
ubld.group(1, 0).AND(component(header, 5),
|
|
||||||
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
|
|
||||||
brw_imm_ud(0xfffffc00));
|
|
||||||
}
|
}
|
||||||
if (is_surface_access)
|
|
||||||
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
|
|
||||||
}
|
}
|
||||||
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
|
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
|
||||||
|
|
||||||
|
@@ -513,6 +513,7 @@ private:
|
|||||||
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
|
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
|
||||||
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
|
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
|
||||||
void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
|
void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
|
||||||
|
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
|
||||||
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
|
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
|
||||||
struct brw_reg index,
|
struct brw_reg index,
|
||||||
struct brw_reg offset);
|
struct brw_reg offset);
|
||||||
|
@@ -1533,6 +1533,76 @@ fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
|
|||||||
gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
|
gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The A32 messages take a buffer base address in header.5:[31:0] (See
|
||||||
|
* MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
|
||||||
|
* and OWord block messages in the SKL PRM Vol. 2d for more details.)
|
||||||
|
* Unfortunately, there are a number of subtle differences:
|
||||||
|
*
|
||||||
|
* For the block read/write messages:
|
||||||
|
*
|
||||||
|
* - We always stomp header.2 to fill in the actual scratch address (in
|
||||||
|
* units of OWORDs) so we don't care what's in there.
|
||||||
|
*
|
||||||
|
* - They rely on per-thread scratch space value in header.3[3:0] to do
|
||||||
|
* bounds checking so that needs to be valid. The upper bits of
|
||||||
|
* header.3 are ignored, though, so we can copy all of g0.3.
|
||||||
|
*
|
||||||
|
* - They ignore header.5[9:0] and assumes the address is 1KB aligned.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* For the byte/dword scattered read/write messages:
|
||||||
|
*
|
||||||
|
* - We want header.2 to be zero because that gets added to the per-channel
|
||||||
|
* offset in the non-header portion of the message.
|
||||||
|
*
|
||||||
|
* - Contrary to what the docs claim, they don't do any bounds checking so
|
||||||
|
* the value of header.3[3:0] doesn't matter.
|
||||||
|
*
|
||||||
|
* - They consider all of header.5 for the base address and header.5[9:0]
|
||||||
|
* are not ignored. This means that we can't copy g0.5 verbatim because
|
||||||
|
* g0.5[9:0] contains the FFTID on most platforms. Instead, we have to
|
||||||
|
* use an AND to mask off the bottom 10 bits.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* For block messages, just copying g0 gives a valid header because all the
|
||||||
|
* garbage gets ignored except for header.2 which we stomp as part of message
|
||||||
|
* setup. For byte/dword scattered messages, we can just zero out the header
|
||||||
|
* and copy over the bits we need from g0.5. This opcode, however, tries to
|
||||||
|
* satisfy the requirements of both by starting with 0 and filling out the
|
||||||
|
* information required by either set of opcodes.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
|
||||||
|
{
|
||||||
|
assert(inst->exec_size == 8 && inst->force_writemask_all);
|
||||||
|
assert(dst.file == BRW_GENERAL_REGISTER_FILE);
|
||||||
|
|
||||||
|
dst.type = BRW_REGISTER_TYPE_UD;
|
||||||
|
|
||||||
|
brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
|
||||||
|
if (devinfo->gen >= 12)
|
||||||
|
brw_set_default_swsb(p, tgl_swsb_null());
|
||||||
|
else
|
||||||
|
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
|
||||||
|
|
||||||
|
/* Copy the per-thread scratch space size from g0.3[3:0] */
|
||||||
|
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
||||||
|
insn = brw_AND(p, suboffset(dst, 3),
|
||||||
|
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
||||||
|
brw_imm_ud(INTEL_MASK(3, 0)));
|
||||||
|
if (devinfo->gen < 12) {
|
||||||
|
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
|
||||||
|
brw_inst_set_no_dd_check(p->devinfo, insn, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy the scratch base address from g0.5[31:10] */
|
||||||
|
insn = brw_AND(p, suboffset(dst, 5),
|
||||||
|
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
|
||||||
|
brw_imm_ud(INTEL_MASK(31, 10)));
|
||||||
|
if (devinfo->gen < 12)
|
||||||
|
brw_inst_set_no_dd_check(p->devinfo, insn, true);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
|
fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
|
||||||
struct brw_reg dst,
|
struct brw_reg dst,
|
||||||
@@ -2265,6 +2335,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
|
|||||||
fill_count++;
|
fill_count++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case SHADER_OPCODE_SCRATCH_HEADER:
|
||||||
|
generate_scratch_header(inst, dst);
|
||||||
|
break;
|
||||||
|
|
||||||
case SHADER_OPCODE_MOV_INDIRECT:
|
case SHADER_OPCODE_MOV_INDIRECT:
|
||||||
generate_mov_indirect(inst, dst, src[0], src[1]);
|
generate_mov_indirect(inst, dst, src[0], src[1]);
|
||||||
break;
|
break;
|
||||||
|
@@ -327,6 +327,7 @@ namespace {
|
|||||||
case BRW_OPCODE_LINE:
|
case BRW_OPCODE_LINE:
|
||||||
case BRW_OPCODE_NOP:
|
case BRW_OPCODE_NOP:
|
||||||
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
||||||
|
case SHADER_OPCODE_SCRATCH_HEADER:
|
||||||
case FS_OPCODE_DDX_COARSE:
|
case FS_OPCODE_DDX_COARSE:
|
||||||
case FS_OPCODE_DDX_FINE:
|
case FS_OPCODE_DDX_FINE:
|
||||||
case FS_OPCODE_DDY_COARSE:
|
case FS_OPCODE_DDY_COARSE:
|
||||||
|
@@ -349,6 +349,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
|
|||||||
return "gen4_scratch_write";
|
return "gen4_scratch_write";
|
||||||
case SHADER_OPCODE_GEN7_SCRATCH_READ:
|
case SHADER_OPCODE_GEN7_SCRATCH_READ:
|
||||||
return "gen7_scratch_read";
|
return "gen7_scratch_read";
|
||||||
|
case SHADER_OPCODE_SCRATCH_HEADER:
|
||||||
|
return "scratch_header";
|
||||||
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
||||||
return "gen8_urb_write_simd8";
|
return "gen8_urb_write_simd8";
|
||||||
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
||||||
|
Reference in New Issue
Block a user