diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 870035d1a0b..b4128d2bf52 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -475,6 +475,8 @@ enum opcode { SHADER_OPCODE_GEN4_SCRATCH_WRITE, SHADER_OPCODE_GEN7_SCRATCH_READ, + SHADER_OPCODE_SCRATCH_HEADER, + /** * Gen8+ SIMD8 URB Read messages. */ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0ec2463c876..b79bdd866f8 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5429,41 +5429,14 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) if ((devinfo->gen < 9 && is_typed_access) || is_stateless) { fs_builder ubld = bld.exec_all().group(8, 0); header = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.MOV(header, brw_imm_d(0)); if (is_stateless) { - /* Copy the per-thread scratch from g0 for bounds checking */ - ubld.group(1, 0).AND(component(header, 3), - retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), - brw_imm_ud(0xf)); - - /* Both the typed and scattered byte/dword A32 messages take a buffer - * base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or - * MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d - * for more details.) This is conveniently where the HW places the - * scratch surface base address. - * - * From the SKL PRM Vol. 7 "Per-Thread Scratch Space": - * - * "When a thread becomes 'active' it is allocated a portion of - * scratch space, sized according to PerThreadScratchSpace. The - * starting location of each thread’s scratch space allocation, - * ScratchSpaceOffset, is passed in the thread payload in - * R0.5[31:10] and is specified as a 1KB-granular offset from the - * GeneralStateBaseAddress. The computation of ScratchSpaceOffset - * includes the starting address of the stage’s scratch space - * allocation, as programmed by ScratchSpaceBasePointer." - * - * The base address is passed in bits R0.5[31:10] and the bottom 10 - * bits of R0.5 are used for other things. Therefore, we have to - * mask off the bottom 10 bits so that we don't get a garbage base - * address. - */ - ubld.group(1, 0).AND(component(header, 5), - retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), - brw_imm_ud(0xfffffc00)); + assert(!is_surface_access); + ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header); + } else { + ubld.MOV(header, brw_imm_d(0)); + if (is_surface_access) + ubld.group(1, 0).MOV(component(header, 7), sample_mask); } - if (is_surface_access) - ubld.group(1, 0).MOV(component(header, 7), sample_mask); } const unsigned header_sz = header.file != BAD_FILE ? 1 : 0; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 98db40a20cd..774f414a6fd 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -513,6 +513,7 @@ private: void generate_scratch_write(fs_inst *inst, struct brw_reg src); void generate_scratch_read(fs_inst *inst, struct brw_reg dst); void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst); + void generate_scratch_header(fs_inst *inst, struct brw_reg dst); void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst, struct brw_reg index, struct brw_reg offset); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 5228f35e0ef..9e96ca946a9 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1533,6 +1533,76 @@ fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst) gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); } +/* The A32 messages take a buffer base address in header.5:[31:0] (See + * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered + * and OWord block messages in the SKL PRM Vol. 2d for more details.) + * Unfortunately, there are a number of subtle differences: + * + * For the block read/write messages: + * + * - We always stomp header.2 to fill in the actual scratch address (in + * units of OWORDs) so we don't care what's in there. + * + * - They rely on per-thread scratch space value in header.3[3:0] to do + * bounds checking so that needs to be valid. The upper bits of + * header.3 are ignored, though, so we can copy all of g0.3. + * + * - They ignore header.5[9:0] and assumes the address is 1KB aligned. + * + * + * For the byte/dword scattered read/write messages: + * + * - We want header.2 to be zero because that gets added to the per-channel + * offset in the non-header portion of the message. + * + * - Contrary to what the docs claim, they don't do any bounds checking so + * the value of header.3[3:0] doesn't matter. + * + * - They consider all of header.5 for the base address and header.5[9:0] + * are not ignored. This means that we can't copy g0.5 verbatim because + * g0.5[9:0] contains the FFTID on most platforms. Instead, we have to + * use an AND to mask off the bottom 10 bits. + * + * + * For block messages, just copying g0 gives a valid header because all the + * garbage gets ignored except for header.2 which we stomp as part of message + * setup. For byte/dword scattered messages, we can just zero out the header + * and copy over the bits we need from g0.5. This opcode, however, tries to + * satisfy the requirements of both by starting with 0 and filling out the + * information required by either set of opcodes. + */ +void +fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->exec_size == 8 && inst->force_writemask_all); + assert(dst.file == BRW_GENERAL_REGISTER_FILE); + + dst.type = BRW_REGISTER_TYPE_UD; + + brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0)); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(p->devinfo, insn, true); + + /* Copy the per-thread scratch space size from g0.3[3:0] */ + brw_set_default_exec_size(p, BRW_EXECUTE_1); + insn = brw_AND(p, suboffset(dst, 3), + retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(3, 0))); + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(p->devinfo, insn, true); + brw_inst_set_no_dd_check(p->devinfo, insn, true); + } + + /* Copy the scratch base address from g0.5[31:10] */ + insn = brw_AND(p, suboffset(dst, 5), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + if (devinfo->gen < 12) + brw_inst_set_no_dd_check(p->devinfo, insn, true); +} + void fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst, @@ -2265,6 +2335,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, fill_count++; break; + case SHADER_OPCODE_SCRATCH_HEADER: + generate_scratch_header(inst, dst); + break; + case SHADER_OPCODE_MOV_INDIRECT: generate_mov_indirect(inst, dst, src[0], src[1]); break; diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index eb36ae1eb9e..ce74f0a6723 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -327,6 +327,7 @@ namespace { case BRW_OPCODE_LINE: case BRW_OPCODE_NOP: case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_SCRATCH_HEADER: case FS_OPCODE_DDX_COARSE: case FS_OPCODE_DDX_FINE: case FS_OPCODE_DDY_COARSE: diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index f493e02c767..2ea680d1575 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -349,6 +349,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "gen4_scratch_write"; case SHADER_OPCODE_GEN7_SCRATCH_READ: return "gen7_scratch_read"; + case SHADER_OPCODE_SCRATCH_HEADER: + return "scratch_header"; case SHADER_OPCODE_URB_WRITE_SIMD8: return "gen8_urb_write_simd8"; case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: