intel/fs: Implement the new load/store_scratch intrinsics
This commit fills in a number of different pieces: 1. We add support to brw_nir_lower_mem_access_bit_sizes to handle the new intrinsics. This involves simple plumbing work as well as a tiny bit of extra logic to always scalarize scratch intrinsics 2. Add code to brw_fs_nir.cpp to turn nir_load/store_scratch intrinsics into byte/dword scattered read/write messages which use the A32 stateless model. 3. Add code to lower_surface_logical_send to handle dword scattered messages and the A32 stateless model. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
This commit is contained in:

committed by
Jason Ekstrand

parent
e2297699de
commit
53bfcdeecf
@@ -5368,6 +5368,15 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
|||||||
inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
|
inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
|
||||||
inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
|
inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
|
||||||
|
|
||||||
|
const bool is_surface_access = is_typed_access ||
|
||||||
|
inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
|
||||||
|
inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
|
||||||
|
inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
|
||||||
|
|
||||||
|
const bool is_stateless =
|
||||||
|
surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
|
||||||
|
surface.ud == GEN8_BTI_STATELESS_NON_COHERENT);
|
||||||
|
|
||||||
const bool has_side_effects = inst->has_side_effects();
|
const bool has_side_effects = inst->has_side_effects();
|
||||||
fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() :
|
fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() :
|
||||||
fs_reg(brw_imm_d(0xffff));
|
fs_reg(brw_imm_d(0xffff));
|
||||||
@@ -5381,25 +5390,63 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
|||||||
* we don't attempt to implement sample masks via predication for such
|
* we don't attempt to implement sample masks via predication for such
|
||||||
* messages prior to Gen9, since we have to provide a header anyway. On
|
* messages prior to Gen9, since we have to provide a header anyway. On
|
||||||
* Gen11+ the header has been removed so we can only use predication.
|
* Gen11+ the header has been removed so we can only use predication.
|
||||||
|
*
|
||||||
|
* For all stateless A32 messages, we also need a header
|
||||||
*/
|
*/
|
||||||
fs_reg header;
|
fs_reg header;
|
||||||
if (devinfo->gen < 9 && is_typed_access) {
|
if ((devinfo->gen < 9 && is_typed_access) || is_stateless) {
|
||||||
fs_builder ubld = bld.exec_all().group(8, 0);
|
fs_builder ubld = bld.exec_all().group(8, 0);
|
||||||
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
ubld.MOV(header, brw_imm_d(0));
|
ubld.MOV(header, brw_imm_d(0));
|
||||||
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
|
if (is_stateless) {
|
||||||
|
/* Both the typed and scattered byte/dword A32 messages take a buffer
|
||||||
|
* base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or
|
||||||
|
* MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d
|
||||||
|
* for more details.) This is conveniently where the HW places the
|
||||||
|
* scratch surface base address.
|
||||||
|
*
|
||||||
|
* From the SKL PRM Vol. 7 "Per-Thread Scratch Space":
|
||||||
|
*
|
||||||
|
* "When a thread becomes 'active' it is allocated a portion of
|
||||||
|
* scratch space, sized according to PerThreadScratchSpace. The
|
||||||
|
* starting location of each thread’s scratch space allocation,
|
||||||
|
* ScratchSpaceOffset, is passed in the thread payload in
|
||||||
|
* R0.5[31:10] and is specified as a 1KB-granular offset from the
|
||||||
|
* GeneralStateBaseAddress. The computation of ScratchSpaceOffset
|
||||||
|
* includes the starting address of the stage’s scratch space
|
||||||
|
* allocation, as programmed by ScratchSpaceBasePointer."
|
||||||
|
*
|
||||||
|
* The base address is passed in bits R0.5[31:10] and the bottom 10
|
||||||
|
* bits of R0.5 are used for other things. Therefore, we have to
|
||||||
|
* mask off the bottom 10 bits so that we don't get a garbage base
|
||||||
|
* address.
|
||||||
|
*/
|
||||||
|
ubld.group(1, 0).AND(component(header, 5),
|
||||||
|
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
|
||||||
|
brw_imm_ud(0xfffffc00));
|
||||||
|
}
|
||||||
|
if (is_surface_access)
|
||||||
|
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
|
||||||
}
|
}
|
||||||
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
|
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
|
||||||
|
|
||||||
fs_reg payload, payload2;
|
fs_reg payload, payload2;
|
||||||
unsigned mlen, ex_mlen = 0;
|
unsigned mlen, ex_mlen = 0;
|
||||||
if (devinfo->gen >= 9) {
|
if (devinfo->gen >= 9 &&
|
||||||
|
(src.file == BAD_FILE || header.file == BAD_FILE)) {
|
||||||
/* We have split sends on gen9 and above */
|
/* We have split sends on gen9 and above */
|
||||||
assert(header.file == BAD_FILE);
|
if (header.file == BAD_FILE) {
|
||||||
payload = bld.move_to_vgrf(addr, addr_sz);
|
payload = bld.move_to_vgrf(addr, addr_sz);
|
||||||
payload2 = bld.move_to_vgrf(src, src_sz);
|
payload2 = bld.move_to_vgrf(src, src_sz);
|
||||||
mlen = addr_sz * (inst->exec_size / 8);
|
mlen = addr_sz * (inst->exec_size / 8);
|
||||||
ex_mlen = src_sz * (inst->exec_size / 8);
|
ex_mlen = src_sz * (inst->exec_size / 8);
|
||||||
|
} else {
|
||||||
|
assert(src.file == BAD_FILE);
|
||||||
|
payload = header;
|
||||||
|
payload2 = bld.move_to_vgrf(addr, addr_sz);
|
||||||
|
mlen = header_sz;
|
||||||
|
ex_mlen = addr_sz * (inst->exec_size / 8);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
/* Allocate space for the payload. */
|
/* Allocate space for the payload. */
|
||||||
const unsigned sz = header_sz + addr_sz + src_sz;
|
const unsigned sz = header_sz + addr_sz + src_sz;
|
||||||
@@ -5426,8 +5473,8 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
|||||||
/* Predicate the instruction on the sample mask if no header is
|
/* Predicate the instruction on the sample mask if no header is
|
||||||
* provided.
|
* provided.
|
||||||
*/
|
*/
|
||||||
if (header.file == BAD_FILE && sample_mask.file != BAD_FILE &&
|
if ((header.file == BAD_FILE || !is_surface_access) &&
|
||||||
sample_mask.file != IMM) {
|
sample_mask.file != BAD_FILE && sample_mask.file != IMM) {
|
||||||
const fs_builder ubld = bld.group(1, 0).exec_all();
|
const fs_builder ubld = bld.group(1, 0).exec_all();
|
||||||
if (inst->predicate) {
|
if (inst->predicate) {
|
||||||
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
||||||
|
@@ -228,6 +228,9 @@ public:
|
|||||||
nir_intrinsic_instr *instr);
|
nir_intrinsic_instr *instr);
|
||||||
fs_reg get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
|
fs_reg get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
|
||||||
nir_intrinsic_instr *instr);
|
nir_intrinsic_instr *instr);
|
||||||
|
fs_reg swizzle_nir_scratch_addr(const brw::fs_builder &bld,
|
||||||
|
const fs_reg &addr,
|
||||||
|
bool in_dwords);
|
||||||
void nir_emit_intrinsic(const brw::fs_builder &bld,
|
void nir_emit_intrinsic(const brw::fs_builder &bld,
|
||||||
nir_intrinsic_instr *instr);
|
nir_intrinsic_instr *instr);
|
||||||
void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
|
void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
|
||||||
@@ -341,6 +344,7 @@ public:
|
|||||||
int *push_constant_loc;
|
int *push_constant_loc;
|
||||||
|
|
||||||
fs_reg subgroup_id;
|
fs_reg subgroup_id;
|
||||||
|
fs_reg scratch_base;
|
||||||
fs_reg frag_depth;
|
fs_reg frag_depth;
|
||||||
fs_reg frag_stencil;
|
fs_reg frag_stencil;
|
||||||
fs_reg sample_mask;
|
fs_reg sample_mask;
|
||||||
|
@@ -2062,7 +2062,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
|
|||||||
case SHADER_OPCODE_SEND:
|
case SHADER_OPCODE_SEND:
|
||||||
generate_send(inst, dst, src[0], src[1], src[2],
|
generate_send(inst, dst, src[0], src[1], src[2],
|
||||||
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
|
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
|
||||||
send_count++;
|
if ((inst->desc & 0xff) == BRW_BTI_STATELESS ||
|
||||||
|
(inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) {
|
||||||
|
if (inst->size_written)
|
||||||
|
fill_count++;
|
||||||
|
else
|
||||||
|
spill_count++;
|
||||||
|
} else {
|
||||||
|
send_count++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
||||||
|
@@ -42,6 +42,7 @@ fs_visitor::emit_nir_code()
|
|||||||
nir_setup_outputs();
|
nir_setup_outputs();
|
||||||
nir_setup_uniforms();
|
nir_setup_uniforms();
|
||||||
nir_emit_system_values();
|
nir_emit_system_values();
|
||||||
|
last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;
|
||||||
|
|
||||||
nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
|
nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
|
||||||
}
|
}
|
||||||
@@ -4023,6 +4024,61 @@ image_intrinsic_coord_components(nir_intrinsic_instr *instr)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The offsets we get from NIR act as if each SIMD channel has it's own blob
|
||||||
|
* of contiguous space. However, if we actually place each SIMD channel in
|
||||||
|
* it's own space, we end up with terrible cache performance because each SIMD
|
||||||
|
* channel accesses a different cache line even when they're all accessing the
|
||||||
|
* same byte offset. To deal with this problem, we swizzle the address using
|
||||||
|
* a simple algorithm which ensures that any time a SIMD message reads or
|
||||||
|
* writes the same address, it's all in the same cache line. We have to keep
|
||||||
|
* the bottom two bits fixed so that we can read/write up to a dword at a time
|
||||||
|
* and the individual element is contiguous. We do this by splitting the
|
||||||
|
* address as follows:
|
||||||
|
*
|
||||||
|
* 31 4-6 2 0
|
||||||
|
* +-------------------------------+------------+----------+
|
||||||
|
* | Hi address bits | chan index | addr low |
|
||||||
|
* +-------------------------------+------------+----------+
|
||||||
|
*
|
||||||
|
* In other words, the bottom two address bits stay, and the top 30 get
|
||||||
|
* shifted up so that we can stick the SIMD channel index in the middle. This
|
||||||
|
* way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
|
||||||
|
* at the same logical offset, the scratch read/write instruction acts on
|
||||||
|
* continuous elements and we get good cache locality.
|
||||||
|
*/
|
||||||
|
fs_reg
|
||||||
|
fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,
|
||||||
|
const fs_reg &nir_addr,
|
||||||
|
bool in_dwords)
|
||||||
|
{
|
||||||
|
const fs_reg &chan_index =
|
||||||
|
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||||
|
const unsigned chan_index_bits = ffs(dispatch_width) - 1;
|
||||||
|
|
||||||
|
fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
|
if (in_dwords) {
|
||||||
|
/* In this case, we know the address is aligned to a DWORD and we want
|
||||||
|
* the final address in DWORDs.
|
||||||
|
*/
|
||||||
|
bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
|
||||||
|
bld.OR(addr, addr, chan_index);
|
||||||
|
} else {
|
||||||
|
/* This case substantially more annoying because we have to pay
|
||||||
|
* attention to those pesky two bottom bits.
|
||||||
|
*/
|
||||||
|
fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
|
bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
|
||||||
|
bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
|
||||||
|
fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
|
bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
|
||||||
|
bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
|
||||||
|
bld.OR(addr, addr, addr_hi);
|
||||||
|
bld.OR(addr, addr, chan_addr);
|
||||||
|
}
|
||||||
|
return addr;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
|
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
|
||||||
{
|
{
|
||||||
@@ -4682,6 +4738,99 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_load_scratch: {
|
||||||
|
assert(devinfo->gen >= 7);
|
||||||
|
|
||||||
|
assert(nir_dest_num_components(instr->dest) == 1);
|
||||||
|
const unsigned bit_size = nir_dest_bit_size(instr->dest);
|
||||||
|
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
|
||||||
|
|
||||||
|
if (devinfo->gen >= 8) {
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
|
||||||
|
brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT);
|
||||||
|
} else {
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
|
||||||
|
const fs_reg nir_addr = get_nir_src(instr->src[0]);
|
||||||
|
|
||||||
|
/* Make dest unsigned because that's what the temporary will be */
|
||||||
|
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
|
/* Read the vector */
|
||||||
|
if (nir_intrinsic_align(instr) >= 4) {
|
||||||
|
assert(nir_dest_bit_size(instr->dest) == 32);
|
||||||
|
|
||||||
|
/* The offset for a DWORD scattered message is in dwords. */
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||||
|
swizzle_nir_scratch_addr(bld, nir_addr, true);
|
||||||
|
|
||||||
|
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
|
||||||
|
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||||
|
} else {
|
||||||
|
assert(nir_dest_bit_size(instr->dest) <= 32);
|
||||||
|
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||||
|
swizzle_nir_scratch_addr(bld, nir_addr, false);
|
||||||
|
|
||||||
|
fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
|
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
|
||||||
|
read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||||
|
bld.MOV(dest, read_result);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_store_scratch: {
|
||||||
|
assert(devinfo->gen >= 7);
|
||||||
|
|
||||||
|
assert(nir_src_num_components(instr->src[0]) == 1);
|
||||||
|
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
||||||
|
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
|
||||||
|
|
||||||
|
if (devinfo->gen >= 8) {
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
|
||||||
|
brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT);
|
||||||
|
} else {
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
|
||||||
|
const fs_reg nir_addr = get_nir_src(instr->src[1]);
|
||||||
|
|
||||||
|
fs_reg data = get_nir_src(instr->src[0]);
|
||||||
|
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
|
assert(nir_intrinsic_write_mask(instr) ==
|
||||||
|
(1u << instr->num_components) - 1);
|
||||||
|
if (nir_intrinsic_align(instr) >= 4) {
|
||||||
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
|
||||||
|
|
||||||
|
/* The offset for a DWORD scattered message is in dwords. */
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||||
|
swizzle_nir_scratch_addr(bld, nir_addr, true);
|
||||||
|
|
||||||
|
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
|
||||||
|
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||||
|
} else {
|
||||||
|
assert(nir_src_bit_size(instr->src[0]) <= 32);
|
||||||
|
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
|
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
|
||||||
|
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||||
|
swizzle_nir_scratch_addr(bld, nir_addr, false);
|
||||||
|
|
||||||
|
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
|
||||||
|
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case nir_intrinsic_load_subgroup_size:
|
case nir_intrinsic_load_subgroup_size:
|
||||||
/* This should only happen for fragment shaders because every other case
|
/* This should only happen for fragment shaders because every other case
|
||||||
* is lowered in NIR so we can optimize on it.
|
* is lowered in NIR so we can optimize on it.
|
||||||
|
@@ -77,8 +77,12 @@ static bool
|
|||||||
lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||||
const struct gen_device_info *devinfo)
|
const struct gen_device_info *devinfo)
|
||||||
{
|
{
|
||||||
|
const bool needs_scalar =
|
||||||
|
intrin->intrinsic == nir_intrinsic_load_scratch;
|
||||||
|
|
||||||
assert(intrin->dest.is_ssa);
|
assert(intrin->dest.is_ssa);
|
||||||
if (intrin->dest.ssa.bit_size == 32)
|
if (intrin->dest.ssa.bit_size == 32 &&
|
||||||
|
(!needs_scalar || intrin->num_components == 1))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
const unsigned bit_size = intrin->dest.ssa.bit_size;
|
const unsigned bit_size = intrin->dest.ssa.bit_size;
|
||||||
@@ -119,7 +123,8 @@ lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||||||
} else {
|
} else {
|
||||||
assert(load_offset % 4 == 0);
|
assert(load_offset % 4 == 0);
|
||||||
load_bit_size = 32;
|
load_bit_size = 32;
|
||||||
load_comps = DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
|
load_comps = needs_scalar ? 1 :
|
||||||
|
DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
|
loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
|
||||||
@@ -144,6 +149,9 @@ static bool
|
|||||||
lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||||
const struct gen_device_info *devinfo)
|
const struct gen_device_info *devinfo)
|
||||||
{
|
{
|
||||||
|
const bool needs_scalar =
|
||||||
|
intrin->intrinsic == nir_intrinsic_store_scratch;
|
||||||
|
|
||||||
assert(intrin->src[0].is_ssa);
|
assert(intrin->src[0].is_ssa);
|
||||||
nir_ssa_def *value = intrin->src[0].ssa;
|
nir_ssa_def *value = intrin->src[0].ssa;
|
||||||
|
|
||||||
@@ -159,7 +167,9 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||||||
assert(writemask < (1 << num_components));
|
assert(writemask < (1 << num_components));
|
||||||
|
|
||||||
if ((value->bit_size <= 32 && num_components == 1) ||
|
if ((value->bit_size <= 32 && num_components == 1) ||
|
||||||
(value->bit_size == 32 && writemask == (1 << num_components) - 1))
|
(value->bit_size == 32 &&
|
||||||
|
writemask == (1 << num_components) - 1 &&
|
||||||
|
!needs_scalar))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
||||||
@@ -180,7 +190,6 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||||||
|
|
||||||
while (BITSET_FFS(mask) != 0) {
|
while (BITSET_FFS(mask) != 0) {
|
||||||
const int start = BITSET_FFS(mask) - 1;
|
const int start = BITSET_FFS(mask) - 1;
|
||||||
assert(start % byte_size == 0);
|
|
||||||
|
|
||||||
int end;
|
int end;
|
||||||
for (end = start + 1; end < bytes_written; end++) {
|
for (end = start + 1; end < bytes_written; end++) {
|
||||||
@@ -198,7 +207,7 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||||||
if (chunk_bytes >= 4 && is_dword_aligned) {
|
if (chunk_bytes >= 4 && is_dword_aligned) {
|
||||||
store_align = MAX2(align, 4);
|
store_align = MAX2(align, 4);
|
||||||
store_bit_size = 32;
|
store_bit_size = 32;
|
||||||
store_comps = MIN2(chunk_bytes, 16) / 4;
|
store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
|
||||||
} else {
|
} else {
|
||||||
store_align = align;
|
store_align = align;
|
||||||
store_comps = 1;
|
store_comps = 1;
|
||||||
@@ -208,7 +217,6 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
|
|||||||
store_bit_size = 16;
|
store_bit_size = 16;
|
||||||
}
|
}
|
||||||
const unsigned store_bytes = store_comps * (store_bit_size / 8);
|
const unsigned store_bytes = store_comps * (store_bit_size / 8);
|
||||||
assert(store_bytes % byte_size == 0);
|
|
||||||
|
|
||||||
nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8,
|
nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8,
|
||||||
store_comps, store_bit_size);
|
store_comps, store_bit_size);
|
||||||
@@ -245,6 +253,7 @@ lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
|
|||||||
case nir_intrinsic_load_global:
|
case nir_intrinsic_load_global:
|
||||||
case nir_intrinsic_load_ssbo:
|
case nir_intrinsic_load_ssbo:
|
||||||
case nir_intrinsic_load_shared:
|
case nir_intrinsic_load_shared:
|
||||||
|
case nir_intrinsic_load_scratch:
|
||||||
if (lower_mem_load_bit_size(&b, intrin, devinfo))
|
if (lower_mem_load_bit_size(&b, intrin, devinfo))
|
||||||
progress = true;
|
progress = true;
|
||||||
break;
|
break;
|
||||||
@@ -252,6 +261,7 @@ lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
|
|||||||
case nir_intrinsic_store_global:
|
case nir_intrinsic_store_global:
|
||||||
case nir_intrinsic_store_ssbo:
|
case nir_intrinsic_store_ssbo:
|
||||||
case nir_intrinsic_store_shared:
|
case nir_intrinsic_store_shared:
|
||||||
|
case nir_intrinsic_store_scratch:
|
||||||
if (lower_mem_store_bit_size(&b, intrin, devinfo))
|
if (lower_mem_store_bit_size(&b, intrin, devinfo))
|
||||||
progress = true;
|
progress = true;
|
||||||
break;
|
break;
|
||||||
@@ -285,6 +295,12 @@ lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
|
|||||||
* all nir load/store intrinsics into a series of either 8 or 32-bit
|
* all nir load/store intrinsics into a series of either 8 or 32-bit
|
||||||
* load/store intrinsics with a number of components that we can directly
|
* load/store intrinsics with a number of components that we can directly
|
||||||
* handle in hardware and with a trivial write-mask.
|
* handle in hardware and with a trivial write-mask.
|
||||||
|
*
|
||||||
|
* For scratch access, additional consideration has to be made due to the way
|
||||||
|
* that we swizzle the memory addresses to achieve decent cache locality. In
|
||||||
|
* particular, even though untyped surface read/write messages exist and work,
|
||||||
|
* we can't use them to load multiple components in a single SEND. For more
|
||||||
|
* detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
|
brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
|
||||||
|
Reference in New Issue
Block a user