From 77486db867bd39aa9b76e549c946b0a165fcb21a Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Thu, 23 Jul 2020 15:15:34 +0300 Subject: [PATCH] intel/fs: Disable sample mask predication for scratch stores Scratch stores are being lowered to the instructions with side-effects, however they should be enabled in fs helper invocations, since they are produced from operations which don't imply side-effects. To fix this - we move the decision of whether the sample mask predication is enable to the point where logical brw instructions are created. GLSL example of the issue: int tmp[1024]; ... do { // changes to tmp } while (some_condition(tmp)) If `tmp` is lowered to scrach memory, `some_condition` would be undefined if scratch write is predicated on sample mask, making possible for the while loop to become infinite and hang the GPU. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3256 Fixes: 53bfcdeecf4c9632e09ee641d2ca02dd9ec25e34 Signed-off-by: Danylo Piliaiev Reviewed-by: Matt Turner Acked-by: Jason Ekstrand Part-of: --- src/intel/compiler/brw_eu_defines.h | 5 +++++ src/intel/compiler/brw_fs.cpp | 8 ++++++-- src/intel/compiler/brw_fs_nir.cpp | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 25576a21a87..29ae95b8a38 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -904,6 +904,11 @@ enum surface_logical_srcs { SURFACE_LOGICAL_SRC_IMM_DIMS, /** Per-opcode immediate argument. For atomics, this is the atomic opcode */ SURFACE_LOGICAL_SRC_IMM_ARG, + /** + * Some instructions with side-effects should not be predicated on + * sample mask, e.g. lowered stores to scratch. + */ + SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK, SURFACE_LOGICAL_NUM_SRCS }; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index ea10e522b00..ccfc6871534 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5380,7 +5380,10 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; + const fs_reg &allow_sample_mask = + inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK]; assert(arg.file == IMM); + assert(allow_sample_mask.file == IMM); /* We must have exactly one of surface and surface_handle */ assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); @@ -5404,8 +5407,9 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); const bool has_side_effects = inst->has_side_effects(); - fs_reg sample_mask = has_side_effects ? sample_mask_reg(bld) : - fs_reg(brw_imm_d(0xffff)); + + fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) : + fs_reg(brw_imm_d(0xffff)); /* From the BDW PRM Volume 7, page 147: * diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 5abeefe66b4..e7ce2230c03 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3774,6 +3774,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */ srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -3808,6 +3809,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); /* Make dest unsigned because that's what the temporary will be */ dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -3844,6 +3846,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -4125,6 +4128,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr if (instr->intrinsic == nir_intrinsic_image_load || instr->intrinsic == nir_intrinsic_bindless_image_load) { srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); fs_inst *inst = bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -4133,6 +4137,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr instr->intrinsic == nir_intrinsic_bindless_image_store) { srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { @@ -4155,6 +4160,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr data = tmp; } srcs[SURFACE_LOGICAL_SRC_DATA] = data; + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -4214,6 +4220,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, @@ -4230,6 +4237,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -4651,6 +4659,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr get_nir_ssbo_intrinsic_index(bld, instr); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); /* Make dest unsigned because that's what the temporary will be */ dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -4687,6 +4696,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr get_nir_ssbo_intrinsic_index(bld, instr); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -4825,6 +4835,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); const fs_reg nir_addr = get_nir_src(instr->src[0]); /* Make dest unsigned because that's what the temporary will be */ @@ -4870,6 +4881,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + /** + * While this instruction has side-effects, it should not be predicated + * on sample mask, because otherwise fs helper invocations would + * load undefined values from scratch memory. And scratch memory + * load-stores are produced from operations without side-effects, thus + * they should not have different behaviour in the helper invocations. + */ + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); const fs_reg nir_addr = get_nir_src(instr->src[1]); fs_reg data = get_nir_src(instr->src[0]); @@ -5318,6 +5337,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data; if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) @@ -5350,6 +5370,7 @@ fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[2]); if (op == BRW_AOP_FCMPWR) { @@ -5378,6 +5399,7 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data; if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) @@ -5419,6 +5441,7 @@ fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[1]); if (op == BRW_AOP_FCMPWR) {