diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 336363466dc..2297ec477fb 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8594,6 +8594,20 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) set_wqm(ctx); break; } + case nir_intrinsic_dpp16_shift_amd: { + Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp dst = get_ssa_temp(ctx, &instr->def); + int delta = nir_intrinsic_base(instr); + assert(delta >= -15 && delta <= 15 && delta != 0); + assert(instr->def.bit_size != 1 && instr->def.bit_size < 64); + assert(ctx->options->gfx_level >= GFX8); + + uint16_t dpp_ctrl = delta < 0 ? dpp_row_sr(-delta) : dpp_row_sl(delta); + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), src, dpp_ctrl); + + set_wqm(ctx); + break; + } case nir_intrinsic_quad_broadcast: case nir_intrinsic_quad_swap_horizontal: case nir_intrinsic_quad_swap_vertical: diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 8f8225f6e40..316bc0cd09e 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -484,6 +484,7 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_write_invocation_amd: case nir_intrinsic_mbcnt_amd: case nir_intrinsic_lane_permute_16_amd: + case nir_intrinsic_dpp16_shift_amd: case nir_intrinsic_load_instance_id: case nir_intrinsic_ssbo_atomic: case nir_intrinsic_ssbo_atomic_swap: diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index db2d4a404ed..7d248188c3a 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -705,6 +705,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_write_invocation_amd: case nir_intrinsic_mbcnt_amd: case nir_intrinsic_lane_permute_16_amd: + case nir_intrinsic_dpp16_shift_amd: case nir_intrinsic_elect: case nir_intrinsic_elect_any_ir3: case nir_intrinsic_load_tlb_color_brcm: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 745bc1c3b44..17b872a83ce 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -545,6 +545,11 @@ intrinsic("write_invocation_amd", src_comp=[0, 0, 1], dest_comp=0, bit_sizes=src intrinsic("mbcnt_amd", src_comp=[1, 1], dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE]) # Compiled to v_permlane16_b32. src = [ value, lanesel_lo, lanesel_hi ] intrinsic("lane_permute_16_amd", src_comp=[1, 1, 1], dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE]) +# subgroup shuffle up/down with cluster size 16. +# base in [-15, -1]: DPP_ROW_SR +# base in [ 1, 15]: DPP_ROW_SL, otherwise invalid. +# Returns zero for invocations that try to read out of bounds +intrinsic("dpp16_shift_amd", src_comp=[0], dest_comp=0, bit_sizes=src0, indices=[BASE], flags=[CAN_ELIMINATE]) # Basic Geometry Shader intrinsics. #