From cb019ff9266b5985dd8d467f724d3310acd03ab6 Mon Sep 17 00:00:00 2001 From: Jesse Natalie Date: Wed, 15 Nov 2023 08:34:25 -0800 Subject: [PATCH] d3d12: GL4.6 This enables pipeline stats and SO overflow queries Part-of: --- docs/features.txt | 8 ++--- docs/relnotes/new_features.txt | 1 + .../drivers/d3d12/ci/d3d12-quick_gl.txt | 12 +++++++ src/gallium/drivers/d3d12/d3d12_compiler.h | 1 + .../d3d12/d3d12_compute_transforms.cpp | 31 ++++++++++++++----- .../drivers/d3d12/d3d12_compute_transforms.h | 11 ++++--- src/gallium/drivers/d3d12/d3d12_context.h | 3 +- src/gallium/drivers/d3d12/d3d12_draw.cpp | 3 +- src/gallium/drivers/d3d12/d3d12_query.cpp | 30 +++++++++++++++--- src/gallium/drivers/d3d12/d3d12_query.h | 2 +- src/gallium/drivers/d3d12/d3d12_screen.cpp | 2 ++ src/microsoft/compiler/nir_to_dxil.c | 3 +- 12 files changed, 82 insertions(+), 25 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index f5bfdb7fe35..7077285d981 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -225,17 +225,17 @@ GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, v GL_KHR_robustness DONE (freedreno) GL_EXT_shader_integer_mix DONE (all drivers that support GLSL) -GL 4.6, GLSL 4.60 -- all DONE: radeonsi, virgl, zink, iris, crocus/gen7+ +GL 4.6, GLSL 4.60 -- all DONE: radeonsi, virgl, zink, iris, crocus/gen7+, d3d12 GL_ARB_gl_spirv DONE (freedreno, llvmpipe) - GL_ARB_indirect_parameters DONE (freedreno/a6xx+, nvc0, llvmpipe, virgl, d3d12) + GL_ARB_indirect_parameters DONE (freedreno/a6xx+, nvc0, llvmpipe, virgl) GL_ARB_pipeline_statistics_query DONE (freedreno/a6xx+, nvc0, r600, llvmpipe, softpipe, crocus/gen6+) GL_ARB_polygon_offset_clamp DONE (freedreno, nv50, nvc0, r600, llvmpipe, v3d, panfrost, crocus) GL_ARB_shader_atomic_counter_ops DONE (freedreno/a5xx+, nvc0, r600, llvmpipe, softpipe, v3d) - GL_ARB_shader_draw_parameters DONE (freedreno/a6xx+, llvmpipe, nvc0, d3d12, crocus/gen6+) + GL_ARB_shader_draw_parameters DONE (freedreno/a6xx+, llvmpipe, nvc0, crocus/gen6+) GL_ARB_shader_group_vote DONE (freedreno/a6xx, nvc0, llvmpipe, crocus) GL_ARB_spirv_extensions DONE (freedreno, llvmpipe) - GL_ARB_texture_filter_anisotropic DONE (etnaviv/HALTI0, freedreno, nv50, nvc0, r600, softpipe, llvmpipe, d3d12, v3d, panfrost/g72+, asahi, crocus) + GL_ARB_texture_filter_anisotropic DONE (etnaviv/HALTI0, freedreno, nv50, nvc0, r600, softpipe, llvmpipe, v3d, panfrost/g72+, asahi, crocus) GL_ARB_transform_feedback_overflow_query DONE (freedreno/a6xx+, nvc0, llvmpipe, softpipe, crocus/gen6+) GL_KHR_no_error DONE (all drivers) diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index 1aa34e28860..1a08c98080b 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -9,3 +9,4 @@ GL_ARB_clip_control on Asahi GL_ARB_timer_query on Asahi GL_EXT_disjoint_timer_query on Asahi GL_ARB_base_instance on Asahi +OpenGL 4.6 (up from 4.2) on d3d12 diff --git a/src/gallium/drivers/d3d12/ci/d3d12-quick_gl.txt b/src/gallium/drivers/d3d12/ci/d3d12-quick_gl.txt index 53d62ee2eae..6e4247185b5 100644 --- a/src/gallium/drivers/d3d12/ci/d3d12-quick_gl.txt +++ b/src/gallium/drivers/d3d12/ci/d3d12-quick_gl.txt @@ -411,3 +411,15 @@ spec@arb_vertex_program@arb_vertex_program-property-bindings,Fail # https://gitlab.freedesktop.org/mesa/piglit/-/merge_requests/850, # and CI changes to glue them together spec@arb_gpu_shader5@arb_gpu_shader5-xfb-streams-without-invocations spirv,Fail + +# WARP bug: submitting an indirect draw with a count buffer modifies the arg buffer to zero +# entries between >= dynamic count and < static max count +spec@arb_query_buffer_object@coherency,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_FRAGMENT_SHADER_INVOCATIONS,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_PRIMITIVES_GENERATED,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_PRIMITIVES_SUBMITTED,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_SAMPLES_PASSED,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TIMESTAMP,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_TIME_ELAPSED,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_VERTEX_SHADER_INVOCATIONS,Fail +spec@arb_query_buffer_object@coherency@indirect-draw-count-GL_VERTICES_SUBMITTED,Fail diff --git a/src/gallium/drivers/d3d12/d3d12_compiler.h b/src/gallium/drivers/d3d12/d3d12_compiler.h index d2e63b38487..ce229a25728 100644 --- a/src/gallium/drivers/d3d12/d3d12_compiler.h +++ b/src/gallium/drivers/d3d12/d3d12_compiler.h @@ -54,6 +54,7 @@ enum d3d12_state_var { D3D12_STATE_VAR_NUM_WORKGROUPS = 0, D3D12_STATE_VAR_TRANSFORM_GENERIC0, + D3D12_STATE_VAR_TRANSFORM_GENERIC1, D3D12_MAX_COMPUTE_STATE_VARS, D3D12_MAX_STATE_VARS = MAX2(D3D12_MAX_GRAPHICS_STATE_VARS, D3D12_MAX_COMPUTE_STATE_VARS) diff --git a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp index 9f94c038b81..a3358a2ba49 100644 --- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp +++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp @@ -223,10 +223,11 @@ get_query_resolve(const nir_shader_compiler_options *options, const d3d12_comput assert(!key->query_resolve.is_resolve_in_place || (key->query_resolve.is_64bit && key->query_resolve.num_subqueries == 1)); assert(key->query_resolve.num_subqueries == 1 || - key->query_resolve.pipe_query_type == PIPE_QUERY_PRIMITIVES_GENERATED); - assert(key->query_resolve.num_subqueries <= 3); /* Fourth state var is an output offset */ + key->query_resolve.pipe_query_type == PIPE_QUERY_PRIMITIVES_GENERATED || + key->query_resolve.pipe_query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE); + assert(key->query_resolve.num_subqueries <= 4); - nir_variable *inputs[3]; + nir_variable *inputs[4]; for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) { /* Inputs are always 64-bit */ inputs[i] = nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(glsl_uint64_t_type(), 0, 8), "input"); @@ -239,8 +240,9 @@ get_query_resolve(const nir_shader_compiler_options *options, const d3d12_comput } /* How many entries in each sub-query is passed via root constants */ - nir_variable *state_var = nullptr; + nir_variable *state_var = nullptr, *state_var1 = nullptr; nir_def *state_var_data = d3d12_get_state_var(&b, D3D12_STATE_VAR_TRANSFORM_GENERIC0, "state_var", glsl_uvec4_type(), &state_var); + nir_def *state_var_data1 = d3d12_get_state_var(&b, D3D12_STATE_VAR_TRANSFORM_GENERIC1, "state_var1", glsl_uvec4_type(), &state_var1); /* For in-place resolves, we resolve each field of the query. Otherwise, resolve one field into the dest */ nir_variable *results[sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64)]; @@ -280,6 +282,8 @@ get_query_resolve(const nir_shader_compiler_options *options, const d3d12_comput break; case PIPE_QUERY_SO_STATISTICS: case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64); break; case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -324,11 +328,19 @@ get_query_resolve(const nir_shader_compiler_options *options, const d3d12_comput assert(j == 0 && i == 0); nir_def *start = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, array_index, 8)); nir_def *end = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, 1), 8)); - new_value = nir_isub(&b, end, start); + new_value = nir_iadd(&b, nir_load_var(&b, results[j]), nir_isub(&b, end, start)); + } else if (key->query_resolve.pipe_query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE || + key->query_resolve.pipe_query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) { + /* These predicates are true if the primitives emitted != primitives stored */ + assert(j == 0); + nir_def *val_a = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, array_index, 8)); + nir_def *val_b = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, 1), 8)); + new_value = nir_ior(&b, nir_load_var(&b, results[j]), nir_u2uN(&b, nir_ine(&b, val_a, val_b), var_bit_size)); } else { new_value = nir_u2uN(&b, nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, j), 8)), var_bit_size); + new_value = nir_iadd(&b, nir_load_var(&b, results[j]), new_value); } - nir_store_var(&b, results[j], nir_iadd(&b, nir_load_var(&b, results[j]), new_value), 1); + nir_store_var(&b, results[j], new_value, 1); } nir_store_var(&b, loop_counter, nir_iadd_imm(&b, loop_counter_value, 1), 1); @@ -336,7 +348,7 @@ get_query_resolve(const nir_shader_compiler_options *options, const d3d12_comput } /* Results are accumulated, now store the final values */ - nir_def *output_base_index = nir_channel(&b, state_var_data, 3); + nir_def *output_base_index = nir_channel(&b, state_var_data1, 0); for (uint32_t i = 0; i < num_result_values; ++i) { /* When resolving in-place, resolve each field, otherwise just write the one result */ uint32_t field_offset = key->query_resolve.is_resolve_in_place ? i : 0; @@ -483,11 +495,16 @@ d3d12_save_compute_transform_state(struct d3d12_context *ctx, d3d12_compute_tran pipe_resource_reference(&save->ssbos[i].buffer, ctx->ssbo_views[PIPE_SHADER_COMPUTE][i].buffer); save->ssbos[i] = ctx->ssbo_views[PIPE_SHADER_COMPUTE][i]; } + + save->queries_disabled = ctx->queries_disabled; + ctx->base.set_active_query_state(&ctx->base, false); } void d3d12_restore_compute_transform_state(struct d3d12_context *ctx, d3d12_compute_transform_save_restore *save) { + ctx->base.set_active_query_state(&ctx->base, !save->queries_disabled); + ctx->base.bind_compute_state(&ctx->base, save->cs); ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 1, true, &save->cbuf0); diff --git a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h index 207ac5bf9e9..0e9cc37edad 100644 --- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h +++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h @@ -73,14 +73,14 @@ struct d3d12_compute_transform_key struct { /* true means the accumulation should be done as uint64, else uint32. */ uint8_t is_64bit : 1; + /* Indicates how many subqueries to accumulate together into a final result. When + * set to 1, single_subquery_index determines where the data comes from. */ + uint8_t num_subqueries : 3; + uint8_t pipe_query_type : 4; /* true means output is written where input[0] was, else output is a separate buffer. * true also means all fields are accumulated, else single_result_field_offset determines * which field is resolved. Implies num_subqueries == 1. */ uint8_t is_resolve_in_place : 1; - /* Indicates how many subqueries to accumulate together into a final result. When - * set to 1, single_subquery_index determines where the data comes from. */ - uint8_t num_subqueries : 2; - uint8_t pipe_query_type : 4; uint8_t single_subquery_index : 2; uint8_t single_result_field_offset : 4; uint8_t is_signed : 1; @@ -102,7 +102,8 @@ struct d3d12_compute_transform_save_restore { struct d3d12_shader_selector *cs; struct pipe_constant_buffer cbuf0; - struct pipe_shader_buffer ssbos[4]; + struct pipe_shader_buffer ssbos[5]; + bool queries_disabled; }; void diff --git a/src/gallium/drivers/d3d12/d3d12_context.h b/src/gallium/drivers/d3d12/d3d12_context.h index 3429c0bfccd..ac13fe53243 100644 --- a/src/gallium/drivers/d3d12/d3d12_context.h +++ b/src/gallium/drivers/d3d12/d3d12_context.h @@ -277,8 +277,9 @@ struct d3d12_context { struct d3d12_resource *current_predication; bool predication_condition; + bool queries_suspended; - uint32_t transform_state_vars[4]; + uint32_t transform_state_vars[8]; #ifdef __cplusplus ResourceStateManager *resource_state_manager; diff --git a/src/gallium/drivers/d3d12/d3d12_draw.cpp b/src/gallium/drivers/d3d12/d3d12_draw.cpp index 5f4e4f75507..fb50833a026 100644 --- a/src/gallium/drivers/d3d12/d3d12_draw.cpp +++ b/src/gallium/drivers/d3d12/d3d12_draw.cpp @@ -443,7 +443,8 @@ fill_compute_state_vars(struct d3d12_context *ctx, cmd_sig_key->params_root_const_offset = size; size += 4; break; - case D3D12_STATE_VAR_TRANSFORM_GENERIC0: { + case D3D12_STATE_VAR_TRANSFORM_GENERIC0: + case D3D12_STATE_VAR_TRANSFORM_GENERIC1: { unsigned idx = shader->state_vars[j].var - D3D12_STATE_VAR_TRANSFORM_GENERIC0; ptr[0] = ctx->transform_state_vars[idx * 4]; ptr[1] = ctx->transform_state_vars[idx * 4 + 1]; diff --git a/src/gallium/drivers/d3d12/d3d12_query.cpp b/src/gallium/drivers/d3d12/d3d12_query.cpp index 4ea51a436c9..338d6182fa7 100644 --- a/src/gallium/drivers/d3d12/d3d12_query.cpp +++ b/src/gallium/drivers/d3d12/d3d12_query.cpp @@ -42,6 +42,8 @@ num_sub_queries(unsigned query_type, unsigned index) switch (query_type) { case PIPE_QUERY_PRIMITIVES_GENERATED: return index == 0 ? 3 : 1; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + return 4; default: return 1; } @@ -63,6 +65,8 @@ d3d12_query_heap_type(unsigned query_type, unsigned sub_query) D3D12_QUERY_HEAP_TYPE_PIPELINE_STATISTICS; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: return D3D12_QUERY_HEAP_TYPE_SO_STATISTICS; case PIPE_QUERY_TIMESTAMP: case PIPE_QUERY_TIME_ELAPSED: @@ -92,7 +96,10 @@ d3d12_query_type(unsigned query_type, unsigned sub_query, unsigned index) FALLTHROUGH; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: return (D3D12_QUERY_TYPE)(D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0 + index); + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + return (D3D12_QUERY_TYPE)(D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0 + sub_query); case PIPE_QUERY_TIMESTAMP: case PIPE_QUERY_TIME_ELAPSED: return D3D12_QUERY_TYPE_TIMESTAMP; @@ -245,8 +252,13 @@ accumulate_subresult_cpu(struct d3d12_context *ctx, struct d3d12_query *q_parent case D3D12_QUERY_TYPE_SO_STATISTICS_STREAM1: case D3D12_QUERY_TYPE_SO_STATISTICS_STREAM2: case D3D12_QUERY_TYPE_SO_STATISTICS_STREAM3: - result->so_statistics.num_primitives_written += results_so[i].NumPrimitivesWritten; - result->so_statistics.primitives_storage_needed += results_so[i].PrimitivesStorageNeeded; + if (q_parent->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE || + q_parent->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) { + result->b = results_so[i].NumPrimitivesWritten != results_so[i].PrimitivesStorageNeeded; + } else { + result->so_statistics.num_primitives_written += results_so[i].NumPrimitivesWritten; + result->so_statistics.primitives_storage_needed += results_so[i].PrimitivesStorageNeeded; + } break; default: @@ -291,6 +303,14 @@ accumulate_result_cpu(struct d3d12_context *ctx, struct d3d12_query *q, return false; result->u64 = local_result.so_statistics.num_primitives_written; return true; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + result->b = false; + for (uint32_t i = 0; i < num_sub_queries(q->type, q->index); ++i) { + if (!accumulate_subresult_cpu(ctx, q, i, &local_result)) + return false; + result->b |= local_result.b; + } + return true; default: assert(num_sub_queries(q->type, q->index) == 1); return accumulate_subresult_cpu(ctx, q, 0, result); @@ -361,6 +381,7 @@ accumulate_subresult_gpu(struct d3d12_context *ctx, struct d3d12_query *q_parent ctx->transform_state_vars[1] = 0; ctx->transform_state_vars[2] = 0; ctx->transform_state_vars[3] = 0; + ctx->transform_state_vars[4] = 0; pipe_shader_buffer new_cs_ssbos[1]; new_cs_ssbos[0].buffer = q_parent->subqueries[sub_query].buffer; @@ -396,7 +417,7 @@ accumulate_result_gpu(struct d3d12_context *ctx, struct d3d12_query *q, key.query_resolve.timestamp_multiplier = d3d12_screen(ctx->base.screen)->timestamp_multiplier; ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, &key)); - pipe_shader_buffer new_cs_ssbos[4]; + pipe_shader_buffer new_cs_ssbos[5]; uint32_t num_ssbos = 0; for (uint32_t i = 0; i < key.query_resolve.num_subqueries; ++i) { ctx->transform_state_vars[i] = q->subqueries[i].curr_query; @@ -407,7 +428,7 @@ accumulate_result_gpu(struct d3d12_context *ctx, struct d3d12_query *q, } assert(dst_offset % (key.query_resolve.is_64bit ? 8 : 4) == 0); - ctx->transform_state_vars[3] = dst_offset / (key.query_resolve.is_64bit ? 8 : 4); + ctx->transform_state_vars[4] = dst_offset / (key.query_resolve.is_64bit ? 8 : 4); new_cs_ssbos[num_ssbos].buffer = dst; new_cs_ssbos[num_ssbos].buffer_offset = 0; @@ -670,7 +691,6 @@ d3d12_render_condition(struct pipe_context *pctx, return; } - assert(num_sub_queries(query->type, query->index) == 1); if (!query->predicate) query->predicate = d3d12_resource(pipe_buffer_create(pctx->screen, 0, PIPE_USAGE_DEFAULT, sizeof(uint64_t))); diff --git a/src/gallium/drivers/d3d12/d3d12_query.h b/src/gallium/drivers/d3d12/d3d12_query.h index f40dd8b0e44..d0e431a0162 100644 --- a/src/gallium/drivers/d3d12/d3d12_query.h +++ b/src/gallium/drivers/d3d12/d3d12_query.h @@ -41,7 +41,7 @@ d3d12_validate_queries(struct d3d12_context *ctx); void d3d12_enable_predication(struct d3d12_context *ctx); -constexpr unsigned MAX_SUBQUERIES = 3; +constexpr unsigned MAX_SUBQUERIES = 4; struct d3d12_query_impl { ID3D12QueryHeap* query_heap; diff --git a/src/gallium/drivers/d3d12/d3d12_screen.cpp b/src/gallium/drivers/d3d12/d3d12_screen.cpp index 99a084508be..5ae88e1f3dd 100644 --- a/src/gallium/drivers/d3d12/d3d12_screen.cpp +++ b/src/gallium/drivers/d3d12/d3d12_screen.cpp @@ -343,6 +343,8 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_GL_SPIRV: case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_SHADER_GROUP_VOTE: + case PIPE_CAP_QUERY_PIPELINE_STATISTICS: + case PIPE_CAP_QUERY_SO_OVERFLOW: return 1; case PIPE_CAP_QUERY_BUFFER_OBJECT: diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index 579ff51e20f..cd574eb9301 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -6228,7 +6228,8 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts) NIR_PASS(progress, s, dxil_nir_lower_16bit_conv); NIR_PASS(progress, s, nir_opt_remove_phis); NIR_PASS(progress, s, nir_opt_dce); - NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false); + NIR_PASS(progress, s, nir_opt_if, + nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);