diff --git a/src/intel/compiler/brw_nir_rt.c b/src/intel/compiler/brw_nir_rt.c index ca0c43ebf6d..9c7f7bee01b 100644 --- a/src/intel/compiler/brw_nir_rt.c +++ b/src/intel/compiler/brw_nir_rt.c @@ -368,3 +368,126 @@ brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection, NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo); lower_rt_io_and_scratch(intersection); } + +static nir_ssa_def * +build_load_uniform(nir_builder *b, unsigned offset, + unsigned num_components, unsigned bit_size) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); + load->num_components = num_components; + load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); + nir_intrinsic_set_base(load, offset); + nir_intrinsic_set_range(load, num_components * bit_size / 8); + nir_ssa_dest_init(&load->instr, &load->dest, + num_components, bit_size, NULL); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +#define load_trampoline_param(b, name, num_components, bit_size) \ + build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \ + (num_components), (bit_size)) + +nir_shader * +brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler, + void *mem_ctx) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + const nir_shader_compiler_options *nir_options = + compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions; + + STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32); + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, + nir_options, + "RT Ray-Gen Trampoline"); + ralloc_steal(mem_ctx, b.shader); + + b.shader->info.cs.local_size_variable = true; + + /* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are + * passed in as push constants in the first register. We deal with the + * raygen BSR address here; the global data we'll deal with later. + */ + b.shader->num_uniforms = 32; + nir_ssa_def *raygen_bsr_addr = + load_trampoline_param(&b, raygen_bsr_addr, 1, 64); + nir_ssa_def *local_shift = + nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8)); + + nir_ssa_def *global_id = nir_load_work_group_id(&b, 32); + nir_ssa_def *simd_channel = nir_load_subgroup_invocation(&b); + nir_ssa_def *local_x = + nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0), + nir_channel(&b, local_shift, 0)); + nir_ssa_def *local_y = + nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0), + nir_channel(&b, local_shift, 1)); + nir_ssa_def *local_z = + nir_ubfe(&b, simd_channel, + nir_iadd(&b, nir_channel(&b, local_shift, 0), + nir_channel(&b, local_shift, 1)), + nir_channel(&b, local_shift, 2)); + nir_ssa_def *launch_id = + nir_iadd(&b, nir_ishl(&b, global_id, local_shift), + nir_vec3(&b, local_x, local_y, local_z)); + + nir_ssa_def *launch_size = nir_load_ray_launch_size(&b); + nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size))); + { + nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16, + nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */ + nir_channel(&b, launch_id, 0), + nir_channel(&b, launch_id, 1), + nir_channel(&b, launch_id, 2)), + 0xf /* write mask */); + + brw_nir_btd_spawn(&b, raygen_bsr_addr); + } + nir_push_else(&b, NULL); + { + /* Even though these invocations aren't being used for anything, the + * hardware allocated stack IDs for them. They need to retire them. + */ + brw_nir_btd_retire(&b); + } + nir_pop_if(&b, NULL); + + nir_shader *nir = b.shader; + nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline"); + nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline"); + brw_preprocess_nir(compiler, nir, NULL); + + NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo); + + /* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr + * intrinsic which doesn't exist in compute shaders. We also created one + * above when we generated the BTD spawn intrinsic. Now we go through and + * replace them with a uniform load. + */ + nir_foreach_block(block, b.impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel) + continue; + + b.cursor = nir_before_instr(&intrin->instr); + nir_ssa_def *global_arg_addr = + load_trampoline_param(&b, rt_disp_globals_addr, 1, 64); + assert(intrin->dest.is_ssa); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_src_for_ssa(global_arg_addr)); + nir_instr_remove(instr); + } + } + + NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics); + + brw_nir_optimize(nir, compiler, true, false); + + return nir; +} diff --git a/src/intel/compiler/brw_nir_rt.h b/src/intel/compiler/brw_nir_rt.h index a16f6be47fa..baff0e60748 100644 --- a/src/intel/compiler/brw_nir_rt.h +++ b/src/intel/compiler/brw_nir_rt.h @@ -64,6 +64,9 @@ void brw_nir_lower_intersection_shader(nir_shader *intersection, const struct gen_device_info *devinfo); nir_shader * +brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler, + void *mem_ctx); +nir_shader * brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler, void *mem_ctx); diff --git a/src/intel/compiler/brw_rt.h b/src/intel/compiler/brw_rt.h index 330abf4dd16..eebb29b1f1b 100644 --- a/src/intel/compiler/brw_rt.h +++ b/src/intel/compiler/brw_rt.h @@ -96,6 +96,39 @@ struct brw_rt_scratch_layout { uint64_t total_size; }; +/** Parameters passed to the raygen trampoline shader + * + * This struct is carefully construected to be 32B and must be passed to the + * raygen trampoline shader as as inline constant data. + */ +struct brw_rt_raygen_trampoline_params { + /** The GPU address of the RT_DISPATCH_GLOBALS */ + uint64_t rt_disp_globals_addr; + + /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */ + uint64_t raygen_bsr_addr; + + /** 1 if this is an indirect dispatch, 0 otherwise */ + uint8_t is_indirect; + + /** The integer log2 of the local group size + * + * Ray-tracing shaders don't have a concept of local vs. global workgroup + * size. They only have a single 3D launch size. The raygen trampoline + * shader is always dispatched with a local workgroup size equal to the + * SIMD width but the shape of the local workgroup is determined at + * dispatch time based on the shape of the launch and passed to the + * trampoline via this field. (There's no sense having a Z dimension on + * the local workgroup if the launch is 2D.) + * + * We use the integer log2 of the size because there's no point in + * non-power-of-two sizes and shifts are cheaper than division. + */ + uint8_t local_group_size_log2[3]; + + uint32_t pad[3]; +}; + /** Size of the "hot zone" in bytes * * The hot zone is a SW-defined data structure which is a single uvec4