anv: enable ray queries

Only on platforms that support it.

v3: Split out code setting up ray query shadow buffer (Caio)
    Don't forget to setup ray query globals even when no shadow buffer
    is used (Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13719>
This commit is contained in:
Lionel Landwerlin
2021-06-08 16:24:54 +03:00
committed by Marge Bot
parent c78be5da30
commit 5d3e419378
8 changed files with 208 additions and 10 deletions

View File

@@ -136,6 +136,7 @@ struct intel_device_info
bool has_aux_map;
bool has_tiling_uapi;
bool has_ray_tracing;
bool has_ray_query;
bool has_local_mem;
bool has_lsc;
bool has_mesh_shading;

View File

@@ -483,6 +483,78 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage);
}
static inline uint32_t
ilog2_round_up(uint32_t value)
{
assert(value != 0);
return 32 - __builtin_clz(value - 1);
}
static void
anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
struct anv_cmd_pipeline_state *pipeline_state,
struct anv_pipeline *pipeline,
VkShaderStageFlags stages)
{
struct anv_device *device = cmd_buffer->device;
uint64_t ray_shadow_size =
align_u64(brw_rt_ray_queries_shadow_stacks_size(&device->info,
pipeline->ray_queries),
4096);
if (ray_shadow_size > 0 &&
(!cmd_buffer->state.ray_query_shadow_bo ||
cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
unsigned shadow_size_log2 = MAX2(ilog2_round_up(ray_shadow_size), 16);
unsigned bucket = shadow_size_log2 - 16;
assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos));
struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]);
if (bo == NULL) {
struct anv_bo *new_bo;
VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
ray_shadow_size,
ANV_BO_ALLOC_LOCAL_MEM, /* alloc_flags */
0, /* explicit_address */
&new_bo);
if (result != VK_SUCCESS) {
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo);
if (bo != NULL) {
anv_device_release_bo(device, bo);
} else {
bo = new_bo;
}
}
cmd_buffer->state.ray_query_shadow_bo = bo;
/* Add the ray query buffers to the batch list. */
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
cmd_buffer->batch.alloc,
cmd_buffer->state.ray_query_shadow_bo);
}
/* Add the HW buffer to the list of BO used. */
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
cmd_buffer->batch.alloc,
device->ray_query_bo);
/* Fill the push constants & mark them dirty. */
struct anv_state ray_query_global_state =
anv_genX(&device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
struct anv_address ray_query_globals_addr = (struct anv_address) {
.bo = device->dynamic_state_pool.block_pool.bo,
.offset = ray_query_global_state.offset,
};
pipeline_state->push_constants.ray_query_globals =
anv_address_physical(ray_query_globals_addr);
cmd_buffer->state.push_constants_dirty |= stages;
}
void anv_CmdBindPipeline(
VkCommandBuffer commandBuffer,
VkPipelineBindPoint pipelineBindPoint,
@@ -490,6 +562,8 @@ void anv_CmdBindPipeline(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
struct anv_cmd_pipeline_state *state;
VkShaderStageFlags stages = 0;
switch (pipelineBindPoint) {
case VK_PIPELINE_BIND_POINT_COMPUTE: {
@@ -502,6 +576,9 @@ void anv_CmdBindPipeline(
cmd_buffer->state.compute.pipeline_dirty = true;
set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
&compute_pipeline->cs->bind_map);
state = &cmd_buffer->state.compute.base;
stages = VK_SHADER_STAGE_COMPUTE_BIT;
break;
}
@@ -525,6 +602,9 @@ void anv_CmdBindPipeline(
anv_dynamic_state_copy(&cmd_buffer->state.gfx.dynamic,
&gfx_pipeline->dynamic_state,
gfx_pipeline->dynamic_state_mask);
state = &cmd_buffer->state.gfx.base;
stages = gfx_pipeline->active_stages;
break;
}
@@ -541,6 +621,8 @@ void anv_CmdBindPipeline(
anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
rt_pipeline->stack_size);
}
state = &cmd_buffer->state.rt.base;
break;
}
@@ -548,6 +630,9 @@ void anv_CmdBindPipeline(
assert(!"invalid bind point");
break;
}
if (pipeline->ray_queries > 0)
anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
}
void anv_CmdSetRasterizerDiscardEnableEXT(
@@ -1675,13 +1760,6 @@ void anv_CmdSetFragmentShadingRateKHR(
}
}
static inline uint32_t
ilog2_round_up(uint32_t value)
{
assert(value != 0);
return 32 - __builtin_clz(value - 1);
}
void anv_CmdSetRayTracingPipelineStackSizeKHR(
VkCommandBuffer commandBuffer,
uint32_t pipelineStackSize)

View File

@@ -221,6 +221,7 @@ get_device_extensions(const struct anv_physical_device *device,
device->use_call_secondary,
.KHR_pipeline_executable_properties = true,
.KHR_push_descriptor = true,
.KHR_ray_query = device->info.has_ray_tracing,
.KHR_relaxed_block_layout = true,
.KHR_sampler_mirror_clamp_to_edge = true,
.KHR_sampler_ycbcr_conversion = true,
@@ -1640,6 +1641,12 @@ void anv_GetPhysicalDeviceFeatures2(
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR: {
VkPhysicalDeviceRayQueryFeaturesKHR *features = (void *)ext;
features->rayQuery = pdevice->info.has_ray_tracing;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
features->robustBufferAccess2 = true;
@@ -3331,9 +3338,22 @@ VkResult anv_CreateDevice(
device->workaround_bo->size,
INTEL_DEBUG_BLOCK_TYPE_FRAME);
result = anv_device_init_trivial_batch(device);
if (device->vk.enabled_extensions.KHR_ray_query) {
uint32_t ray_queries_size =
align_u32(brw_rt_ray_queries_hw_stacks_size(&device->info), 4096);
result = anv_device_alloc_bo(device, "ray queries",
ray_queries_size,
ANV_BO_ALLOC_LOCAL_MEM,
0 /* explicit_address */,
&device->ray_query_bo);
if (result != VK_SUCCESS)
goto fail_workaround_bo;
}
result = anv_device_init_trivial_batch(device);
if (result != VK_SUCCESS)
goto fail_ray_query_bo;
if (device->info.ver >= 12 &&
device->vk.enabled_extensions.KHR_fragment_shading_rate) {
@@ -3403,6 +3423,9 @@ VkResult anv_CreateDevice(
anv_scratch_pool_finish(device, &device->scratch_pool);
fail_trivial_batch:
anv_device_release_bo(device, device->trivial_batch_bo);
fail_ray_query_bo:
if (device->ray_query_bo)
anv_device_release_bo(device, device->ray_query_bo);
fail_workaround_bo:
anv_device_release_bo(device, device->workaround_bo);
fail_surface_aux_map_pool:
@@ -3487,6 +3510,13 @@ void anv_DestroyDevice(
anv_scratch_pool_finish(device, &device->scratch_pool);
if (device->vk.enabled_extensions.KHR_ray_query) {
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) {
if (device->ray_query_shadow_bos[i] != NULL)
anv_device_release_bo(device, device->ray_query_shadow_bos[i]);
}
anv_device_release_bo(device, device->ray_query_bo);
}
anv_device_release_bo(device, device->workaround_bo);
anv_device_release_bo(device, device->trivial_batch_bo);

View File

@@ -119,6 +119,8 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
void
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
const struct intel_l3_config *l3_config,

View File

@@ -34,6 +34,8 @@
#define MAX_SAMPLER_TABLE_SIZE 128
#define BINDLESS_OFFSET 255
#define sizeof_field(type, field) sizeof(((type *)0)->field)
struct apply_pipeline_layout_state {
const struct anv_physical_device *pdevice;
@@ -1322,6 +1324,21 @@ lower_tex(nir_builder *b, nir_tex_instr *tex,
return true;
}
static bool
lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin,
struct apply_pipeline_layout_state *state)
{
b->cursor = nir_instr_remove(&intrin->instr);
nir_ssa_def *rq_globals =
nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0),
.base = offsetof(struct anv_push_constants, ray_query_globals),
.range = sizeof_field(struct anv_push_constants, ray_query_globals));
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, rq_globals);
return true;
}
static bool
apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
{
@@ -1360,6 +1377,8 @@ apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
return lower_image_intrinsic(b, intrin, state);
case nir_intrinsic_load_constant:
return lower_load_constant(b, intrin, state);
case nir_intrinsic_load_ray_query_global_intel:
return lower_ray_query_globals(b, intrin, state);
default:
return false;
}

View File

@@ -142,6 +142,7 @@ anv_shader_compile_to_nir(struct anv_device *device,
.post_depth_coverage = pdevice->info.ver >= 9,
.runtime_descriptor_array = true,
.float_controls = pdevice->info.ver >= 8,
.ray_query = pdevice->info.has_ray_tracing,
.ray_tracing = pdevice->info.has_ray_tracing,
.shader_clock = true,
.shader_viewport_index_layer = true,
@@ -871,6 +872,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
nir_address_format_32bit_offset);
NIR_PASS_V(nir, brw_nir_lower_ray_queries, &pdevice->info);
/* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
anv_nir_apply_pipeline_layout(pdevice,
pipeline->device->robust_buffer_access,
@@ -1485,6 +1488,8 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
} else {
anv_pipeline_add_executable(pipeline, stage, bin->stats, 0);
}
pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries);
}
static uint32_t

View File

@@ -1210,6 +1210,21 @@ struct anv_device {
struct anv_scratch_pool scratch_pool;
struct anv_bo *rt_scratch_bos[16];
/** Shadow ray query BO
*
* The ray_query_bo only holds the current ray being traced. When using
* more than 1 ray query per thread, we cannot fit all the queries in
* there, so we need a another buffer to hold query data that is not
* currently being used by the HW for tracing, similar to a scratch space.
*
* The size of the shadow buffer depends on the number of queries per
* shader.
*/
struct anv_bo *ray_query_shadow_bos[16];
/** Ray query buffer used to communicated with HW unit.
*/
struct anv_bo *ray_query_bo;
struct anv_shader_bin *rt_trampoline;
struct anv_shader_bin *rt_trivial_return;
@@ -2618,8 +2633,8 @@ struct anv_push_constants {
/* Robust access pushed registers. */
uint64_t push_reg_mask[MESA_SHADER_STAGES];
/** Pad out to a multiple of 32 bytes */
uint32_t pad[2];
/** Ray query globals (RT_DISPATCH_GLOBALS) */
uint64_t ray_query_globals;
/* Base addresses for descriptor sets */
uint64_t desc_sets[MAX_SETS];
@@ -3105,6 +3120,11 @@ struct anv_cmd_state {
struct anv_state null_surface_state;
struct anv_dynamic_render_pass dynamic_render_pass;
/**
* A buffer used for spill/fill of ray queries.
*/
struct anv_bo * ray_query_shadow_bo;
};
struct anv_cmd_pool {
@@ -3463,6 +3483,8 @@ struct anv_pipeline {
enum anv_pipeline_type type;
VkPipelineCreateFlags flags;
uint32_t ray_queries;
struct util_dynarray executables;
const struct intel_l3_config * l3_config;

View File

@@ -5461,6 +5461,47 @@ void genX(CmdDispatchIndirect)(
trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer, 0, 0, 0);
}
struct anv_state
genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
{
#if GFX_VERx10 >= 125
struct anv_device *device = cmd_buffer->device;
struct anv_state state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
BRW_RT_DISPATCH_GLOBALS_SIZE,
64);
struct brw_rt_scratch_layout layout;
uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
* some cases?
*/
brw_rt_compute_scratch_layout(&layout, &device->info,
stack_ids_per_dss, 1 << 10);
struct GFX_RT_DISPATCH_GLOBALS rtdg = {
.MemBaseAddress = (struct anv_address) {
/* The ray query HW computes offsets from the top of the buffer, so
* let the address at the end of the buffer.
*/
.bo = device->ray_query_bo,
.offset = device->ray_query_bo->size
},
.AsyncRTStackSize = layout.ray_stack_stride / 64,
.NumDSSRTStacks = layout.stack_ids_per_dss,
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
.ResumeShaderTable = (struct anv_address) {
.bo = cmd_buffer->state.ray_query_shadow_bo,
},
};
GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
return state;
#else
unreachable("Not supported");
#endif
}
#if GFX_VERx10 >= 125
static void
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])