From 5fd7df4aa2c36726d190ba06c6ce9f1c01da5d4a Mon Sep 17 00:00:00 2001 From: George Ouzounoudis Date: Wed, 16 Nov 2022 22:29:57 +0200 Subject: [PATCH] nvk: Support for vertex shader transform feedback For starters, we support only vertex shader output transform feedback. Optional support for queries, streams and indirect draw are left out for now. Pipeline and shader state is based on nvc0 code. Most of the shader state is going to change with the new compiler. Required support for pause/resume is implemented with an mme function that loads the offset indirectly from the counter buffer on resume. For pause, we store the offset indirectly with a SET_REPORT_SEMAPHORE. Part-of: --- src/nouveau/vulkan/nvk_cmd_buffer.c | 136 +++++++++++++++++++++ src/nouveau/vulkan/nvk_graphics_pipeline.c | 25 ++++ src/nouveau/vulkan/nvk_mme.c | 1 + src/nouveau/vulkan/nvk_mme.h | 2 + src/nouveau/vulkan/nvk_physical_device.c | 19 +++ src/nouveau/vulkan/nvk_pipeline.c | 1 + src/nouveau/vulkan/nvk_shader.c | 68 +++++++++++ src/nouveau/vulkan/nvk_shader.h | 9 ++ 8 files changed, 261 insertions(+) diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c index df663cde516..9e8dc5f57d1 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.c +++ b/src/nouveau/vulkan/nvk_cmd_buffer.c @@ -8,11 +8,14 @@ #include "nvk_device_memory.h" #include "nvk_pipeline.h" #include "nvk_physical_device.h" +#include "nvk_buffer.h" +#include "nvk_mme.h" #include "nouveau_context.h" #include "nouveau/nouveau.h" +#include "nvk_clc597.h" #include "nvk_cl90b5.h" #include "nvk_cla0c0.h" @@ -516,3 +519,136 @@ nvk_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, nvk_push_descriptor_set_update_template(push_set, set_layout, template, pData); } + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets, + const VkDeviceSize* pSizes) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + + for (uint32_t i = 0; i < bindingCount; i++) { + VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]); + uint32_t idx = firstBinding + i; + uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE; + struct nvk_addr_range addr_range = + nvk_buffer_addr_range(buffer, pOffsets[i], size); + assert(addr_range.range <= UINT32_MAX); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); + + P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx)); + P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE); + P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32); + P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr); + P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range); + } + + // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ? +} + +void +nvk_mme_xfb_counter_load(struct mme_builder *b) +{ + if (b->devinfo->cls_eng3d < TURING_A) + return; + + struct mme_value64 counter_addr = mme_load_addr64(b); + struct mme_value buffer = mme_load(b); + + mme_tu104_read_fifoed(b, counter_addr, mme_imm(1)); + mme_free_reg(b, counter_addr.lo); + mme_free_reg(b, counter_addr.hi); + + struct mme_value counter = mme_load(b); + mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer); + mme_emit(b, counter); + + mme_free_reg(b, counter); + mme_free_reg(b, buffer); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + const uint32_t max_buffers = 4; + + /* TODO: pre-Turing transform feedback */ + assert(nvk_cmd_buffer_device(cmd)->ctx->eng3d.cls >= TURING_A); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2+2*max_buffers); + + P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE); + for (uint32_t i = 0; i < max_buffers; ++i) { + P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0); + } + + for (uint32_t i = 0; i < counterBufferCount; ++i) { + if (pCounterBuffers[i] == VK_NULL_HANDLE) + continue; + + VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]); + // index of counter buffer corresponts to index of transform buffer + uint32_t cb_idx = firstCounterBuffer + i; + uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0; + uint64_t cb_addr = nvk_buffer_address(buffer, offset); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 6); + P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB); + P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD)); + P_INLINE_DATA(p, cb_addr >> 32); + P_INLINE_DATA(p, cb_addr); + P_INLINE_DATA(p, cb_idx); + } +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + + for (uint32_t i = 0; i < counterBufferCount; ++i) { + if (pCounterBuffers[i] == VK_NULL_HANDLE) + continue; + + VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]); + uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0; + uint64_t cb_addr = nvk_buffer_address(buffer, offset); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_REPORT_ONLY, + .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT, + .report = REPORT_STREAMING_BYTE_COUNT, + .structure_size = STRUCTURE_SIZE_ONE_WORD, + }); + } + + struct nv_push *p = nvk_cmd_buffer_push(cmd, counterBufferCount ? 4 : 2); + P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE); + + // TODO: this probably needs to move to CmdPipelineBarrier + if (counterBufferCount > 0) { + P_MTHD(p, NVA0C0, INVALIDATE_SHADER_CACHES_NO_WFI); + P_NVA0C0_INVALIDATE_SHADER_CACHES_NO_WFI(p, { + .constant = CONSTANT_TRUE + }); + } +} + diff --git a/src/nouveau/vulkan/nvk_graphics_pipeline.c b/src/nouveau/vulkan/nvk_graphics_pipeline.c index 8f36017c3f8..04f0ce2c91b 100644 --- a/src/nouveau/vulkan/nvk_graphics_pipeline.c +++ b/src/nouveau/vulkan/nvk_graphics_pipeline.c @@ -175,6 +175,27 @@ emit_pipeline_cb_state(struct nv_push *p, } } +static void +emit_pipeline_xfb_state(struct nv_push *p, + const struct nvk_transform_feedback_state *xfb) +{ + const uint8_t max_buffers = 4; + for (uint8_t b = 0; b < max_buffers; ++b) { + const uint32_t var_count = xfb->varying_count[b]; + P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b)); + P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]); + P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, var_count); + P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]); + + /* upload packed varying indices in multiples of 4 bytes */ + const uint32_t n = (var_count + 3) / 4; + if (n > 0) { + P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0)); + P_INLINE_ARRAY(p, (const uint32_t*)xfb->varying_index[b], n); + } + } +} + static const uint32_t mesa_to_nv9097_shader_type[] = { [MESA_SHADER_VERTEX] = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX, [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT, @@ -359,6 +380,10 @@ nvk_graphics_pipeline_create(struct nvk_device *device, CONTROL_V_SELECTS_LAYER, }); + if (last_geom->xfb) { + emit_pipeline_xfb_state(&push, last_geom->xfb); + } + if (state.ts) emit_pipeline_ts_state(&push, state.ts); if (state.vp) emit_pipeline_vp_state(&push, state.vp); if (state.rs) emit_pipeline_rs_state(&push, state.rs); diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c index dd24e0e04c7..f9a2fb1b759 100644 --- a/src/nouveau/vulkan/nvk_mme.c +++ b/src/nouveau/vulkan/nvk_mme.c @@ -15,6 +15,7 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = { [NVK_MME_DISPATCH_INDIRECT] = nvk_mme_dispatch_indirect, [NVK_MME_WRITE_CS_INVOCATIONS] = nvk_mme_write_cs_invocations, [NVK_MME_COPY_QUERIES] = nvk_mme_copy_queries, + [NVK_MME_XFB_COUNTER_LOAD] = nvk_mme_xfb_counter_load, }; uint32_t * diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h index ef14b3c0acd..b5e0a1cffbe 100644 --- a/src/nouveau/vulkan/nvk_mme.h +++ b/src/nouveau/vulkan/nvk_mme.h @@ -18,6 +18,7 @@ enum nvk_mme { NVK_MME_DISPATCH_INDIRECT, NVK_MME_WRITE_CS_INVOCATIONS, NVK_MME_COPY_QUERIES, + NVK_MME_XFB_COUNTER_LOAD, NVK_MME_COUNT, }; @@ -48,5 +49,6 @@ void nvk_mme_add_cs_invocations(struct mme_builder *b); void nvk_mme_dispatch_indirect(struct mme_builder *b); void nvk_mme_write_cs_invocations(struct mme_builder *b); void nvk_mme_copy_queries(struct mme_builder *b); +void nvk_mme_xfb_counter_load(struct mme_builder *b); #endif /* NVK_MME_H */ diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c index 0e49064c3d7..dcaae204374 100644 --- a/src/nouveau/vulkan/nvk_physical_device.c +++ b/src/nouveau/vulkan/nvk_physical_device.c @@ -228,6 +228,20 @@ nvk_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, p->robustUniformBufferAccessSizeAlignment = NVK_MIN_UBO_ALIGNMENT; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { + VkPhysicalDeviceTransformFeedbackPropertiesEXT *p = (void *)ext; + p->maxTransformFeedbackStreams = 0; + p->maxTransformFeedbackBuffers = 4; + p->maxTransformFeedbackBufferSize = UINT32_MAX; + p->maxTransformFeedbackStreamDataSize = 2048; + p->maxTransformFeedbackBufferDataSize = 512; + p->maxTransformFeedbackBufferDataStride = 2048; + p->transformFeedbackQueries = false; + p->transformFeedbackStreamsLinesTriangles = false; + p->transformFeedbackRasterizationStreamSelect = false; + p->transformFeedbackDraw = false; + break; + } /* More property structs */ default: break; @@ -299,6 +313,7 @@ nvk_get_device_extensions(const struct nv_device_info *dev, .EXT_robustness2 = true, .EXT_sample_locations = dev->cls_eng3d >= MAXWELL_B, .EXT_separate_stencil_usage = true, + .EXT_transform_feedback = dev->cls_eng3d >= TURING_A, .EXT_vertex_attribute_divisor = true, .EXT_vertex_input_dynamic_state = true, }; @@ -466,6 +481,10 @@ nvk_get_device_features(const struct nv_device_info *dev, .robustImageAccess2 = true, .nullDescriptor = true, + /* VK_EXT_transform_feedback */ + .transformFeedback = dev->cls_eng3d >= TURING_A, + .geometryStreams = false, + /* VK_EXT_vertex_attribute_divisor */ .vertexAttributeInstanceRateDivisor = true, .vertexAttributeInstanceRateZeroDivisor = true, diff --git a/src/nouveau/vulkan/nvk_pipeline.c b/src/nouveau/vulkan/nvk_pipeline.c index 33561428996..08bba2d2287 100644 --- a/src/nouveau/vulkan/nvk_pipeline.c +++ b/src/nouveau/vulkan/nvk_pipeline.c @@ -34,6 +34,7 @@ nvk_pipeline_free(struct nvk_device *device, pipeline->shaders[s].upload_addr, pipeline->shaders[s].upload_size); } + free(pipeline->shaders[s].xfb); } vk_object_free(&device->vk, pAllocator, pipeline); diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c index 687c69017ef..1c682bda1f0 100644 --- a/src/nouveau/vulkan/nvk_shader.c +++ b/src/nouveau/vulkan/nvk_shader.c @@ -11,6 +11,7 @@ #include "nir.h" #include "nir_builder.h" #include "compiler/spirv/nir_spirv.h" +#include "compiler/nir/nir_xfb_info.h" #include "nv50_ir_driver.h" @@ -67,6 +68,7 @@ nvk_physical_device_spirv_options(const struct nvk_physical_device *pdevice, .draw_parameters = true, .image_write_without_format = true, .physical_storage_buffer_address = true, + .transform_feedback = true, }, .ssbo_addr_format = nvk_buffer_addr_format(rs->storage_buffers), .phys_ssbo_addr_format = nir_address_format_64bit_global, @@ -754,6 +756,65 @@ nvk_fs_gen_header(struct nvk_shader *fs, struct nv50_ir_prog_info_out *info) return 0; } +static uint8_t find_register_index_for_xfb_output(const struct nir_shader *nir, + nir_xfb_output_info output) +{ + nir_foreach_shader_out_variable(var, nir) { + uint32_t slots = glsl_count_vec4_slots(var->type, false, false); + for (uint32_t i = 0; i < slots; ++i) { + if (output.location == (var->data.location+i)) { + return var->data.driver_location+i; + } + } + } + // should not be reached + return 0; +} + +static struct nvk_transform_feedback_state * +nvk_fill_transform_feedback_state(struct nir_shader *nir, + const struct nv50_ir_prog_info_out *info) +{ + const uint8_t max_buffers = 4; + const uint8_t dw_bytes = 4; + const struct nir_xfb_info *nx = nir->xfb_info; + //nir_print_xfb_info(nx, stdout); + + struct nvk_transform_feedback_state *xfb = + malloc(sizeof(struct nvk_transform_feedback_state)); + + if (!xfb) + return NULL; + + for (uint8_t b = 0; b < max_buffers; ++b) { + xfb->stride[b] = b < nx->buffers_written ? nx->buffers[b].stride : 0; + xfb->varying_count[b] = 0; + xfb->stream[b] = nx->buffer_to_stream[b]; + } + memset(xfb->varying_index, 0xff, sizeof(xfb->varying_index)); /* = skip */ + + for (uint32_t i = 0; i < nx->output_count; ++i) { + const nir_xfb_output_info output = nx->outputs[i]; + const uint8_t b = output.buffer; + const uint8_t r = find_register_index_for_xfb_output(nir, output); + uint32_t p = output.offset / dw_bytes; + + assert(r < info->numOutputs && p < ARRAY_SIZE(xfb->varying_index[b])); + + u_foreach_bit(c, nx->outputs[i].component_mask) + xfb->varying_index[b][p++] = info->out[r].slot[c]; + + xfb->varying_count[b] = MAX2(xfb->varying_count[b], p); + } + + /* zero unused indices */ + for (uint8_t b = 0; b < 4; ++b) + for (uint32_t c = xfb->varying_count[b]; c & 3; ++c) + xfb->varying_index[b][c] = 0; + + return xfb; +} + VkResult nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir, const struct nvk_fs_key *fs_key, @@ -840,6 +901,13 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir, if (info_out.io.fp64) shader->hdr[0] |= 1 << 27; + if (nir->xfb_info) { + shader->xfb = nvk_fill_transform_feedback_state(nir, &info_out); + if (shader->xfb == NULL) { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + return VK_SUCCESS; } diff --git a/src/nouveau/vulkan/nvk_shader.h b/src/nouveau/vulkan/nvk_shader.h index df35039a23a..8ef98a7d231 100644 --- a/src/nouveau/vulkan/nvk_shader.h +++ b/src/nouveau/vulkan/nvk_shader.h @@ -21,6 +21,13 @@ struct nvk_fs_key { bool force_per_sample; }; +struct nvk_transform_feedback_state { + uint32_t stride[4]; + uint8_t stream[4]; + uint8_t varying_count[4]; + uint8_t varying_index[4][128]; +}; + struct nvk_shader { gl_shader_stage stage; @@ -66,6 +73,8 @@ struct nvk_shader { uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */ uint32_t block_size[3]; } cp; + + struct nvk_transform_feedback_state *xfb; }; static inline uint64_t