diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c index df663cde516..9e8dc5f57d1 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.c +++ b/src/nouveau/vulkan/nvk_cmd_buffer.c @@ -8,11 +8,14 @@ #include "nvk_device_memory.h" #include "nvk_pipeline.h" #include "nvk_physical_device.h" +#include "nvk_buffer.h" +#include "nvk_mme.h" #include "nouveau_context.h" #include "nouveau/nouveau.h" +#include "nvk_clc597.h" #include "nvk_cl90b5.h" #include "nvk_cla0c0.h" @@ -516,3 +519,136 @@ nvk_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, nvk_push_descriptor_set_update_template(push_set, set_layout, template, pData); } + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets, + const VkDeviceSize* pSizes) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + + for (uint32_t i = 0; i < bindingCount; i++) { + VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]); + uint32_t idx = firstBinding + i; + uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE; + struct nvk_addr_range addr_range = + nvk_buffer_addr_range(buffer, pOffsets[i], size); + assert(addr_range.range <= UINT32_MAX); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); + + P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx)); + P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE); + P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32); + P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr); + P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range); + } + + // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ? +} + +void +nvk_mme_xfb_counter_load(struct mme_builder *b) +{ + if (b->devinfo->cls_eng3d < TURING_A) + return; + + struct mme_value64 counter_addr = mme_load_addr64(b); + struct mme_value buffer = mme_load(b); + + mme_tu104_read_fifoed(b, counter_addr, mme_imm(1)); + mme_free_reg(b, counter_addr.lo); + mme_free_reg(b, counter_addr.hi); + + struct mme_value counter = mme_load(b); + mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer); + mme_emit(b, counter); + + mme_free_reg(b, counter); + mme_free_reg(b, buffer); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + const uint32_t max_buffers = 4; + + /* TODO: pre-Turing transform feedback */ + assert(nvk_cmd_buffer_device(cmd)->ctx->eng3d.cls >= TURING_A); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2+2*max_buffers); + + P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE); + for (uint32_t i = 0; i < max_buffers; ++i) { + P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0); + } + + for (uint32_t i = 0; i < counterBufferCount; ++i) { + if (pCounterBuffers[i] == VK_NULL_HANDLE) + continue; + + VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]); + // index of counter buffer corresponts to index of transform buffer + uint32_t cb_idx = firstCounterBuffer + i; + uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0; + uint64_t cb_addr = nvk_buffer_address(buffer, offset); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 6); + P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB); + P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD)); + P_INLINE_DATA(p, cb_addr >> 32); + P_INLINE_DATA(p, cb_addr); + P_INLINE_DATA(p, cb_idx); + } +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + + for (uint32_t i = 0; i < counterBufferCount; ++i) { + if (pCounterBuffers[i] == VK_NULL_HANDLE) + continue; + + VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]); + uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0; + uint64_t cb_addr = nvk_buffer_address(buffer, offset); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_REPORT_ONLY, + .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT, + .report = REPORT_STREAMING_BYTE_COUNT, + .structure_size = STRUCTURE_SIZE_ONE_WORD, + }); + } + + struct nv_push *p = nvk_cmd_buffer_push(cmd, counterBufferCount ? 4 : 2); + P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE); + + // TODO: this probably needs to move to CmdPipelineBarrier + if (counterBufferCount > 0) { + P_MTHD(p, NVA0C0, INVALIDATE_SHADER_CACHES_NO_WFI); + P_NVA0C0_INVALIDATE_SHADER_CACHES_NO_WFI(p, { + .constant = CONSTANT_TRUE + }); + } +} + diff --git a/src/nouveau/vulkan/nvk_graphics_pipeline.c b/src/nouveau/vulkan/nvk_graphics_pipeline.c index 8f36017c3f8..04f0ce2c91b 100644 --- a/src/nouveau/vulkan/nvk_graphics_pipeline.c +++ b/src/nouveau/vulkan/nvk_graphics_pipeline.c @@ -175,6 +175,27 @@ emit_pipeline_cb_state(struct nv_push *p, } } +static void +emit_pipeline_xfb_state(struct nv_push *p, + const struct nvk_transform_feedback_state *xfb) +{ + const uint8_t max_buffers = 4; + for (uint8_t b = 0; b < max_buffers; ++b) { + const uint32_t var_count = xfb->varying_count[b]; + P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b)); + P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]); + P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, var_count); + P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]); + + /* upload packed varying indices in multiples of 4 bytes */ + const uint32_t n = (var_count + 3) / 4; + if (n > 0) { + P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0)); + P_INLINE_ARRAY(p, (const uint32_t*)xfb->varying_index[b], n); + } + } +} + static const uint32_t mesa_to_nv9097_shader_type[] = { [MESA_SHADER_VERTEX] = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX, [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT, @@ -359,6 +380,10 @@ nvk_graphics_pipeline_create(struct nvk_device *device, CONTROL_V_SELECTS_LAYER, }); + if (last_geom->xfb) { + emit_pipeline_xfb_state(&push, last_geom->xfb); + } + if (state.ts) emit_pipeline_ts_state(&push, state.ts); if (state.vp) emit_pipeline_vp_state(&push, state.vp); if (state.rs) emit_pipeline_rs_state(&push, state.rs); diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c index dd24e0e04c7..f9a2fb1b759 100644 --- a/src/nouveau/vulkan/nvk_mme.c +++ b/src/nouveau/vulkan/nvk_mme.c @@ -15,6 +15,7 @@ static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = { [NVK_MME_DISPATCH_INDIRECT] = nvk_mme_dispatch_indirect, [NVK_MME_WRITE_CS_INVOCATIONS] = nvk_mme_write_cs_invocations, [NVK_MME_COPY_QUERIES] = nvk_mme_copy_queries, + [NVK_MME_XFB_COUNTER_LOAD] = nvk_mme_xfb_counter_load, }; uint32_t * diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h index ef14b3c0acd..b5e0a1cffbe 100644 --- a/src/nouveau/vulkan/nvk_mme.h +++ b/src/nouveau/vulkan/nvk_mme.h @@ -18,6 +18,7 @@ enum nvk_mme { NVK_MME_DISPATCH_INDIRECT, NVK_MME_WRITE_CS_INVOCATIONS, NVK_MME_COPY_QUERIES, + NVK_MME_XFB_COUNTER_LOAD, NVK_MME_COUNT, }; @@ -48,5 +49,6 @@ void nvk_mme_add_cs_invocations(struct mme_builder *b); void nvk_mme_dispatch_indirect(struct mme_builder *b); void nvk_mme_write_cs_invocations(struct mme_builder *b); void nvk_mme_copy_queries(struct mme_builder *b); +void nvk_mme_xfb_counter_load(struct mme_builder *b); #endif /* NVK_MME_H */ diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c index 0e49064c3d7..dcaae204374 100644 --- a/src/nouveau/vulkan/nvk_physical_device.c +++ b/src/nouveau/vulkan/nvk_physical_device.c @@ -228,6 +228,20 @@ nvk_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, p->robustUniformBufferAccessSizeAlignment = NVK_MIN_UBO_ALIGNMENT; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { + VkPhysicalDeviceTransformFeedbackPropertiesEXT *p = (void *)ext; + p->maxTransformFeedbackStreams = 0; + p->maxTransformFeedbackBuffers = 4; + p->maxTransformFeedbackBufferSize = UINT32_MAX; + p->maxTransformFeedbackStreamDataSize = 2048; + p->maxTransformFeedbackBufferDataSize = 512; + p->maxTransformFeedbackBufferDataStride = 2048; + p->transformFeedbackQueries = false; + p->transformFeedbackStreamsLinesTriangles = false; + p->transformFeedbackRasterizationStreamSelect = false; + p->transformFeedbackDraw = false; + break; + } /* More property structs */ default: break; @@ -299,6 +313,7 @@ nvk_get_device_extensions(const struct nv_device_info *dev, .EXT_robustness2 = true, .EXT_sample_locations = dev->cls_eng3d >= MAXWELL_B, .EXT_separate_stencil_usage = true, + .EXT_transform_feedback = dev->cls_eng3d >= TURING_A, .EXT_vertex_attribute_divisor = true, .EXT_vertex_input_dynamic_state = true, }; @@ -466,6 +481,10 @@ nvk_get_device_features(const struct nv_device_info *dev, .robustImageAccess2 = true, .nullDescriptor = true, + /* VK_EXT_transform_feedback */ + .transformFeedback = dev->cls_eng3d >= TURING_A, + .geometryStreams = false, + /* VK_EXT_vertex_attribute_divisor */ .vertexAttributeInstanceRateDivisor = true, .vertexAttributeInstanceRateZeroDivisor = true, diff --git a/src/nouveau/vulkan/nvk_pipeline.c b/src/nouveau/vulkan/nvk_pipeline.c index 33561428996..08bba2d2287 100644 --- a/src/nouveau/vulkan/nvk_pipeline.c +++ b/src/nouveau/vulkan/nvk_pipeline.c @@ -34,6 +34,7 @@ nvk_pipeline_free(struct nvk_device *device, pipeline->shaders[s].upload_addr, pipeline->shaders[s].upload_size); } + free(pipeline->shaders[s].xfb); } vk_object_free(&device->vk, pAllocator, pipeline); diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c index 687c69017ef..1c682bda1f0 100644 --- a/src/nouveau/vulkan/nvk_shader.c +++ b/src/nouveau/vulkan/nvk_shader.c @@ -11,6 +11,7 @@ #include "nir.h" #include "nir_builder.h" #include "compiler/spirv/nir_spirv.h" +#include "compiler/nir/nir_xfb_info.h" #include "nv50_ir_driver.h" @@ -67,6 +68,7 @@ nvk_physical_device_spirv_options(const struct nvk_physical_device *pdevice, .draw_parameters = true, .image_write_without_format = true, .physical_storage_buffer_address = true, + .transform_feedback = true, }, .ssbo_addr_format = nvk_buffer_addr_format(rs->storage_buffers), .phys_ssbo_addr_format = nir_address_format_64bit_global, @@ -754,6 +756,65 @@ nvk_fs_gen_header(struct nvk_shader *fs, struct nv50_ir_prog_info_out *info) return 0; } +static uint8_t find_register_index_for_xfb_output(const struct nir_shader *nir, + nir_xfb_output_info output) +{ + nir_foreach_shader_out_variable(var, nir) { + uint32_t slots = glsl_count_vec4_slots(var->type, false, false); + for (uint32_t i = 0; i < slots; ++i) { + if (output.location == (var->data.location+i)) { + return var->data.driver_location+i; + } + } + } + // should not be reached + return 0; +} + +static struct nvk_transform_feedback_state * +nvk_fill_transform_feedback_state(struct nir_shader *nir, + const struct nv50_ir_prog_info_out *info) +{ + const uint8_t max_buffers = 4; + const uint8_t dw_bytes = 4; + const struct nir_xfb_info *nx = nir->xfb_info; + //nir_print_xfb_info(nx, stdout); + + struct nvk_transform_feedback_state *xfb = + malloc(sizeof(struct nvk_transform_feedback_state)); + + if (!xfb) + return NULL; + + for (uint8_t b = 0; b < max_buffers; ++b) { + xfb->stride[b] = b < nx->buffers_written ? nx->buffers[b].stride : 0; + xfb->varying_count[b] = 0; + xfb->stream[b] = nx->buffer_to_stream[b]; + } + memset(xfb->varying_index, 0xff, sizeof(xfb->varying_index)); /* = skip */ + + for (uint32_t i = 0; i < nx->output_count; ++i) { + const nir_xfb_output_info output = nx->outputs[i]; + const uint8_t b = output.buffer; + const uint8_t r = find_register_index_for_xfb_output(nir, output); + uint32_t p = output.offset / dw_bytes; + + assert(r < info->numOutputs && p < ARRAY_SIZE(xfb->varying_index[b])); + + u_foreach_bit(c, nx->outputs[i].component_mask) + xfb->varying_index[b][p++] = info->out[r].slot[c]; + + xfb->varying_count[b] = MAX2(xfb->varying_count[b], p); + } + + /* zero unused indices */ + for (uint8_t b = 0; b < 4; ++b) + for (uint32_t c = xfb->varying_count[b]; c & 3; ++c) + xfb->varying_index[b][c] = 0; + + return xfb; +} + VkResult nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir, const struct nvk_fs_key *fs_key, @@ -840,6 +901,13 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir, if (info_out.io.fp64) shader->hdr[0] |= 1 << 27; + if (nir->xfb_info) { + shader->xfb = nvk_fill_transform_feedback_state(nir, &info_out); + if (shader->xfb == NULL) { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + return VK_SUCCESS; } diff --git a/src/nouveau/vulkan/nvk_shader.h b/src/nouveau/vulkan/nvk_shader.h index df35039a23a..8ef98a7d231 100644 --- a/src/nouveau/vulkan/nvk_shader.h +++ b/src/nouveau/vulkan/nvk_shader.h @@ -21,6 +21,13 @@ struct nvk_fs_key { bool force_per_sample; }; +struct nvk_transform_feedback_state { + uint32_t stride[4]; + uint8_t stream[4]; + uint8_t varying_count[4]; + uint8_t varying_index[4][128]; +}; + struct nvk_shader { gl_shader_stage stage; @@ -66,6 +73,8 @@ struct nvk_shader { uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */ uint32_t block_size[3]; } cp; + + struct nvk_transform_feedback_state *xfb; }; static inline uint64_t