diff --git a/.pick_status.json b/.pick_status.json index c860057e943..1b156d68d06 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -224,7 +224,7 @@ "description": "panvk/csf: Fix instance attribute offseting", "nominated": true, "nomination_type": 2, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "1570f0172e02aa660bb0b2619d67c1cff52b3914", "notes": null diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 0e70c89536d..a114f6b0303 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -60,6 +60,12 @@ emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf, pan_pack(desc, ATTRIBUTE, cfg) { cfg.offset = attrib_info->offset; + + if (per_instance) { + cfg.offset += + cmdbuf->state.gfx.sysvals.vs.base_instance * buf_info->stride; + } + cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw; cfg.table = 0; cfg.buffer_index = buf_idx; @@ -115,8 +121,19 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf) cmdbuf->vk.dynamic_graphics_state.vi; uint32_t vb_count = 0; - u_foreach_bit(i, vi->attributes_valid) + cmdbuf->state.gfx.vi.attribs_changing_on_base_instance = 0; + u_foreach_bit(i, vi->attributes_valid) { + const struct vk_vertex_binding_state *binding = + &vi->bindings[vi->attributes[i].binding]; + + if (binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE && + binding->stride != 0) { + cmdbuf->state.gfx.vi.attribs_changing_on_base_instance |= + BITFIELD_BIT(i); + } + vb_count = MAX2(vi->attributes[i].binding + 1, vb_count); + } uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1; uint32_t desc_count = vb_offset + vb_count; @@ -1900,6 +1917,14 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) if (result != VK_SUCCESS) return result; + /* Changes to base_instance will modify the offset of per-instance + * attributes, so we manually invalidate the VI state to trigger a + * new attribute table generation in that case. */ + if ((draw->instance.base != cmdbuf->state.gfx.sysvals.vs.base_instance || + draw->indirect.buffer_dev_addr) && + cmdbuf->state.gfx.vi.attribs_changing_on_base_instance != 0) + BITSET_SET(cmdbuf->vk.dynamic_graphics_state.dirty, MESA_VK_DYNAMIC_VI); + panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw); result = prepare_push_uniforms(cmdbuf); @@ -2164,7 +2189,13 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, if (result != VK_SUCCESS) return; + uint32_t patch_attribs = + cmdbuf->state.gfx.vi.attribs_changing_on_base_instance; struct cs_index draw_params_addr = cs_scratch_reg64(b, 0); + struct cs_index vs_drv_set = cs_scratch_reg64(b, 2); + struct cs_index attrib_offset = cs_scratch_reg32(b, 4); + struct cs_index multiplicand = cs_scratch_reg32(b, 5); + cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr); cs_update_vt_ctx(b) { @@ -2200,6 +2231,60 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_wait_slot(b, SB_ID(LS), false); } + if (patch_attribs != 0) { + struct panvk_shader_desc_state *vs_desc_state = + &cmdbuf->state.gfx.vs.desc; + const struct vk_vertex_input_state *vi = + cmdbuf->vk.dynamic_graphics_state.vi; + + cs_move64_to(b, vs_drv_set, vs_desc_state->driver_set.dev_addr); + + /* If firstInstance=0, skip the offset adjustment. */ + cs_if(b, MALI_CS_CONDITION_NEQUAL, + cs_sr_reg32(b, IDVS, INSTANCE_OFFSET)) { + u_foreach_bit(i, patch_attribs) { + const struct vk_vertex_attribute_state *attrib_info = + &vi->attributes[i]; + const struct vk_vertex_binding_state *binding = + &vi->bindings[attrib_info->binding]; + + cs_load32_to(b, attrib_offset, vs_drv_set, + pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t))); + cs_wait_slot(b, SB_ID(LS), false); + + /* Emulated immediate multiply: we walk the bits in + * base_instance, and accumulate (stride << bit_pos) if the bit + * is present. This is sub-optimal, but it's simple :-). */ + cs_add32(b, multiplicand, cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), 0); + for (uint32_t i = 31; i > 0; i--) { + uint32_t add = binding->stride << i; + + /* bit31 is the sign bit, so we don't need to subtract to + * check the presence of the bit. */ + if (i < 31) + cs_add32(b, multiplicand, multiplicand, -(1 << i)); + + if (add) { + cs_if(b, MALI_CS_CONDITION_LESS, multiplicand) + cs_add32(b, multiplicand, multiplicand, 1 << i); + cs_else(b) + cs_add32(b, attrib_offset, attrib_offset, add); + } else { + cs_if(b, MALI_CS_CONDITION_LESS, multiplicand) + cs_add32(b, multiplicand, multiplicand, 1 << i); + } + } + + cs_if(b, MALI_CS_CONDITION_NEQUAL, multiplicand) + cs_add32(b, attrib_offset, attrib_offset, binding->stride); + + cs_store32(b, attrib_offset, vs_drv_set, + pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t))); + cs_wait_slot(b, SB_ID(LS), false); + } + } + } + /* NIR expects zero-based instance ID, but even if it did have an intrinsic to * load the absolute instance ID, we'd want to keep it zero-based to work around * Mali's limitation on non-zero firstInstance when a instance divisor is used. diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index aae044e48f0..f7fc384753b 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -146,6 +146,12 @@ struct panvk_cmd_graphics_state { unsigned count; } vb; +#if PAN_ARCH >= 10 + struct { + uint32_t attribs_changing_on_base_instance; + } vi; +#endif + /* Index buffer */ struct { struct panvk_buffer *buffer; diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index bf1d9e755a6..bf9a3214ab5 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -203,9 +203,7 @@ panvk_lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, PAN_ARCH <= 7 ? nir_load_raw_vertex_id_pan(b) : nir_load_vertex_id(b), - PAN_ARCH >= 9 ? - nir_iadd(b, nir_load_instance_id(b), nir_load_base_instance(b)) : - nir_load_instance_id(b), + nir_load_instance_id(b), nir_get_io_offset_src(intrin)->ssa, .base = nir_intrinsic_base(intrin), .component = nir_intrinsic_component(intrin),