diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index 567aab61da4..e287afd1b4f 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -1534,6 +1534,13 @@ agx_nir_prefix_sum_gs(nir_builder *b, const void *data) nir_channel(b, nir_load_workgroup_id(b), 0)); } +void +agx_nir_prefix_sum_tess(nir_builder *b, const void *data) +{ + b->shader->info.workgroup_size[0] = 1024; + libagx_prefix_sum_tess(b, nir_load_preamble(b, 1, 64, .base = 0)); +} + void agx_nir_gs_setup_indirect(nir_builder *b, const void *data) { @@ -1564,3 +1571,37 @@ agx_nir_unroll_restart(nir_builder *b, const void *data) else unreachable("invalid index size"); } + +void +agx_nir_tessellate(nir_builder *b, const void *data) +{ + const struct agx_tessellator_key *key = data; + b->shader->info.workgroup_size[0] = 64; + + nir_def *params = nir_load_preamble(b, 1, 64, .base = 0); + nir_def *patch = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); + nir_def *mode = nir_imm_int(b, key->mode); + nir_def *partitioning = nir_imm_int(b, key->partitioning); + nir_def *output_prim = nir_imm_int(b, key->output_primitive); + + if (key->prim == TESS_PRIMITIVE_ISOLINES) + libagx_tess_isoline(b, params, mode, partitioning, output_prim, patch); + else if (key->prim == TESS_PRIMITIVE_TRIANGLES) + libagx_tess_tri(b, params, mode, partitioning, output_prim, patch); + else if (key->prim == TESS_PRIMITIVE_QUADS) + libagx_tess_quad(b, params, mode, partitioning, output_prim, patch); + else + unreachable("invalid tess primitive"); +} + +void +agx_nir_tess_setup_indirect(nir_builder *b, const void *data) +{ + const struct agx_tess_setup_indirect_key *key = data; + + nir_def *params = nir_load_preamble(b, 1, 64, .base = 0); + nir_def *with_counts = nir_imm_bool(b, key->with_counts); + nir_def *point_mode = nir_imm_bool(b, key->point_mode); + + libagx_tess_setup_indirect(b, params, with_counts, point_mode); +} diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h index f73084552f0..74fca3849ed 100644 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ b/src/asahi/lib/agx_nir_lower_gs.h @@ -7,6 +7,7 @@ #include #include +#include "shaders/tessellator.h" #include "nir.h" #include "shader_enums.h" @@ -39,6 +40,8 @@ bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx, void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data); +void agx_nir_prefix_sum_tess(struct nir_builder *b, const void *data); + struct agx_gs_setup_indirect_key { enum mesa_prim prim; }; @@ -52,6 +55,23 @@ struct agx_unroll_restart_key { void agx_nir_unroll_restart(struct nir_builder *b, const void *key); +struct agx_tessellator_key { + enum tess_primitive_mode prim : 8; + enum libagx_tess_output_primitive output_primitive : 8; + enum libagx_tess_partitioning partitioning : 8; + enum libagx_tess_mode mode : 8; +}; +static_assert(sizeof(struct agx_tessellator_key) == 4, "padded"); + +struct agx_tess_setup_indirect_key { + bool point_mode; + bool with_counts; + bool padding[2]; +}; +static_assert(sizeof(struct agx_tess_setup_indirect_key) == 4, "padded"); + +void agx_nir_tessellate(struct nir_builder *b, const void *key); + bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *libagx); bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx); @@ -59,3 +79,5 @@ bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx); uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir); unsigned agx_tcs_output_stride(const struct nir_shader *nir); + +void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data); diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build index 0d78932adc4..ced44d9f679 100644 --- a/src/asahi/lib/meson.build +++ b/src/asahi/lib/meson.build @@ -42,6 +42,7 @@ libagx_shader_files = files( 'shaders/query.h', 'shaders/tessellation.cl', 'shaders/tessellator.cl', + 'shaders/tessellator.h', 'shaders/texture.cl', 'shaders/helper.cl', ) diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl index ec4e0f970e0..88468ef9cb2 100644 --- a/src/asahi/lib/shaders/geometry.cl +++ b/src/asahi/lib/shaders/geometry.cl @@ -4,6 +4,7 @@ * SPDX-License-Identifier: MIT */ +#include "shaders/tessellator.h" #include "geometry.h" /* Compatible with util/u_math.h */ @@ -282,6 +283,17 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch) return (first_group * 32) + off; } +/* + * Allocate memory from the heap (thread-safe). Returns the offset into the + * heap. The allocation will be word-aligned. + */ +static inline uint +libagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B) +{ + return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom), + align(size_B, 8)); +} + /* * When unrolling the index buffer for a draw, we translate the old indirect * draws to new indirect draws. This routine allocates the new index buffer and @@ -300,12 +312,11 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p, uint max_verts = max_prims * mesa_vertices_per_prim(mode); uint alloc_size = max_verts * index_size_B; - /* Allocate memory from the heap for the unrolled index buffer. Use an atomic - * since multiple threads may be running to handle multidraw in parallel. + /* Allocate unrolled index buffer. Atomic since multiple threads may be + * running to handle multidraw in parallel. */ global struct agx_geometry_state *heap = p->heap; - uint old_heap_bottom_B = atomic_fetch_add( - (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4)); + uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size); /* Regardless of the input stride, we use tightly packed output draws */ global uint *out = &p->out_draws[5 * draw]; @@ -612,6 +623,40 @@ libagx_prefix_sum(global uint *buffer, uint len, uint words, uint word) } } +kernel void +libagx_prefix_sum_tess(global struct libagx_tess_args *p) +{ + libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */); + + /* After prefix summing, we know the total # of indices, so allocate the + * index buffer now. Elect a thread for the allocation. + */ + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) != 0) + return; + + /* The last element of an inclusive prefix sum is the total sum */ + uint total = p->counts[p->nr_patches - 1]; + + /* Allocate 4-byte indices */ + uint32_t elsize_B = sizeof(uint32_t); + uint32_t size_B = total * elsize_B; + uint alloc_B = p->heap->heap_bottom; + p->heap->heap_bottom += size_B; + p->heap->heap_bottom = align(p->heap->heap_bottom, 8); + + p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B); + + /* ...and now we can generate the API indexed draw */ + global uint32_t *desc = p->out_draws; + + desc[0] = total; /* count */ + desc[1] = 1; /* instance_count */ + desc[2] = alloc_B / elsize_B; /* start */ + desc[3] = 0; /* index_bias */ + desc[4] = 0; /* start_instance */ +} + uintptr_t libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx, gl_varying_slot location) diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h index 745d17d6b26..b4a2b44f7d4 100644 --- a/src/asahi/lib/shaders/geometry.h +++ b/src/asahi/lib/shaders/geometry.h @@ -203,47 +203,6 @@ struct agx_geometry_params { } PACKED; AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 78 * 4); -struct agx_tess_params { - /* Persistent (cross-draw) geometry state */ - GLOBAL(struct agx_geometry_state) state; - - /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */ - GLOBAL(uint) patch_coord_offs; - - /* Patch coordinate buffer, indexed as: - * - * patch_coord_offs[patch_ID] + vertex_in_patch - * - * Currently float2s, but we might be able to compact later? - */ - GLOBAL(float2) patch_coord_buffer; - - /* Tessellation control shader output buffer, indexed by patch ID. */ - GLOBAL(uchar) tcs_buffer; - - /* Bitfield of TCS per-vertex outputs */ - uint64_t tcs_per_vertex_outputs; - - /* Default tess levels used in OpenGL when there is no TCS in the pipeline. - * Unused in Vulkan and OpenGL ES. - */ - float tess_level_outer_default[4]; - float tess_level_inner_default[4]; - - /* Number of vertices in the input patch */ - uint input_patch_size; - - /* Number of vertices in the TCS output patch */ - uint output_patch_size; - - /* Number of patch constants written by TCS */ - uint tcs_patch_constants; - - /* Number of input patches per instance of the VS/TCS */ - uint patches_per_instance; -} PACKED; -AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4); - /* TCS shared memory layout: * * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS]; diff --git a/src/asahi/lib/shaders/tessellation.cl b/src/asahi/lib/shaders/tessellation.cl index c4d549b9ff7..5dfea8345a1 100644 --- a/src/asahi/lib/shaders/tessellation.cl +++ b/src/asahi/lib/shaders/tessellation.cl @@ -4,15 +4,17 @@ */ #include "geometry.h" +#include "tessellator.h" +#include uint -libagx_tcs_patch_vertices_in(constant struct agx_tess_params *p) +libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p) { return p->input_patch_size; } uint -libagx_tes_patch_vertices_in(constant struct agx_tess_params *p) +libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p) { return p->output_patch_size; } @@ -25,7 +27,7 @@ libagx_tcs_in_offset(uint vtx, gl_varying_slot location, } uintptr_t -libagx_tcs_out_address(constant struct agx_tess_params *p, uint patch_id, +libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id, uint vtx_id, gl_varying_slot location, uint nr_patch_out, uint out_patch_size, uint64_t vtx_out_mask) { @@ -45,7 +47,7 @@ libagx_tes_unrolled_patch_id(uint raw_id) } uint -libagx_tes_patch_id(constant struct agx_tess_params *p, uint raw_id) +libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id) { return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance; } @@ -57,16 +59,20 @@ tes_vertex_id_in_patch(uint raw_id) } float2 -libagx_load_tess_coord(constant struct agx_tess_params *p, uint raw_id) +libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id) { uint patch = libagx_tes_unrolled_patch_id(raw_id); uint vtx = tes_vertex_id_in_patch(raw_id); - return p->patch_coord_buffer[p->patch_coord_offs[patch] + vtx]; + global struct libagx_tess_point *t = + &p->patch_coord_buffer[p->coord_allocs[patch] + vtx]; + + /* Written weirdly because NIR struggles with loads of structs */ + return *((global float2 *)t); } uintptr_t -libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id, +libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id, uint vtx_id, gl_varying_slot location) { uint patch = libagx_tes_unrolled_patch_id(raw_id); @@ -77,7 +83,7 @@ libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id, } float4 -libagx_tess_level_outer_default(constant struct agx_tess_params *p) +libagx_tess_level_outer_default(constant struct libagx_tess_args *p) { return ( float4)(p->tess_level_outer_default[0], p->tess_level_outer_default[1], @@ -85,8 +91,98 @@ libagx_tess_level_outer_default(constant struct agx_tess_params *p) } float2 -libagx_tess_level_inner_default(constant struct agx_tess_params *p) +libagx_tess_level_inner_default(constant struct libagx_tess_args *p) { return (float2)(p->tess_level_inner_default[0], p->tess_level_inner_default[1]); } + +void +libagx_tess_setup_indirect(global struct libagx_tess_args *p, bool with_counts, + bool point_mode) +{ + uint count = p->indirect[0], instance_count = p->indirect[1]; + unsigned in_patches = count / p->input_patch_size; + + /* TCS invocation counter increments once per-patch */ + if (p->tcs_statistic) { + *(p->tcs_statistic) += in_patches; + } + + size_t draw_stride = + ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t); + + unsigned unrolled_patches = in_patches * instance_count; + + uint32_t alloc = 0; + uint32_t tcs_out_offs = alloc; + alloc += unrolled_patches * p->tcs_stride_el * 4; + + uint32_t patch_coord_offs = alloc; + alloc += unrolled_patches * 4; + + uint32_t count_offs = alloc; + if (with_counts) + alloc += unrolled_patches * sizeof(uint32_t); + + uint vb_offs = alloc; + uint vb_size = libagx_tcs_in_size(count * instance_count, p->vertex_outputs); + alloc += vb_size; + + /* Allocate all patch calculations in one go */ + global uchar *blob = p->heap->heap + p->heap->heap_bottom; + p->heap->heap_bottom += alloc; + + p->tcs_buffer = (global float *)(blob + tcs_out_offs); + p->patches_per_instance = in_patches; + p->coord_allocs = (global uint *)(blob + patch_coord_offs); + p->nr_patches = unrolled_patches; + + *(p->vertex_output_buffer_ptr) = (uintptr_t)(blob + vb_offs); + + if (with_counts) { + p->counts = (global uint32_t *)(blob + count_offs); + } else { +#if 0 + /* Arrange so we return after all generated draws. agx_pack would be nicer + * here but designated initializers lead to scratch access... + */ + global uint32_t *ret = + (global uint32_t *)(blob + draw_offs + + (draw_stride * unrolled_patches)); + + *ret = (AGX_VDM_BLOCK_TYPE_BARRIER << 29) | /* with return */ (1u << 27); +#endif + /* TODO */ + } + + /* VS grid size */ + p->grids[0] = count; + p->grids[1] = instance_count; + p->grids[2] = 1; + + /* VS workgroup size */ + p->grids[3] = 64; + p->grids[4] = 1; + p->grids[5] = 1; + + /* TCS grid size */ + p->grids[6] = in_patches * p->output_patch_size; + p->grids[7] = instance_count; + p->grids[8] = 1; + + /* TCS workgroup size */ + p->grids[9] = p->output_patch_size; + p->grids[10] = 1; + p->grids[11] = 1; + + /* Tess grid size */ + p->grids[12] = unrolled_patches; + p->grids[13] = 1; + p->grids[14] = 1; + + /* Tess workgroup size */ + p->grids[15] = 64; + p->grids[16] = 1; + p->grids[17] = 1; +} diff --git a/src/asahi/lib/shaders/tessellator.cl b/src/asahi/lib/shaders/tessellator.cl index 0a1fe63e66a..82079242f47 100644 --- a/src/asahi/lib/shaders/tessellator.cl +++ b/src/asahi/lib/shaders/tessellator.cl @@ -1,8 +1,1748 @@ /* - * Copyright 2023 Alyssa Rosenzweig - * Copyright (c) Microsoft Corporation - * SPDX-License-Identifier: MIT - */ + Copyright (c) Microsoft Corporation + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ #include "geometry.h" +#include "tessellator.h" +#include +#if 0 +#include +#include +#include +#include +#include "util/macros.h" +#define min(x, y) (x < y ? x : y) +#define max(x, y) (x > y ? x : y) +#define clz(x) (x ? __builtin_clz(x) : (8 * sizeof(x))) +#define clamp(x, y, z) (x < y ? y : x > z ? z : x) +#define align(x, y) ALIGN_POT(x, y) +#else +#define assert(x) +#endif + +#define LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f +#define LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f + +typedef unsigned int FXP; // fixed point number + +enum { + U = 0, // points on a tri patch + V = 1, +}; + +enum { + Ueq0 = 0, // edges on a tri patch + Veq0 = 1, + Weq0 = 2, +}; + +enum { + Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1 + Veq1 = 3, +}; + +#define QUAD_AXES 2 +#define QUAD_EDGES 4 +#define TRI_EDGES 3 + +// The interior can just use a simpler stitch. +typedef enum DIAGONALS { + DIAGONALS_INSIDE_TO_OUTSIDE, + DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE, + DIAGONALS_MIRRORED +} DIAGONALS; + +typedef struct TESS_FACTOR_CONTEXT { + FXP fxpInvNumSegmentsOnFloorTessFactor; + FXP fxpInvNumSegmentsOnCeilTessFactor; + FXP fxpHalfTessFactorFraction; + int numHalfTessFactorPoints; + int splitPointOnFloorHalfTessFactor; +} TESS_FACTOR_CONTEXT; + +struct INDEX_PATCH_CONTEXT { + int insidePointIndexDeltaToRealValue; + int insidePointIndexBadValue; + int insidePointIndexReplacementValue; + int outsidePointIndexPatchBase; + int outsidePointIndexDeltaToRealValue; + int outsidePointIndexBadValue; + int outsidePointIndexReplacementValue; +}; + +struct INDEX_PATCH_CONTEXT2 { + int baseIndexToInvert; + int indexInversionEndPoint; + int cornerCaseBadValue; + int cornerCaseReplacementValue; +}; + +struct CHWTessellator { + enum libagx_tess_output_primitive outputPrimitive; + enum libagx_tess_mode mode; + uint index_bias; + + // array where we will store u/v's for the points we generate + global struct libagx_tess_point *Point; + + // array where we will store index topology + global void *Index; + + // A second index patch we have to do handles the leftover strip of quads in + // the middle of an odd quad patch after finishing all the concentric rings. + // This also handles the leftover strip of points in the middle of an even + // quad patch, when stitching the row of triangles up the left side (V major + // quad) or bottom (U major quad) of the inner ring + bool bUsingPatchedIndices; + bool bUsingPatchedIndices2; + struct INDEX_PATCH_CONTEXT IndexPatchCtx; + struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2; +}; + +#define FXP_INTEGER_BITS 15 +#define FXP_FRACTION_BITS 16 +#define FXP_FRACTION_MASK 0x0000ffff +#define FXP_INTEGER_MASK 0x7fff0000 +#define FXP_ONE (1 << FXP_FRACTION_BITS) +#define FXP_ONE_THIRD 0x00005555 +#define FXP_TWO_THIRDS 0x0000aaaa +#define FXP_ONE_HALF 0x00008000 + +static global float * +tess_factors(constant struct libagx_tess_args *p, uint patch) +{ + return p->tcs_buffer + (patch * p->tcs_stride_el); +} + +static inline uint +libagx_heap_alloc(global struct agx_geometry_state *heap, uint size_B) +{ + // TODO: drop align to 4 I think + return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom), + align(size_B, 8)); +} + +/* + * Generate an indexed draw for a patch with the computed number of indices. + * This allocates heap memory for the index buffer, returning the allocated + * memory. + */ +static global void * +libagx_draw(constant struct libagx_tess_args *p, enum libagx_tess_mode mode, + bool lines, uint patch, uint count) +{ + if (mode == LIBAGX_TESS_MODE_COUNT) { + p->counts[patch] = count; + } + + if (mode == LIBAGX_TESS_MODE_VDM) { + uint32_t elsize_B = sizeof(uint16_t); + uint32_t alloc_B = libagx_heap_alloc(p->heap, elsize_B * count); + uint64_t ib = ((uintptr_t)p->heap->heap) + alloc_B; + + global uint32_t *desc = p->out_draws + (patch * 6); + agx_pack(&desc[0], INDEX_LIST, cfg) { + cfg.index_buffer_hi = (ib >> 32); + cfg.primitive = lines ? AGX_PRIMITIVE_LINES : AGX_PRIMITIVE_TRIANGLES; + cfg.restart_enable = false; + cfg.index_size = AGX_INDEX_SIZE_U16; + cfg.index_buffer_size_present = true; + cfg.index_buffer_present = true; + cfg.index_count_present = true; + cfg.instance_count_present = true; + cfg.start_present = true; + cfg.unk_1_present = false; + cfg.indirect_buffer_present = false; + cfg.unk_2_present = false; + cfg.block_type = AGX_VDM_BLOCK_TYPE_INDEX_LIST; + } + + agx_pack(&desc[1], INDEX_LIST_BUFFER_LO, cfg) { + cfg.buffer_lo = ib & 0xffffffff; + } + + agx_pack(&desc[2], INDEX_LIST_COUNT, cfg) { + cfg.count = count; + } + + agx_pack(&desc[3], INDEX_LIST_INSTANCES, cfg) { + cfg.count = 1; + } + + agx_pack(&desc[4], INDEX_LIST_START, cfg) { + cfg.start = patch * LIBAGX_TES_PATCH_ID_STRIDE; + } + + agx_pack(&desc[5], INDEX_LIST_BUFFER_SIZE, cfg) { + cfg.size = align(count * 2, 4); + } + + return (global void *)ib; + } + + if (mode == LIBAGX_TESS_MODE_WITH_COUNTS) { + /* The index buffer is already allocated, get a pointer inside it. + * p->counts has had an inclusive prefix sum hence the subtraction. + */ + uint offset_el = p->counts[sub_sat(patch, 1u)]; + if (patch == 0) + offset_el = 0; + + return &p->index_buffer[offset_el]; + } + + return NULL; +} + +static void +libagx_draw_points(private struct CHWTessellator *ctx, + constant struct libagx_tess_args *p, uint patch, uint count) +{ + if (ctx->mode == LIBAGX_TESS_MODE_VDM) { + /* Generate a non-indexed draw for points mode tessellation. */ + global uint32_t *desc = p->out_draws + (patch * 4); + agx_pack(&desc[0], INDEX_LIST, cfg) { + cfg.index_buffer_hi = 0; + cfg.primitive = AGX_PRIMITIVE_POINTS; + cfg.restart_enable = false; + cfg.index_size = 0; + cfg.index_buffer_size_present = false; + cfg.index_buffer_present = false; + cfg.index_count_present = true; + cfg.instance_count_present = true; + cfg.start_present = true; + cfg.unk_1_present = false; + cfg.indirect_buffer_present = false; + cfg.unk_2_present = false; + cfg.block_type = AGX_VDM_BLOCK_TYPE_INDEX_LIST; + } + + agx_pack(&desc[1], INDEX_LIST_COUNT, cfg) { + cfg.count = count; + } + + agx_pack(&desc[2], INDEX_LIST_INSTANCES, cfg) { + cfg.count = 1; + } + + agx_pack(&desc[3], INDEX_LIST_START, cfg) { + cfg.start = patch * LIBAGX_TES_PATCH_ID_STRIDE; + } + } else { + /* For points mode with a single draw, we need to generate a trivial index + * buffer to stuff in the patch ID in the right place. + */ + global uint32_t *indices = libagx_draw(p, ctx->mode, false, patch, count); + + if (ctx->mode == LIBAGX_TESS_MODE_COUNT) + return; + + for (int i = 0; i < count; ++i) { + indices[i] = ctx->index_bias + i; + } + } +} + +static void +libagx_draw_empty(constant struct libagx_tess_args *p, + enum libagx_tess_mode mode, + enum libagx_tess_output_primitive output_primitive, + uint patch) +{ + if (mode == LIBAGX_TESS_MODE_COUNT) { + p->counts[patch] = 0; + } else if (mode == LIBAGX_TESS_MODE_VDM) { + uint32_t words = (output_primitive == LIBAGX_TESS_OUTPUT_POINT) ? 4 : 6; + global uint32_t *desc = p->out_draws + (patch * words); + uint32_t nop_token = AGX_VDM_BLOCK_TYPE_BARRIER << 29; + + for (uint32_t i = 0; i < words; ++i) { + desc[i] = nop_token; + } + } +} + +/* + * Allocate heap memory for domain points for a patch. The allocation + * is recorded in the coord_allocs[] array, which is in elements. + */ +static global struct libagx_tess_point * +libagx_heap_alloc_points(constant struct libagx_tess_args *p, uint patch, + uint count) +{ + /* If we're recording statistics, increment now. The statistic is for + * tessellation evaluation shader invocations, which is equal to the number + * of domain points generated. + */ + if (p->statistic) { + atomic_fetch_add((volatile atomic_uint *)(p->statistic), count); + } + + uint32_t elsize_B = sizeof(struct libagx_tess_point); + uint32_t alloc_B = libagx_heap_alloc(p->heap, elsize_B * count); + uint32_t alloc_el = alloc_B / elsize_B; + + p->coord_allocs[patch] = alloc_el; + return (global struct libagx_tess_point *)(((uintptr_t)p->heap->heap) + + alloc_B); +} + +// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012 +// amar.patel@microsoft.com + +#define LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR 1 +#define LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR 63 +#define LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR 2 +#define LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR 64 + +// 2^(-16), min positive fixed point fraction +#define EPSILON 0.0000152587890625f +#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON \ + (LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2) + +static float clamp_factor(float factor, + enum libagx_tess_partitioning partitioning, + float maxf) +{ + float lower = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) + ? LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR + : LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR; + + float upper = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD) + ? LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR + : LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR; + + // If any TessFactor will end up > 1 after floatToFixed conversion later, + // then force the inside TessFactors to be > 1 so there is a picture frame. + if (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD && + maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) { + + lower = LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON; + } + + factor = clamp(factor, lower, upper); + + if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { + factor = ceil(factor); + } + + return factor; +} + + +static FXP +floatToFixed(const float input) +{ + return mad(input, FXP_ONE, 0.5f); +} + +static float +fixedToFloat(const FXP input) +{ + // Don't need to worry about special cases because the bounds are reasonable. + return ((float)input) / FXP_ONE; +} + +static bool +isOdd(const float input) +{ + return ((int)input) & 1; +} + +static FXP +fxpCeil(const FXP input) +{ + if (input & FXP_FRACTION_MASK) { + return (input & FXP_INTEGER_MASK) + FXP_ONE; + } + return input; +} + +static FXP +fxpFloor(const FXP input) +{ + return (input & FXP_INTEGER_MASK); +} + +static int +PatchIndexValue(private struct CHWTessellator *ctx, int index) +{ + if (ctx->bUsingPatchedIndices) { + // assumed remapped outide indices are > remapped inside vertices + if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) { + if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue) + return ctx->IndexPatchCtx.outsidePointIndexReplacementValue; + else + return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue; + } else { + if (index == ctx->IndexPatchCtx.insidePointIndexBadValue) + return ctx->IndexPatchCtx.insidePointIndexReplacementValue; + else + return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue; + } + } else if (ctx->bUsingPatchedIndices2) { + if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) { + if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) + return ctx->IndexPatchCtx2.cornerCaseReplacementValue; + else + return ctx->IndexPatchCtx2.indexInversionEndPoint - index; + } else if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) { + return ctx->IndexPatchCtx2.cornerCaseReplacementValue; + } + } + + return index; +} + +static void +DefinePoint(global struct libagx_tess_point *out, FXP fxpU, FXP fxpV) +{ + out->u = fixedToFloat(fxpU); + out->v = fixedToFloat(fxpV); +} + +static void +DefineIndex(private struct CHWTessellator *ctx, int index, + int indexStorageOffset) +{ + int patched = PatchIndexValue(ctx, index); + + if (ctx->mode == LIBAGX_TESS_MODE_WITH_COUNTS) { + global uint32_t *indices = (global uint32_t *)ctx->Index; + indices[indexStorageOffset] = ctx->index_bias + patched; + } else { + global uint16_t *indices = (global uint16_t *)ctx->Index; + indices[indexStorageOffset] = patched; + } +} + +static void +DefineClockwiseTriangle(private struct CHWTessellator *ctx, int index0, + int index1, int index2, int indexStorageBaseOffset) +{ + // inputs a clockwise triangle, stores a CW or CCW triangle per state state + bool cw = ctx->outputPrimitive == LIBAGX_TESS_OUTPUT_TRIANGLE_CW; + + DefineIndex(ctx, index0, indexStorageBaseOffset); + DefineIndex(ctx, cw ? index1 : index2, indexStorageBaseOffset + 1); + DefineIndex(ctx, cw ? index2 : index1, indexStorageBaseOffset + 2); +} + +static uint32_t +RemoveMSB(uint32_t val) +{ + uint32_t bit = val ? (1 << (31 - clz(val))) : 0; + return val & ~bit; +} + +static int +NumPointsForTessFactor(bool odd, FXP fxpTessFactor) +{ + // Add epsilon for rounding and add 1 for odd + FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1); + int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1); + return odd ? r : r + 1; +} + +static void +ComputeTessFactorCtx(bool odd, FXP fxpTessFactor, + private TESS_FACTOR_CONTEXT *TessFactorCtx) +{ + // fxpHalfTessFactor == 1/2 if TessFactor is 1, + // but we're pretending we are even. + FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2; + if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) { + fxpHalfTessFactor += FXP_ONE_HALF; + } + FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor); + FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor); + TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor; + TessFactorCtx->numHalfTessFactorPoints = + (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always + // fixed at the midpoint of the TessFactor + if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) { + TessFactorCtx->splitPointOnFloorHalfTessFactor = + /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1; + } else if (odd) { + if (fxpFloorHalfTessFactor == FXP_ONE) { + TessFactorCtx->splitPointOnFloorHalfTessFactor = 0; + } else { + TessFactorCtx->splitPointOnFloorHalfTessFactor = + (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1; + } + } else { + TessFactorCtx->splitPointOnFloorHalfTessFactor = + (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1; + } + int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS; + int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS; + if (odd) { + numFloorSegments -= 1; + numCeilSegments -= 1; + } + TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor = + floatToFixed(1.0f / (float)numFloorSegments); + TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor = + floatToFixed(1.0f / (float)numCeilSegments); +} + +static FXP +PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd, + int point) +{ + bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints; + + if (bFlip) { + point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd; + } + + // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly + if (point == TessFactorCtx->numHalfTessFactorPoints) + return FXP_ONE_HALF; + + unsigned int indexOnCeilHalfTessFactor = point; + unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor; + if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) { + indexOnFloorHalfTessFactor -= 1; + } + // For the fixed point multiplies below, we know the results are <= 16 bits + // because the locations on the halfTessFactor are <= half the number of + // segments for the total TessFactor. So a number divided by a number that + // is at least twice as big will give a result no bigger than 0.5 (which in + // fixed point is 16 bits in our case) + FXP fxpLocationOnFloorHalfTessFactor = + indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor; + FXP fxpLocationOnCeilHalfTessFactor = + indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor; + + // Since we know the numbers calculated above are <= fixed point 0.5, and the + // equation below is just lerping between two values <= fixed point 0.5 + // (0x00008000), then we know that the final result before shifting by 16 bits + // is no larger than 0x80000000. Once we shift that down by 16, we get the + // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5 + // (0x00008000) + FXP fxpLocation = + fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) + + fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction); + fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16 + if (bFlip) { + fxpLocation = FXP_ONE - fxpLocation; + } + return fxpLocation; +} + +static void +StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid, + DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints, + int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset) +{ + int insidePoint = insideEdgePointBaseOffset; + int outsidePoint = outsideEdgePointBaseOffset; + if (bTrapezoid) { + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + int p; + switch (diagonals) { + case DIAGONALS_INSIDE_TO_OUTSIDE: + // Diagonals pointing from inside edge forward towards outside edge + for (p = 0; p < numInsideEdgePoints - 1; p++) { + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + break; + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation + // Diagonals pointing from outside edge forward towards inside edge + + // First half + for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) { + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + + // Middle + DefineClockwiseTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + p += 2; + + // Second half + for (; p < numInsideEdgePoints; p++) { + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + break; + case DIAGONALS_MIRRORED: + // First half, diagonals pointing from outside of outside edge to inside of + // inside edge + for (p = 0; p < numInsideEdgePoints / 2; p++) { + DefineClockwiseTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + // Second half, diagonals pointing from inside of inside edge to outside of + // outside edge + for (; p < numInsideEdgePoints - 1; p++) { + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1, + baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + break; + } + if (bTrapezoid) { + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset); + baseIndexOffset += 3; + } +} + +// loop_start and loop_end give optimal loop bounds for +// the stitching algorithm further below, for any given halfTssFactor. There +// is probably a better way to encode this... +// +// Return the FIRST entry in finalPointPositionTable awhich is less than +// halfTessFactor, except entry 0 and 1 which are set up to skip the loop. +static int +loop_start(int N) +{ + if (N < 2) + return 1; + else if (N == 2) + return 17; + else if (N < 5) + return 9; + else if (N < 9) + return 5; + else if (N < 17) + return 3; + else + return 2; +} + +// Return the LAST entry in finalPointPositionTable[] which is less than +// halfTessFactor, except entry 0 and 1 which are set up to skip the loop. +static int +loop_end(int N) +{ + if (N < 2) + return 0; + else if (N < 4) + return 17; + else if (N < 8) + return 25; + else if (N < 16) + return 29; + else if (N < 32) + return 31; + else + return 32; +} + +// Tables to assist in the stitching of 2 rows of points having arbitrary +// TessFactors. The stitching order is governed by Ruler Function vertex +// split ordering (see external documentation). +// +// The contents of the finalPointPositionTable are where vertex i [0..33] +// ends up on the half-edge at the max tessellation amount given +// ruler-function split order. Recall the other half of an edge is mirrored, +// so we only need to deal with one half. This table is used to decide when +// to advance a point on the interior or exterior. It supports odd TessFactor +// up to 65 and even TessFactor up to 64. + +/* TODO: Is this actually faster than a LUT? */ +static uint32_t +finalPointPositionTable(uint32_t x) +{ + if (x == 0) + return 0; + if (x == 1) + return 0x20; + + uint32_t shift; + if ((x & 1) == 0) { + shift = 1; + } else if ((x & 3) == 3) { + shift = 2; + } else if ((x & 7) == 5) { + shift = 3; + } else if (x != 17) { + shift = 4; + } else { + shift = 5; + } + + // SWAR vectorized right-shift of (0x20, x) + // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`. + uint32_t items_to_shift = x | (0x20 << 16); + uint32_t shifted = items_to_shift >> shift; + + uint32_t bias = min(0xfu, shifted >> 16); + return bias + (shifted & 0xffff); +} + +static void +StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset, + int insideEdgePointBaseOffset, + int insideNumHalfTessFactorPoints, + bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset, + int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd) +{ + if (insideEdgeTessFactorOdd) { + insideNumHalfTessFactorPoints -= 1; + } + if (outsideTessFactorOdd) { + outsideNumHalfTessFactorPoints -= 1; + } + // Walk first half + int outsidePoint = outsideEdgePointBaseOffset; + int insidePoint = insideEdgePointBaseOffset; + + // iStart,iEnd are a small optimization so the loop below doesn't have to go + // from 0 up to 31 + int iStart = min(loop_start(insideNumHalfTessFactorPoints), + loop_start(outsideNumHalfTessFactorPoints)); + int iEnd = loop_end( + max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints)); + + // since we don't start the loop at 0 below, we need a special case. + if (0 < outsideNumHalfTessFactorPoints) { + // Advance outside + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + + for (int i = iStart; i <= iEnd; i++) { + int bound = finalPointPositionTable(i); + + if (bound < insideNumHalfTessFactorPoints) { + // Advance inside + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, + insidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + if (bound < outsideNumHalfTessFactorPoints) { + // Advance outside + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, + insidePoint, baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + } + + if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) || + insideEdgeTessFactorOdd) { + if (insideEdgeTessFactorOdd == outsideTessFactorOdd) { + // Quad in the middle + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, + insidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(ctx, insidePoint + 1, outsidePoint, + outsidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } else if (!insideEdgeTessFactorOdd) { + // Triangle pointing inside + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, + outsidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } else { + // Triangle pointing outside + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, + insidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + } + + // Walk second half. + for (int i = iEnd; i >= iStart; i--) { + int bound = finalPointPositionTable(i); + + if (bound < outsideNumHalfTessFactorPoints) { + // Advance outside + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, + insidePoint, baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + if (bound < insideNumHalfTessFactorPoints) { + // Advance inside + DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, + insidePoint + 1, baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + } + // Below case is not needed if we didn't optimize loop above and made it run + // from 31 down to 0. + if (0 < outsideNumHalfTessFactorPoints) { + DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, + baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } +} + +void +libagx_tess_isoline(constant struct libagx_tess_args *p, + enum libagx_tess_mode mode, + enum libagx_tess_partitioning partitioning, + enum libagx_tess_output_primitive output_primitive, + uint patch) +{ + bool lineDensityOdd; + bool lineDetailOdd; + TESS_FACTOR_CONTEXT lineDensityTessFactorCtx; + TESS_FACTOR_CONTEXT lineDetailTessFactorCtx; + + global float *factors = tess_factors(p, patch); + float TessFactor_V_LineDensity = factors[0]; + float TessFactor_U_LineDetail = factors[1]; + + // Is the patch culled? NaN will pass. + if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) { + libagx_draw_empty(p, mode, output_primitive, patch); + return; + } + + // Clamp edge TessFactors + TessFactor_V_LineDensity = + clamp(TessFactor_V_LineDensity, + LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR, + LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR); + TessFactor_U_LineDetail = + clamp_factor(TessFactor_U_LineDetail, partitioning, 0); + + // Process tessFactors + if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { + lineDetailOdd = isOdd(TessFactor_U_LineDetail); + } else { + lineDetailOdd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD); + } + + FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail); + + ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail, + &lineDetailTessFactorCtx); + int numPointsPerLine = + NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail); + + TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity); + lineDensityOdd = isOdd(TessFactor_V_LineDensity); + FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity); + ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity, + &lineDensityTessFactorCtx); + + // don't draw last line at V == 1. + int numLines = + NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1; + + /* Points */ + uint num_points = numPointsPerLine * numLines; + if (mode != LIBAGX_TESS_MODE_COUNT) { + global struct libagx_tess_point *points = + libagx_heap_alloc_points(p, patch, num_points); + + for (int line = 0, pointOffset = 0; line < numLines; line++) { + FXP fxpV = + PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line); + + for (int point = 0; point < numPointsPerLine; point++) { + FXP fxpU = + PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point); + + DefinePoint(&points[pointOffset++], fxpU, fxpV); + } + } + } + + struct CHWTessellator ctx; + ctx.mode = mode; + ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE; + + /* Connectivity */ + if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) { + uint num_indices = numLines * (numPointsPerLine - 1) * 2; + ctx.Index = libagx_draw(p, mode, true, patch, num_indices); + + if (mode == LIBAGX_TESS_MODE_COUNT) + return; + + for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines; + line++) { + pointOffset++; + + for (int point = 1; point < numPointsPerLine; point++) { + DefineIndex(&ctx, pointOffset - 1, indexOffset++); + DefineIndex(&ctx, pointOffset, indexOffset++); + pointOffset++; + } + } + } else { + libagx_draw_points(&ctx, p, patch, num_points); + } +} + +void +libagx_tess_tri(constant struct libagx_tess_args *p, enum libagx_tess_mode mode, + + enum libagx_tess_partitioning partitioning, + enum libagx_tess_output_primitive output_primitive, uint patch) +{ + global float *factors = tess_factors(p, patch); + float tessFactor_Ueq0 = factors[0]; + float tessFactor_Veq0 = factors[1]; + float tessFactor_Weq0 = factors[2]; + float insideTessFactor_f = factors[4]; + + struct CHWTessellator ctx; + ctx.outputPrimitive = output_primitive; + ctx.Point = NULL; + ctx.Index = NULL; + ctx.mode = mode; + ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE; + ctx.bUsingPatchedIndices = false; + ctx.bUsingPatchedIndices2 = false; + + // Is the patch culled? NaN will pass. + if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) || + !(tessFactor_Weq0 > 0)) { + + libagx_draw_empty(p, mode, output_primitive, patch); + + return; + } + + FXP outsideTessFactor[TRI_EDGES]; + FXP insideTessFactor; + bool outsideTessFactorOdd[TRI_EDGES]; + bool insideTessFactorOdd; + TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES]; + TESS_FACTOR_CONTEXT insideTessFactorCtx; + // Stuff below is just specific to the traversal order + // this code happens to use to generate points/lines + int numPointsForOutsideEdge[TRI_EDGES]; + int numPointsForInsideTessFactor; + int insideEdgePointBaseOffset; + + // Clamp TessFactors + tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0); + tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0); + tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0); + + float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0); + insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf); + // Note the above clamps map NaN to the lower bound + + // Process tessFactors + float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, + tessFactor_Weq0}; + if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { + for (int edge = 0; edge < TRI_EDGES; edge++) { + outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]); + } + insideTessFactorOdd = + isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f); + } else { + bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD); + + for (int edge = 0; edge < TRI_EDGES; edge++) { + outsideTessFactorOdd[edge] = odd; + } + insideTessFactorOdd = odd; + } + + // Save fixed point TessFactors + for (int edge = 0; edge < TRI_EDGES; edge++) { + outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]); + } + insideTessFactor = floatToFixed(insideTessFactor_f); + + if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) { + // Special case if all TessFactors are 1 + if ((FXP_ONE == insideTessFactor) && + (FXP_ONE == outsideTessFactor[Ueq0]) && + (FXP_ONE == outsideTessFactor[Veq0]) && + (FXP_ONE == outsideTessFactor[Weq0])) { + + /* Just do minimum tess factor */ + if (mode == LIBAGX_TESS_MODE_COUNT) { + p->counts[patch] = 3; + return; + } + + global struct libagx_tess_point *points = + libagx_heap_alloc_points(p, patch, 3); + + DefinePoint(&points[0], 0, + FXP_ONE); // V=1 (beginning of Ueq0 edge VW) + DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU) + DefinePoint(&points[2], FXP_ONE, + 0); // U=1 (beginning of Weq0 edge UV) + + if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) { + ctx.Index = libagx_draw(p, mode, false, patch, 3); + + DefineClockwiseTriangle(&ctx, 0, 1, 2, + /*indexStorageBaseOffset*/ 0); + } else { + libagx_draw_points(&ctx, p, patch, 3); + } + + return; + } + } + + // Compute per-TessFactor metadata + for (int edge = 0; edge < TRI_EDGES; edge++) { + ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge], + &outsideTessFactorCtx[edge]); + } + ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor, + &insideTessFactorCtx); + + // Compute some initial data. + int NumPoints = 0; + + // outside edge offsets and storage + for (int edge = 0; edge < TRI_EDGES; edge++) { + numPointsForOutsideEdge[edge] = NumPointsForTessFactor( + outsideTessFactorOdd[edge], outsideTessFactor[edge]); + NumPoints += numPointsForOutsideEdge[edge]; + } + NumPoints -= 3; + + // inside edge offsets + numPointsForInsideTessFactor = + NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor); + { + int pointCountMin = insideTessFactorOdd ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + numPointsForInsideTessFactor = + max(pointCountMin, numPointsForInsideTessFactor); + } + + insideEdgePointBaseOffset = NumPoints; + + // inside storage, including interior edges above + { + int interiorRings = (numPointsForInsideTessFactor >> 1) - 1; + int even = insideTessFactorOdd ? 0 : 1; + NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even; + } + + /* GENERATE POINTS */ + if (mode != LIBAGX_TESS_MODE_COUNT) { + ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints); + + // Generate exterior ring edge points, clockwise starting from point V + // (VW, the U==0 edge) + int pointOffset = 0; + for (int edge = 0; edge < TRI_EDGES; edge++) { + int odd = edge & 0x1; + int endPoint = numPointsForOutsideEdge[edge] - 1; + // don't include end, since next edge starts with it. + for (int p = 0; p < endPoint; p++, pointOffset++) { + // whether to reverse point order given we are defining V or U (W + // implicit): edge0, VW, has V decreasing, so reverse 1D points + // below edge1, WU, has U increasing, so don't reverse 1D points + // below edge2, UV, has U decreasing, so reverse 1D points below + int q = odd ? p : endPoint - p; + + FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge], + outsideTessFactorOdd[edge], q); + if (edge == 0) { + DefinePoint(&ctx.Point[pointOffset], 0, fxpParam); + } else { + DefinePoint(&ctx.Point[pointOffset], fxpParam, + (edge == 2) ? FXP_ONE - fxpParam : 0); + } + } + } + + // Generate interior ring points, clockwise spiralling in + int numRings = (numPointsForInsideTessFactor >> 1); + for (int ring = 1; ring < numRings; ring++) { + int startPoint = ring; + int endPoint = numPointsForInsideTessFactor - 1 - startPoint; + + for (int edge = 0; edge < TRI_EDGES; edge++) { + int odd = edge & 0x1; + int perpendicularAxisPoint = startPoint; + FXP fxpPerpParam = + PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, + perpendicularAxisPoint); + + // Map location to the right size in + // barycentric space. We know this fixed + // point math won't over/underflow + fxpPerpParam *= FXP_TWO_THIRDS; + fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >> + FXP_FRACTION_BITS; // get back to n.16 + + // don't include end: next edge starts with it. + for (int p = startPoint; p < endPoint; p++, pointOffset++) { + // whether to reverse point given we are defining V or U (W + // implicit): edge0, VW, has V decreasing, so reverse 1D points + // below edge1, WU, has U increasing, so don't reverse 1D points + // below edge2, UV, has U decreasing, so reverse 1D points below + int q = odd ? p : endPoint - (p - startPoint); + + FXP fxpParam = + PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q); + // edge0 VW, has perpendicular parameter U constant + // edge1 WU, has perpendicular parameter V constant + // edge2 UV, has perpendicular parameter W constant + // reciprocal is the rate of change of edge-parallel parameters + // as they are pushed into the triangle + const unsigned int deriv = 2; + + // we know this fixed point math won't over/underflow + FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv; + + DefinePoint(&ctx.Point[pointOffset], + edge > 0 ? tmp : fxpPerpParam, + edge == 0 ? tmp + : edge == 1 ? fxpPerpParam + : FXP_ONE - tmp - fxpPerpParam); + } + } + } + if (!insideTessFactorOdd) { + // Last point is the point at the center. + DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD); + } + } + + if (output_primitive == LIBAGX_TESS_OUTPUT_POINT) { + libagx_draw_points(&ctx, p, patch, NumPoints); + return; + } + + { + // Generate primitives for all the concentric rings, one side at a time + // for each ring +1 is so even tess includes the center point, which we + // want to now + int numRings = ((numPointsForInsideTessFactor + 1) >> 1); + + int NumIndices = 0; + { + assert(numRings >= 2 && "invariant"); + int OuterPoints = numPointsForOutsideEdge[0] + + numPointsForOutsideEdge[1] + + numPointsForOutsideEdge[2]; + + int numRings18 = numRings * 18; + NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) + + (3 * OuterPoints) - (numRings18 * (numRings - 1)) + + (insideTessFactorOdd ? 3 : 0); + } + + // Generate the draw and allocate the index buffer now that we know the size + ctx.Index = libagx_draw(p, mode, false, patch, NumIndices); + + if (mode == LIBAGX_TESS_MODE_COUNT) + return; + + int insideOffset = insideEdgePointBaseOffset; + int outsideEdgePointBaseOffset = 0; + + NumIndices = 0; + for (int ring = 1; ring < numRings; ring++) { + int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring; + int edge0InsidePointBaseOffset = insideOffset; + int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; + for (int edge = 0; edge < TRI_EDGES; edge++) { + int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge] + : (numPointsForInsideEdge + 2); + + int numTriangles = numPointsForInsideEdge + outsidePoints - 2; + + int insideBaseOffset; + int outsideBaseOffset; + if (edge == 2) { + ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue = + insideOffset; + ctx.IndexPatchCtx.insidePointIndexBadValue = + numPointsForInsideEdge - 1; + ctx.IndexPatchCtx.insidePointIndexReplacementValue = + edge0InsidePointBaseOffset; + ctx.IndexPatchCtx.outsidePointIndexPatchBase = + ctx.IndexPatchCtx.insidePointIndexBadValue + + 1; // past inside patched index range + ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue = + outsideEdgePointBaseOffset - + ctx.IndexPatchCtx.outsidePointIndexPatchBase; + ctx.IndexPatchCtx.outsidePointIndexBadValue = + ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints - + 1; + ctx.IndexPatchCtx.outsidePointIndexReplacementValue = + edge0OutsidePointBaseOffset; + ctx.bUsingPatchedIndices = true; + insideBaseOffset = 0; + outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase; + } else { + insideBaseOffset = insideOffset; + outsideBaseOffset = outsideEdgePointBaseOffset; + } + if (ring == 1) { + StitchTransition( + &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset, + insideTessFactorCtx.numHalfTessFactorPoints, + insideTessFactorOdd, outsideBaseOffset, + outsideTessFactorCtx[edge].numHalfTessFactorPoints, + outsideTessFactorOdd[edge]); + } else { + StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED, + /*baseIndexOffset: */ NumIndices, + numPointsForInsideEdge, insideBaseOffset, + outsideBaseOffset); + } + if (2 == edge) { + ctx.bUsingPatchedIndices = false; + } + NumIndices += numTriangles * 3; + outsideEdgePointBaseOffset += outsidePoints - 1; + insideOffset += numPointsForInsideEdge - 1; + } + } + if (insideTessFactorOdd) { + // Triangulate center (a single triangle) + DefineClockwiseTriangle(&ctx, outsideEdgePointBaseOffset, + outsideEdgePointBaseOffset + 1, + outsideEdgePointBaseOffset + 2, NumIndices); + NumIndices += 3; + } + } +} + +void +libagx_tess_quad(constant struct libagx_tess_args *p, + enum libagx_tess_mode mode, + enum libagx_tess_partitioning partitioning, + enum libagx_tess_output_primitive output_primitive, uint patch) +{ + global float *factors = tess_factors(p, patch); + + float tessFactor_Ueq0 = factors[0]; + float tessFactor_Veq0 = factors[1]; + float tessFactor_Ueq1 = factors[2]; + float tessFactor_Veq1 = factors[3]; + + float insideTessFactor_U = factors[4]; + float insideTessFactor_V = factors[5]; + + // TODO: fix designated initializer optimization in NIR + struct CHWTessellator ctx; + ctx.outputPrimitive = output_primitive; + ctx.Point = NULL; + ctx.Index = NULL; + ctx.mode = mode; + ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE; + ctx.bUsingPatchedIndices = false; + ctx.bUsingPatchedIndices2 = false; + + // Is the patch culled? + if (!(tessFactor_Ueq0 > 0) || // NaN will pass + !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) || + !(tessFactor_Veq1 > 0)) { + libagx_draw_empty(p, mode, output_primitive, patch); + return; + } + + FXP outsideTessFactor[QUAD_EDGES]; + FXP insideTessFactor[QUAD_AXES]; + bool outsideTessFactorOdd[QUAD_EDGES]; + bool insideTessFactorOdd[QUAD_AXES]; + TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES]; + TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES]; + // Stuff below is just specific to the traversal order + // this code happens to use to generate points/lines + int numPointsForOutsideEdge[QUAD_EDGES]; + int numPointsForInsideTessFactor[QUAD_AXES]; + int insideEdgePointBaseOffset; + + // Clamp edge TessFactors + tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0); + tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0); + tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0); + tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0); + + float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0), + max(tessFactor_Ueq1, tessFactor_Veq1)), + max(insideTessFactor_U, insideTessFactor_V)); + + insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf); + insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf); + // Note the above clamps map NaN to lowerBound + + // Process tessFactors + float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, + tessFactor_Ueq1, tessFactor_Veq1}; + float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U, + insideTessFactor_V}; + int edge, axis; + if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) { + for (edge = 0; edge < QUAD_EDGES; edge++) { + outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]); + } + for (axis = 0; axis < QUAD_AXES; axis++) { + insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) && + (1.0f != insideTessFactor_f[axis]); + } + } else { + bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD); + + for (edge = 0; edge < QUAD_EDGES; edge++) { + outsideTessFactorOdd[edge] = odd; + } + insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd; + } + + // Save fixed point TessFactors + for (edge = 0; edge < QUAD_EDGES; edge++) { + outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]); + } + for (axis = 0; axis < QUAD_AXES; axis++) { + insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]); + } + + if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) { + // Special case if all TessFactors are 1 + if ((FXP_ONE == insideTessFactor[U]) && + (FXP_ONE == insideTessFactor[V]) && + (FXP_ONE == outsideTessFactor[Ueq0]) && + (FXP_ONE == outsideTessFactor[Veq0]) && + (FXP_ONE == outsideTessFactor[Ueq1]) && + (FXP_ONE == outsideTessFactor[Veq1])) { + + /* Just do minimum tess factor */ + if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) { + ctx.Index = libagx_draw(p, mode, false, patch, 6); + if (mode == LIBAGX_TESS_MODE_COUNT) + return; + + DefineClockwiseTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0); + DefineClockwiseTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3); + } else { + libagx_draw_points(&ctx, p, patch, 4); + } + + global struct libagx_tess_point *points = + libagx_heap_alloc_points(p, patch, 4); + + DefinePoint(&points[0], 0, 0); + DefinePoint(&points[1], FXP_ONE, 0); + DefinePoint(&points[2], FXP_ONE, FXP_ONE); + DefinePoint(&points[3], 0, FXP_ONE); + return; + } + } + + // Compute TessFactor-specific metadata + for (int edge = 0; edge < QUAD_EDGES; edge++) { + ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge], + &outsideTessFactorCtx[edge]); + } + + for (int axis = 0; axis < QUAD_AXES; axis++) { + ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis], + &insideTessFactorCtx[axis]); + } + + int NumPoints = 0; + + // outside edge offsets and storage + for (int edge = 0; edge < QUAD_EDGES; edge++) { + numPointsForOutsideEdge[edge] = NumPointsForTessFactor( + outsideTessFactorOdd[edge], outsideTessFactor[edge]); + NumPoints += numPointsForOutsideEdge[edge]; + } + NumPoints -= 4; + + // inside edge offsets + for (int axis = 0; axis < QUAD_AXES; axis++) { + numPointsForInsideTessFactor[axis] = NumPointsForTessFactor( + insideTessFactorOdd[axis], insideTessFactor[axis]); + int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + numPointsForInsideTessFactor[axis] = + max(pointCountMin, numPointsForInsideTessFactor[axis]); + } + + insideEdgePointBaseOffset = NumPoints; + + // inside storage, including interior edges above + int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) * + (numPointsForInsideTessFactor[V] - 2); + NumPoints += numInteriorPoints; + + if (mode != LIBAGX_TESS_MODE_COUNT) { + ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints); + + // Generate exterior ring edge points, clockwise from top-left + int pointOffset = 0; + for (int edge = 0; edge < QUAD_EDGES; edge++) { + int odd = edge & 0x1; + // don't include end, since next edge starts with it. + int endPoint = numPointsForOutsideEdge[edge] - 1; + for (int p = 0; p < endPoint; p++, pointOffset++) { + FXP fxpParam; + int q = + ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order + fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge], + outsideTessFactorOdd[edge], q); + if (odd) { + DefinePoint(&ctx.Point[pointOffset], fxpParam, + (edge == 3) ? FXP_ONE : 0); + } else { + DefinePoint(&ctx.Point[pointOffset], (edge == 2) ? FXP_ONE : 0, + fxpParam); + } + } + } + + // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left) + // spiralling toward center + int minNumPointsForTessFactor = + min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]); + // note for even tess we aren't counting center point here. + int numRings = (minNumPointsForTessFactor >> 1); + + for (int ring = 1; ring < numRings; ring++) { + int startPoint = ring; + int endPoint[QUAD_AXES] = { + numPointsForInsideTessFactor[U] - 1 - startPoint, + numPointsForInsideTessFactor[V] - 1 - startPoint, + }; + + for (int edge = 0; edge < QUAD_EDGES; edge++) { + int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)}; + int perpendicularAxisPoint = + (edge < 2) ? startPoint : endPoint[odd[0]]; + FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]], + insideTessFactorOdd[odd[0]], + perpendicularAxisPoint); + + for (int p = startPoint; p < endPoint[odd[1]]; p++, + pointOffset++) // don't include end: next edge starts with + // it. + { + int q = ((edge == 1) || (edge == 2)) + ? p + : endPoint[odd[1]] - (p - startPoint); + FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd[1]], + insideTessFactorOdd[odd[1]], q); + if (odd[1]) { + DefinePoint(&ctx.Point[pointOffset], fxpPerpParam, fxpParam); + } else { + DefinePoint(&ctx.Point[pointOffset], fxpParam, fxpPerpParam); + } + } + } + } + // For even tessellation, the inner "ring" is degenerate - a row of points + if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) && + !insideTessFactorOdd[V]) { + int startPoint = numRings; + int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint; + for (int p = startPoint; p <= endPoint; p++, pointOffset++) { + FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U], + insideTessFactorOdd[U], p); + DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF); + } + } else if ((numPointsForInsideTessFactor[V] >= + numPointsForInsideTessFactor[U]) && + !insideTessFactorOdd[U]) { + int startPoint = numRings; + int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint; + for (int p = endPoint; p >= startPoint; p--, pointOffset++) { + FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V], + insideTessFactorOdd[V], p); + DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam); + } + } + } + + if (output_primitive == LIBAGX_TESS_OUTPUT_POINT) { + libagx_draw_points(&ctx, p, patch, NumPoints); + return; + } + + /* CONNECTIVITY */ + { + // Generate primitives for all the concentric rings, one side at a time + // for each ring. +1 is so even tess includes the center point + int numPointRowsToCenter[QUAD_AXES] = { + (numPointsForInsideTessFactor[U] + 1) >> 1, + (numPointsForInsideTessFactor[V] + 1) >> 1, + }; + + int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]); + + /* Calculate # of indices so we can allocate */ + { + /* numPointsForInsideTessFactor >= 3 so numRings >= 2 */ + assert(numRings >= 2); + + /* Handle main case */ + int OuterPoints = + numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] + + numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3]; + + int InnerPoints = + numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V]; + + int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) - + (InnerPoints * 18) - (24 * numRings * (numRings - 1)); + + /* Determine major/minor axes */ + bool U_major = + (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]); + unsigned M = U_major ? U : V; + unsigned m = U_major ? V : U; + + /* Handle degenerate ring */ + if (insideTessFactorOdd[m]) { + assert(numPointsForInsideTessFactor[M] >= + numPointsForInsideTessFactor[m]); + + NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) - + (numPointsForInsideTessFactor[m] >> 1)); + NumIndices += (insideTessFactorOdd[M] ? 6 : 12); + } + + // Generate the draw and allocate the index buffer with the size + ctx.Index = libagx_draw(p, mode, false, patch, NumIndices); + } + + if (mode == LIBAGX_TESS_MODE_COUNT) + return; + + int degeneratePointRing[QUAD_AXES] = { + // Even partitioning causes degenerate row of points, + // which results in exceptions to the point ordering conventions + // when travelling around the rings counterclockwise. + !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1, + !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1, + }; + + int numPointsForOutsideEdge_[QUAD_EDGES] = { + numPointsForOutsideEdge[Ueq0], + numPointsForOutsideEdge[Veq0], + numPointsForOutsideEdge[Ueq1], + numPointsForOutsideEdge[Veq1], + }; + + int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset; + int outsideEdgePointBaseOffset = 0; + + int NumIndices = 0; + + for (int ring = 1; ring < numRings; ring++) { + int numPointsForInsideEdge[QUAD_AXES] = { + numPointsForInsideTessFactor[U] - 2 * ring, + numPointsForInsideTessFactor[V] - 2 * ring}; + + int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_; + int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; + + for (int edge = 0; edge < QUAD_EDGES; edge++) { + int odd = (edge + 1) & 0x1; + + int numTriangles = + numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2; + int insideBaseOffset; + int outsideBaseOffset; + + // We need to patch the indexing so Stitch() can think it sees 2 + // sequentially increasing rows of points, even though we have + // wrapped around to the end of the inner and outer ring's points, + // so the last point is really the first point for the ring. We make + // it so that when Stitch() calls AddIndex(), that function will do + // any necessary index adjustment. + if (edge == 3) { + if (ring == degeneratePointRing[odd]) { + ctx.IndexPatchCtx2.baseIndexToInvert = + insideEdgePointBaseOffset_ + 1; + ctx.IndexPatchCtx2.cornerCaseBadValue = + outsideEdgePointBaseOffset + + numPointsForOutsideEdge_[edge] - 1; + ctx.IndexPatchCtx2.cornerCaseReplacementValue = + edge0OutsidePointBaseOffset; + ctx.IndexPatchCtx2.indexInversionEndPoint = + (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1; + insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert; + outsideBaseOffset = outsideEdgePointBaseOffset; + ctx.bUsingPatchedIndices2 = true; + } else { + ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue = + insideEdgePointBaseOffset_; + ctx.IndexPatchCtx.insidePointIndexBadValue = + numPointsForInsideEdge[odd] - 1; + ctx.IndexPatchCtx.insidePointIndexReplacementValue = + edge0InsidePointBaseOffset; + ctx.IndexPatchCtx.outsidePointIndexPatchBase = + ctx.IndexPatchCtx.insidePointIndexBadValue + + 1; // past inside patched index range + ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue = + outsideEdgePointBaseOffset - + ctx.IndexPatchCtx.outsidePointIndexPatchBase; + ctx.IndexPatchCtx.outsidePointIndexBadValue = + ctx.IndexPatchCtx.outsidePointIndexPatchBase + + numPointsForOutsideEdge_[edge] - 1; + ctx.IndexPatchCtx.outsidePointIndexReplacementValue = + edge0OutsidePointBaseOffset; + + insideBaseOffset = 0; + outsideBaseOffset = + ctx.IndexPatchCtx.outsidePointIndexPatchBase; + ctx.bUsingPatchedIndices = true; + } + } else if ((edge == 2) && (ring == degeneratePointRing[odd])) { + ctx.IndexPatchCtx2.baseIndexToInvert = + insideEdgePointBaseOffset_; + ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused + ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused + ctx.IndexPatchCtx2.indexInversionEndPoint = + ctx.IndexPatchCtx2.baseIndexToInvert << 1; + insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert; + outsideBaseOffset = outsideEdgePointBaseOffset; + ctx.bUsingPatchedIndices2 = true; + } else { + insideBaseOffset = insideEdgePointBaseOffset_; + outsideBaseOffset = outsideEdgePointBaseOffset; + } + if (ring == 1) { + StitchTransition( + &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset, + insideTessFactorCtx[odd].numHalfTessFactorPoints, + insideTessFactorOdd[odd], outsideBaseOffset, + outsideTessFactorCtx[edge].numHalfTessFactorPoints, + outsideTessFactorOdd[edge]); + } else { + StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED, + /*baseIndexOffset: */ NumIndices, + numPointsForInsideEdge[odd], insideBaseOffset, + outsideBaseOffset); + } + ctx.bUsingPatchedIndices = false; + ctx.bUsingPatchedIndices2 = false; + NumIndices += numTriangles * 3; + outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1; + if ((edge == 2) && (ring == degeneratePointRing[odd])) { + insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1; + } else { + insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1; + } + numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd]; + } + } + + // Triangulate center - a row of quads if odd + // This triangulation may be producing diagonals that are asymmetric about + // the center of the patch in this region. + if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) && + insideTessFactorOdd[V]) { + ctx.bUsingPatchedIndices2 = true; + int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) - + (numPointsForInsideTessFactor[V] >> 1)) + << 1) + + (insideTessFactorOdd[U] ? 1 : 2); + ctx.IndexPatchCtx2.baseIndexToInvert = + outsideEdgePointBaseOffset + stripNumQuads + 2; + ctx.IndexPatchCtx2.cornerCaseBadValue = + ctx.IndexPatchCtx2.baseIndexToInvert; + ctx.IndexPatchCtx2.cornerCaseReplacementValue = + outsideEdgePointBaseOffset; + ctx.IndexPatchCtx2.indexInversionEndPoint = + ctx.IndexPatchCtx2.baseIndexToInvert + + ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads; + StitchRegular( + &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE, + /*baseIndexOffset: */ NumIndices, + /*numInsideEdgePoints:*/ stripNumQuads + 1, + /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert, + outsideEdgePointBaseOffset + 1); + ctx.bUsingPatchedIndices2 = false; + NumIndices += stripNumQuads * 6; + } else if ((numPointsForInsideTessFactor[V] >= + numPointsForInsideTessFactor[U]) && + insideTessFactorOdd[U]) { + ctx.bUsingPatchedIndices2 = true; + int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) - + (numPointsForInsideTessFactor[U] >> 1)) + << 1) + + (insideTessFactorOdd[V] ? 1 : 2); + ctx.IndexPatchCtx2.baseIndexToInvert = + outsideEdgePointBaseOffset + stripNumQuads + 1; + ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused + ctx.IndexPatchCtx2.indexInversionEndPoint = + ctx.IndexPatchCtx2.baseIndexToInvert + + ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads; + DIAGONALS diag = insideTessFactorOdd[V] + ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE + : DIAGONALS_INSIDE_TO_OUTSIDE; + StitchRegular( + &ctx, /*bTrapezoid*/ false, diag, + /*baseIndexOffset: */ NumIndices, + /*numInsideEdgePoints:*/ stripNumQuads + 1, + /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert, + outsideEdgePointBaseOffset); + ctx.bUsingPatchedIndices2 = false; + NumIndices += stripNumQuads * 6; + } + } +} diff --git a/src/asahi/lib/shaders/tessellator.h b/src/asahi/lib/shaders/tessellator.h new file mode 100644 index 00000000000..ec674f43764 --- /dev/null +++ b/src/asahi/lib/shaders/tessellator.h @@ -0,0 +1,124 @@ +/* + * Copyright 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "libagx.h" + +enum libagx_tess_partitioning { + LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD, + LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN, + LIBAGX_TESS_PARTITIONING_INTEGER, +}; + +enum libagx_tess_output_primitive { + LIBAGX_TESS_OUTPUT_POINT, + LIBAGX_TESS_OUTPUT_TRIANGLE_CW, + LIBAGX_TESS_OUTPUT_TRIANGLE_CCW, +}; + +enum libagx_tess_mode { + /* Do not actually tessellate, just write the index counts */ + LIBAGX_TESS_MODE_COUNT, + + /* Tessellate using the count buffers to allocate indices */ + LIBAGX_TESS_MODE_WITH_COUNTS, + + /* Tessellate without count buffers by generating VDM index list words */ + LIBAGX_TESS_MODE_VDM, +}; + +struct libagx_tess_point { + float u; + float v; +}; +AGX_STATIC_ASSERT(sizeof(struct libagx_tess_point) == 8); + +struct libagx_tess_args { + /* Heap to allocate tessellator outputs in */ + GLOBAL(struct agx_geometry_state) heap; + + /* Patch coordinate buffer, indexed as: + * + * coord_allocs[patch_ID] + vertex_in_patch + */ + GLOBAL(struct libagx_tess_point) patch_coord_buffer; + + /* Per-patch index within the heap for the tess coords, written by the + * tessellator based on the allocated memory. + */ + GLOBAL(uint32_t) coord_allocs; + + /* Space for output draws from the tessellator. Either API draw calls or + * VDM control words, depending on the mode. */ + GLOBAL(uint32_t) out_draws; + + /* Tessellation control shader output buffer. */ + GLOBAL(float) tcs_buffer; + + /* Count buffer. # of indices per patch written here, then prefix summed. */ + GLOBAL(uint32_t) counts; + + /* Allocated index buffer for all patches, if we're prefix summing counts */ + GLOBAL(uint32_t) index_buffer; + + /* Address of the tess eval invocation counter for implementing pipeline + * statistics, if active. Zero if inactive. Incremented by tessellator. + */ + GLOBAL(uint32_t) statistic; + + /* Address of the tess control invocation counter for implementing pipeline + * statistics, if active. Zero if inactive. Incremented by indirect tess + * setup kernel. + */ + GLOBAL(uint32_t) tcs_statistic; + + /* For indirect draws with tessellation, the grid sizes. VS then TCS then + * tess. Allocated by the CPU and written by the tessellation + * setup indirect kernel. + */ + GLOBAL(uint32_t) grids; + + /* For indirect draws, the indirect draw descriptor. */ + GLOBAL(uint32_t) indirect; + + /* For indirect draws, the allocation for the vertex buffer. + * + * TODO: We could move these fields to an indirect setup kernel, not sure if + * it's worth it though... + */ + GLOBAL(uint64_t) vertex_output_buffer_ptr; + + /* For indirect draws, the bitfield of VS outputs */ + uint64_t vertex_outputs; + + /* Bitfield of TCS per-vertex outputs */ + uint64_t tcs_per_vertex_outputs; + + /* Default tess levels used in OpenGL when there is no TCS in the pipeline. + * Unused in Vulkan and OpenGL ES. + */ + float tess_level_outer_default[4]; + float tess_level_inner_default[2]; + + /* Number of vertices in the input patch */ + uint32_t input_patch_size; + + /* Number of vertices in the TCS output patch */ + uint32_t output_patch_size; + + /* Number of patch constants written by TCS */ + uint32_t tcs_patch_constants; + + /* Number of input patches per instance of the VS/TCS */ + uint32_t patches_per_instance; + + /* Stride between tessellation facotrs in the TCS output buffer. */ + uint32_t tcs_stride_el; + + /* Number of patches being tessellated */ + uint32_t nr_patches; +} PACKED; +AGX_STATIC_ASSERT(sizeof(struct libagx_tess_args) == 40 * 4); diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c index 58835b77d8e..65fc3731e17 100644 --- a/src/gallium/drivers/asahi/agx_query.c +++ b/src/gallium/drivers/asahi/agx_query.c @@ -553,8 +553,8 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx, ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, false, &cb); - struct pipe_grid_info grid = {.block = {1, 1, 1}, .grid = {1, 1, 1}}; - agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE); + struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1); + agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE, 0); /* take_ownership=true so do not unreference */ ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, true, diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 9b655713e7f..5e26b2696d7 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -16,6 +16,7 @@ #include "asahi/lib/agx_nir_passes.h" #include "asahi/lib/agx_ppp.h" #include "asahi/lib/agx_usc.h" +#include "asahi/lib/shaders/tessellator.h" #include "compiler/nir/nir.h" #include "compiler/nir/nir_serialize.h" #include "compiler/shader_enums.h" @@ -32,7 +33,6 @@ #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" -#include "tessellator/p_tessellator.h" #include "util/bitscan.h" #include "util/bitset.h" #include "util/blend.h" @@ -3047,7 +3047,7 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs, static uint32_t agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs, - void *data, size_t data_size) + uint64_t data) { size_t usc_size = agx_usc_size(12); @@ -3056,8 +3056,7 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs, struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); - uint64_t ptr = agx_pool_upload(&batch->pool, data, data_size); - agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &ptr, 8)); + agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &data, 8)); agx_usc_immediates(&b, batch, cs); assert(cs->b.info.scratch_size == 0 && "internal kernels don't spill"); @@ -3093,16 +3092,26 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs, return t.gpu; } -void -agx_launch_with_data(struct agx_batch *batch, const struct pipe_grid_info *info, - meta_shader_builder_t builder, void *key, size_t key_size, - void *data, size_t data_size) +static void +agx_launch_with_uploaded_data(struct agx_batch *batch, + const struct agx_grid *grid, + meta_shader_builder_t builder, void *key, + size_t key_size, uint64_t data) { struct agx_compiled_shader *cs = agx_build_meta_shader_internal( batch->ctx, builder, key, key_size, false, false, 0, true); - uint32_t usc = agx_build_internal_usc(batch, cs, data, data_size); - agx_launch_internal(batch, info, cs, PIPE_SHADER_COMPUTE, usc); + uint32_t usc = agx_build_internal_usc(batch, cs, data); + agx_launch_internal(batch, grid, cs, PIPE_SHADER_COMPUTE, usc); +} + +void +agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid, + meta_shader_builder_t builder, void *key, size_t key_size, + void *data, size_t data_size) +{ + uint64_t upload = agx_pool_upload_aligned(&batch->pool, data, data_size, 4); + agx_launch_with_uploaded_data(batch, grid, builder, key, key_size, upload); } struct asahi_bg_eot @@ -4053,6 +4062,17 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, 8, &batch->geom_params_bo); } +static uint64_t +agx_indirect_buffer_ptr(struct agx_batch *batch, + const struct pipe_draw_indirect_info *indirect) +{ + assert(indirect->buffer && "drawauto already handled"); + + struct agx_resource *rsrc = agx_resource(indirect->buffer); + agx_batch_reads(batch, rsrc); + return rsrc->bo->ptr.gpu + indirect->offset; +} + static void agx_launch_gs_prerast(struct agx_batch *batch, const struct pipe_draw_info *info, @@ -4084,23 +4104,14 @@ agx_launch_gs_prerast(struct agx_batch *batch, assert(!info->primitive_restart && "should have been lowered"); - struct pipe_grid_info grid_vs = {.block = {1, 1, 1}}; - struct pipe_grid_info grid_gs = {.block = {1, 1, 1}}; - struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo}; + struct agx_grid grid_vs, grid_gs; /* Setup grids */ if (indirect) { - assert(indirect->buffer && "drawauto already handled"); - struct agx_gs_setup_indirect_key key = { .prim = info->mode, }; - const struct pipe_grid_info grid_setup = { - .block = {1, 1, 1}, - .grid = {1, 1, 1}, - }; - uint64_t ib = 0; size_t ib_extent = 0; @@ -4109,13 +4120,10 @@ agx_launch_gs_prerast(struct agx_batch *batch, &ib_extent); } - struct agx_resource *rsrc = agx_resource(indirect->buffer); - agx_batch_reads(batch, rsrc); - struct agx_gs_setup_indirect_params gsi = { .index_buffer = ib, .index_buffer_range_el = ib_extent / info->index_size, - .draw = rsrc->bo->ptr.gpu + indirect->offset, + .draw = agx_indirect_buffer_ptr(batch, indirect), .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr, .ia = batch->uniforms.input_assembly, .geom = batch->uniforms.geometry_params, @@ -4123,62 +4131,50 @@ agx_launch_gs_prerast(struct agx_batch *batch, .index_size_B = info->index_size, }; + const struct agx_grid grid_setup = agx_grid_direct(1, 1, 1, 1, 1, 1); agx_launch_with_data(batch, &grid_setup, agx_nir_gs_setup_indirect, &key, sizeof(key), &gsi, sizeof(gsi)); - /* Wrap the pool allocation in a fake resource for meta-Gallium use */ - assert(batch->geom_params_bo != NULL); - grid_vs.indirect = &grid_indirect_rsrc.base; - grid_gs.indirect = &grid_indirect_rsrc.base; + uint64_t gp = batch->uniforms.geometry_params; - unsigned param_offs = - (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu); + grid_vs = agx_grid_indirect( + gp + offsetof(struct agx_geometry_params, vs_grid), 1, 1, 1); - grid_vs.indirect_offset = - param_offs + offsetof(struct agx_geometry_params, vs_grid); - - grid_gs.indirect_offset = - param_offs + offsetof(struct agx_geometry_params, gs_grid); + grid_gs = agx_grid_indirect( + gp + offsetof(struct agx_geometry_params, gs_grid), 1, 1, 1); } else { - grid_vs.grid[0] = draws->count; - grid_vs.grid[1] = info->instance_count; - grid_vs.grid[2] = 1; + grid_vs = + agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1); - grid_gs.grid[0] = - u_decomposed_prims_for_vertices(info->mode, draws->count); - grid_gs.grid[1] = info->instance_count; - grid_gs.grid[2] = 1; + grid_gs = agx_grid_direct( + u_decomposed_prims_for_vertices(info->mode, draws->count), + info->instance_count, 1, 64, 1, 1); } /* Launch the vertex shader first */ - agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage); + agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0); /* If there is a count shader, launch it and prefix sum the results. */ if (gs->gs_count) { perf_debug(dev, "Geometry shader count"); - agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY); + agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY, 0); unsigned words = gs->gs_count_words; - agx_launch(batch, - &(const struct pipe_grid_info){ - .block = {1024, 1, 1}, - .grid = {gs->gs_count_words, 1, 1}, - }, + struct agx_grid grid = + agx_grid_direct(1024 * gs->gs_count_words, 1, 1, 1024, 1, 1); + + agx_launch(batch, &grid, agx_build_meta_shader(ctx, agx_nir_prefix_sum_gs, &words, sizeof(words)), - NULL, PIPE_SHADER_COMPUTE); + NULL, PIPE_SHADER_COMPUTE, 0); } /* Pre-GS shader */ - agx_launch(batch, - &(const struct pipe_grid_info){ - .block = {1, 1, 1}, - .grid = {1, 1, 1}, - }, - gs->pre_gs, NULL, PIPE_SHADER_COMPUTE); + struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1); + agx_launch(batch, &grid, gs->pre_gs, NULL, PIPE_SHADER_COMPUTE, 0); /* Pre-rast geometry shader */ - agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY); + agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY, 0); } static void @@ -4249,10 +4245,8 @@ agx_draw_without_restart(struct agx_batch *batch, }; /* Unroll the index buffer for each draw */ - const struct pipe_grid_info grid_setup = { - .block = {1024, 1, 1}, - .grid = {indirect->draw_count, 1, 1}, - }; + const struct agx_grid grid_setup = + agx_grid_direct(1024 * indirect->draw_count, 1, 1, 1024, 1, 1); agx_launch_with_data(batch, &grid_setup, agx_nir_unroll_restart, &key, sizeof(key), &unroll, sizeof(unroll)); @@ -4310,8 +4304,9 @@ agx_needs_passthrough_gs(struct agx_context *ctx, return true; } - /* TODO: this is sloppy, we should add a VDM kernel for this. */ - if (indirect && ctx->active_queries && ctx->prims_generated[0]) { + /* TODO: this is really sloppy, we should add a VDM kernel for this. */ + if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries && + ctx->prims_generated[0]) { perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query"); return true; } @@ -4333,8 +4328,11 @@ agx_needs_passthrough_gs(struct agx_context *ctx, /* Transform feedback is layered on geometry shaders, so if transform * feedback is used, we need a GS. */ - if (ctx->stage[PIPE_SHADER_VERTEX].shader->has_xfb_info && - ctx->streamout.num_targets) { + struct agx_uncompiled_shader *last_vtx = + ctx->stage[PIPE_SHADER_TESS_EVAL].shader + ?: ctx->stage[PIPE_SHADER_VERTEX].shader; + + if (last_vtx->has_xfb_info && ctx->streamout.num_targets) { *xfb_only = true; return true; } @@ -4343,6 +4341,20 @@ agx_needs_passthrough_gs(struct agx_context *ctx, return false; } +static enum mesa_prim +agx_tess_output_prim(struct agx_uncompiled_shader *tcs, + struct agx_uncompiled_shader *tes) +{ + if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) { + return MESA_PRIM_POINTS; + } else if (TESS_PRIMITIVE_ISOLINES == + MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) { + return MESA_PRIM_LINES; + } else { + return MESA_PRIM_TRIANGLES; + } +} + static struct agx_uncompiled_shader * agx_get_passthrough_gs(struct agx_context *ctx, struct agx_uncompiled_shader *prev_cso, @@ -4350,11 +4362,16 @@ agx_get_passthrough_gs(struct agx_context *ctx, { bool edgeflags = has_edgeflags(ctx, mode); + if (mode == MESA_PRIM_PATCHES) { + mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader, + ctx->stage[MESA_SHADER_TESS_EVAL].shader); + } + /* Only handle the polygon mode when edge flags are in use, because * nir_passthrough_gs doesn't handle transform feedback + polygon mode - * properly. Technically this can break edge flags + transform feedback but - * that's firmly in "doctor, it hurts when I do this" territory, and I'm not - * sure that's even possible to hit. TODO: Reevaluate. + * properly. Technically this can break edge flags + transform feedback + * but that's firmly in "doctor, it hurts when I do this" territory, and + * I'm not sure that's even possible to hit. TODO: Reevaluate. */ unsigned poly_mode = edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL; @@ -4525,35 +4542,39 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, unbind_tcs_when_done = true; } - unsigned in_vertices = draws->count; - unsigned in_patches = in_vertices / patch_vertices; + enum tess_primitive_mode mode = + MAX2(tcs->tess.primitive, tes->tess.primitive); + enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing); - if (in_patches == 0) - return; + enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL + ? PIPE_TESS_SPACING_EQUAL + : spacing == TESS_SPACING_FRACTIONAL_ODD + ? PIPE_TESS_SPACING_FRACTIONAL_ODD + : PIPE_TESS_SPACING_FRACTIONAL_EVEN; - /* TCS invocation counter increments once per-patch */ - agx_query_increment_cpu( - ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS], - in_patches); + bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode); + enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes); - struct agx_batch *batch = agx_get_compute_batch(ctx); + enum libagx_tess_partitioning partitioning = + (enum libagx_tess_partitioning)pspacing; + + enum libagx_tess_output_primitive prim = + point_mode ? LIBAGX_TESS_OUTPUT_POINT + : !tes->tess.ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW + : LIBAGX_TESS_OUTPUT_TRIANGLE_CW; + + struct agx_bo *draw_bo = NULL; + bool with_counts = + indirect || ctx->stage[MESA_SHADER_GEOMETRY].shader != NULL; + size_t draw_stride = + ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t); + + struct agx_batch *batch = agx_get_batch(ctx); agx_batch_init_state(batch); - struct pipe_resource *heap = - pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL, PIPE_USAGE_DEFAULT, - 1024 * 1024 * 128); - - uint64_t heap_gpu = agx_resource(heap)->bo->ptr.gpu; - uint8_t *heap_cpu = agx_resource(heap)->bo->ptr.cpu; - - unsigned unrolled_patch_count = in_patches * info->instance_count; - - uint32_t heap_water = 0; - uint32_t tcs_out_offs = heap_water; - heap_water += ALIGN(unrolled_patch_count * tcs->tess.output_stride, 4); - - agx_batch_writes(batch, agx_resource(heap), 0); - batch->incoherent_writes = true; + if (!batch->cdm.bo) { + batch->cdm = agx_encoder_allocate(batch, dev); + } uint64_t ib = 0; size_t ib_extent = 0; @@ -4573,25 +4594,29 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, agx_upload_draw_params(batch, indirect, draws, info); /* Setup parameters */ - struct agx_tess_params tess_params = { - .tcs_buffer = heap_gpu + tcs_out_offs, + uint64_t geom_state = agx_batch_geometry_state(batch); + assert((tcs->tess.output_stride & 3) == 0 && "must be aligned"); + + struct libagx_tess_args args = { + .heap = geom_state, + .tcs_stride_el = tcs->tess.output_stride / 4, + .statistic = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]), .input_patch_size = patch_vertices, .output_patch_size = tcs->tess.output_patch_size, .tcs_patch_constants = tcs->tess.nr_patch_outputs, .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs, - .patch_coord_buffer = heap_gpu, - .patches_per_instance = in_patches, + .patch_coord_buffer = agx_resource(ctx->heap)->bo->ptr.gpu, }; - memcpy(&tess_params.tess_level_outer_default, ctx->default_outer_level, + memcpy(&args.tess_level_outer_default, ctx->default_outer_level, sizeof(ctx->default_outer_level)); - memcpy(&tess_params.tess_level_inner_default, ctx->default_inner_level, + memcpy(&args.tess_level_inner_default, ctx->default_inner_level, sizeof(ctx->default_inner_level)); - batch->uniforms.tess_params = - agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params)); + struct agx_grid vs_grid, tcs_grid, tess_grid; + unsigned tess_wg_size = 64; - /* Run VS+TCS as compute */ agx_upload_vbos(batch); agx_update_vs(ctx, info->index_size); agx_update_tcs(ctx, info); @@ -4605,153 +4630,184 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs; - unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count, - batch->uniforms.vertex_outputs); - uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu; - batch->uniforms.vertex_output_buffer_ptr = - agx_pool_upload(&batch->pool, &addr, 8); + if (indirect == NULL) { + unsigned in_patches = draws->count / patch_vertices; + if (in_patches == 0) + return; - struct pipe_grid_info vs_grid = { - .block = {1, 1, 1}, - .grid = {draws->count, info->instance_count, 1}, - }; + /* TCS invocation counter increments once per-patch */ + agx_query_increment_cpu( + ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS], + in_patches); - agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX); + unsigned unrolled_patches = in_patches * info->instance_count; - struct pipe_grid_info tcs_grid = { - .block = {tcs->tess.output_patch_size, 1, 1}, - .grid = {in_patches, info->instance_count, 1}, - }; + uint32_t alloc = 0; + uint32_t tcs_out_offs = alloc; + alloc += unrolled_patches * tcs->tess.output_stride; - agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL); + uint32_t patch_coord_offs = alloc; + alloc += unrolled_patches * 4; + + uint32_t count_offs = alloc; + if (with_counts) + alloc += unrolled_patches * sizeof(uint32_t); + + uint32_t draw_offs = alloc; + + if (with_counts) { + alloc += draw_stride; + } else { + /* Padding added because VDM overreads */ + alloc += + (draw_stride * unrolled_patches) + (AGX_VDM_BARRIER_LENGTH + 0x800); + } + + struct agx_ptr blob = + agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo); + + args.tcs_buffer = blob.gpu + tcs_out_offs; + args.patches_per_instance = in_patches; + args.coord_allocs = blob.gpu + patch_coord_offs; + args.nr_patches = unrolled_patches; + args.out_draws = blob.gpu + draw_offs; + + if (with_counts) { + args.counts = blob.gpu + count_offs; + } else { + /* Arrange so we return after all generated draws */ + uint8_t *ret = + (uint8_t *)blob.cpu + draw_offs + (draw_stride * unrolled_patches); + + agx_pack(ret, VDM_BARRIER, cfg) { + cfg.returns = true; + } + } + + unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count, + batch->uniforms.vertex_outputs); + uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu; + batch->uniforms.vertex_output_buffer_ptr = + agx_pool_upload(&batch->pool, &addr, 8); + + vs_grid = + agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1); + + tcs_grid = agx_grid_direct(in_patches * tcs->tess.output_patch_size, + info->instance_count, 1, + tcs->tess.output_patch_size, 1, 1); + + tess_grid = agx_grid_direct(unrolled_patches, 1, 1, tess_wg_size, 1, 1); + } else if (indirect) { + args.tcs_statistic = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]); + + args.indirect = agx_indirect_buffer_ptr(batch, indirect); + + /* Allocate 3x indirect global+local grids for VS/TCS/tess */ + uint32_t grid_stride = sizeof(uint32_t) * 6; + args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu; + + vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride); + tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride); + tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride); + + args.vertex_outputs = ctx->vs->b.info.outputs; + args.vertex_output_buffer_ptr = + agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu; + + batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr; + + if (with_counts) { + args.out_draws = agx_pool_alloc_aligned_with_bo( + &batch->pool, draw_stride, 4, &draw_bo) + .gpu; + } else { + unreachable("need an extra indirection..."); + } + } + + uint64_t state = + agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4); + + if (indirect) { + const struct agx_grid indirect_grid = agx_grid_direct(1, 1, 1, 1, 1, 1); + struct agx_tess_setup_indirect_key indirect_key = { + .point_mode = point_mode, + .with_counts = with_counts, + }; + + agx_launch_with_uploaded_data(batch, &indirect_grid, + agx_nir_tess_setup_indirect, &indirect_key, + sizeof(indirect_key), state); + } + + batch->uniforms.tess_params = state; + + agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX, 0); + agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0); batch->uniforms.vertex_output_buffer_ptr = 0; - agx_flush_all(ctx, "HACK"); - agx_sync_all(ctx, "HACK"); + struct agx_tessellator_key key = { + .prim = mode, + .output_primitive = prim, + .partitioning = partitioning, + }; - /* Setup batch */ - batch = agx_get_batch(ctx); + if (with_counts) { + /* Generate counts */ + key.mode = LIBAGX_TESS_MODE_COUNT; + agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key, + sizeof(key), state); - enum tess_primitive_mode mode = - MAX2(tcs->tess.primitive, tes->tess.primitive); - enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing); + /* Prefix sum counts, allocating index buffer space. */ + const struct agx_grid prefix_sum_grid = + agx_grid_direct(1024, 1, 1, 1024, 1, 1); - enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL - ? PIPE_TESS_SPACING_EQUAL - : spacing == TESS_SPACING_FRACTIONAL_ODD - ? PIPE_TESS_SPACING_FRACTIONAL_ODD - : PIPE_TESS_SPACING_FRACTIONAL_EVEN; + agx_launch_with_uploaded_data(batch, &prefix_sum_grid, + agx_nir_prefix_sum_tess, NULL, 0, state); - bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode); - enum mesa_prim in_prim = mode == TESS_PRIMITIVE_ISOLINES ? MESA_PRIM_LINES - : mode == TESS_PRIMITIVE_QUADS - ? MESA_PRIM_QUADS - : MESA_PRIM_TRIANGLES; - enum mesa_prim out_prim = point_mode ? MESA_PRIM_POINTS - : mode == TESS_PRIMITIVE_ISOLINES - ? MESA_PRIM_LINES - : MESA_PRIM_TRIANGLES; - - struct pipe_tessellator *tess = - p_tess_init(in_prim, pspacing, tes->tess.ccw, point_mode); - - struct pipe_tessellator_data data = {0}; - - /* Mem allocate */ - uint32_t patch_coord_offs_offs = heap_water; - tess_params.patch_coord_offs = heap_gpu + heap_water; - heap_water += align(4 * unrolled_patch_count, 4); - - uint32_t draws_off = heap_water; - uint32_t *patch_draws = (uint32_t *)(heap_cpu + heap_water); - heap_water += align(sizeof(uint32_t) * 5 * unrolled_patch_count, 4); - - uint32_t *patch_offs = (uint32_t *)(heap_cpu + patch_coord_offs_offs); - - for (unsigned patch = 0; patch < unrolled_patch_count; ++patch) { - float *addr = - (float *)(heap_cpu + tcs_out_offs + tcs->tess.output_stride * patch); - - struct pipe_tessellation_factors factors = { - .outer_tf = {addr[0], addr[1], addr[2], addr[3]}, - .inner_tf = {addr[4], addr[5]}, - }; - p_tessellate(tess, &factors, &data); - - /* Mem allocate indices */ - uint32_t index_off = heap_water; - uint16_t *indices = (uint16_t *)(heap_cpu + heap_water); - heap_water += align(sizeof(*indices) * data.num_indices, 4); - - for (unsigned idx = 0; idx < data.num_indices; ++idx) { - indices[idx] = data.indices[idx]; - } - - /* Mem allocate patch coords */ - heap_water = align(heap_water, 8); - patch_offs[patch] = heap_water / 8; - float *patch_coords = (float *)(heap_cpu + heap_water); - heap_water += align(8 * data.num_domain_points, 4); - - for (unsigned p = 0; p < data.num_domain_points; ++p) { - patch_coords[2 * p + 0] = data.domain_points_u[p]; - patch_coords[2 * p + 1] = data.domain_points_v[p]; - } - assert(data.num_indices < 32768); - assert(data.num_domain_points < 8192); - - /* Generate a draw for the patch */ - uint32_t *desc = patch_draws + (patch * 5); - - desc[0] = data.num_indices; /* count */ - desc[1] = 1; /* instance_count */ - desc[2] = index_off / sizeof(*indices); /* start */ - desc[3] = patch * LIBAGX_TES_PATCH_ID_STRIDE; /* index_bias */ - desc[4] = 0; /* start_instance */ - - /* TES invocation counter increments once per tessellated vertex */ - agx_query_increment_cpu( - ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS], - data.num_domain_points); + key.mode = LIBAGX_TESS_MODE_WITH_COUNTS; + } else { + key.mode = LIBAGX_TESS_MODE_VDM; } - p_tess_destroy(tess); + + /* Now we can tessellate */ + agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key, + sizeof(key), state); /* Run TES as VS */ void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader; void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader; ctx->base.bind_vs_state(&ctx->base, tes_cso); ctx->in_tess = true; + ctx->in_generated_vdm = !with_counts; struct pipe_draw_info draw_info = { .mode = out_prim, - .index_size = 2, - .index.resource = heap, + .index_size = with_counts ? 4 : (point_mode ? 0 : 2), + .index.resource = (!with_counts && point_mode) ? NULL : ctx->heap, .instance_count = 1, .view_mask = info->view_mask, }; /* Wrap the pool allocation in a fake resource for meta-Gallium use */ - struct pipe_draw_indirect_info copy_indirect = { - .buffer = heap, - .offset = draws_off, - .stride = 5 * sizeof(uint32_t), - .draw_count = in_patches * info->instance_count, - }; + struct agx_resource indirect_rsrc = {.bo = draw_bo}; - /* Tess param upload is deferred to draw_vbo since the batch may change - * within draw_vbo for various reasons, so we can't upload it to the batch - * upfront. - */ - memcpy(&ctx->tess_params, &tess_params, sizeof(tess_params)); + struct pipe_draw_indirect_info copy_indirect = { + .buffer = &indirect_rsrc.base, + .offset = args.out_draws - draw_bo->ptr.gpu, + .stride = draw_stride, + .draw_count = 1, + }; ctx->base.draw_vbo(&ctx->base, &draw_info, 0, ©_indirect, NULL, 1); /* Restore vertex state */ ctx->base.bind_vs_state(&ctx->base, vs_cso); + ctx->in_generated_vdm = false; ctx->in_tess = false; - pipe_resource_reference(&heap, NULL); - if (unbind_tcs_when_done) { ctx->base.bind_tcs_state(&ctx->base, NULL); } @@ -4860,13 +4916,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } - /* TODO: stop cheating */ - if (info->mode == MESA_PRIM_PATCHES && indirect) { - perf_debug_ctx(ctx, "indirect tessellation"); - util_draw_indirect(pctx, info, drawid_offset, indirect); - return; - } - /* TODO: stop cheating */ if (ctx->active_queries && !ctx->active_draw_without_restart && (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] || @@ -4878,11 +4927,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } - if (info->mode == MESA_PRIM_PATCHES) { - agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws); - return; - } - bool xfb_passthrough = false; if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) { agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws, @@ -4890,6 +4934,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } + if (info->mode == MESA_PRIM_PATCHES) { + agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws); + return; + } + agx_legalize_feedback_loops(ctx); /* Only the rasterization stream counts */ @@ -5017,11 +5066,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) || IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) { - if (ctx->in_tess) { - batch->uniforms.tess_params = agx_pool_upload( - &batch->pool, &ctx->tess_params, sizeof(ctx->tess_params)); - } - if (IS_DIRTY(VERTEX)) { agx_upload_vbos(batch); } @@ -5143,63 +5187,75 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, uint8_t *out = agx_encode_state(batch, batch->vdm.current); - if (info->index_size) { - agx_push(out, VDM_STATE, cfg) - cfg.restart_index_present = true; - - agx_push(out, VDM_STATE_RESTART_INDEX, cfg) - cfg.value = info->restart_index; - } - - agx_push(out, INDEX_LIST, cfg) { - cfg.primitive = agx_primitive_for_pipe(info->mode); - - if (indirect != NULL) { - cfg.indirect_buffer_present = true; - } else { - cfg.instance_count_present = true; - cfg.index_count_present = true; - cfg.start_present = true; - } - - if (info->index_size) { - cfg.restart_enable = info->primitive_restart; - cfg.index_buffer_hi = (ib >> 32); - cfg.index_size = agx_translate_index_size(info->index_size); - cfg.index_buffer_present = true; - cfg.index_buffer_size_present = true; - } - } - - if (info->index_size) { - agx_push(out, INDEX_LIST_BUFFER_LO, cfg) { - cfg.buffer_lo = ib & BITFIELD_MASK(32); - } - } - - if (indirect) { + if (ctx->in_generated_vdm) { struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer); uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset; - agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) { - cfg.address_hi = address >> 32; - cfg.address_lo = address & BITFIELD_MASK(32); + agx_push(out, VDM_STREAM_LINK, cfg) { + cfg.target_lo = address & BITFIELD_MASK(32); + cfg.target_hi = address >> 32; + cfg.with_return = true; } } else { - agx_push(out, INDEX_LIST_COUNT, cfg) - cfg.count = draws->count; - agx_push(out, INDEX_LIST_INSTANCES, cfg) - cfg.count = info->instance_count; + if (info->index_size && info->primitive_restart) { + agx_push(out, VDM_STATE, cfg) + cfg.restart_index_present = true; - agx_push(out, INDEX_LIST_START, cfg) { - cfg.start = info->index_size ? draws->index_bias : draws->start; + agx_push(out, VDM_STATE_RESTART_INDEX, cfg) + cfg.value = info->restart_index; } - } - if (info->index_size) { - agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) { - cfg.size = ib_extent; + agx_push(out, INDEX_LIST, cfg) { + cfg.primitive = agx_primitive_for_pipe(info->mode); + + if (indirect != NULL) { + cfg.indirect_buffer_present = true; + } else { + cfg.instance_count_present = true; + cfg.index_count_present = true; + cfg.start_present = true; + } + + if (info->index_size) { + cfg.restart_enable = info->primitive_restart; + cfg.index_buffer_hi = (ib >> 32); + cfg.index_size = agx_translate_index_size(info->index_size); + cfg.index_buffer_present = true; + cfg.index_buffer_size_present = true; + } + } + + if (info->index_size) { + agx_push(out, INDEX_LIST_BUFFER_LO, cfg) { + cfg.buffer_lo = ib & BITFIELD_MASK(32); + } + } + + if (indirect) { + struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer); + uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset; + + agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) { + cfg.address_hi = address >> 32; + cfg.address_lo = address & BITFIELD_MASK(32); + } + } else { + agx_push(out, INDEX_LIST_COUNT, cfg) + cfg.count = draws->count; + + agx_push(out, INDEX_LIST_INSTANCES, cfg) + cfg.count = info->instance_count; + + agx_push(out, INDEX_LIST_START, cfg) { + cfg.start = info->index_size ? draws->index_bias : draws->start; + } + } + + if (info->index_size) { + agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) { + cfg.size = ib_extent; + } } } @@ -5242,7 +5298,7 @@ agx_texture_barrier(struct pipe_context *pipe, unsigned flags) } void -agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info, +agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid, struct agx_compiled_shader *cs, enum pipe_shader_type stage, uint32_t usc) { @@ -5253,11 +5309,7 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info, uint8_t *out = batch->cdm.current; agx_push(out, CDM_LAUNCH_WORD_0, cfg) { - if (info->indirect) - cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL; - else - cfg.mode = AGX_CDM_MODE_DIRECT; - + cfg.mode = grid->mode; cfg.uniform_register_count = cs->b.info.push_count; cfg.preshader_register_count = cs->b.info.nr_preamble_gprs; cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, cs); @@ -5275,32 +5327,25 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info, ; } - if (info->indirect) { - struct agx_resource *indirect = agx_resource(info->indirect); - uint64_t addr = indirect->bo->ptr.gpu + info->indirect_offset; - - agx_push(out, CDM_INDIRECT, cfg) { - cfg.address_hi = addr >> 32; - cfg.address_lo = addr & BITFIELD64_MASK(32); + if (grid->mode == AGX_CDM_MODE_DIRECT) { + agx_push(out, CDM_GLOBAL_SIZE, cfg) { + cfg.x = grid->global[0]; + cfg.y = grid->global[1]; + cfg.z = grid->global[2]; } } else { - uint32_t size[3]; - for (unsigned d = 0; d < 3; ++d) { - size[d] = ((info->grid[d] - 1) * info->block[d]) + - (info->last_block[d] ?: info->block[d]); - } - - agx_push(out, CDM_GLOBAL_SIZE, cfg) { - cfg.x = size[0]; - cfg.y = size[1]; - cfg.z = size[2]; + agx_push(out, CDM_INDIRECT, cfg) { + cfg.address_hi = grid->indirect >> 32; + cfg.address_lo = grid->indirect & BITFIELD64_MASK(32); } } - agx_push(out, CDM_LOCAL_SIZE, cfg) { - cfg.x = info->block[0]; - cfg.y = info->block[1]; - cfg.z = info->block[2]; + if (grid->mode != AGX_CDM_MODE_INDIRECT_LOCAL) { + agx_push(out, CDM_LOCAL_SIZE, cfg) { + cfg.x = grid->local[0]; + cfg.y = grid->local[1]; + cfg.z = grid->local[2]; + } } agx_push(out, CDM_BARRIER, cfg) { @@ -5352,9 +5397,9 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info, } void -agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info, +agx_launch(struct agx_batch *batch, const struct agx_grid *grid, struct agx_compiled_shader *cs, struct agx_linked_shader *linked, - enum pipe_shader_type stage) + enum pipe_shader_type stage, unsigned variable_shared_mem) { struct agx_context *ctx = batch->ctx; @@ -5362,18 +5407,17 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info, * available in GPU memory. This is either the indirect buffer, or just a * buffer we upload ourselves if not indirect. */ - if (info->indirect) { - struct agx_resource *indirect = agx_resource(info->indirect); - agx_batch_reads(batch, indirect); + if (grid->mode == AGX_CDM_MODE_DIRECT) { + uint32_t groups[3] = { + grid->global[0] / grid->local[0], + grid->global[1] / grid->local[1], + grid->global[2] / grid->local[2], + }; batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = - indirect->bo->ptr.gpu + info->indirect_offset; + agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4); } else { - static_assert(sizeof(info->grid) == 12, - "matches indirect dispatch buffer"); - - batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = agx_pool_upload_aligned( - &batch->pool, info->grid, sizeof(info->grid), 4); + batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid->indirect; } util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) { @@ -5403,11 +5447,10 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info, } #endif - uint32_t usc = - agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE, - info->variable_shared_mem, subgroups_per_core); + uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE, + variable_shared_mem, subgroups_per_core); - agx_launch_internal(batch, info, cs, stage, usc); + agx_launch_internal(batch, grid, cs, stage, usc); } static void @@ -5454,7 +5497,29 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) struct agx_compiled_shader *cs = _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data; - agx_launch(batch, info, cs, NULL, PIPE_SHADER_COMPUTE); + struct agx_grid grid = { + .local[0] = info->block[0], + .local[1] = info->block[1], + .local[2] = info->block[2], + }; + + if (info->indirect) { + struct agx_resource *indirect = agx_resource(info->indirect); + agx_batch_reads(batch, indirect); + + grid.mode = AGX_CDM_MODE_INDIRECT_GLOBAL; + grid.indirect = indirect->bo->ptr.gpu + info->indirect_offset; + } else { + grid.mode = AGX_CDM_MODE_DIRECT; + + for (unsigned d = 0; d < 3; ++d) { + grid.global[d] = ((info->grid[d] - 1) * info->block[d]) + + (info->last_block[d] ?: info->block[d]); + } + } + + agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE, + info->variable_shared_mem); /* TODO: Dirty tracking? */ agx_dirty_all(ctx); diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 3d139a3c71b..949e343cecc 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -682,8 +682,8 @@ struct agx_context { struct util_debug_callback debug; bool is_noop; - struct agx_tess_params tess_params; bool in_tess; + bool in_generated_vdm; struct blitter_context *blitter; struct asahi_blitter compute_blitter; @@ -781,19 +781,67 @@ struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx, meta_shader_builder_t builder, void *data, size_t data_size); -void agx_launch_with_data(struct agx_batch *batch, - const struct pipe_grid_info *info, +struct agx_grid { + /* Tag for the union */ + enum agx_cdm_mode mode; + + /* If mode != INDIRECT_LOCAL, the local size */ + uint32_t local[3]; + + union { + /* If mode == DIRECT, the global size. This is *not* multiplied by the + * local size, differing from the API definition but matching AGX. + */ + uint32_t global[3]; + + /* Address of the indirect buffer if mode != DIRECT */ + uint64_t indirect; + }; +}; + +static inline const struct agx_grid +agx_grid_direct(uint32_t global_x, uint32_t global_y, uint32_t global_z, + uint32_t local_x, uint32_t local_y, uint32_t local_z) +{ + return (struct agx_grid){ + .mode = AGX_CDM_MODE_DIRECT, + .global = {global_x, global_y, global_z}, + .local = {local_x, local_y, local_z}, + }; +} + +static inline const struct agx_grid +agx_grid_indirect(uint64_t indirect, uint32_t local_x, uint32_t local_y, + uint32_t local_z) +{ + return (struct agx_grid){ + .mode = AGX_CDM_MODE_INDIRECT_GLOBAL, + .local = {local_x, local_y, local_z}, + .indirect = indirect, + }; +} + +static inline const struct agx_grid +agx_grid_indirect_local(uint64_t indirect) +{ + return (struct agx_grid){ + .mode = AGX_CDM_MODE_INDIRECT_LOCAL, + .indirect = indirect, + }; +} + +void agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid, meta_shader_builder_t builder, void *key, size_t key_size, void *data, size_t data_size); -void agx_launch_internal(struct agx_batch *batch, - const struct pipe_grid_info *info, +void agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid, struct agx_compiled_shader *cs, enum pipe_shader_type stage, uint32_t usc); -void agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info, +void agx_launch(struct agx_batch *batch, const struct agx_grid *grid, struct agx_compiled_shader *cs, - struct agx_linked_shader *linked, enum pipe_shader_type stage); + struct agx_linked_shader *linked, enum pipe_shader_type stage, + unsigned variable_shared_mem); void agx_init_query_functions(struct pipe_context *ctx);