dzn: Fix triangle-fan emulation

We were completely ignoring the primitive-restart case in the
index-rewrite logic used to emulate triangle fans. Unfortunately, this
case is way more complicated than a regular index rewrite:

- we need to skip all primitive-restart entries when turning the triangle
  fan into a triangle list, which implies serializing the index buffer
  rewrite procedure (at least I didn't find any clever way to parallelize
  things)
- the number of triangles can no longer be extrapolated from the number
  of indices in the original index buffer, thus forcing us to lower
  direct indexed draws into indirect draws and patching the indexCount
  value when the new index buffer is forged

Reviewed-by: Jesse Natalie <jenatali@microsoft.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16971>
This commit is contained in:
Boris Brezillon
2022-06-13 16:22:38 +02:00
committed by Marge Bot
parent 741b5ded49
commit 91f3c7a9fb
5 changed files with 451 additions and 78 deletions

View File

@@ -93,6 +93,8 @@ dzn_nir_indirect_draw_shader(enum dzn_indirect_draw_type type)
"draw_count_triangle_fan",
"indexed_draw_triangle_fan",
"indexed_draw_count_triangle_fan",
"indexed_draw_triangle_fan_prim_restart",
"indexed_draw_count_triangle_fan_prim_restart",
};
assert(type < ARRAY_SIZE(type_str));
@@ -100,15 +102,22 @@ dzn_nir_indirect_draw_shader(enum dzn_indirect_draw_type type)
bool indexed = type == DZN_INDIRECT_INDEXED_DRAW ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT ||
type == DZN_INDIRECT_INDEXED_DRAW_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN;
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_TRIANGLE_FAN_PRIM_RESTART ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN_PRIM_RESTART;
bool triangle_fan = type == DZN_INDIRECT_DRAW_TRIANGLE_FAN ||
type == DZN_INDIRECT_DRAW_COUNT_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN;
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_TRIANGLE_FAN_PRIM_RESTART ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN_PRIM_RESTART;
bool indirect_count = type == DZN_INDIRECT_DRAW_COUNT ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT ||
type == DZN_INDIRECT_DRAW_COUNT_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN;
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN_PRIM_RESTART;
bool prim_restart = type == DZN_INDIRECT_INDEXED_DRAW_TRIANGLE_FAN_PRIM_RESTART ||
type == DZN_INDIRECT_INDEXED_DRAW_COUNT_TRIANGLE_FAN_PRIM_RESTART;
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
dxil_get_nir_compiler_options(),
@@ -130,10 +139,11 @@ dzn_nir_indirect_draw_shader(enum dzn_indirect_draw_type type)
nir_ssa_def *exec_buf_desc =
dzn_nir_create_bo_desc(&b, nir_var_mem_ssbo, 0, 2, "exec_buf", ACCESS_NON_READABLE);
unsigned params_size =
triangle_fan ?
sizeof(struct dzn_indirect_draw_triangle_fan_rewrite_params) :
sizeof(struct dzn_indirect_draw_rewrite_params);
unsigned params_size;
if (triangle_fan)
params_size = sizeof(struct dzn_indirect_draw_triangle_fan_rewrite_params);
else
params_size = sizeof(struct dzn_indirect_draw_rewrite_params);
nir_ssa_def *params =
nir_load_ubo(&b, params_size / 4, 32,
@@ -209,35 +219,64 @@ dzn_nir_indirect_draw_shader(enum dzn_indirect_draw_type type)
nir_iadd(&b, nir_channel(&b, params, 2),
nir_imul(&b, triangle_fan_index_buf_stride, index));
nir_ssa_def *triangle_fan_exec_vals[9] = { 0 };
uint32_t triangle_fan_exec_param_count = 0;
nir_ssa_def *addr_lo_overflow =
nir_ult(&b, triangle_fan_index_buf_addr_lo, nir_channel(&b, params, 2));
nir_ssa_def *triangle_fan_index_buf_addr_hi =
nir_iadd(&b, nir_channel(&b, params, 3),
nir_bcsel(&b, addr_lo_overflow, nir_imm_int(&b, 1), nir_imm_int(&b, 0)));
nir_ssa_def *triangle_fan_exec_vals[] = {
triangle_fan_index_buf_addr_lo,
triangle_fan_index_buf_addr_hi,
indexed ? nir_channel(&b, draw_info1, 2) : nir_imm_int(&b, 0),
triangle_count,
nir_imm_int(&b, 1),
nir_imm_int(&b, 1),
};
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = triangle_fan_index_buf_addr_lo;
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = triangle_fan_index_buf_addr_hi;
assert(sizeof(struct dzn_indirect_triangle_fan_rewrite_index_exec_params) == (ARRAY_SIZE(triangle_fan_exec_vals) * 4));
if (prim_restart) {
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = nir_channel(&b, draw_info1, 2);
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = nir_channel(&b, draw_info1, 0);
uint32_t index_count_offset =
offsetof(struct dzn_indirect_triangle_fan_draw_exec_params, indexed_draw.index_count);
nir_ssa_def *exec_buf_start =
nir_load_ubo(&b, 2, 32,
params_desc, nir_imm_int(&b, 16),
.align_mul = 4, .align_offset = 0, .range_base = 0, .range = ~0);
nir_ssa_def *exec_buf_start_lo =
nir_iadd(&b, nir_imm_int(&b, index_count_offset),
nir_iadd(&b, nir_channel(&b, exec_buf_start, 0),
nir_imul(&b, exec_stride, index)));
addr_lo_overflow = nir_ult(&b, exec_buf_start_lo, nir_channel(&b, exec_buf_start, 0));
nir_ssa_def *exec_buf_start_hi =
nir_iadd(&b, nir_channel(&b, exec_buf_start, 0),
nir_bcsel(&b, addr_lo_overflow, nir_imm_int(&b, 1), nir_imm_int(&b, 0)));
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = exec_buf_start_lo;
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = exec_buf_start_hi;
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = nir_imm_int(&b, 1);
} else {
triangle_fan_exec_vals[triangle_fan_exec_param_count++] =
indexed ? nir_channel(&b, draw_info1, 2) : nir_imm_int(&b, 0);
triangle_fan_exec_vals[triangle_fan_exec_param_count++] =
triangle_count;
}
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = nir_imm_int(&b, 1);
triangle_fan_exec_vals[triangle_fan_exec_param_count++] = nir_imm_int(&b, 1);
unsigned rewrite_index_exec_params =
prim_restart ?
sizeof(struct dzn_indirect_triangle_fan_prim_restart_rewrite_index_exec_params) :
sizeof(struct dzn_indirect_triangle_fan_rewrite_index_exec_params);
nir_ssa_def *triangle_fan_exec_stride =
nir_imm_int(&b, sizeof(struct dzn_indirect_triangle_fan_rewrite_index_exec_params));
nir_imm_int(&b, rewrite_index_exec_params);
nir_ssa_def *triangle_fan_exec_offset =
nir_imul(&b, triangle_fan_exec_stride, index);
nir_store_ssbo(&b, nir_vec(&b, &triangle_fan_exec_vals[0], 4),
triangle_fan_exec_buf_desc, triangle_fan_exec_offset,
.write_mask = 0xf, .access = ACCESS_NON_READABLE, .align_mul = 4);
nir_store_ssbo(&b, nir_vec(&b, &triangle_fan_exec_vals[4], 2),
triangle_fan_exec_buf_desc,
nir_iadd_imm(&b, triangle_fan_exec_offset, 16),
.write_mask = 0x3, .access = ACCESS_NON_READABLE, .align_mul = 4);
for (uint32_t i = 0; i < triangle_fan_exec_param_count; i += 4) {
unsigned comps = MIN2(triangle_fan_exec_param_count - i, 4);
uint32_t mask = (1 << comps) - 1;
nir_store_ssbo(&b, nir_vec(&b, &triangle_fan_exec_vals[i], comps),
triangle_fan_exec_buf_desc,
nir_iadd_imm(&b, triangle_fan_exec_offset, i * 4),
.write_mask = mask, .access = ACCESS_NON_READABLE, .align_mul = 4);
}
nir_ssa_def *ibview_vals[] = {
triangle_fan_index_buf_addr_lo,
@@ -271,6 +310,172 @@ dzn_nir_indirect_draw_shader(enum dzn_indirect_draw_type type)
return b.shader;
}
nir_shader *
dzn_nir_triangle_fan_prim_restart_rewrite_index_shader(uint8_t old_index_size)
{
assert(old_index_size == 2 || old_index_size == 4);
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
dxil_get_nir_compiler_options(),
"dzn_meta_triangle_prim_rewrite_index(old_index_size=%d)",
old_index_size);
b.shader->info.internal = true;
nir_ssa_def *params_desc =
dzn_nir_create_bo_desc(&b, nir_var_mem_ubo, 0, 0, "params", 0);
nir_ssa_def *new_index_buf_desc =
dzn_nir_create_bo_desc(&b, nir_var_mem_ssbo, 0, 1,
"new_index_buf", ACCESS_NON_READABLE);
nir_ssa_def *old_index_buf_desc =
dzn_nir_create_bo_desc(&b, nir_var_mem_ssbo, 0, 2,
"old_index_buf", ACCESS_NON_WRITEABLE);
nir_ssa_def *new_index_count_ptr_desc =
dzn_nir_create_bo_desc(&b, nir_var_mem_ssbo, 0, 3,
"new_index_count_ptr", ACCESS_NON_READABLE);
nir_ssa_def *params =
nir_load_ubo(&b, sizeof(struct dzn_triangle_fan_prim_restart_rewrite_index_params) / 4, 32,
params_desc, nir_imm_int(&b, 0),
.align_mul = 4, .align_offset = 0, .range_base = 0, .range = ~0);
nir_ssa_def *prim_restart_val =
nir_imm_int(&b, old_index_size == 2 ? 0xffff : 0xffffffff);
nir_variable *old_index_ptr_var =
nir_local_variable_create(b.impl, glsl_uint_type(), "old_index_ptr_var");
nir_ssa_def *old_index_ptr = nir_channel(&b, params, 0);
nir_store_var(&b, old_index_ptr_var, old_index_ptr, 1);
nir_variable *new_index_ptr_var =
nir_local_variable_create(b.impl, glsl_uint_type(), "new_index_ptr_var");
nir_store_var(&b, new_index_ptr_var, nir_imm_int(&b, 0), 1);
nir_ssa_def *old_index_count = nir_channel(&b, params, 1);
nir_variable *index0_var =
nir_local_variable_create(b.impl, glsl_uint_type(), "index0_var");
nir_store_var(&b, index0_var, prim_restart_val, 1);
/*
* Filter out all primitive-restart magic values, and generate a triangle list
* from the triangle fan definition.
*
* Basically:
*
* new_index_ptr = 0;
* index0 = restart_prim_value; // 0xffff or 0xffffffff
* for (old_index_ptr = firstIndex; old_index_ptr < indexCount;) {
* // If we have no starting-point we need at least 3 vertices,
* // otherwise we can do with two. If there's not enough vertices
* // to form a primitive, we just bail out.
* min_indices = index0 == restart_prim_value ? 3 : 2;
* if (old_index_ptr + min_indices > firstIndex + indexCount)
* break;
*
* if (index0 == restart_prim_value) {
* // No starting point, skip all entries until we have a
* // non-primitive-restart value
* index0 = old_index_buf[old_index_ptr++];
* continue;
* }
*
* // If at least one index contains the primitive-restart pattern,
// ignore this triangle, and skip the unused entries
* if (old_index_buf[old_index_ptr + 1] == restart_prim_value) {
* old_index_ptr += 2;
* continue;
* }
* if (old_index_buf[old_index_ptr] == restart_prim_value) {
* old_index_ptr++;
* continue;
* }
*
* // We have a valid primitive, queue it to the new index buffer
* new_index_buf[new_index_ptr++] = old_index_buf[old_index_ptr];
* new_index_buf[new_index_ptr++] = old_index_buf[old_index_ptr + 1];
* new_index_buf[new_index_ptr++] = index0;
* }
*
* expressed in NIR, which admitedly is not super easy to grasp with.
* TODO: Might be a good thing to use use the CL compiler we have and turn
* those shaders into CL kernels.
*/
nir_push_loop(&b);
old_index_ptr = nir_load_var(&b, old_index_ptr_var);
nir_ssa_def *index0 = nir_load_var(&b, index0_var);
nir_ssa_def *read_index_count =
nir_bcsel(&b, nir_ieq(&b, index0, prim_restart_val),
nir_imm_int(&b, 3), nir_imm_int(&b, 2));
nir_push_if(&b, nir_ult(&b, old_index_count, nir_iadd(&b, old_index_ptr, read_index_count)));
nir_jump(&b, nir_jump_break);
nir_pop_if(&b, NULL);
nir_ssa_def *old_index_offset =
nir_imul_imm(&b, old_index_ptr, old_index_size);
nir_push_if(&b, nir_ieq(&b, index0, prim_restart_val));
nir_ssa_def *index_val =
nir_load_ssbo(&b, 1, 32, old_index_buf_desc,
old_index_size == 2 ? nir_iand_imm(&b, old_index_offset, ~3ULL) : old_index_offset,
.align_mul = 4);
if (old_index_size == 2) {
index_val = nir_bcsel(&b,
nir_ieq_imm(&b, nir_iand_imm(&b, old_index_offset, 0x2), 0),
nir_iand_imm(&b, index_val, 0xffff),
nir_ushr_imm(&b, index_val, 16));
}
nir_store_var(&b, index0_var, index_val, 1);
nir_store_var(&b, old_index_ptr_var, nir_iadd_imm(&b, old_index_ptr, 1), 1);
nir_jump(&b, nir_jump_continue);
nir_pop_if(&b, NULL);
nir_ssa_def *index12 =
nir_load_ssbo(&b, 2, 32, old_index_buf_desc,
old_index_size == 2 ? nir_iand_imm(&b, old_index_offset, ~3ULL) : old_index_offset,
.align_mul = 4);
if (old_index_size == 2) {
nir_ssa_def *indices[] = {
nir_iand_imm(&b, nir_channel(&b, index12, 0), 0xffff),
nir_ushr_imm(&b, nir_channel(&b, index12, 0), 16),
nir_iand_imm(&b, nir_channel(&b, index12, 1), 0xffff),
};
index12 =
nir_bcsel(&b,
nir_ieq_imm(&b, nir_iand_imm(&b, old_index_offset, 0x2), 0),
nir_vec2(&b, indices[0], indices[1]),
nir_vec2(&b, indices[1], indices[2]));
}
nir_push_if(&b, nir_ieq(&b, nir_channel(&b, index12, 1), prim_restart_val));
nir_store_var(&b, old_index_ptr_var, nir_iadd_imm(&b, old_index_ptr, 2), 1);
nir_store_var(&b, index0_var, prim_restart_val, 1);
nir_jump(&b, nir_jump_continue);
nir_push_else(&b, NULL);
nir_store_var(&b, old_index_ptr_var, nir_iadd_imm(&b, old_index_ptr, 1), 1);
nir_push_if(&b, nir_ieq(&b, nir_channel(&b, index12, 0), prim_restart_val));
nir_store_var(&b, index0_var, prim_restart_val, 1);
nir_jump(&b, nir_jump_continue);
nir_push_else(&b, NULL);
nir_ssa_def *new_indices =
nir_vec3(&b, nir_channel(&b, index12, 0), nir_channel(&b, index12, 1), index0);
nir_ssa_def *new_index_ptr = nir_load_var(&b, new_index_ptr_var);
nir_ssa_def *new_index_offset = nir_imul_imm(&b, new_index_ptr, sizeof(uint32_t));
nir_store_ssbo(&b, new_indices, new_index_buf_desc,
new_index_offset,
.write_mask = 7, .access = ACCESS_NON_READABLE, .align_mul = 4);
nir_store_var(&b, new_index_ptr_var, nir_iadd_imm(&b, new_index_ptr, 3), 1);
nir_pop_if(&b, NULL);
nir_pop_if(&b, NULL);
nir_pop_loop(&b, NULL);
nir_store_ssbo(&b, nir_load_var(&b, new_index_ptr_var),
new_index_count_ptr_desc, nir_imm_int(&b, 0),
.write_mask = 1, .access = ACCESS_NON_READABLE, .align_mul = 4);
return b.shader;
}
nir_shader *
dzn_nir_triangle_fan_rewrite_index_shader(uint8_t old_index_size)
{