radeonsi: rewrite the prefix sum computation for shader culling
Instead of storing the vertex mask per wave into LDS and then computing the prefix sum, store 8-bit bitcounts (vertex counts) of the vertex masks into LDS. This allows us to compute the sum using v_sad_u8, which computes a sum of 4 i8vec4 components in one instruction. Each i8vec4 of vertex counts is loaded in parallel threads (one dword per thread) instead of all being loaded in thread 0, and readlane copies them to SGPRs instead of readfirstlane. LDS is no longer initialized before culling. Instead, the counts for inactive waves are masked with AND later. Incorrect old comments are also fixed. This change removes 80 bytes from the code size, and it allows increasing the workgroup size from 128 to 256. (which is the main motivation for this) Now changing the workgroup size with wave64 has no effect on the code size. Switching to wave32 with 8 waves even generates slightly smaller code than wave64 with 4 waves. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
This commit is contained in:
@@ -3293,17 +3293,21 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, uns
|
||||
return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
|
||||
}
|
||||
|
||||
/* If param is i64 and bitwidth <= 32, the return value will be i32. */
|
||||
LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
|
||||
unsigned bitwidth)
|
||||
{
|
||||
LLVMValueRef value = param;
|
||||
if (rshift)
|
||||
value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(ctx->i32, rshift, false), "");
|
||||
value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
|
||||
|
||||
if (rshift + bitwidth < 32) {
|
||||
unsigned mask = (1 << bitwidth) - 1;
|
||||
value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(ctx->i32, mask, false), "");
|
||||
uint64_t mask = (1ull << bitwidth) - 1;
|
||||
value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
|
||||
}
|
||||
|
||||
if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
|
||||
value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
|
||||
return value;
|
||||
}
|
||||
|
||||
@@ -4723,64 +4727,6 @@ void ac_build_s_endpgm(struct ac_llvm_context *ctx)
|
||||
LLVMBuildCall(ctx->builder, code, NULL, 0, "");
|
||||
}
|
||||
|
||||
LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->builder;
|
||||
LLVMTypeRef type = LLVMTypeOf(mask);
|
||||
|
||||
LLVMValueRef bit =
|
||||
LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), LLVMBuildZExt(builder, index, type, ""), "");
|
||||
LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), "");
|
||||
LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, "");
|
||||
return ac_build_bit_count(ctx, prefix_mask);
|
||||
}
|
||||
|
||||
/* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */
|
||||
LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2],
|
||||
LLVMValueRef index)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->builder;
|
||||
#if 0
|
||||
/* Reference version using i128. */
|
||||
LLVMValueRef input_mask =
|
||||
LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, "");
|
||||
|
||||
return ac_prefix_bitcount(ctx, input_mask, index);
|
||||
#else
|
||||
/* Optimized version using 2 64-bit masks. */
|
||||
LLVMValueRef is_hi, is_0, c64, c128, all_bits;
|
||||
LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2];
|
||||
|
||||
/* Compute the 128-bit prefix mask. */
|
||||
c64 = LLVMConstInt(ctx->i32, 64, 0);
|
||||
c128 = LLVMConstInt(ctx->i32, 128, 0);
|
||||
all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0);
|
||||
/* The first index that can have non-zero high bits in the prefix mask is 65. */
|
||||
is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, "");
|
||||
is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, "");
|
||||
mask_bcnt0 = ac_build_bit_count(ctx, mask[0]);
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, "");
|
||||
/* For i==0, index==0, the right shift by 64 doesn't give the desired result,
|
||||
* so we handle it by the is_0 select.
|
||||
* For i==1, index==64, same story, so we handle it by the last is_hi select.
|
||||
* For i==0, index==64, we shift by 0, which is what we want.
|
||||
*/
|
||||
prefix_mask[i] =
|
||||
LLVMBuildLShr(builder, all_bits, LLVMBuildZExt(builder, shift[i], ctx->i64, ""), "");
|
||||
prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], "");
|
||||
prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]);
|
||||
}
|
||||
|
||||
prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], "");
|
||||
prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], "");
|
||||
prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, "");
|
||||
|
||||
return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert triangle strip indices to triangle indices. This is used to decompose
|
||||
* triangle strips into triangles.
|
||||
|
Reference in New Issue
Block a user