amd/common: scan/reduce across waves of a workgroup

Order-aware scan/reduce can trade-off LDS traffic for external atomics
memory traffic in producer/consumer compute shaders.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
This commit is contained in:
Nicolai Hähnle
2018-05-23 22:09:27 +02:00
parent 3963402fd3
commit 300876a9a7
2 changed files with 227 additions and 4 deletions

View File

@@ -524,6 +524,42 @@ ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size);
/**
* Common arguments for a scan/reduce operation that accumulates per-wave
* values across an entire workgroup, while respecting the order of waves.
*/
struct ac_wg_scan {
bool enable_reduce;
bool enable_exclusive;
bool enable_inclusive;
nir_op op;
LLVMValueRef src; /* clobbered! */
LLVMValueRef result_reduce;
LLVMValueRef result_exclusive;
LLVMValueRef result_inclusive;
LLVMValueRef extra;
LLVMValueRef waveidx;
LLVMValueRef numwaves; /* only needed for "reduce" operations */
/* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */
LLVMValueRef scratch;
unsigned maxwaves;
};
void
ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
void
ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
void
ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
void
ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
void
ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
void
ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3);