radeonsi: cull primitives with async compute for large draw calls
Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de> Acked-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
@@ -10,6 +10,7 @@ C_SOURCES := \
|
||||
si_build_pm4.h \
|
||||
si_clear.c \
|
||||
si_compute.c \
|
||||
si_compute_prim_discard.c \
|
||||
si_compute.h \
|
||||
si_compute_blit.c \
|
||||
si_cp_dma.c \
|
||||
|
@@ -26,6 +26,7 @@ files_libradeonsi = files(
|
||||
'si_build_pm4.h',
|
||||
'si_clear.c',
|
||||
'si_compute.c',
|
||||
'si_compute_prim_discard.c',
|
||||
'si_compute.h',
|
||||
'si_compute_blit.c',
|
||||
'si_cp_dma.c',
|
||||
|
1567
src/gallium/drivers/radeonsi/si_compute_prim_discard.c
Normal file
1567
src/gallium/drivers/radeonsi/si_compute_prim_discard.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -249,8 +249,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
sdst->TC_L2_dirty = true;
|
||||
|
||||
/* If it's not a framebuffer fast clear... */
|
||||
if (coher == SI_COHERENCY_SHADER)
|
||||
if (coher == SI_COHERENCY_SHADER) {
|
||||
sctx->num_cp_dma_calls++;
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -405,8 +407,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
|
||||
si_resource(dst)->TC_L2_dirty = true;
|
||||
|
||||
/* If it's not a prefetch or GDS copy... */
|
||||
if (dst && src && (dst != src || dst_offset != src_offset))
|
||||
if (dst && src && (dst != src || dst_offset != src_offset)) {
|
||||
sctx->num_cp_dma_calls++;
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
}
|
||||
}
|
||||
|
||||
void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
|
||||
|
@@ -337,6 +337,7 @@ struct si_log_chunk_cs {
|
||||
struct si_saved_cs *cs;
|
||||
bool dump_bo_list;
|
||||
unsigned gfx_begin, gfx_end;
|
||||
unsigned compute_begin, compute_end;
|
||||
};
|
||||
|
||||
static void si_log_chunk_type_cs_destroy(void *data)
|
||||
@@ -394,6 +395,7 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
|
||||
struct si_context *ctx = chunk->ctx;
|
||||
struct si_saved_cs *scs = chunk->cs;
|
||||
int last_trace_id = -1;
|
||||
int last_compute_trace_id = -1;
|
||||
|
||||
/* We are expecting that the ddebug pipe has already
|
||||
* waited for the context, so this buffer should be idle.
|
||||
@@ -403,8 +405,10 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
|
||||
NULL,
|
||||
PIPE_TRANSFER_UNSYNCHRONIZED |
|
||||
PIPE_TRANSFER_READ);
|
||||
if (map)
|
||||
if (map) {
|
||||
last_trace_id = map[0];
|
||||
last_compute_trace_id = map[1];
|
||||
}
|
||||
|
||||
if (chunk->gfx_end != chunk->gfx_begin) {
|
||||
if (chunk->gfx_begin == 0) {
|
||||
@@ -432,6 +436,21 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk->compute_end != chunk->compute_begin) {
|
||||
assert(ctx->prim_discard_compute_cs);
|
||||
|
||||
if (scs->flushed) {
|
||||
ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
|
||||
chunk->compute_end - chunk->compute_begin,
|
||||
&last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
|
||||
NULL, NULL);
|
||||
} else {
|
||||
si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
|
||||
chunk->compute_end, &last_compute_trace_id,
|
||||
map ? 1 : 0, "Compute IB", ctx->chip_class);
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk->dump_bo_list) {
|
||||
fprintf(f, "Flushing. Time: ");
|
||||
util_dump_ns(f, scs->time_flush);
|
||||
@@ -452,9 +471,14 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
|
||||
|
||||
struct si_saved_cs *scs = ctx->current_saved_cs;
|
||||
unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
|
||||
unsigned compute_cur = 0;
|
||||
|
||||
if (ctx->prim_discard_compute_cs)
|
||||
compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
|
||||
|
||||
if (!dump_bo_list &&
|
||||
gfx_cur == scs->gfx_last_dw)
|
||||
gfx_cur == scs->gfx_last_dw &&
|
||||
compute_cur == scs->compute_last_dw)
|
||||
return;
|
||||
|
||||
struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
|
||||
@@ -467,6 +491,10 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
|
||||
chunk->gfx_end = gfx_cur;
|
||||
scs->gfx_last_dw = gfx_cur;
|
||||
|
||||
chunk->compute_begin = scs->compute_last_dw;
|
||||
chunk->compute_end = compute_cur;
|
||||
scs->compute_last_dw = compute_cur;
|
||||
|
||||
u_log_chunk(log, &si_log_chunk_type_cs, chunk);
|
||||
}
|
||||
|
||||
|
@@ -80,7 +80,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
|
||||
EOP_INT_SEL(int_sel) |
|
||||
EOP_DATA_SEL(data_sel);
|
||||
|
||||
if (ctx->chip_class >= GFX9) {
|
||||
if (ctx->chip_class >= GFX9 || cs == ctx->prim_discard_compute_cs) {
|
||||
/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
|
||||
* counters) must immediately precede every timestamp event to
|
||||
* prevent a GPU hang on GFX9.
|
||||
@@ -89,6 +89,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
|
||||
* always do ZPASS_DONE before the timestamp.
|
||||
*/
|
||||
if (ctx->chip_class == GFX9 &&
|
||||
cs != ctx->prim_discard_compute_cs &&
|
||||
query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
|
||||
query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
|
||||
query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
|
||||
@@ -105,14 +106,15 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
|
||||
RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
|
||||
}
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
|
||||
radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
|
||||
radeon_emit(cs, op);
|
||||
radeon_emit(cs, sel);
|
||||
radeon_emit(cs, va); /* address lo */
|
||||
radeon_emit(cs, va >> 32); /* address hi */
|
||||
radeon_emit(cs, new_fence); /* immediate data lo */
|
||||
radeon_emit(cs, 0); /* immediate data hi */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
if (ctx->chip_class >= GFX9)
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
} else {
|
||||
if (ctx->chip_class == GFX7 ||
|
||||
ctx->chip_class == GFX8) {
|
||||
|
@@ -24,6 +24,8 @@
|
||||
*/
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "si_build_pm4.h"
|
||||
#include "sid.h"
|
||||
|
||||
#include "util/os_time.h"
|
||||
#include "util/u_upload_mgr.h"
|
||||
@@ -134,6 +136,24 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
||||
if (radeon_emitted(ctx->dma_cs, 0))
|
||||
si_flush_dma_cs(ctx, flags, NULL);
|
||||
|
||||
if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
|
||||
struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
|
||||
si_compute_signal_gfx(ctx);
|
||||
|
||||
/* Make sure compute shaders are idle before leaving the IB, so that
|
||||
* the next IB doesn't overwrite GDS that might be in use. */
|
||||
radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
|
||||
EVENT_INDEX(4));
|
||||
|
||||
/* Save the GDS prim restart counter if needed. */
|
||||
if (ctx->preserve_prim_restart_gds_at_flush) {
|
||||
si_cp_copy_data(ctx, compute_cs,
|
||||
COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
|
||||
COPY_DATA_GDS, NULL, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->has_graphics) {
|
||||
if (!LIST_IS_EMPTY(&ctx->active_queries))
|
||||
si_suspend_queries(ctx);
|
||||
@@ -168,6 +188,32 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
||||
si_log_hw_flush(ctx);
|
||||
}
|
||||
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* The compute IB can start after the previous gfx IB starts. */
|
||||
if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
|
||||
ctx->last_gfx_fence) {
|
||||
ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
|
||||
ctx->last_gfx_fence,
|
||||
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
|
||||
RADEON_DEPENDENCY_START_FENCE);
|
||||
}
|
||||
|
||||
/* Remember the last execution barrier. It's in the IB.
|
||||
* It will signal the start of the next compute IB.
|
||||
*/
|
||||
if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
|
||||
ctx->last_pkt3_write_data) {
|
||||
*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
|
||||
ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
|
||||
si_resource_reference(&ctx->barrier_buf, NULL);
|
||||
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Flush the CS. */
|
||||
ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
|
||||
if (fence)
|
||||
@@ -175,6 +221,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
||||
|
||||
ctx->num_gfx_cs_flushes++;
|
||||
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* Remember the last execution barrier, which is the last fence
|
||||
* in this case.
|
||||
*/
|
||||
if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check VM faults if needed. */
|
||||
if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
|
||||
/* Use conservative timeout 800ms, after which we won't wait any
|
||||
@@ -226,6 +283,16 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
|
||||
if (ctx->is_debug)
|
||||
si_begin_gfx_cs_debug(ctx);
|
||||
|
||||
if (ctx->gds) {
|
||||
ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds,
|
||||
RADEON_USAGE_READWRITE, 0, 0);
|
||||
if (ctx->gds_oa) {
|
||||
ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds_oa,
|
||||
RADEON_USAGE_READWRITE, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Always invalidate caches at the beginning of IBs, because external
|
||||
* users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
|
||||
* buffers.
|
||||
@@ -352,6 +419,19 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
|
||||
ctx->last_num_tcs_input_cp = -1;
|
||||
ctx->last_ls_hs_config = -1; /* impossible value */
|
||||
|
||||
ctx->prim_discard_compute_ib_initialized = false;
|
||||
|
||||
/* Compute-based primitive discard:
|
||||
* The index ring is divided into 2 halves. Switch between the halves
|
||||
* in the same fashion as doublebuffering.
|
||||
*/
|
||||
if (ctx->index_ring_base)
|
||||
ctx->index_ring_base = 0;
|
||||
else
|
||||
ctx->index_ring_base = ctx->index_ring_size_per_ib;
|
||||
|
||||
ctx->index_ring_offset = 0;
|
||||
|
||||
if (has_clear_state) {
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
|
||||
|
@@ -80,6 +80,9 @@ static const struct debug_named_value debug_options[] = {
|
||||
{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
|
||||
|
||||
/* 3D engine options: */
|
||||
{ "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
|
||||
{ "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
|
||||
{ "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
|
||||
{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
|
||||
{ "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
|
||||
{ "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
|
||||
@@ -255,7 +258,13 @@ static void si_destroy_context(struct pipe_context *context)
|
||||
|
||||
sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
|
||||
sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
|
||||
sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
|
||||
si_resource_reference(&sctx->eop_bug_scratch, NULL);
|
||||
si_resource_reference(&sctx->index_ring, NULL);
|
||||
si_resource_reference(&sctx->barrier_buf, NULL);
|
||||
si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
|
||||
pb_reference(&sctx->gds, NULL);
|
||||
pb_reference(&sctx->gds_oa, NULL);
|
||||
|
||||
si_destroy_compiler(&sctx->compiler);
|
||||
|
||||
@@ -533,6 +542,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
||||
sctx->blitter->skip_viewport_restore = true;
|
||||
|
||||
si_init_draw_functions(sctx);
|
||||
si_initialize_prim_discard_tunables(sctx);
|
||||
}
|
||||
|
||||
/* Initialize SDMA functions. */
|
||||
@@ -554,7 +564,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
||||
|
||||
if (sctx->chip_class >= GFX9) {
|
||||
sctx->wait_mem_scratch = si_resource(
|
||||
pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
|
||||
pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
|
||||
if (!sctx->wait_mem_scratch)
|
||||
goto fail;
|
||||
|
||||
|
@@ -39,7 +39,7 @@
|
||||
#endif
|
||||
|
||||
#define ATI_VENDOR_ID 0x1002
|
||||
|
||||
#define SI_PRIM_DISCARD_DEBUG 0
|
||||
#define SI_NOT_QUERY 0xffffffff
|
||||
|
||||
/* The base vertex and primitive restart can be any number, but we must pick
|
||||
@@ -165,6 +165,9 @@ enum {
|
||||
DBG_ZERO_VRAM,
|
||||
|
||||
/* 3D engine options: */
|
||||
DBG_ALWAYS_PD,
|
||||
DBG_PD,
|
||||
DBG_NO_PD,
|
||||
DBG_SWITCH_ON_EOP,
|
||||
DBG_NO_OUT_OF_ORDER,
|
||||
DBG_NO_DPBB,
|
||||
@@ -209,6 +212,7 @@ enum si_coherency {
|
||||
};
|
||||
|
||||
struct si_compute;
|
||||
struct si_shader_context;
|
||||
struct hash_table;
|
||||
struct u_suballocator;
|
||||
|
||||
@@ -675,6 +679,7 @@ struct si_signed_scissor {
|
||||
struct si_viewports {
|
||||
struct pipe_viewport_state states[SI_MAX_VIEWPORTS];
|
||||
struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS];
|
||||
bool y_inverted;
|
||||
};
|
||||
|
||||
struct si_clip_state {
|
||||
@@ -780,10 +785,12 @@ struct si_saved_cs {
|
||||
struct pipe_reference reference;
|
||||
struct si_context *ctx;
|
||||
struct radeon_saved_cs gfx;
|
||||
struct radeon_saved_cs compute;
|
||||
struct si_resource *trace_buf;
|
||||
unsigned trace_id;
|
||||
|
||||
unsigned gfx_last_dw;
|
||||
unsigned compute_last_dw;
|
||||
bool flushed;
|
||||
int64_t time_flush;
|
||||
};
|
||||
@@ -839,6 +846,7 @@ struct si_context {
|
||||
struct pipe_debug_callback debug;
|
||||
struct ac_llvm_compiler compiler; /* only non-threaded compilation */
|
||||
struct si_shader_ctx_state fixed_func_tcs_shader;
|
||||
/* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
|
||||
struct si_resource *wait_mem_scratch;
|
||||
unsigned wait_mem_number;
|
||||
uint16_t prefetch_L2_mask;
|
||||
@@ -859,6 +867,31 @@ struct si_context {
|
||||
uint64_t vram;
|
||||
uint64_t gtt;
|
||||
|
||||
/* Compute-based primitive discard. */
|
||||
unsigned prim_discard_vertex_count_threshold;
|
||||
struct pb_buffer *gds;
|
||||
struct pb_buffer *gds_oa;
|
||||
struct radeon_cmdbuf *prim_discard_compute_cs;
|
||||
unsigned compute_gds_offset;
|
||||
struct si_shader *compute_ib_last_shader;
|
||||
uint32_t compute_rewind_va;
|
||||
unsigned compute_num_prims_in_batch;
|
||||
bool preserve_prim_restart_gds_at_flush;
|
||||
/* index_ring is divided into 2 halves for doublebuffering. */
|
||||
struct si_resource *index_ring;
|
||||
unsigned index_ring_base; /* offset of a per-IB portion */
|
||||
unsigned index_ring_offset; /* offset within a per-IB portion */
|
||||
unsigned index_ring_size_per_ib; /* max available size per IB */
|
||||
bool prim_discard_compute_ib_initialized;
|
||||
/* For tracking the last execution barrier - it can be either
|
||||
* a WRITE_DATA packet or a fence. */
|
||||
uint32_t *last_pkt3_write_data;
|
||||
struct si_resource *barrier_buf;
|
||||
unsigned barrier_buf_offset;
|
||||
struct pipe_fence_handle *last_ib_barrier_fence;
|
||||
struct si_resource *last_ib_barrier_buf;
|
||||
unsigned last_ib_barrier_buf_offset;
|
||||
|
||||
/* Atoms (direct states). */
|
||||
union si_state_atoms atoms;
|
||||
unsigned dirty_atoms; /* mask */
|
||||
@@ -895,6 +928,7 @@ struct si_context {
|
||||
struct si_shader_ctx_state vs_shader;
|
||||
struct si_shader_ctx_state tcs_shader;
|
||||
struct si_shader_ctx_state tes_shader;
|
||||
struct si_shader_ctx_state cs_prim_discard_state;
|
||||
struct si_cs_shader_state cs_shader_state;
|
||||
|
||||
/* shader information */
|
||||
@@ -963,6 +997,7 @@ struct si_context {
|
||||
/* Emitted draw state. */
|
||||
bool gs_tri_strip_adj_fix:1;
|
||||
bool ls_vgpr_fix:1;
|
||||
bool prim_discard_cs_instancing:1;
|
||||
int last_index_size;
|
||||
int last_base_vertex;
|
||||
int last_start_instance;
|
||||
@@ -1076,6 +1111,7 @@ struct si_context {
|
||||
/* Maintain the list of active queries for pausing between IBs. */
|
||||
int num_occlusion_queries;
|
||||
int num_perfect_occlusion_queries;
|
||||
int num_pipeline_stat_queries;
|
||||
struct list_head active_queries;
|
||||
unsigned num_cs_dw_queries_suspend;
|
||||
|
||||
@@ -1311,6 +1347,26 @@ unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
|
||||
unsigned threadgroups_per_cu);
|
||||
void si_init_compute_functions(struct si_context *sctx);
|
||||
|
||||
/* si_compute_prim_discard.c */
|
||||
enum si_prim_discard_outcome {
|
||||
SI_PRIM_DISCARD_ENABLED,
|
||||
SI_PRIM_DISCARD_DISABLED,
|
||||
SI_PRIM_DISCARD_DRAW_SPLIT,
|
||||
};
|
||||
|
||||
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
|
||||
enum si_prim_discard_outcome
|
||||
si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
|
||||
const struct pipe_draw_info *info);
|
||||
void si_compute_signal_gfx(struct si_context *sctx);
|
||||
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
||||
const struct pipe_draw_info *info,
|
||||
unsigned index_size,
|
||||
unsigned base_vertex,
|
||||
uint64_t input_indexbuf_va,
|
||||
unsigned input_indexbuf_max_elements);
|
||||
void si_initialize_prim_discard_tunables(struct si_context *sctx);
|
||||
|
||||
/* si_perfcounters.c */
|
||||
void si_init_perfcounters(struct si_screen *screen);
|
||||
void si_destroy_perfcounters(struct si_screen *screen);
|
||||
@@ -1748,6 +1804,11 @@ radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
|
||||
}
|
||||
|
||||
static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
|
||||
{
|
||||
return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
|
||||
}
|
||||
|
||||
#define PRINT_ERR(fmt, args...) \
|
||||
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
|
||||
|
||||
|
@@ -850,6 +850,9 @@ static void si_query_hw_emit_start(struct si_context *sctx,
|
||||
si_update_occlusion_query_state(sctx, query->b.type, 1);
|
||||
si_update_prims_generated_query_state(sctx, query->b.type, 1);
|
||||
|
||||
if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
|
||||
sctx->num_pipeline_stat_queries++;
|
||||
|
||||
if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
|
||||
si_need_gfx_cs_space(sctx);
|
||||
|
||||
@@ -954,6 +957,9 @@ static void si_query_hw_emit_stop(struct si_context *sctx,
|
||||
|
||||
si_update_occlusion_query_state(sctx, query->b.type, -1);
|
||||
si_update_prims_generated_query_state(sctx, query->b.type, -1);
|
||||
|
||||
if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
|
||||
sctx->num_pipeline_stat_queries--;
|
||||
}
|
||||
|
||||
static void emit_set_predicate(struct si_context *ctx,
|
||||
|
@@ -25,6 +25,7 @@
|
||||
#include "util/u_memory.h"
|
||||
#include "util/u_string.h"
|
||||
#include "tgsi/tgsi_build.h"
|
||||
#include "tgsi/tgsi_strings.h"
|
||||
#include "tgsi/tgsi_util.h"
|
||||
#include "tgsi/tgsi_dump.h"
|
||||
|
||||
@@ -3548,6 +3549,33 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
|
||||
FREE(outputs);
|
||||
}
|
||||
|
||||
static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
|
||||
unsigned max_outputs,
|
||||
LLVMValueRef *addrs)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
struct tgsi_shader_info *info = &ctx->shader->selector->info;
|
||||
LLVMValueRef pos[4] = {};
|
||||
|
||||
assert(info->num_outputs <= max_outputs);
|
||||
|
||||
for (unsigned i = 0; i < info->num_outputs; i++) {
|
||||
if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
|
||||
continue;
|
||||
|
||||
for (unsigned chan = 0; chan < 4; chan++)
|
||||
pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
|
||||
break;
|
||||
}
|
||||
assert(pos[0] != NULL);
|
||||
|
||||
/* Return the position output. */
|
||||
LLVMValueRef ret = ctx->return_value;
|
||||
for (unsigned chan = 0; chan < 4; chan++)
|
||||
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
|
||||
ctx->return_value = ret;
|
||||
}
|
||||
|
||||
static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context(bld_base);
|
||||
@@ -4518,6 +4546,12 @@ static void create_function(struct si_shader_context *ctx)
|
||||
|
||||
/* VGPRs */
|
||||
declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
|
||||
|
||||
/* Return values */
|
||||
if (shader->key.opt.vs_as_prim_discard_cs) {
|
||||
for (i = 0; i < 4; i++)
|
||||
returns[num_returns++] = ctx->f32; /* VGPRs */
|
||||
}
|
||||
break;
|
||||
|
||||
case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
|
||||
@@ -5317,6 +5351,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo
|
||||
return "Vertex Shader as ES";
|
||||
else if (shader->key.as_ls)
|
||||
return "Vertex Shader as LS";
|
||||
else if (shader->key.opt.vs_as_prim_discard_cs)
|
||||
return "Vertex Shader as Primitive Discard CS";
|
||||
else
|
||||
return "Vertex Shader as VS";
|
||||
case PIPE_SHADER_TESS_CTRL:
|
||||
@@ -5699,6 +5735,28 @@ static void si_dump_shader_key(unsigned processor, const struct si_shader *shade
|
||||
fprintf(f, " as_ls = %u\n", key->as_ls);
|
||||
fprintf(f, " mono.u.vs_export_prim_id = %u\n",
|
||||
key->mono.u.vs_export_prim_id);
|
||||
fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
|
||||
key->opt.vs_as_prim_discard_cs);
|
||||
fprintf(f, " opt.cs_prim_type = %s\n",
|
||||
tgsi_primitive_names[key->opt.cs_prim_type]);
|
||||
fprintf(f, " opt.cs_indexed = %u\n",
|
||||
key->opt.cs_indexed);
|
||||
fprintf(f, " opt.cs_instancing = %u\n",
|
||||
key->opt.cs_instancing);
|
||||
fprintf(f, " opt.cs_primitive_restart = %u\n",
|
||||
key->opt.cs_primitive_restart);
|
||||
fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
|
||||
key->opt.cs_provoking_vertex_first);
|
||||
fprintf(f, " opt.cs_need_correct_orientation = %u\n",
|
||||
key->opt.cs_need_correct_orientation);
|
||||
fprintf(f, " opt.cs_cull_front = %u\n",
|
||||
key->opt.cs_cull_front);
|
||||
fprintf(f, " opt.cs_cull_back = %u\n",
|
||||
key->opt.cs_cull_back);
|
||||
fprintf(f, " opt.cs_cull_z = %u\n",
|
||||
key->opt.cs_cull_z);
|
||||
fprintf(f, " opt.cs_halfz_clip_space = %u\n",
|
||||
key->opt.cs_halfz_clip_space);
|
||||
break;
|
||||
|
||||
case PIPE_SHADER_TESS_CTRL:
|
||||
@@ -5854,6 +5912,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
|
||||
ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
|
||||
else if (shader->key.as_es)
|
||||
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
|
||||
else if (shader->key.opt.vs_as_prim_discard_cs)
|
||||
ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
|
||||
else
|
||||
ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
|
||||
bld_base->emit_epilogue = si_tgsi_emit_epilogue;
|
||||
@@ -6644,6 +6704,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
|
||||
|
||||
si_build_wrapper_function(&ctx, parts + !need_prolog,
|
||||
1 + need_prolog, need_prolog, 0);
|
||||
|
||||
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
|
||||
si_build_prim_discard_compute_shader(&ctx);
|
||||
} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
|
||||
if (sscreen->info.chip_class >= GFX9) {
|
||||
struct si_shader_selector *ls = shader->key.part.tcs.ls;
|
||||
|
@@ -340,6 +340,7 @@ struct si_shader_selector {
|
||||
unsigned type;
|
||||
bool vs_needs_prolog;
|
||||
bool force_correct_derivs_after_kill;
|
||||
bool prim_discard_cs_allowed;
|
||||
unsigned pa_cl_vs_out_cntl;
|
||||
ubyte clipdist_mask;
|
||||
ubyte culldist_mask;
|
||||
@@ -554,6 +555,19 @@ struct si_shader_key {
|
||||
* possible, because it's in the "opt" group.
|
||||
*/
|
||||
unsigned prefer_mono:1;
|
||||
|
||||
/* Primitive discard compute shader. */
|
||||
unsigned vs_as_prim_discard_cs:1;
|
||||
unsigned cs_prim_type:4;
|
||||
unsigned cs_indexed:1;
|
||||
unsigned cs_instancing:1;
|
||||
unsigned cs_primitive_restart:1;
|
||||
unsigned cs_provoking_vertex_first:1;
|
||||
unsigned cs_need_correct_orientation:1;
|
||||
unsigned cs_cull_front:1;
|
||||
unsigned cs_cull_back:1;
|
||||
unsigned cs_cull_z:1;
|
||||
unsigned cs_halfz_clip_space:1;
|
||||
} opt;
|
||||
};
|
||||
|
||||
|
@@ -857,6 +857,15 @@ static void *si_create_rs_state(struct pipe_context *ctx,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!state->front_ccw) {
|
||||
rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
|
||||
rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
|
||||
} else {
|
||||
rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
|
||||
rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
|
||||
}
|
||||
rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
|
||||
rs->provoking_vertex_first = state->flatshade_first;
|
||||
rs->scissor_enable = state->scissor;
|
||||
rs->clip_halfz = state->clip_halfz;
|
||||
rs->two_side = state->light_twoside;
|
||||
|
@@ -87,6 +87,10 @@ struct si_state_rasterizer {
|
||||
unsigned rasterizer_discard:1;
|
||||
unsigned scissor_enable:1;
|
||||
unsigned clip_halfz:1;
|
||||
unsigned cull_front:1;
|
||||
unsigned cull_back:1;
|
||||
unsigned depth_clamp_any:1;
|
||||
unsigned provoking_vertex_first:1;
|
||||
};
|
||||
|
||||
struct si_dsa_stencil_ref_part {
|
||||
@@ -600,6 +604,7 @@ void si_shader_selector_key_vs(struct si_context *sctx,
|
||||
struct si_vs_prolog_bits *prolog_key);
|
||||
|
||||
/* si_state_draw.c */
|
||||
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
|
||||
void si_emit_cache_flush(struct si_context *sctx);
|
||||
void si_trace_emit(struct si_context *sctx);
|
||||
void si_init_draw_functions(struct si_context *sctx);
|
||||
|
@@ -29,6 +29,7 @@
|
||||
#include "util/u_log.h"
|
||||
#include "util/u_upload_mgr.h"
|
||||
#include "util/u_prim.h"
|
||||
#include "util/u_suballoc.h"
|
||||
|
||||
#include "ac_debug.h"
|
||||
|
||||
@@ -676,7 +677,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
|
||||
struct pipe_resource *indexbuf,
|
||||
unsigned index_size,
|
||||
unsigned index_offset,
|
||||
unsigned instance_count)
|
||||
unsigned instance_count,
|
||||
bool dispatch_prim_discard_cs,
|
||||
unsigned original_index_size)
|
||||
{
|
||||
struct pipe_draw_indirect_info *indirect = info->indirect;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
@@ -735,13 +738,15 @@ static void si_emit_draw_packets(struct si_context *sctx,
|
||||
sctx->last_index_size = index_size;
|
||||
}
|
||||
|
||||
index_max_size = (indexbuf->width0 - index_offset) /
|
||||
index_size;
|
||||
index_va = si_resource(indexbuf)->gpu_address + index_offset;
|
||||
if (original_index_size) {
|
||||
index_max_size = (indexbuf->width0 - index_offset) /
|
||||
original_index_size;
|
||||
index_va = si_resource(indexbuf)->gpu_address + index_offset;
|
||||
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
|
||||
si_resource(indexbuf),
|
||||
RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
|
||||
si_resource(indexbuf),
|
||||
RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
|
||||
}
|
||||
} else {
|
||||
/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
|
||||
* so the state must be re-emitted before the next indexed draw.
|
||||
@@ -828,7 +833,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
|
||||
}
|
||||
|
||||
/* Base vertex and start instance. */
|
||||
base_vertex = index_size ? info->index_bias : info->start;
|
||||
base_vertex = original_index_size ? info->index_bias : info->start;
|
||||
|
||||
if (sctx->num_vs_blit_sgprs) {
|
||||
/* Re-emit draw constants after we leave u_blitter. */
|
||||
@@ -856,6 +861,17 @@ static void si_emit_draw_packets(struct si_context *sctx,
|
||||
}
|
||||
|
||||
if (index_size) {
|
||||
if (dispatch_prim_discard_cs) {
|
||||
index_va += info->start * original_index_size;
|
||||
index_max_size = MIN2(index_max_size, info->count);
|
||||
|
||||
si_dispatch_prim_discard_cs_and_draw(sctx, info,
|
||||
original_index_size,
|
||||
base_vertex,
|
||||
index_va, index_max_size);
|
||||
return;
|
||||
}
|
||||
|
||||
index_va += info->start * index_size;
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
|
||||
@@ -902,6 +918,33 @@ static void si_emit_surface_sync(struct si_context *sctx,
|
||||
sctx->context_roll = true;
|
||||
}
|
||||
|
||||
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
|
||||
{
|
||||
if (!si_compute_prim_discard_enabled(sctx))
|
||||
return;
|
||||
|
||||
if (!sctx->barrier_buf) {
|
||||
u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
|
||||
&sctx->barrier_buf_offset,
|
||||
(struct pipe_resource**)&sctx->barrier_buf);
|
||||
}
|
||||
|
||||
/* Emit a placeholder to signal the next compute IB to start.
|
||||
* See si_compute_prim_discard.c for explanation.
|
||||
*/
|
||||
uint32_t signal = 1;
|
||||
si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
|
||||
4, V_370_MEM, V_370_ME, &signal);
|
||||
|
||||
sctx->last_pkt3_write_data =
|
||||
&sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
|
||||
|
||||
/* Only the last occurence of WRITE_DATA will be executed.
|
||||
* The packet will be enabled in si_flush_gfx_cs.
|
||||
*/
|
||||
*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
|
||||
}
|
||||
|
||||
void si_emit_cache_flush(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
@@ -919,8 +962,18 @@ void si_emit_cache_flush(struct si_context *sctx)
|
||||
}
|
||||
|
||||
uint32_t cp_coher_cntl = 0;
|
||||
uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
|
||||
SI_CONTEXT_FLUSH_AND_INV_DB);
|
||||
const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
|
||||
SI_CONTEXT_FLUSH_AND_INV_DB);
|
||||
const bool is_barrier = flush_cb_db ||
|
||||
/* INV_ICACHE == beginning of gfx IB. Checking
|
||||
* INV_ICACHE fixes corruption for DeusExMD with
|
||||
* compute-based culling, but I don't know why.
|
||||
*/
|
||||
flags & (SI_CONTEXT_INV_ICACHE |
|
||||
SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_VS_PARTIAL_FLUSH) ||
|
||||
(flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
|
||||
sctx->compute_is_busy);
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
||||
sctx->num_cb_cache_flushes++;
|
||||
@@ -1144,6 +1197,9 @@ void si_emit_cache_flush(struct si_context *sctx)
|
||||
if (cp_coher_cntl)
|
||||
si_emit_surface_sync(sctx, cp_coher_cntl);
|
||||
|
||||
if (is_barrier)
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
|
||||
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
|
||||
@@ -1260,6 +1316,94 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
|
||||
primitive_restart);
|
||||
}
|
||||
|
||||
static bool
|
||||
si_all_vs_resources_read_only(struct si_context *sctx,
|
||||
struct pipe_resource *indexbuf)
|
||||
{
|
||||
struct radeon_winsys *ws = sctx->ws;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
/* Index buffer. */
|
||||
if (indexbuf &&
|
||||
ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
|
||||
RADEON_USAGE_WRITE))
|
||||
return false;
|
||||
|
||||
/* Vertex buffers. */
|
||||
struct si_vertex_elements *velems = sctx->vertex_elements;
|
||||
unsigned num_velems = velems->count;
|
||||
|
||||
for (unsigned i = 0; i < num_velems; i++) {
|
||||
if (!((1 << i) & velems->first_vb_use_mask))
|
||||
continue;
|
||||
|
||||
unsigned vb_index = velems->vertex_buffer_index[i];
|
||||
struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
|
||||
if (!res)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
|
||||
RADEON_USAGE_WRITE))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Constant and shader buffers. */
|
||||
struct si_descriptors *buffers =
|
||||
&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
|
||||
for (unsigned i = 0; i < buffers->num_active_slots; i++) {
|
||||
unsigned index = buffers->first_active_slot + i;
|
||||
struct pipe_resource *res =
|
||||
sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
|
||||
if (!res)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
|
||||
RADEON_USAGE_WRITE))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Samplers. */
|
||||
struct si_shader_selector *vs = sctx->vs_shader.cso;
|
||||
if (vs->info.samplers_declared) {
|
||||
unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
|
||||
|
||||
for (unsigned i = 0; i < num_samplers; i++) {
|
||||
struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
|
||||
if (!view)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs,
|
||||
si_resource(view->texture)->buf,
|
||||
RADEON_USAGE_WRITE))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Images. */
|
||||
if (vs->info.images_declared) {
|
||||
unsigned num_images = util_last_bit(vs->info.images_declared);
|
||||
|
||||
for (unsigned i = 0; i < num_images; i++) {
|
||||
struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
|
||||
if (!res)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
|
||||
RADEON_USAGE_WRITE))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE bool pd_msg(const char *s)
|
||||
{
|
||||
if (SI_PRIM_DISCARD_DEBUG)
|
||||
printf("PD failed: %s\n", s);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
|
||||
{
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
@@ -1370,9 +1514,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
||||
}
|
||||
}
|
||||
|
||||
if (sctx->do_update_shaders && !si_update_shaders(sctx))
|
||||
goto return_cleanup;
|
||||
|
||||
if (index_size) {
|
||||
/* Translate or upload, if needed. */
|
||||
/* 8-bit indices are supported on GFX8. */
|
||||
@@ -1425,6 +1566,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
||||
}
|
||||
}
|
||||
|
||||
bool dispatch_prim_discard_cs = false;
|
||||
bool prim_discard_cs_instancing = false;
|
||||
unsigned original_index_size = index_size;
|
||||
unsigned direct_count = 0;
|
||||
|
||||
if (info->indirect) {
|
||||
struct pipe_draw_indirect_info *indirect = info->indirect;
|
||||
|
||||
@@ -1444,8 +1590,80 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
||||
si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
direct_count = info->count * instance_count;
|
||||
}
|
||||
|
||||
/* Determine if we can use the primitive discard compute shader. */
|
||||
if (si_compute_prim_discard_enabled(sctx) &&
|
||||
/* Multiply by 3 for strips and fans to get the vertex count as triangles. */
|
||||
direct_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3) >
|
||||
sctx->prim_discard_vertex_count_threshold &&
|
||||
(!info->count_from_stream_output || pd_msg("draw_opaque")) &&
|
||||
(primitive_restart ?
|
||||
/* Supported prim types with primitive restart: */
|
||||
(prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
|
||||
/* Disallow instancing with primitive restart: */
|
||||
(instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
|
||||
/* Supported prim types without primitive restart + allow instancing: */
|
||||
(1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
|
||||
(1 << PIPE_PRIM_TRIANGLE_STRIP) |
|
||||
(1 << PIPE_PRIM_TRIANGLE_FAN)) &&
|
||||
/* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
|
||||
/* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
|
||||
(instance_count == 1 ||
|
||||
(instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
|
||||
pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
|
||||
(info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
|
||||
(!sctx->render_cond || pd_msg("render condition")) &&
|
||||
/* Forced enablement ignores pipeline statistics queries. */
|
||||
(sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
|
||||
(!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
|
||||
pd_msg("pipestat or primgen query")) &&
|
||||
(!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
|
||||
(!sctx->tes_shader.cso || pd_msg("uses tess")) &&
|
||||
(!sctx->gs_shader.cso || pd_msg("uses GS")) &&
|
||||
(!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
|
||||
#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
|
||||
(!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
|
||||
(!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
|
||||
(!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
|
||||
(!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
|
||||
!sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
|
||||
!sctx->vs_shader.cso->so.num_outputs &&
|
||||
#else
|
||||
(sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
|
||||
#endif
|
||||
/* Check that all buffers are used for read only, because compute
|
||||
* dispatches can run ahead. */
|
||||
(si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
|
||||
switch (si_prepare_prim_discard_or_split_draw(sctx, info)) {
|
||||
case SI_PRIM_DISCARD_ENABLED:
|
||||
original_index_size = index_size;
|
||||
prim_discard_cs_instancing = instance_count > 1;
|
||||
dispatch_prim_discard_cs = true;
|
||||
|
||||
/* The compute shader changes/lowers the following: */
|
||||
prim = PIPE_PRIM_TRIANGLES;
|
||||
index_size = 4;
|
||||
instance_count = 1;
|
||||
primitive_restart = false;
|
||||
break;
|
||||
case SI_PRIM_DISCARD_DISABLED:
|
||||
break;
|
||||
case SI_PRIM_DISCARD_DRAW_SPLIT:
|
||||
goto return_cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
|
||||
sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
|
||||
sctx->do_update_shaders = true;
|
||||
}
|
||||
|
||||
if (sctx->do_update_shaders && !si_update_shaders(sctx))
|
||||
goto return_cleanup;
|
||||
|
||||
si_need_gfx_cs_space(sctx);
|
||||
|
||||
if (sctx->bo_list_add_all_gfx_resources)
|
||||
@@ -1507,7 +1725,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
||||
sctx->dirty_atoms = 0;
|
||||
|
||||
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
|
||||
instance_count);
|
||||
instance_count, dispatch_prim_discard_cs,
|
||||
original_index_size);
|
||||
/* <-- CUs are busy here. */
|
||||
|
||||
/* Start prefetches after the draw has been started. Both will run
|
||||
@@ -1527,7 +1746,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
||||
cik_emit_prefetch_L2(sctx, true);
|
||||
|
||||
if (!si_upload_graphics_shader_descriptors(sctx))
|
||||
return;
|
||||
goto return_cleanup;
|
||||
|
||||
si_emit_all_states(sctx, info, prim, instance_count,
|
||||
primitive_restart, masked_atoms);
|
||||
@@ -1540,7 +1759,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
||||
sctx->dirty_atoms = 0;
|
||||
|
||||
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
|
||||
instance_count);
|
||||
instance_count, dispatch_prim_discard_cs,
|
||||
original_index_size);
|
||||
|
||||
/* Prefetch the remaining shaders after the draw has been
|
||||
* started. */
|
||||
|
@@ -82,6 +82,10 @@
|
||||
* Right half: {1,3,5,7,9,11,13,15}
|
||||
*/
|
||||
|
||||
/* Important note: We have to use the standard DX positions, because
|
||||
* the primitive discard compute shader relies on them.
|
||||
*/
|
||||
|
||||
/* 1x MSAA */
|
||||
static const uint32_t sample_locs_1x =
|
||||
FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
|
||||
|
@@ -1383,6 +1383,8 @@ void si_shader_selector_key_vs(struct si_context *sctx,
|
||||
|
||||
prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
|
||||
prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
|
||||
prolog_key->unpack_instance_id_from_vertex_id =
|
||||
sctx->prim_discard_cs_instancing;
|
||||
|
||||
/* Prefer a monolithic shader to allow scheduling divisions around
|
||||
* VBO loads. */
|
||||
@@ -1910,8 +1912,11 @@ current_not_ready:
|
||||
|
||||
/* Compile the main shader part if it doesn't exist. This can happen
|
||||
* if the initial guess was wrong.
|
||||
*
|
||||
* The prim discard CS doesn't need the main shader part.
|
||||
*/
|
||||
if (!is_pure_monolithic) {
|
||||
if (!is_pure_monolithic &&
|
||||
!key->opt.vs_as_prim_discard_cs) {
|
||||
bool ok;
|
||||
|
||||
/* Make sure the main shader part is present. This is needed
|
||||
@@ -1962,9 +1967,10 @@ current_not_ready:
|
||||
is_pure_monolithic ||
|
||||
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
|
||||
|
||||
/* The prim discard CS is always optimized. */
|
||||
shader->is_optimized =
|
||||
!is_pure_monolithic &&
|
||||
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
|
||||
(!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
|
||||
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
|
||||
|
||||
/* If it's an optimized shader, compile it asynchronously. */
|
||||
if (shader->is_optimized && thread_index < 0) {
|
||||
@@ -2312,6 +2318,15 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
|
||||
sel->info.uses_kill &&
|
||||
sctx->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL);
|
||||
|
||||
sel->prim_discard_cs_allowed =
|
||||
sel->type == PIPE_SHADER_VERTEX &&
|
||||
!sel->info.uses_bindless_images &&
|
||||
!sel->info.uses_bindless_samplers &&
|
||||
!sel->info.writes_memory &&
|
||||
!sel->info.writes_viewport_index &&
|
||||
!sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
|
||||
!sel->so.num_outputs;
|
||||
|
||||
/* Set which opcode uses which (i,j) pair. */
|
||||
if (sel->info.uses_persp_opcode_interp_centroid)
|
||||
sel->info.uses_persp_centroid = true;
|
||||
|
@@ -381,6 +381,12 @@ static void si_set_viewport_states(struct pipe_context *pctx,
|
||||
scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
|
||||
}
|
||||
|
||||
if (start_slot == 0) {
|
||||
ctx->viewports.y_inverted =
|
||||
-state->scale[1] + state->translate[1] >
|
||||
state->scale[1] + state->translate[1];
|
||||
}
|
||||
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
|
||||
|
Reference in New Issue
Block a user