radeonsi: cull primitives with async compute for large draw calls

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de> Acked-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
2018-08-14 02:01:18 -04:00
parent 187f1c999f
commit c9b7a37b8f
18 changed files with 2124 additions and 28 deletions
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -10,6 +10,7 @@ C_SOURCES := \
 	si_build_pm4.h \
 	si_clear.c \
 	si_compute.c \
+	si_compute_prim_discard.c \
 	si_compute.h \
 	si_compute_blit.c \
 	si_cp_dma.c \
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -26,6 +26,7 @@ files_libradeonsi = files(
  'si_build_pm4.h',
  'si_clear.c',
  'si_compute.c',
+  'si_compute_prim_discard.c',
  'si_compute.h',
  'si_compute_blit.c',
  'si_cp_dma.c',
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -249,8 +249,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
 		sdst->TC_L2_dirty = true;

 	/* If it's not a framebuffer fast clear... */
-	if (coher == SI_COHERENCY_SHADER)
+	if (coher == SI_COHERENCY_SHADER) {
 		sctx->num_cp_dma_calls++;
+		si_prim_discard_signal_next_compute_ib_start(sctx);
+	}
 }

 /**
@@ -405,8 +407,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
 		si_resource(dst)->TC_L2_dirty = true;

 	/* If it's not a prefetch or GDS copy... */
-	if (dst && src && (dst != src || dst_offset != src_offset))
+	if (dst && src && (dst != src || dst_offset != src_offset)) {
 		sctx->num_cp_dma_calls++;
+		si_prim_discard_signal_next_compute_ib_start(sctx);
+	}
 }

 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -337,6 +337,7 @@ struct si_log_chunk_cs {
 	struct si_saved_cs *cs;
 	bool dump_bo_list;
 	unsigned gfx_begin, gfx_end;
+	unsigned compute_begin, compute_end;
 };

 static void si_log_chunk_type_cs_destroy(void *data)
@@ -394,6 +395,7 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
 	struct si_context *ctx = chunk->ctx;
 	struct si_saved_cs *scs = chunk->cs;
 	int last_trace_id = -1;
+	int last_compute_trace_id = -1;

 	/* We are expecting that the ddebug pipe has already
 	 * waited for the context, so this buffer should be idle.
@@ -403,8 +405,10 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
 					      NULL,
 					      PIPE_TRANSFER_UNSYNCHRONIZED |
 					      PIPE_TRANSFER_READ);
-	if (map)
+	if (map) {
 		last_trace_id = map[0];
+		last_compute_trace_id = map[1];
+	}

 	if (chunk->gfx_end != chunk->gfx_begin) {
 		if (chunk->gfx_begin == 0) {
@@ -432,6 +436,21 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
 		}
 	}

+	if (chunk->compute_end != chunk->compute_begin) {
+		assert(ctx->prim_discard_compute_cs);
+
+		if (scs->flushed) {
+			ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
+				    chunk->compute_end - chunk->compute_begin,
+				    &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
+				    NULL, NULL);
+		} else {
+			si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
+					    chunk->compute_end, &last_compute_trace_id,
+					    map ? 1 : 0, "Compute IB", ctx->chip_class);
+		}
+	}
+
 	if (chunk->dump_bo_list) {
 		fprintf(f, "Flushing. Time: ");
 		util_dump_ns(f, scs->time_flush);
@@ -452,9 +471,14 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,

 	struct si_saved_cs *scs = ctx->current_saved_cs;
 	unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
+	unsigned compute_cur = 0;
+
+	if (ctx->prim_discard_compute_cs)
+		compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;

 	if (!dump_bo_list &&
-	    gfx_cur == scs->gfx_last_dw)
+	    gfx_cur == scs->gfx_last_dw &&
+	    compute_cur == scs->compute_last_dw)
 		return;

 	struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -467,6 +491,10 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
 	chunk->gfx_end = gfx_cur;
 	scs->gfx_last_dw = gfx_cur;

+	chunk->compute_begin = scs->compute_last_dw;
+	chunk->compute_end = compute_cur;
+	scs->compute_last_dw = compute_cur;
+
 	u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }

--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -80,7 +80,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
 		       EOP_INT_SEL(int_sel) |
 		       EOP_DATA_SEL(data_sel);

-	if (ctx->chip_class >= GFX9) {
+	if (ctx->chip_class >= GFX9 || cs == ctx->prim_discard_compute_cs) {
 		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
 		 * counters) must immediately precede every timestamp event to
 		 * prevent a GPU hang on GFX9.
@@ -89,6 +89,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
 		 * always do ZPASS_DONE before the timestamp.
 		 */
 		if (ctx->chip_class == GFX9 &&
+		    cs != ctx->prim_discard_compute_cs &&
 		    query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
 		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
 		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
@@ -105,14 +106,15 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
 						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 		}

-		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
+		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
 		radeon_emit(cs, op);
 		radeon_emit(cs, sel);
 		radeon_emit(cs, va);		/* address lo */
 		radeon_emit(cs, va >> 32);	/* address hi */
 		radeon_emit(cs, new_fence);	/* immediate data lo */
 		radeon_emit(cs, 0); /* immediate data hi */
-		radeon_emit(cs, 0); /* unused */
+		if (ctx->chip_class >= GFX9)
+			radeon_emit(cs, 0); /* unused */
 	} else {
 		if (ctx->chip_class == GFX7 ||
 		    ctx->chip_class == GFX8) {
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -24,6 +24,8 @@
 */

 #include "si_pipe.h"
+#include "si_build_pm4.h"
+#include "sid.h"

 #include "util/os_time.h"
 #include "util/u_upload_mgr.h"
@@ -134,6 +136,24 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 	if (radeon_emitted(ctx->dma_cs, 0))
 		si_flush_dma_cs(ctx, flags, NULL);

+	if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
+		struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
+		si_compute_signal_gfx(ctx);
+
+		/* Make sure compute shaders are idle before leaving the IB, so that
+		 * the next IB doesn't overwrite GDS that might be in use. */
+		radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
+					EVENT_INDEX(4));
+
+		/* Save the GDS prim restart counter if needed. */
+		if (ctx->preserve_prim_restart_gds_at_flush) {
+			si_cp_copy_data(ctx, compute_cs,
+					COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
+					COPY_DATA_GDS, NULL, 4);
+		}
+	}
+
 	if (ctx->has_graphics) {
 		if (!LIST_IS_EMPTY(&ctx->active_queries))
 			si_suspend_queries(ctx);
@@ -168,6 +188,32 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 		si_log_hw_flush(ctx);
 	}

+	if (si_compute_prim_discard_enabled(ctx)) {
+		/* The compute IB can start after the previous gfx IB starts. */
+		if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
+		    ctx->last_gfx_fence) {
+			ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
+							 ctx->last_gfx_fence,
+							 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
+							 RADEON_DEPENDENCY_START_FENCE);
+		}
+
+		/* Remember the last execution barrier. It's in the IB.
+		 * It will signal the start of the next compute IB.
+		 */
+		if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
+		    ctx->last_pkt3_write_data) {
+			*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
+			ctx->last_pkt3_write_data = NULL;
+
+			si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
+			ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
+			si_resource_reference(&ctx->barrier_buf, NULL);
+
+			ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
+		}
+	}
+
 	/* Flush the CS. */
 	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
 	if (fence)
@@ -175,6 +221,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,

 	ctx->num_gfx_cs_flushes++;

+	if (si_compute_prim_discard_enabled(ctx)) {
+		/* Remember the last execution barrier, which is the last fence
+		 * in this case.
+		 */
+		if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+			ctx->last_pkt3_write_data = NULL;
+			si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
+			ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
+		}
+	}
+
 	/* Check VM faults if needed. */
 	if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
 		/* Use conservative timeout 800ms, after which we won't wait any
@@ -226,6 +283,16 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
 	if (ctx->is_debug)
 		si_begin_gfx_cs_debug(ctx);

+	if (ctx->gds) {
+		ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds,
+				       RADEON_USAGE_READWRITE, 0, 0);
+		if (ctx->gds_oa) {
+			ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds_oa,
+					       RADEON_USAGE_READWRITE, 0, 0);
+		}
+	}
+
+
 	/* Always invalidate caches at the beginning of IBs, because external
 	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
 	 * buffers.
@@ -352,6 +419,19 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
 	ctx->last_num_tcs_input_cp = -1;
 	ctx->last_ls_hs_config = -1; /* impossible value */

+	ctx->prim_discard_compute_ib_initialized = false;
+
+        /* Compute-based primitive discard:
+         *   The index ring is divided into 2 halves. Switch between the halves
+         *   in the same fashion as doublebuffering.
+         */
+        if (ctx->index_ring_base)
+                ctx->index_ring_base = 0;
+        else
+                ctx->index_ring_base = ctx->index_ring_size_per_ib;
+
+        ctx->index_ring_offset = 0;
+
 	if (has_clear_state) {
 		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
 		ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -80,6 +80,9 @@ static const struct debug_named_value debug_options[] = {
 	{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },

 	/* 3D engine options: */
+	{ "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
+	{ "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
+	{ "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
 	{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
 	{ "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
 	{ "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
@@ -255,7 +258,13 @@ static void si_destroy_context(struct pipe_context *context)

 	sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
 	sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
+	sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
 	si_resource_reference(&sctx->eop_bug_scratch, NULL);
+	si_resource_reference(&sctx->index_ring, NULL);
+	si_resource_reference(&sctx->barrier_buf, NULL);
+	si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
+	pb_reference(&sctx->gds, NULL);
+	pb_reference(&sctx->gds_oa, NULL);

 	si_destroy_compiler(&sctx->compiler);

@@ -533,6 +542,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->blitter->skip_viewport_restore = true;

 		si_init_draw_functions(sctx);
+		si_initialize_prim_discard_tunables(sctx);
 	}

 	/* Initialize SDMA functions. */
@@ -554,7 +564,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,

 	if (sctx->chip_class >= GFX9) {
 		sctx->wait_mem_scratch = si_resource(
-			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
+			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
 		if (!sctx->wait_mem_scratch)
 			goto fail;

--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -39,7 +39,7 @@
 #endif

 #define ATI_VENDOR_ID			0x1002
-
+#define SI_PRIM_DISCARD_DEBUG		0
 #define SI_NOT_QUERY			0xffffffff

 /* The base vertex and primitive restart can be any number, but we must pick
@@ -165,6 +165,9 @@ enum {
 	DBG_ZERO_VRAM,

 	/* 3D engine options: */
+	DBG_ALWAYS_PD,
+	DBG_PD,
+	DBG_NO_PD,
 	DBG_SWITCH_ON_EOP,
 	DBG_NO_OUT_OF_ORDER,
 	DBG_NO_DPBB,
@@ -209,6 +212,7 @@ enum si_coherency {
 };

 struct si_compute;
+struct si_shader_context;
 struct hash_table;
 struct u_suballocator;

@@ -675,6 +679,7 @@ struct si_signed_scissor {
 struct si_viewports {
 	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
 	struct si_signed_scissor	as_scissor[SI_MAX_VIEWPORTS];
+	bool				y_inverted;
 };

 struct si_clip_state {
@@ -780,10 +785,12 @@ struct si_saved_cs {
 	struct pipe_reference	reference;
 	struct si_context	*ctx;
 	struct radeon_saved_cs	gfx;
+	struct radeon_saved_cs	compute;
 	struct si_resource	*trace_buf;
 	unsigned		trace_id;

 	unsigned		gfx_last_dw;
+	unsigned		compute_last_dw;
 	bool			flushed;
 	int64_t			time_flush;
 };
@@ -839,6 +846,7 @@ struct si_context {
 	struct pipe_debug_callback	debug;
 	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
+	/* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
 	struct si_resource		*wait_mem_scratch;
 	unsigned			wait_mem_number;
 	uint16_t			prefetch_L2_mask;
@@ -859,6 +867,31 @@ struct si_context {
 	uint64_t			vram;
 	uint64_t			gtt;

+	/* Compute-based primitive discard. */
+	unsigned			prim_discard_vertex_count_threshold;
+	struct pb_buffer		*gds;
+	struct pb_buffer		*gds_oa;
+	struct radeon_cmdbuf		*prim_discard_compute_cs;
+	unsigned			compute_gds_offset;
+	struct si_shader		*compute_ib_last_shader;
+	uint32_t			compute_rewind_va;
+	unsigned			compute_num_prims_in_batch;
+	bool				preserve_prim_restart_gds_at_flush;
+	/* index_ring is divided into 2 halves for doublebuffering. */
+	struct si_resource		*index_ring;
+	unsigned			index_ring_base; /* offset of a per-IB portion */
+	unsigned			index_ring_offset; /* offset within a per-IB portion */
+	unsigned			index_ring_size_per_ib; /* max available size per IB */
+	bool				prim_discard_compute_ib_initialized;
+	/* For tracking the last execution barrier - it can be either
+	 * a WRITE_DATA packet or a fence. */
+	uint32_t			*last_pkt3_write_data;
+	struct si_resource		*barrier_buf;
+	unsigned			barrier_buf_offset;
+	struct pipe_fence_handle	*last_ib_barrier_fence;
+	struct si_resource		*last_ib_barrier_buf;
+	unsigned			last_ib_barrier_buf_offset;
+
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
 	unsigned			dirty_atoms; /* mask */
@@ -895,6 +928,7 @@ struct si_context {
 	struct si_shader_ctx_state	vs_shader;
 	struct si_shader_ctx_state	tcs_shader;
 	struct si_shader_ctx_state	tes_shader;
+	struct si_shader_ctx_state	cs_prim_discard_state;
 	struct si_cs_shader_state	cs_shader_state;

 	/* shader information */
@@ -963,6 +997,7 @@ struct si_context {
 	/* Emitted draw state. */
 	bool			gs_tri_strip_adj_fix:1;
 	bool			ls_vgpr_fix:1;
+	bool			prim_discard_cs_instancing:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
@@ -1076,6 +1111,7 @@ struct si_context {
 	/* Maintain the list of active queries for pausing between IBs. */
 	int				num_occlusion_queries;
 	int				num_perfect_occlusion_queries;
+	int				num_pipeline_stat_queries;
 	struct list_head		active_queries;
 	unsigned			num_cs_dw_queries_suspend;

@@ -1311,6 +1347,26 @@ unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
 					unsigned threadgroups_per_cu);
 void si_init_compute_functions(struct si_context *sctx);

+/* si_compute_prim_discard.c */
+enum si_prim_discard_outcome {
+	SI_PRIM_DISCARD_ENABLED,
+	SI_PRIM_DISCARD_DISABLED,
+	SI_PRIM_DISCARD_DRAW_SPLIT,
+};
+
+void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
+enum si_prim_discard_outcome
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
+				      const struct pipe_draw_info *info);
+void si_compute_signal_gfx(struct si_context *sctx);
+void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
+					  const struct pipe_draw_info *info,
+					  unsigned index_size,
+					  unsigned base_vertex,
+					  uint64_t input_indexbuf_va,
+					  unsigned input_indexbuf_max_elements);
+void si_initialize_prim_discard_tunables(struct si_context *sctx);
+
 /* si_perfcounters.c */
 void si_init_perfcounters(struct si_screen *screen);
 void si_destroy_perfcounters(struct si_screen *screen);
@@ -1748,6 +1804,11 @@ radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
 	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
 }

+static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
+{
+	return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
+}
+
 #define PRINT_ERR(fmt, args...) \
 	fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -850,6 +850,9 @@ static void si_query_hw_emit_start(struct si_context *sctx,
 	si_update_occlusion_query_state(sctx, query->b.type, 1);
 	si_update_prims_generated_query_state(sctx, query->b.type, 1);

+	if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+		sctx->num_pipeline_stat_queries++;
+
 	if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
 		si_need_gfx_cs_space(sctx);

@@ -954,6 +957,9 @@ static void si_query_hw_emit_stop(struct si_context *sctx,

 	si_update_occlusion_query_state(sctx, query->b.type, -1);
 	si_update_prims_generated_query_state(sctx, query->b.type, -1);
+
+	if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+		sctx->num_pipeline_stat_queries--;
 }

 static void emit_set_predicate(struct si_context *ctx,
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -25,6 +25,7 @@
 #include "util/u_memory.h"
 #include "util/u_string.h"
 #include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_strings.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_dump.h"

@@ -3548,6 +3549,33 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
 	FREE(outputs);
 }

+static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
+						  unsigned max_outputs,
+						  LLVMValueRef *addrs)
+{
+	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+	struct tgsi_shader_info *info = &ctx->shader->selector->info;
+	LLVMValueRef pos[4] = {};
+
+	assert(info->num_outputs <= max_outputs);
+
+	for (unsigned i = 0; i < info->num_outputs; i++) {
+		if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
+			continue;
+
+		for (unsigned chan = 0; chan < 4; chan++)
+			pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+		break;
+	}
+	assert(pos[0] != NULL);
+
+	/* Return the position output. */
+	LLVMValueRef ret = ctx->return_value;
+	for (unsigned chan = 0; chan < 4; chan++)
+		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
+	ctx->return_value = ret;
+}
+
 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -4518,6 +4546,12 @@ static void create_function(struct si_shader_context *ctx)

 		/* VGPRs */
 		declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
+
+		/* Return values */
+		if (shader->key.opt.vs_as_prim_discard_cs) {
+			for (i = 0; i < 4; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		}
 		break;

 	case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -5317,6 +5351,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo
 			return "Vertex Shader as ES";
 		else if (shader->key.as_ls)
 			return "Vertex Shader as LS";
+		else if (shader->key.opt.vs_as_prim_discard_cs)
+			return "Vertex Shader as Primitive Discard CS";
 		else
 			return "Vertex Shader as VS";
 	case PIPE_SHADER_TESS_CTRL:
@@ -5699,6 +5735,28 @@ static void si_dump_shader_key(unsigned processor, const struct si_shader *shade
 		fprintf(f, "  as_ls = %u\n", key->as_ls);
 		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
 			key->mono.u.vs_export_prim_id);
+		fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n",
+			key->opt.vs_as_prim_discard_cs);
+		fprintf(f, "  opt.cs_prim_type = %s\n",
+			tgsi_primitive_names[key->opt.cs_prim_type]);
+		fprintf(f, "  opt.cs_indexed = %u\n",
+			key->opt.cs_indexed);
+		fprintf(f, "  opt.cs_instancing = %u\n",
+			key->opt.cs_instancing);
+		fprintf(f, "  opt.cs_primitive_restart = %u\n",
+			key->opt.cs_primitive_restart);
+		fprintf(f, "  opt.cs_provoking_vertex_first = %u\n",
+			key->opt.cs_provoking_vertex_first);
+		fprintf(f, "  opt.cs_need_correct_orientation = %u\n",
+			key->opt.cs_need_correct_orientation);
+		fprintf(f, "  opt.cs_cull_front = %u\n",
+			key->opt.cs_cull_front);
+		fprintf(f, "  opt.cs_cull_back = %u\n",
+			key->opt.cs_cull_back);
+		fprintf(f, "  opt.cs_cull_z = %u\n",
+			key->opt.cs_cull_z);
+		fprintf(f, "  opt.cs_halfz_clip_space = %u\n",
+			key->opt.cs_halfz_clip_space);
 		break;

 	case PIPE_SHADER_TESS_CTRL:
@@ -5854,6 +5912,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 			ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
 		else if (shader->key.as_es)
 			ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+		else if (shader->key.opt.vs_as_prim_discard_cs)
+			ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
 		else
 			ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
 		bld_base->emit_epilogue = si_tgsi_emit_epilogue;
@@ -6644,6 +6704,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,

 		si_build_wrapper_function(&ctx, parts + !need_prolog,
 					  1 + need_prolog, need_prolog, 0);
+
+		if (ctx.shader->key.opt.vs_as_prim_discard_cs)
+			si_build_prim_discard_compute_shader(&ctx);
 	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
 		if (sscreen->info.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -340,6 +340,7 @@ struct si_shader_selector {
 	unsigned	type;
 	bool		vs_needs_prolog;
 	bool		force_correct_derivs_after_kill;
+	bool		prim_discard_cs_allowed;
 	unsigned	pa_cl_vs_out_cntl;
 	ubyte		clipdist_mask;
 	ubyte		culldist_mask;
@@ -554,6 +555,19 @@ struct si_shader_key {
 		 * possible, because it's in the "opt" group.
 		 */
 		unsigned	prefer_mono:1;
+
+		/* Primitive discard compute shader. */
+		unsigned	vs_as_prim_discard_cs:1;
+		unsigned	cs_prim_type:4;
+		unsigned	cs_indexed:1;
+		unsigned	cs_instancing:1;
+		unsigned	cs_primitive_restart:1;
+		unsigned	cs_provoking_vertex_first:1;
+		unsigned	cs_need_correct_orientation:1;
+		unsigned	cs_cull_front:1;
+		unsigned	cs_cull_back:1;
+		unsigned	cs_cull_z:1;
+		unsigned	cs_halfz_clip_space:1;
 	} opt;
 };

--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -857,6 +857,15 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 		return NULL;
 	}

+	if (!state->front_ccw) {
+		rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+		rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+	} else {
+		rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+		rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
+	}
+	rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
+	rs->provoking_vertex_first = state->flatshade_first;
 	rs->scissor_enable = state->scissor;
 	rs->clip_halfz = state->clip_halfz;
 	rs->two_side = state->light_twoside;
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -87,6 +87,10 @@ struct si_state_rasterizer {
 	unsigned		rasterizer_discard:1;
 	unsigned		scissor_enable:1;
 	unsigned		clip_halfz:1;
+	unsigned		cull_front:1;
+	unsigned		cull_back:1;
+	unsigned		depth_clamp_any:1;
+	unsigned		provoking_vertex_first:1;
 };

 struct si_dsa_stencil_ref_part {
@@ -600,6 +604,7 @@ void si_shader_selector_key_vs(struct si_context *sctx,
 			       struct si_vs_prolog_bits *prolog_key);

 /* si_state_draw.c */
+void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void si_emit_cache_flush(struct si_context *sctx);
 void si_trace_emit(struct si_context *sctx);
 void si_init_draw_functions(struct si_context *sctx);
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -29,6 +29,7 @@
 #include "util/u_log.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_prim.h"
+#include "util/u_suballoc.h"

 #include "ac_debug.h"

@@ -676,7 +677,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 struct pipe_resource *indexbuf,
 				 unsigned index_size,
 				 unsigned index_offset,
-				 unsigned instance_count)
+				 unsigned instance_count,
+				 bool dispatch_prim_discard_cs,
+				 unsigned original_index_size)
 {
 	struct pipe_draw_indirect_info *indirect = info->indirect;
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -735,13 +738,15 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			sctx->last_index_size = index_size;
 		}

-		index_max_size = (indexbuf->width0 - index_offset) /
-				  index_size;
-		index_va = si_resource(indexbuf)->gpu_address + index_offset;
+		if (original_index_size) {
+			index_max_size = (indexbuf->width0 - index_offset) /
+					  original_index_size;
+			index_va = si_resource(indexbuf)->gpu_address + index_offset;

-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      si_resource(indexbuf),
-				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
+			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+					      si_resource(indexbuf),
+					      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
+		}
 	} else {
 		/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
 		 * so the state must be re-emitted before the next indexed draw.
@@ -828,7 +833,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		}

 		/* Base vertex and start instance. */
-		base_vertex = index_size ? info->index_bias : info->start;
+		base_vertex = original_index_size ? info->index_bias : info->start;

 		if (sctx->num_vs_blit_sgprs) {
 			/* Re-emit draw constants after we leave u_blitter. */
@@ -856,6 +861,17 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		}

 		if (index_size) {
+			if (dispatch_prim_discard_cs) {
+				index_va += info->start * original_index_size;
+				index_max_size = MIN2(index_max_size, info->count);
+
+				si_dispatch_prim_discard_cs_and_draw(sctx, info,
+								     original_index_size,
+								     base_vertex,
+								     index_va, index_max_size);
+				return;
+			}
+
 			index_va += info->start * index_size;

 			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
@@ -902,6 +918,33 @@ static void si_emit_surface_sync(struct si_context *sctx,
 		sctx->context_roll = true;
 }

+void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
+{
+	if (!si_compute_prim_discard_enabled(sctx))
+		return;
+
+	if (!sctx->barrier_buf) {
+		u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
+				     &sctx->barrier_buf_offset,
+				     (struct pipe_resource**)&sctx->barrier_buf);
+	}
+
+	/* Emit a placeholder to signal the next compute IB to start.
+	 * See si_compute_prim_discard.c for explanation.
+	 */
+	uint32_t signal = 1;
+	si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
+			 4, V_370_MEM, V_370_ME, &signal);
+
+	sctx->last_pkt3_write_data =
+			&sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
+
+	/* Only the last occurence of WRITE_DATA will be executed.
+	 * The packet will be enabled in si_flush_gfx_cs.
+	 */
+	*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
+}
+
 void si_emit_cache_flush(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -919,8 +962,18 @@ void si_emit_cache_flush(struct si_context *sctx)
 	}

 	uint32_t cp_coher_cntl = 0;
-	uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-					SI_CONTEXT_FLUSH_AND_INV_DB);
+	const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+					      SI_CONTEXT_FLUSH_AND_INV_DB);
+	const bool is_barrier = flush_cb_db ||
+				/* INV_ICACHE == beginning of gfx IB. Checking
+				 * INV_ICACHE fixes corruption for DeusExMD with
+				 * compute-based culling, but I don't know why.
+				 */
+				flags & (SI_CONTEXT_INV_ICACHE |
+					 SI_CONTEXT_PS_PARTIAL_FLUSH |
+					 SI_CONTEXT_VS_PARTIAL_FLUSH) ||
+				(flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
+				 sctx->compute_is_busy);

 	if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
 		sctx->num_cb_cache_flushes++;
@@ -1144,6 +1197,9 @@ void si_emit_cache_flush(struct si_context *sctx)
 	if (cp_coher_cntl)
 		si_emit_surface_sync(sctx, cp_coher_cntl);

+	if (is_barrier)
+		si_prim_discard_signal_next_compute_ib_start(sctx);
+
 	if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
@@ -1260,6 +1316,94 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 			       primitive_restart);
 }

+static bool
+si_all_vs_resources_read_only(struct si_context *sctx,
+			      struct pipe_resource *indexbuf)
+{
+	struct radeon_winsys *ws = sctx->ws;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	/* Index buffer. */
+	if (indexbuf &&
+	    ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
+					RADEON_USAGE_WRITE))
+		return false;
+
+	/* Vertex buffers. */
+	struct si_vertex_elements *velems = sctx->vertex_elements;
+	unsigned num_velems = velems->count;
+
+	for (unsigned i = 0; i < num_velems; i++) {
+		if (!((1 << i) & velems->first_vb_use_mask))
+			continue;
+
+		unsigned vb_index = velems->vertex_buffer_index[i];
+		struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
+		if (!res)
+			continue;
+
+		if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
+						RADEON_USAGE_WRITE))
+			return false;
+	}
+
+	/* Constant and shader buffers. */
+	struct si_descriptors *buffers =
+		&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
+	for (unsigned i = 0; i < buffers->num_active_slots; i++) {
+		unsigned index = buffers->first_active_slot + i;
+		struct pipe_resource *res =
+			sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
+		if (!res)
+			continue;
+
+		if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
+						RADEON_USAGE_WRITE))
+			return false;
+	}
+
+	/* Samplers. */
+	struct si_shader_selector *vs = sctx->vs_shader.cso;
+	if (vs->info.samplers_declared) {
+		unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
+
+		for (unsigned i = 0; i < num_samplers; i++) {
+			struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
+			if (!view)
+				continue;
+
+			if (ws->cs_is_buffer_referenced(cs,
+							si_resource(view->texture)->buf,
+							RADEON_USAGE_WRITE))
+				return false;
+		}
+	}
+
+	/* Images. */
+	if (vs->info.images_declared) {
+		unsigned num_images = util_last_bit(vs->info.images_declared);
+
+		for (unsigned i = 0; i < num_images; i++) {
+			struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
+			if (!res)
+				continue;
+
+			if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
+							RADEON_USAGE_WRITE))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static ALWAYS_INLINE bool pd_msg(const char *s)
+{
+	if (SI_PRIM_DISCARD_DEBUG)
+		printf("PD failed: %s\n", s);
+	return false;
+}
+
 static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -1370,9 +1514,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		}
 	}

-	if (sctx->do_update_shaders && !si_update_shaders(sctx))
-		goto return_cleanup;
-
 	if (index_size) {
 		/* Translate or upload, if needed. */
 		/* 8-bit indices are supported on GFX8. */
@@ -1425,6 +1566,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		}
 	}

+	bool dispatch_prim_discard_cs = false;
+	bool prim_discard_cs_instancing = false;
+	unsigned original_index_size = index_size;
+	unsigned direct_count = 0;
+
 	if (info->indirect) {
 		struct pipe_draw_indirect_info *indirect = info->indirect;

@@ -1444,8 +1590,80 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 				si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
 			}
 		}
+	} else {
+		direct_count = info->count * instance_count;
 	}

+	/* Determine if we can use the primitive discard compute shader. */
+	if (si_compute_prim_discard_enabled(sctx) &&
+	    /* Multiply by 3 for strips and fans to get the vertex count as triangles. */
+	    direct_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3) >
+	    sctx->prim_discard_vertex_count_threshold &&
+	    (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
+	    (primitive_restart ?
+	     /* Supported prim types with primitive restart: */
+	     (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
+	     /* Disallow instancing with primitive restart: */
+	     (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
+	     /* Supported prim types without primitive restart + allow instancing: */
+	     (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
+			    (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+			    (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
+	     /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
+	     /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
+	     (instance_count == 1 ||
+	      (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
+	      pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
+	    (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
+	    (!sctx->render_cond || pd_msg("render condition")) &&
+	    /* Forced enablement ignores pipeline statistics queries. */
+	    (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
+	     (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
+	     pd_msg("pipestat or primgen query")) &&
+	    (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
+	    (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
+	    (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
+	    (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
+#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
+	    (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
+	    (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
+	    (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
+	    (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
+	    !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+	    !sctx->vs_shader.cso->so.num_outputs &&
+#else
+	    (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
+#endif
+	    /* Check that all buffers are used for read only, because compute
+	     * dispatches can run ahead. */
+	    (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
+		switch (si_prepare_prim_discard_or_split_draw(sctx, info)) {
+		case SI_PRIM_DISCARD_ENABLED:
+			original_index_size = index_size;
+			prim_discard_cs_instancing = instance_count > 1;
+			dispatch_prim_discard_cs = true;
+
+			/* The compute shader changes/lowers the following: */
+			prim = PIPE_PRIM_TRIANGLES;
+			index_size = 4;
+			instance_count = 1;
+			primitive_restart = false;
+			break;
+		case SI_PRIM_DISCARD_DISABLED:
+			break;
+		case SI_PRIM_DISCARD_DRAW_SPLIT:
+			goto return_cleanup;
+		}
+	}
+
+	if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
+		sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
+		sctx->do_update_shaders = true;
+	}
+
+	if (sctx->do_update_shaders && !si_update_shaders(sctx))
+		goto return_cleanup;
+
 	si_need_gfx_cs_space(sctx);

 	if (sctx->bo_list_add_all_gfx_resources)
@@ -1507,7 +1725,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		sctx->dirty_atoms = 0;

 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-				     instance_count);
+				     instance_count, dispatch_prim_discard_cs,
+				     original_index_size);
 		/* <-- CUs are busy here. */

 		/* Start prefetches after the draw has been started. Both will run
@@ -1527,7 +1746,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 			cik_emit_prefetch_L2(sctx, true);

 		if (!si_upload_graphics_shader_descriptors(sctx))
-			return;
+			goto return_cleanup;

 		si_emit_all_states(sctx, info, prim, instance_count,
 				   primitive_restart, masked_atoms);
@@ -1540,7 +1759,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		sctx->dirty_atoms = 0;

 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-				     instance_count);
+				     instance_count, dispatch_prim_discard_cs,
+				     original_index_size);

 		/* Prefetch the remaining shaders after the draw has been
 		 * started. */
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -82,6 +82,10 @@
 *   Right half: {1,3,5,7,9,11,13,15}
 */

+/* Important note: We have to use the standard DX positions, because
+ * the primitive discard compute shader relies on them.
+ */
+
 /* 1x MSAA */
 static const uint32_t sample_locs_1x =
 	FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1383,6 +1383,8 @@ void si_shader_selector_key_vs(struct si_context *sctx,

 	prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
 	prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+	prolog_key->unpack_instance_id_from_vertex_id =
+		sctx->prim_discard_cs_instancing;

 	/* Prefer a monolithic shader to allow scheduling divisions around
 	 * VBO loads. */
@@ -1910,8 +1912,11 @@ current_not_ready:

 	/* Compile the main shader part if it doesn't exist. This can happen
 	 * if the initial guess was wrong.
+	 *
+	 * The prim discard CS doesn't need the main shader part.
 	 */
-	if (!is_pure_monolithic) {
+	if (!is_pure_monolithic &&
+	    !key->opt.vs_as_prim_discard_cs) {
 		bool ok;

 		/* Make sure the main shader part is present. This is needed
@@ -1962,9 +1967,10 @@ current_not_ready:
 		is_pure_monolithic ||
 		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;

+	/* The prim discard CS is always optimized. */
 	shader->is_optimized =
-		!is_pure_monolithic &&
-		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+		(!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+		 memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;

 	/* If it's an optimized shader, compile it asynchronously. */
 	if (shader->is_optimized && thread_index < 0) {
@@ -2312,6 +2318,15 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		sel->info.uses_kill &&
 		sctx->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL);

+	sel->prim_discard_cs_allowed =
+		sel->type == PIPE_SHADER_VERTEX &&
+		!sel->info.uses_bindless_images &&
+		!sel->info.uses_bindless_samplers &&
+		!sel->info.writes_memory &&
+		!sel->info.writes_viewport_index &&
+		!sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+		!sel->so.num_outputs;
+
 	/* Set which opcode uses which (i,j) pair. */
 	if (sel->info.uses_persp_opcode_interp_centroid)
 		sel->info.uses_persp_centroid = true;
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -381,6 +381,12 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 			scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
 	}

+	if (start_slot == 0) {
+		ctx->viewports.y_inverted =
+			-state->scale[1] + state->translate[1] >
+			state->scale[1] + state->translate[1];
+	}
+
 	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
 	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);