radeonsi: cull primitives with async compute for large draw calls

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Acked-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
Marek Olšák
2018-08-14 02:01:18 -04:00
parent 187f1c999f
commit c9b7a37b8f
18 changed files with 2124 additions and 28 deletions

View File

@@ -10,6 +10,7 @@ C_SOURCES := \
si_build_pm4.h \
si_clear.c \
si_compute.c \
si_compute_prim_discard.c \
si_compute.h \
si_compute_blit.c \
si_cp_dma.c \

View File

@@ -26,6 +26,7 @@ files_libradeonsi = files(
'si_build_pm4.h',
'si_clear.c',
'si_compute.c',
'si_compute_prim_discard.c',
'si_compute.h',
'si_compute_blit.c',
'si_cp_dma.c',

File diff suppressed because it is too large Load Diff

View File

@@ -249,8 +249,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
sdst->TC_L2_dirty = true;
/* If it's not a framebuffer fast clear... */
if (coher == SI_COHERENCY_SHADER)
if (coher == SI_COHERENCY_SHADER) {
sctx->num_cp_dma_calls++;
si_prim_discard_signal_next_compute_ib_start(sctx);
}
}
/**
@@ -405,8 +407,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
si_resource(dst)->TC_L2_dirty = true;
/* If it's not a prefetch or GDS copy... */
if (dst && src && (dst != src || dst_offset != src_offset))
if (dst && src && (dst != src || dst_offset != src_offset)) {
sctx->num_cp_dma_calls++;
si_prim_discard_signal_next_compute_ib_start(sctx);
}
}
void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,

View File

@@ -337,6 +337,7 @@ struct si_log_chunk_cs {
struct si_saved_cs *cs;
bool dump_bo_list;
unsigned gfx_begin, gfx_end;
unsigned compute_begin, compute_end;
};
static void si_log_chunk_type_cs_destroy(void *data)
@@ -394,6 +395,7 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
struct si_context *ctx = chunk->ctx;
struct si_saved_cs *scs = chunk->cs;
int last_trace_id = -1;
int last_compute_trace_id = -1;
/* We are expecting that the ddebug pipe has already
* waited for the context, so this buffer should be idle.
@@ -403,8 +405,10 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
NULL,
PIPE_TRANSFER_UNSYNCHRONIZED |
PIPE_TRANSFER_READ);
if (map)
if (map) {
last_trace_id = map[0];
last_compute_trace_id = map[1];
}
if (chunk->gfx_end != chunk->gfx_begin) {
if (chunk->gfx_begin == 0) {
@@ -432,6 +436,21 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
}
}
if (chunk->compute_end != chunk->compute_begin) {
assert(ctx->prim_discard_compute_cs);
if (scs->flushed) {
ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
chunk->compute_end - chunk->compute_begin,
&last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
NULL, NULL);
} else {
si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
chunk->compute_end, &last_compute_trace_id,
map ? 1 : 0, "Compute IB", ctx->chip_class);
}
}
if (chunk->dump_bo_list) {
fprintf(f, "Flushing. Time: ");
util_dump_ns(f, scs->time_flush);
@@ -452,9 +471,14 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
struct si_saved_cs *scs = ctx->current_saved_cs;
unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
unsigned compute_cur = 0;
if (ctx->prim_discard_compute_cs)
compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
if (!dump_bo_list &&
gfx_cur == scs->gfx_last_dw)
gfx_cur == scs->gfx_last_dw &&
compute_cur == scs->compute_last_dw)
return;
struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -467,6 +491,10 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
chunk->gfx_end = gfx_cur;
scs->gfx_last_dw = gfx_cur;
chunk->compute_begin = scs->compute_last_dw;
chunk->compute_end = compute_cur;
scs->compute_last_dw = compute_cur;
u_log_chunk(log, &si_log_chunk_type_cs, chunk);
}

View File

@@ -80,7 +80,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
EOP_INT_SEL(int_sel) |
EOP_DATA_SEL(data_sel);
if (ctx->chip_class >= GFX9) {
if (ctx->chip_class >= GFX9 || cs == ctx->prim_discard_compute_cs) {
/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
* counters) must immediately precede every timestamp event to
* prevent a GPU hang on GFX9.
@@ -89,6 +89,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
* always do ZPASS_DONE before the timestamp.
*/
if (ctx->chip_class == GFX9 &&
cs != ctx->prim_discard_compute_cs &&
query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
@@ -105,14 +106,15 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
}
radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
radeon_emit(cs, op);
radeon_emit(cs, sel);
radeon_emit(cs, va); /* address lo */
radeon_emit(cs, va >> 32); /* address hi */
radeon_emit(cs, new_fence); /* immediate data lo */
radeon_emit(cs, 0); /* immediate data hi */
radeon_emit(cs, 0); /* unused */
if (ctx->chip_class >= GFX9)
radeon_emit(cs, 0); /* unused */
} else {
if (ctx->chip_class == GFX7 ||
ctx->chip_class == GFX8) {

View File

@@ -24,6 +24,8 @@
*/
#include "si_pipe.h"
#include "si_build_pm4.h"
#include "sid.h"
#include "util/os_time.h"
#include "util/u_upload_mgr.h"
@@ -134,6 +136,24 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
if (radeon_emitted(ctx->dma_cs, 0))
si_flush_dma_cs(ctx, flags, NULL);
if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
si_compute_signal_gfx(ctx);
/* Make sure compute shaders are idle before leaving the IB, so that
* the next IB doesn't overwrite GDS that might be in use. */
radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
EVENT_INDEX(4));
/* Save the GDS prim restart counter if needed. */
if (ctx->preserve_prim_restart_gds_at_flush) {
si_cp_copy_data(ctx, compute_cs,
COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
COPY_DATA_GDS, NULL, 4);
}
}
if (ctx->has_graphics) {
if (!LIST_IS_EMPTY(&ctx->active_queries))
si_suspend_queries(ctx);
@@ -168,6 +188,32 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
si_log_hw_flush(ctx);
}
if (si_compute_prim_discard_enabled(ctx)) {
/* The compute IB can start after the previous gfx IB starts. */
if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
ctx->last_gfx_fence) {
ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
ctx->last_gfx_fence,
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
RADEON_DEPENDENCY_START_FENCE);
}
/* Remember the last execution barrier. It's in the IB.
* It will signal the start of the next compute IB.
*/
if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
ctx->last_pkt3_write_data) {
*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
ctx->last_pkt3_write_data = NULL;
si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
si_resource_reference(&ctx->barrier_buf, NULL);
ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
}
}
/* Flush the CS. */
ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
if (fence)
@@ -175,6 +221,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
ctx->num_gfx_cs_flushes++;
if (si_compute_prim_discard_enabled(ctx)) {
/* Remember the last execution barrier, which is the last fence
* in this case.
*/
if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
ctx->last_pkt3_write_data = NULL;
si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
}
}
/* Check VM faults if needed. */
if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
/* Use conservative timeout 800ms, after which we won't wait any
@@ -226,6 +283,16 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
if (ctx->is_debug)
si_begin_gfx_cs_debug(ctx);
if (ctx->gds) {
ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds,
RADEON_USAGE_READWRITE, 0, 0);
if (ctx->gds_oa) {
ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds_oa,
RADEON_USAGE_READWRITE, 0, 0);
}
}
/* Always invalidate caches at the beginning of IBs, because external
* users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
* buffers.
@@ -352,6 +419,19 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
ctx->last_num_tcs_input_cp = -1;
ctx->last_ls_hs_config = -1; /* impossible value */
ctx->prim_discard_compute_ib_initialized = false;
/* Compute-based primitive discard:
* The index ring is divided into 2 halves. Switch between the halves
* in the same fashion as doublebuffering.
*/
if (ctx->index_ring_base)
ctx->index_ring_base = 0;
else
ctx->index_ring_base = ctx->index_ring_size_per_ib;
ctx->index_ring_offset = 0;
if (has_clear_state) {
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;

View File

@@ -80,6 +80,9 @@ static const struct debug_named_value debug_options[] = {
{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
/* 3D engine options: */
{ "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
{ "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
{ "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
{ "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
{ "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
@@ -255,7 +258,13 @@ static void si_destroy_context(struct pipe_context *context)
sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
si_resource_reference(&sctx->eop_bug_scratch, NULL);
si_resource_reference(&sctx->index_ring, NULL);
si_resource_reference(&sctx->barrier_buf, NULL);
si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
pb_reference(&sctx->gds, NULL);
pb_reference(&sctx->gds_oa, NULL);
si_destroy_compiler(&sctx->compiler);
@@ -533,6 +542,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
sctx->blitter->skip_viewport_restore = true;
si_init_draw_functions(sctx);
si_initialize_prim_discard_tunables(sctx);
}
/* Initialize SDMA functions. */
@@ -554,7 +564,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
if (sctx->chip_class >= GFX9) {
sctx->wait_mem_scratch = si_resource(
pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
if (!sctx->wait_mem_scratch)
goto fail;

View File

@@ -39,7 +39,7 @@
#endif
#define ATI_VENDOR_ID 0x1002
#define SI_PRIM_DISCARD_DEBUG 0
#define SI_NOT_QUERY 0xffffffff
/* The base vertex and primitive restart can be any number, but we must pick
@@ -165,6 +165,9 @@ enum {
DBG_ZERO_VRAM,
/* 3D engine options: */
DBG_ALWAYS_PD,
DBG_PD,
DBG_NO_PD,
DBG_SWITCH_ON_EOP,
DBG_NO_OUT_OF_ORDER,
DBG_NO_DPBB,
@@ -209,6 +212,7 @@ enum si_coherency {
};
struct si_compute;
struct si_shader_context;
struct hash_table;
struct u_suballocator;
@@ -675,6 +679,7 @@ struct si_signed_scissor {
struct si_viewports {
struct pipe_viewport_state states[SI_MAX_VIEWPORTS];
struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS];
bool y_inverted;
};
struct si_clip_state {
@@ -780,10 +785,12 @@ struct si_saved_cs {
struct pipe_reference reference;
struct si_context *ctx;
struct radeon_saved_cs gfx;
struct radeon_saved_cs compute;
struct si_resource *trace_buf;
unsigned trace_id;
unsigned gfx_last_dw;
unsigned compute_last_dw;
bool flushed;
int64_t time_flush;
};
@@ -839,6 +846,7 @@ struct si_context {
struct pipe_debug_callback debug;
struct ac_llvm_compiler compiler; /* only non-threaded compilation */
struct si_shader_ctx_state fixed_func_tcs_shader;
/* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
struct si_resource *wait_mem_scratch;
unsigned wait_mem_number;
uint16_t prefetch_L2_mask;
@@ -859,6 +867,31 @@ struct si_context {
uint64_t vram;
uint64_t gtt;
/* Compute-based primitive discard. */
unsigned prim_discard_vertex_count_threshold;
struct pb_buffer *gds;
struct pb_buffer *gds_oa;
struct radeon_cmdbuf *prim_discard_compute_cs;
unsigned compute_gds_offset;
struct si_shader *compute_ib_last_shader;
uint32_t compute_rewind_va;
unsigned compute_num_prims_in_batch;
bool preserve_prim_restart_gds_at_flush;
/* index_ring is divided into 2 halves for doublebuffering. */
struct si_resource *index_ring;
unsigned index_ring_base; /* offset of a per-IB portion */
unsigned index_ring_offset; /* offset within a per-IB portion */
unsigned index_ring_size_per_ib; /* max available size per IB */
bool prim_discard_compute_ib_initialized;
/* For tracking the last execution barrier - it can be either
* a WRITE_DATA packet or a fence. */
uint32_t *last_pkt3_write_data;
struct si_resource *barrier_buf;
unsigned barrier_buf_offset;
struct pipe_fence_handle *last_ib_barrier_fence;
struct si_resource *last_ib_barrier_buf;
unsigned last_ib_barrier_buf_offset;
/* Atoms (direct states). */
union si_state_atoms atoms;
unsigned dirty_atoms; /* mask */
@@ -895,6 +928,7 @@ struct si_context {
struct si_shader_ctx_state vs_shader;
struct si_shader_ctx_state tcs_shader;
struct si_shader_ctx_state tes_shader;
struct si_shader_ctx_state cs_prim_discard_state;
struct si_cs_shader_state cs_shader_state;
/* shader information */
@@ -963,6 +997,7 @@ struct si_context {
/* Emitted draw state. */
bool gs_tri_strip_adj_fix:1;
bool ls_vgpr_fix:1;
bool prim_discard_cs_instancing:1;
int last_index_size;
int last_base_vertex;
int last_start_instance;
@@ -1076,6 +1111,7 @@ struct si_context {
/* Maintain the list of active queries for pausing between IBs. */
int num_occlusion_queries;
int num_perfect_occlusion_queries;
int num_pipeline_stat_queries;
struct list_head active_queries;
unsigned num_cs_dw_queries_suspend;
@@ -1311,6 +1347,26 @@ unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
unsigned threadgroups_per_cu);
void si_init_compute_functions(struct si_context *sctx);
/* si_compute_prim_discard.c */
enum si_prim_discard_outcome {
SI_PRIM_DISCARD_ENABLED,
SI_PRIM_DISCARD_DISABLED,
SI_PRIM_DISCARD_DRAW_SPLIT,
};
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
enum si_prim_discard_outcome
si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
const struct pipe_draw_info *info);
void si_compute_signal_gfx(struct si_context *sctx);
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
const struct pipe_draw_info *info,
unsigned index_size,
unsigned base_vertex,
uint64_t input_indexbuf_va,
unsigned input_indexbuf_max_elements);
void si_initialize_prim_discard_tunables(struct si_context *sctx);
/* si_perfcounters.c */
void si_init_perfcounters(struct si_screen *screen);
void si_destroy_perfcounters(struct si_screen *screen);
@@ -1748,6 +1804,11 @@ radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
}
static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
{
return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
}
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

View File

@@ -850,6 +850,9 @@ static void si_query_hw_emit_start(struct si_context *sctx,
si_update_occlusion_query_state(sctx, query->b.type, 1);
si_update_prims_generated_query_state(sctx, query->b.type, 1);
if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
sctx->num_pipeline_stat_queries++;
if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
si_need_gfx_cs_space(sctx);
@@ -954,6 +957,9 @@ static void si_query_hw_emit_stop(struct si_context *sctx,
si_update_occlusion_query_state(sctx, query->b.type, -1);
si_update_prims_generated_query_state(sctx, query->b.type, -1);
if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
sctx->num_pipeline_stat_queries--;
}
static void emit_set_predicate(struct si_context *ctx,

View File

@@ -25,6 +25,7 @@
#include "util/u_memory.h"
#include "util/u_string.h"
#include "tgsi/tgsi_build.h"
#include "tgsi/tgsi_strings.h"
#include "tgsi/tgsi_util.h"
#include "tgsi/tgsi_dump.h"
@@ -3548,6 +3549,33 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
FREE(outputs);
}
static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
unsigned max_outputs,
LLVMValueRef *addrs)
{
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
struct tgsi_shader_info *info = &ctx->shader->selector->info;
LLVMValueRef pos[4] = {};
assert(info->num_outputs <= max_outputs);
for (unsigned i = 0; i < info->num_outputs; i++) {
if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
continue;
for (unsigned chan = 0; chan < 4; chan++)
pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
break;
}
assert(pos[0] != NULL);
/* Return the position output. */
LLVMValueRef ret = ctx->return_value;
for (unsigned chan = 0; chan < 4; chan++)
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
ctx->return_value = ret;
}
static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -4518,6 +4546,12 @@ static void create_function(struct si_shader_context *ctx)
/* VGPRs */
declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
/* Return values */
if (shader->key.opt.vs_as_prim_discard_cs) {
for (i = 0; i < 4; i++)
returns[num_returns++] = ctx->f32; /* VGPRs */
}
break;
case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -5317,6 +5351,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo
return "Vertex Shader as ES";
else if (shader->key.as_ls)
return "Vertex Shader as LS";
else if (shader->key.opt.vs_as_prim_discard_cs)
return "Vertex Shader as Primitive Discard CS";
else
return "Vertex Shader as VS";
case PIPE_SHADER_TESS_CTRL:
@@ -5699,6 +5735,28 @@ static void si_dump_shader_key(unsigned processor, const struct si_shader *shade
fprintf(f, " as_ls = %u\n", key->as_ls);
fprintf(f, " mono.u.vs_export_prim_id = %u\n",
key->mono.u.vs_export_prim_id);
fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
key->opt.vs_as_prim_discard_cs);
fprintf(f, " opt.cs_prim_type = %s\n",
tgsi_primitive_names[key->opt.cs_prim_type]);
fprintf(f, " opt.cs_indexed = %u\n",
key->opt.cs_indexed);
fprintf(f, " opt.cs_instancing = %u\n",
key->opt.cs_instancing);
fprintf(f, " opt.cs_primitive_restart = %u\n",
key->opt.cs_primitive_restart);
fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
key->opt.cs_provoking_vertex_first);
fprintf(f, " opt.cs_need_correct_orientation = %u\n",
key->opt.cs_need_correct_orientation);
fprintf(f, " opt.cs_cull_front = %u\n",
key->opt.cs_cull_front);
fprintf(f, " opt.cs_cull_back = %u\n",
key->opt.cs_cull_back);
fprintf(f, " opt.cs_cull_z = %u\n",
key->opt.cs_cull_z);
fprintf(f, " opt.cs_halfz_clip_space = %u\n",
key->opt.cs_halfz_clip_space);
break;
case PIPE_SHADER_TESS_CTRL:
@@ -5854,6 +5912,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
else if (shader->key.as_es)
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
else if (shader->key.opt.vs_as_prim_discard_cs)
ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
else
ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
bld_base->emit_epilogue = si_tgsi_emit_epilogue;
@@ -6644,6 +6704,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
si_build_wrapper_function(&ctx, parts + !need_prolog,
1 + need_prolog, need_prolog, 0);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
if (sscreen->info.chip_class >= GFX9) {
struct si_shader_selector *ls = shader->key.part.tcs.ls;

View File

@@ -340,6 +340,7 @@ struct si_shader_selector {
unsigned type;
bool vs_needs_prolog;
bool force_correct_derivs_after_kill;
bool prim_discard_cs_allowed;
unsigned pa_cl_vs_out_cntl;
ubyte clipdist_mask;
ubyte culldist_mask;
@@ -554,6 +555,19 @@ struct si_shader_key {
* possible, because it's in the "opt" group.
*/
unsigned prefer_mono:1;
/* Primitive discard compute shader. */
unsigned vs_as_prim_discard_cs:1;
unsigned cs_prim_type:4;
unsigned cs_indexed:1;
unsigned cs_instancing:1;
unsigned cs_primitive_restart:1;
unsigned cs_provoking_vertex_first:1;
unsigned cs_need_correct_orientation:1;
unsigned cs_cull_front:1;
unsigned cs_cull_back:1;
unsigned cs_cull_z:1;
unsigned cs_halfz_clip_space:1;
} opt;
};

View File

@@ -857,6 +857,15 @@ static void *si_create_rs_state(struct pipe_context *ctx,
return NULL;
}
if (!state->front_ccw) {
rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
} else {
rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
}
rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
rs->provoking_vertex_first = state->flatshade_first;
rs->scissor_enable = state->scissor;
rs->clip_halfz = state->clip_halfz;
rs->two_side = state->light_twoside;

View File

@@ -87,6 +87,10 @@ struct si_state_rasterizer {
unsigned rasterizer_discard:1;
unsigned scissor_enable:1;
unsigned clip_halfz:1;
unsigned cull_front:1;
unsigned cull_back:1;
unsigned depth_clamp_any:1;
unsigned provoking_vertex_first:1;
};
struct si_dsa_stencil_ref_part {
@@ -600,6 +604,7 @@ void si_shader_selector_key_vs(struct si_context *sctx,
struct si_vs_prolog_bits *prolog_key);
/* si_state_draw.c */
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
void si_emit_cache_flush(struct si_context *sctx);
void si_trace_emit(struct si_context *sctx);
void si_init_draw_functions(struct si_context *sctx);

View File

@@ -29,6 +29,7 @@
#include "util/u_log.h"
#include "util/u_upload_mgr.h"
#include "util/u_prim.h"
#include "util/u_suballoc.h"
#include "ac_debug.h"
@@ -676,7 +677,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
struct pipe_resource *indexbuf,
unsigned index_size,
unsigned index_offset,
unsigned instance_count)
unsigned instance_count,
bool dispatch_prim_discard_cs,
unsigned original_index_size)
{
struct pipe_draw_indirect_info *indirect = info->indirect;
struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -735,13 +738,15 @@ static void si_emit_draw_packets(struct si_context *sctx,
sctx->last_index_size = index_size;
}
index_max_size = (indexbuf->width0 - index_offset) /
index_size;
index_va = si_resource(indexbuf)->gpu_address + index_offset;
if (original_index_size) {
index_max_size = (indexbuf->width0 - index_offset) /
original_index_size;
index_va = si_resource(indexbuf)->gpu_address + index_offset;
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
si_resource(indexbuf),
RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
si_resource(indexbuf),
RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
}
} else {
/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
* so the state must be re-emitted before the next indexed draw.
@@ -828,7 +833,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
}
/* Base vertex and start instance. */
base_vertex = index_size ? info->index_bias : info->start;
base_vertex = original_index_size ? info->index_bias : info->start;
if (sctx->num_vs_blit_sgprs) {
/* Re-emit draw constants after we leave u_blitter. */
@@ -856,6 +861,17 @@ static void si_emit_draw_packets(struct si_context *sctx,
}
if (index_size) {
if (dispatch_prim_discard_cs) {
index_va += info->start * original_index_size;
index_max_size = MIN2(index_max_size, info->count);
si_dispatch_prim_discard_cs_and_draw(sctx, info,
original_index_size,
base_vertex,
index_va, index_max_size);
return;
}
index_va += info->start * index_size;
radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
@@ -902,6 +918,33 @@ static void si_emit_surface_sync(struct si_context *sctx,
sctx->context_roll = true;
}
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
{
if (!si_compute_prim_discard_enabled(sctx))
return;
if (!sctx->barrier_buf) {
u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
&sctx->barrier_buf_offset,
(struct pipe_resource**)&sctx->barrier_buf);
}
/* Emit a placeholder to signal the next compute IB to start.
* See si_compute_prim_discard.c for explanation.
*/
uint32_t signal = 1;
si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
4, V_370_MEM, V_370_ME, &signal);
sctx->last_pkt3_write_data =
&sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
/* Only the last occurence of WRITE_DATA will be executed.
* The packet will be enabled in si_flush_gfx_cs.
*/
*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
}
void si_emit_cache_flush(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -919,8 +962,18 @@ void si_emit_cache_flush(struct si_context *sctx)
}
uint32_t cp_coher_cntl = 0;
uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
SI_CONTEXT_FLUSH_AND_INV_DB);
const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
SI_CONTEXT_FLUSH_AND_INV_DB);
const bool is_barrier = flush_cb_db ||
/* INV_ICACHE == beginning of gfx IB. Checking
* INV_ICACHE fixes corruption for DeusExMD with
* compute-based culling, but I don't know why.
*/
flags & (SI_CONTEXT_INV_ICACHE |
SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_VS_PARTIAL_FLUSH) ||
(flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
sctx->compute_is_busy);
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
sctx->num_cb_cache_flushes++;
@@ -1144,6 +1197,9 @@ void si_emit_cache_flush(struct si_context *sctx)
if (cp_coher_cntl)
si_emit_surface_sync(sctx, cp_coher_cntl);
if (is_barrier)
si_prim_discard_signal_next_compute_ib_start(sctx);
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
@@ -1260,6 +1316,94 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
primitive_restart);
}
static bool
si_all_vs_resources_read_only(struct si_context *sctx,
struct pipe_resource *indexbuf)
{
struct radeon_winsys *ws = sctx->ws;
struct radeon_cmdbuf *cs = sctx->gfx_cs;
/* Index buffer. */
if (indexbuf &&
ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
RADEON_USAGE_WRITE))
return false;
/* Vertex buffers. */
struct si_vertex_elements *velems = sctx->vertex_elements;
unsigned num_velems = velems->count;
for (unsigned i = 0; i < num_velems; i++) {
if (!((1 << i) & velems->first_vb_use_mask))
continue;
unsigned vb_index = velems->vertex_buffer_index[i];
struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
if (!res)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
RADEON_USAGE_WRITE))
return false;
}
/* Constant and shader buffers. */
struct si_descriptors *buffers =
&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
for (unsigned i = 0; i < buffers->num_active_slots; i++) {
unsigned index = buffers->first_active_slot + i;
struct pipe_resource *res =
sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
if (!res)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
RADEON_USAGE_WRITE))
return false;
}
/* Samplers. */
struct si_shader_selector *vs = sctx->vs_shader.cso;
if (vs->info.samplers_declared) {
unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
for (unsigned i = 0; i < num_samplers; i++) {
struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
if (!view)
continue;
if (ws->cs_is_buffer_referenced(cs,
si_resource(view->texture)->buf,
RADEON_USAGE_WRITE))
return false;
}
}
/* Images. */
if (vs->info.images_declared) {
unsigned num_images = util_last_bit(vs->info.images_declared);
for (unsigned i = 0; i < num_images; i++) {
struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
if (!res)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
RADEON_USAGE_WRITE))
return false;
}
}
return true;
}
static ALWAYS_INLINE bool pd_msg(const char *s)
{
if (SI_PRIM_DISCARD_DEBUG)
printf("PD failed: %s\n", s);
return false;
}
static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
@@ -1370,9 +1514,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
}
}
if (sctx->do_update_shaders && !si_update_shaders(sctx))
goto return_cleanup;
if (index_size) {
/* Translate or upload, if needed. */
/* 8-bit indices are supported on GFX8. */
@@ -1425,6 +1566,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
}
}
bool dispatch_prim_discard_cs = false;
bool prim_discard_cs_instancing = false;
unsigned original_index_size = index_size;
unsigned direct_count = 0;
if (info->indirect) {
struct pipe_draw_indirect_info *indirect = info->indirect;
@@ -1444,8 +1590,80 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
}
}
} else {
direct_count = info->count * instance_count;
}
/* Determine if we can use the primitive discard compute shader. */
if (si_compute_prim_discard_enabled(sctx) &&
/* Multiply by 3 for strips and fans to get the vertex count as triangles. */
direct_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3) >
sctx->prim_discard_vertex_count_threshold &&
(!info->count_from_stream_output || pd_msg("draw_opaque")) &&
(primitive_restart ?
/* Supported prim types with primitive restart: */
(prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
/* Disallow instancing with primitive restart: */
(instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
/* Supported prim types without primitive restart + allow instancing: */
(1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
(1 << PIPE_PRIM_TRIANGLE_STRIP) |
(1 << PIPE_PRIM_TRIANGLE_FAN)) &&
/* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
/* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
(instance_count == 1 ||
(instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
(info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
(!sctx->render_cond || pd_msg("render condition")) &&
/* Forced enablement ignores pipeline statistics queries. */
(sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
(!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
pd_msg("pipestat or primgen query")) &&
(!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
(!sctx->tes_shader.cso || pd_msg("uses tess")) &&
(!sctx->gs_shader.cso || pd_msg("uses GS")) &&
(!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
(!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
(!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
(!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
(!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
!sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
!sctx->vs_shader.cso->so.num_outputs &&
#else
(sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
#endif
/* Check that all buffers are used for read only, because compute
* dispatches can run ahead. */
(si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
switch (si_prepare_prim_discard_or_split_draw(sctx, info)) {
case SI_PRIM_DISCARD_ENABLED:
original_index_size = index_size;
prim_discard_cs_instancing = instance_count > 1;
dispatch_prim_discard_cs = true;
/* The compute shader changes/lowers the following: */
prim = PIPE_PRIM_TRIANGLES;
index_size = 4;
instance_count = 1;
primitive_restart = false;
break;
case SI_PRIM_DISCARD_DISABLED:
break;
case SI_PRIM_DISCARD_DRAW_SPLIT:
goto return_cleanup;
}
}
if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
sctx->do_update_shaders = true;
}
if (sctx->do_update_shaders && !si_update_shaders(sctx))
goto return_cleanup;
si_need_gfx_cs_space(sctx);
if (sctx->bo_list_add_all_gfx_resources)
@@ -1507,7 +1725,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
sctx->dirty_atoms = 0;
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
instance_count);
instance_count, dispatch_prim_discard_cs,
original_index_size);
/* <-- CUs are busy here. */
/* Start prefetches after the draw has been started. Both will run
@@ -1527,7 +1746,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
cik_emit_prefetch_L2(sctx, true);
if (!si_upload_graphics_shader_descriptors(sctx))
return;
goto return_cleanup;
si_emit_all_states(sctx, info, prim, instance_count,
primitive_restart, masked_atoms);
@@ -1540,7 +1759,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
sctx->dirty_atoms = 0;
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
instance_count);
instance_count, dispatch_prim_discard_cs,
original_index_size);
/* Prefetch the remaining shaders after the draw has been
* started. */

View File

@@ -82,6 +82,10 @@
* Right half: {1,3,5,7,9,11,13,15}
*/
/* Important note: We have to use the standard DX positions, because
* the primitive discard compute shader relies on them.
*/
/* 1x MSAA */
static const uint32_t sample_locs_1x =
FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */

View File

@@ -1383,6 +1383,8 @@ void si_shader_selector_key_vs(struct si_context *sctx,
prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
prolog_key->unpack_instance_id_from_vertex_id =
sctx->prim_discard_cs_instancing;
/* Prefer a monolithic shader to allow scheduling divisions around
* VBO loads. */
@@ -1910,8 +1912,11 @@ current_not_ready:
/* Compile the main shader part if it doesn't exist. This can happen
* if the initial guess was wrong.
*
* The prim discard CS doesn't need the main shader part.
*/
if (!is_pure_monolithic) {
if (!is_pure_monolithic &&
!key->opt.vs_as_prim_discard_cs) {
bool ok;
/* Make sure the main shader part is present. This is needed
@@ -1962,9 +1967,10 @@ current_not_ready:
is_pure_monolithic ||
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
/* The prim discard CS is always optimized. */
shader->is_optimized =
!is_pure_monolithic &&
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
(!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
/* If it's an optimized shader, compile it asynchronously. */
if (shader->is_optimized && thread_index < 0) {
@@ -2312,6 +2318,15 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->info.uses_kill &&
sctx->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL);
sel->prim_discard_cs_allowed =
sel->type == PIPE_SHADER_VERTEX &&
!sel->info.uses_bindless_images &&
!sel->info.uses_bindless_samplers &&
!sel->info.writes_memory &&
!sel->info.writes_viewport_index &&
!sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
!sel->so.num_outputs;
/* Set which opcode uses which (i,j) pair. */
if (sel->info.uses_persp_opcode_interp_centroid)
sel->info.uses_persp_centroid = true;

View File

@@ -381,6 +381,12 @@ static void si_set_viewport_states(struct pipe_context *pctx,
scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
}
if (start_slot == 0) {
ctx->viewports.y_inverted =
-state->scale[1] + state->translate[1] >
state->scale[1] + state->translate[1];
}
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);