anv: Add support for the PMA fix on Broadwell

This helps Dota 2 on Broadwell by 8-9%.  I also hacked up the driver and
used the Sascha "shadowmapping" demo to get some results.  Setting
uses_kill to true dropped the framerate on the demo by 25-30%.  Enabling
the PMA fix brought it back up to around 90% of the original framerate.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Nanley Chery <nanley.g.chery@intel.com>
This commit is contained in:
Jason Ekstrand
2016-12-06 17:52:14 -08:00
parent 62bba4ba2d
commit e8d52dab48
9 changed files with 221 additions and 2 deletions

View File

@@ -12,5 +12,4 @@ Performance:
- Compressed multisample support
- Pushing pieces of UBOs?
- Enable guardband clipping
- pma stall workaround
- Use soft-pin to avoid relocations

View File

@@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
state->restart_index = UINT32_MAX;
state->dynamic = default_dynamic_state;
state->need_query_wa = true;
state->pma_fix_enabled = false;
state->hiz_enabled = false;
if (state->attachments != NULL) {
vk_free(&cmd_buffer->pool->alloc, state->attachments);

View File

@@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
bool enable);
void
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
const struct gen_l3_config *l3_config,

View File

@@ -1168,6 +1168,20 @@ struct anv_cmd_state {
struct anv_dynamic_state dynamic;
bool need_query_wa;
/**
* Whether or not the gen8 PMA fix is enabled. We ensure that, at the top
* of any command buffer it is disabled by disabling it in EndCommandBuffer
* and before invoking the secondary in ExecuteCommands.
*/
bool pma_fix_enabled;
/**
* Whether or not we know for certain that HiZ is enabled for the current
* subpass. If, for whatever reason, we are unsure as to whether HiZ is
* enabled or not, this will be false.
*/
bool hiz_enabled;
/**
* Array length is anv_cmd_state::pass::attachment_count. Array content is
* valid only when recording a render pass instance.
@@ -1471,8 +1485,11 @@ struct anv_pipeline {
uint32_t cs_right_mask;
bool writes_depth;
bool depth_test_enable;
bool writes_stencil;
bool depth_clamp_enable;
bool kill_pixel;
struct {
uint32_t sf[7];

View File

@@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
cmd_buffer->state.dirty = 0;
}
void
genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
bool enable)
{
/* The NP PMA fix doesn't exist on gen7 */
}
void genX(CmdSetEvent)(
VkCommandBuffer commandBuffer,
VkEvent event,

View File

@@ -154,6 +154,133 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
#endif
void
genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
{
#if GEN_GEN == 8
if (cmd_buffer->state.pma_fix_enabled == enable)
return;
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
pc.DepthCacheFlushEnable = true;
pc.CommandStreamerStallEnable = true;
pc.RenderTargetCacheFlushEnable = true;
}
uint32_t cache_mode;
anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
.NPPMAFixEnable = enable,
.NPEarlyZFailsDisable = enable,
.NPPMAFixEnableMask = true,
.NPEarlyZFailsDisableMask = true);
anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = GENX(CACHE_MODE_1_num);
lri.DataDWord = cache_mode;
}
/* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
* Flush bits is often necessary. We do it regardless because it's easier.
* The render cache flush is also necessary if stencil writes are enabled.
*/
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
pc.DepthStallEnable = true;
pc.DepthCacheFlushEnable = true;
pc.RenderTargetCacheFlushEnable = true;
}
cmd_buffer->state.pma_fix_enabled = enable;
#endif /* GEN_GEN == 8 */
}
static inline bool
want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
{
assert(GEN_GEN == 8);
/* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
*
* SW must set this bit in order to enable this fix when following
* expression is TRUE.
*
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
* !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
* (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
* (3DSTATE_PS_EXTRA::PixelShaderValid) &&
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
* 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
* (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
* 3DSTATE_WM::ForceKillPix != ForceOff &&
* ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
* 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
* (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
* (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
*/
/* These are always true:
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
* !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
*/
/* We only enable the PMA fix if we know for certain that HiZ is enabled.
* If we don't know whether HiZ is enabled or not, we disable the PMA fix
* and there is no harm.
*
* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
* 3DSTATE_DEPTH_BUFFER::HIZ Enable
*/
if (!cmd_buffer->state.hiz_enabled)
return false;
/* 3DSTATE_PS_EXTRA::PixelShaderValid */
struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
return false;
/* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
if (wm_prog_data->early_fragment_tests)
return false;
/* We never use anv_pipeline for HiZ ops so this is trivially true:
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
* 3DSTATE_WM_HZ_OP::StencilBufferClear)
*/
/* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
if (!pipeline->depth_test_enable)
return false;
/* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
* 3DSTATE_WM::ForceKillPix != ForceOff &&
* ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
* 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
* (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
* (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
*/
return (pipeline->kill_pixel && (pipeline->writes_depth ||
pipeline->writes_stencil)) ||
wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
}
void
genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
{
@@ -211,6 +338,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
}
if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
ANV_CMD_DIRTY_RENDER_TARGETS |
ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
@@ -234,6 +362,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
pipeline->gen8.wm_depth_stencil);
genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
want_depth_pma_fix(cmd_buffer));
}
#else
if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {

View File

@@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
/* BLORP doesn't do anything fancy with depth such as discards, so we want
* the PMA fix off. Also, off is always the safe option.
*/
genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
blorp_exec(batch, params);
cmd_buffer->state.vb_dirty = ~0;

View File

@@ -637,6 +637,11 @@ genX(EndCommandBuffer)(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
/* We want every command buffer to start with the PMA fix in a known state,
* so we disable it at the end of the command buffer.
*/
genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
anv_cmd_buffer_end_batch_buffer(cmd_buffer);
@@ -654,6 +659,11 @@ genX(CmdExecuteCommands)(
assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
/* The secondary command buffers will assume that the PMA fix is disabled
* when they begin executing. Make sure this is true.
*/
genX(cmd_buffer_enable_pma_fix)(primary, false);
for (uint32_t i = 0; i < commandBufferCount; i++) {
ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
@@ -2227,7 +2237,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
const bool has_stencil =
image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
/* FIXME: Implement the PMA stall W/A */
cmd_buffer->state.hiz_enabled = has_hiz;
/* FIXME: Width and Height are wrong */
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
@@ -2465,6 +2476,8 @@ void genX(CmdEndRenderPass)(
anv_cmd_buffer_resolve_subpass(cmd_buffer);
cmd_buffer->state.hiz_enabled = false;
#ifndef NDEBUG
anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
#endif

View File

@@ -455,6 +455,10 @@ emit_rs_state(struct anv_pipeline *pipeline,
*/
#if GEN_GEN >= 8
raster.DXMultisampleRasterizationEnable = true;
/* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
* computations. If we ever set this bit to a different value, they will
* need to be updated accordingly.
*/
raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
raster.ForceMultisampling = false;
#else
@@ -664,6 +668,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
* to make sure it's initialized to something useful.
*/
pipeline->writes_stencil = false;
pipeline->writes_depth = false;
pipeline->depth_test_enable = false;
memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
return;
}
@@ -722,6 +728,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
depth_stencil.DepthBufferWriteEnable = false;
pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
#if GEN_GEN <= 7
GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
#else
@@ -1443,6 +1452,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
}
#endif
static void
compute_kill_pixel(struct anv_pipeline *pipeline,
const VkPipelineMultisampleStateCreateInfo *ms_info,
const struct anv_subpass *subpass)
{
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
pipeline->kill_pixel = false;
return;
}
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
/* This computes the KillPixel portion of the computation for whether or
* not we want to enable the PMA fix on gen8. It's given by this chunk of
* the giant formula:
*
* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
*
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
* 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
* of an alpha test.
*/
pipeline->kill_pixel =
subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
wm_prog_data->uses_omask ||
(ms_info && ms_info->alphaToCoverageEnable);
}
static VkResult
genX(graphics_pipeline_create)(
VkDevice _device,
@@ -1480,6 +1521,7 @@ genX(graphics_pipeline_create)(
emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
pCreateInfo->pMultisampleState);
compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
emit_urb_setup(pipeline);