anv: Add support for the PMA fix on Broadwell
This helps Dota 2 on Broadwell by 8-9%. I also hacked up the driver and used the Sascha "shadowmapping" demo to get some results. Setting uses_kill to true dropped the framerate on the demo by 25-30%. Enabling the PMA fix brought it back up to around 90% of the original framerate. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Nanley Chery <nanley.g.chery@intel.com>
This commit is contained in:
@@ -12,5 +12,4 @@ Performance:
|
||||
- Compressed multisample support
|
||||
- Pushing pieces of UBOs?
|
||||
- Enable guardband clipping
|
||||
- pma stall workaround
|
||||
- Use soft-pin to avoid relocations
|
||||
|
@@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
|
||||
state->restart_index = UINT32_MAX;
|
||||
state->dynamic = default_dynamic_state;
|
||||
state->need_query_wa = true;
|
||||
state->pma_fix_enabled = false;
|
||||
state->hiz_enabled = false;
|
||||
|
||||
if (state->attachments != NULL) {
|
||||
vk_free(&cmd_buffer->pool->alloc, state->attachments);
|
||||
|
@@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
|
||||
|
||||
void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
|
||||
|
||||
void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
|
||||
bool enable);
|
||||
|
||||
void
|
||||
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
|
||||
const struct gen_l3_config *l3_config,
|
||||
|
@@ -1168,6 +1168,20 @@ struct anv_cmd_state {
|
||||
struct anv_dynamic_state dynamic;
|
||||
bool need_query_wa;
|
||||
|
||||
/**
|
||||
* Whether or not the gen8 PMA fix is enabled. We ensure that, at the top
|
||||
* of any command buffer it is disabled by disabling it in EndCommandBuffer
|
||||
* and before invoking the secondary in ExecuteCommands.
|
||||
*/
|
||||
bool pma_fix_enabled;
|
||||
|
||||
/**
|
||||
* Whether or not we know for certain that HiZ is enabled for the current
|
||||
* subpass. If, for whatever reason, we are unsure as to whether HiZ is
|
||||
* enabled or not, this will be false.
|
||||
*/
|
||||
bool hiz_enabled;
|
||||
|
||||
/**
|
||||
* Array length is anv_cmd_state::pass::attachment_count. Array content is
|
||||
* valid only when recording a render pass instance.
|
||||
@@ -1471,8 +1485,11 @@ struct anv_pipeline {
|
||||
|
||||
uint32_t cs_right_mask;
|
||||
|
||||
bool writes_depth;
|
||||
bool depth_test_enable;
|
||||
bool writes_stencil;
|
||||
bool depth_clamp_enable;
|
||||
bool kill_pixel;
|
||||
|
||||
struct {
|
||||
uint32_t sf[7];
|
||||
|
@@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
cmd_buffer->state.dirty = 0;
|
||||
}
|
||||
|
||||
void
|
||||
genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
|
||||
bool enable)
|
||||
{
|
||||
/* The NP PMA fix doesn't exist on gen7 */
|
||||
}
|
||||
|
||||
void genX(CmdSetEvent)(
|
||||
VkCommandBuffer commandBuffer,
|
||||
VkEvent event,
|
||||
|
@@ -154,6 +154,133 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
|
||||
|
||||
#endif
|
||||
|
||||
void
|
||||
genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
|
||||
{
|
||||
#if GEN_GEN == 8
|
||||
if (cmd_buffer->state.pma_fix_enabled == enable)
|
||||
return;
|
||||
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
|
||||
pc.DepthCacheFlushEnable = true;
|
||||
pc.CommandStreamerStallEnable = true;
|
||||
pc.RenderTargetCacheFlushEnable = true;
|
||||
}
|
||||
|
||||
uint32_t cache_mode;
|
||||
anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
|
||||
.NPPMAFixEnable = enable,
|
||||
.NPEarlyZFailsDisable = enable,
|
||||
.NPPMAFixEnableMask = true,
|
||||
.NPEarlyZFailsDisableMask = true);
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
|
||||
lri.RegisterOffset = GENX(CACHE_MODE_1_num);
|
||||
lri.DataDWord = cache_mode;
|
||||
}
|
||||
|
||||
/* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
|
||||
* Flush bits is often necessary. We do it regardless because it's easier.
|
||||
* The render cache flush is also necessary if stencil writes are enabled.
|
||||
*/
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
|
||||
pc.DepthStallEnable = true;
|
||||
pc.DepthCacheFlushEnable = true;
|
||||
pc.RenderTargetCacheFlushEnable = true;
|
||||
}
|
||||
|
||||
cmd_buffer->state.pma_fix_enabled = enable;
|
||||
#endif /* GEN_GEN == 8 */
|
||||
}
|
||||
|
||||
static inline bool
|
||||
want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
assert(GEN_GEN == 8);
|
||||
|
||||
/* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
|
||||
*
|
||||
* SW must set this bit in order to enable this fix when following
|
||||
* expression is TRUE.
|
||||
*
|
||||
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
|
||||
* !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
|
||||
* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
|
||||
* (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
|
||||
* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
|
||||
* (3DSTATE_PS_EXTRA::PixelShaderValid) &&
|
||||
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
|
||||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
|
||||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
|
||||
* 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
|
||||
* (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
|
||||
* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
|
||||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
|
||||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
|
||||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
|
||||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
|
||||
* 3DSTATE_WM::ForceKillPix != ForceOff &&
|
||||
* ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
|
||||
* 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
|
||||
* (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
|
||||
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
|
||||
* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
|
||||
* (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
|
||||
*/
|
||||
|
||||
/* These are always true:
|
||||
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
|
||||
* !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
|
||||
*/
|
||||
|
||||
/* We only enable the PMA fix if we know for certain that HiZ is enabled.
|
||||
* If we don't know whether HiZ is enabled or not, we disable the PMA fix
|
||||
* and there is no harm.
|
||||
*
|
||||
* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
|
||||
* 3DSTATE_DEPTH_BUFFER::HIZ Enable
|
||||
*/
|
||||
if (!cmd_buffer->state.hiz_enabled)
|
||||
return false;
|
||||
|
||||
/* 3DSTATE_PS_EXTRA::PixelShaderValid */
|
||||
struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
|
||||
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
|
||||
return false;
|
||||
|
||||
/* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
|
||||
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
|
||||
if (wm_prog_data->early_fragment_tests)
|
||||
return false;
|
||||
|
||||
/* We never use anv_pipeline for HiZ ops so this is trivially true:
|
||||
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
|
||||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
|
||||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
|
||||
* 3DSTATE_WM_HZ_OP::StencilBufferClear)
|
||||
*/
|
||||
|
||||
/* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
|
||||
if (!pipeline->depth_test_enable)
|
||||
return false;
|
||||
|
||||
/* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
|
||||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
|
||||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
|
||||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
|
||||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
|
||||
* 3DSTATE_WM::ForceKillPix != ForceOff &&
|
||||
* ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
|
||||
* 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
|
||||
* (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
|
||||
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
|
||||
* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
|
||||
* (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
|
||||
*/
|
||||
return (pipeline->kill_pixel && (pipeline->writes_depth ||
|
||||
pipeline->writes_stencil)) ||
|
||||
wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
|
||||
}
|
||||
|
||||
void
|
||||
genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
@@ -211,6 +338,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
|
||||
if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
|
||||
ANV_CMD_DIRTY_RENDER_TARGETS |
|
||||
ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
|
||||
ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
|
||||
uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
|
||||
@@ -234,6 +362,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
|
||||
anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
|
||||
pipeline->gen8.wm_depth_stencil);
|
||||
|
||||
genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
|
||||
want_depth_pma_fix(cmd_buffer));
|
||||
}
|
||||
#else
|
||||
if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
|
||||
|
@@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
|
||||
|
||||
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
|
||||
|
||||
/* BLORP doesn't do anything fancy with depth such as discards, so we want
|
||||
* the PMA fix off. Also, off is always the safe option.
|
||||
*/
|
||||
genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
|
||||
|
||||
blorp_exec(batch, params);
|
||||
|
||||
cmd_buffer->state.vb_dirty = ~0;
|
||||
|
@@ -637,6 +637,11 @@ genX(EndCommandBuffer)(
|
||||
{
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
|
||||
/* We want every command buffer to start with the PMA fix in a known state,
|
||||
* so we disable it at the end of the command buffer.
|
||||
*/
|
||||
genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
|
||||
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
anv_cmd_buffer_end_batch_buffer(cmd_buffer);
|
||||
@@ -654,6 +659,11 @@ genX(CmdExecuteCommands)(
|
||||
|
||||
assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
|
||||
|
||||
/* The secondary command buffers will assume that the PMA fix is disabled
|
||||
* when they begin executing. Make sure this is true.
|
||||
*/
|
||||
genX(cmd_buffer_enable_pma_fix)(primary, false);
|
||||
|
||||
for (uint32_t i = 0; i < commandBufferCount; i++) {
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
|
||||
|
||||
@@ -2227,7 +2237,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
|
||||
const bool has_stencil =
|
||||
image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
|
||||
|
||||
/* FIXME: Implement the PMA stall W/A */
|
||||
cmd_buffer->state.hiz_enabled = has_hiz;
|
||||
|
||||
/* FIXME: Width and Height are wrong */
|
||||
|
||||
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
|
||||
@@ -2465,6 +2476,8 @@ void genX(CmdEndRenderPass)(
|
||||
|
||||
anv_cmd_buffer_resolve_subpass(cmd_buffer);
|
||||
|
||||
cmd_buffer->state.hiz_enabled = false;
|
||||
|
||||
#ifndef NDEBUG
|
||||
anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
|
||||
#endif
|
||||
|
@@ -455,6 +455,10 @@ emit_rs_state(struct anv_pipeline *pipeline,
|
||||
*/
|
||||
#if GEN_GEN >= 8
|
||||
raster.DXMultisampleRasterizationEnable = true;
|
||||
/* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
|
||||
* computations. If we ever set this bit to a different value, they will
|
||||
* need to be updated accordingly.
|
||||
*/
|
||||
raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
|
||||
raster.ForceMultisampling = false;
|
||||
#else
|
||||
@@ -664,6 +668,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
|
||||
* to make sure it's initialized to something useful.
|
||||
*/
|
||||
pipeline->writes_stencil = false;
|
||||
pipeline->writes_depth = false;
|
||||
pipeline->depth_test_enable = false;
|
||||
memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
|
||||
return;
|
||||
}
|
||||
@@ -722,6 +728,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
|
||||
if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
|
||||
depth_stencil.DepthBufferWriteEnable = false;
|
||||
|
||||
pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
|
||||
pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
|
||||
|
||||
#if GEN_GEN <= 7
|
||||
GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
|
||||
#else
|
||||
@@ -1443,6 +1452,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
compute_kill_pixel(struct anv_pipeline *pipeline,
|
||||
const VkPipelineMultisampleStateCreateInfo *ms_info,
|
||||
const struct anv_subpass *subpass)
|
||||
{
|
||||
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
|
||||
pipeline->kill_pixel = false;
|
||||
return;
|
||||
}
|
||||
|
||||
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
|
||||
|
||||
/* This computes the KillPixel portion of the computation for whether or
|
||||
* not we want to enable the PMA fix on gen8. It's given by this chunk of
|
||||
* the giant formula:
|
||||
*
|
||||
* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
|
||||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
|
||||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
|
||||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
|
||||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
|
||||
*
|
||||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
|
||||
* 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
|
||||
* of an alpha test.
|
||||
*/
|
||||
pipeline->kill_pixel =
|
||||
subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
|
||||
wm_prog_data->uses_omask ||
|
||||
(ms_info && ms_info->alphaToCoverageEnable);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
genX(graphics_pipeline_create)(
|
||||
VkDevice _device,
|
||||
@@ -1480,6 +1521,7 @@ genX(graphics_pipeline_create)(
|
||||
emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
|
||||
emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
|
||||
pCreateInfo->pMultisampleState);
|
||||
compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
|
||||
|
||||
emit_urb_setup(pipeline);
|
||||
|
||||
|
Reference in New Issue
Block a user