radv: Implement binning on GFX9.
Overall it does not really help or hurt. The deferred demo gets 1% improvement and some games a 3% decrease, so I don't think this should be enabled by default. But with the code upstream it is easier to experiment with it. v2: Remove initializing the registers from si_emit_config. Reviewed-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
@@ -1042,6 +1042,21 @@ radv_emit_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer,
|
||||
pipeline->graphics.vtx_reuse_depth);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer,
|
||||
struct radv_pipeline *pipeline)
|
||||
{
|
||||
struct radeon_winsys_cs *cs = cmd_buffer->cs;
|
||||
|
||||
if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
|
||||
return;
|
||||
|
||||
radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
|
||||
pipeline->graphics.bin.pa_sc_binner_cntl_0);
|
||||
radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
|
||||
pipeline->graphics.bin.db_dfsm_control);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
@@ -1059,6 +1074,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
|
||||
radv_emit_geometry_shader(cmd_buffer, pipeline);
|
||||
radv_emit_fragment_shader(cmd_buffer, pipeline);
|
||||
radv_emit_vgt_vertex_reuse(cmd_buffer, pipeline);
|
||||
radv_emit_binning_state(cmd_buffer, pipeline);
|
||||
|
||||
cmd_buffer->scratch_size_needed =
|
||||
MAX2(cmd_buffer->scratch_size_needed,
|
||||
|
@@ -2002,6 +2002,329 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
struct radv_bin_size_entry {
|
||||
unsigned bpp;
|
||||
VkExtent2D extent;
|
||||
};
|
||||
|
||||
static VkExtent2D
|
||||
radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo)
|
||||
{
|
||||
static const struct radv_bin_size_entry color_size_table[][3][9] = {
|
||||
{
|
||||
/* One RB / SE */
|
||||
{
|
||||
/* One shader engine */
|
||||
{ 0, {128, 128}},
|
||||
{ 1, { 64, 128}},
|
||||
{ 2, { 32, 128}},
|
||||
{ 3, { 16, 128}},
|
||||
{ 17, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
/* Two shader engines */
|
||||
{ 0, {128, 128}},
|
||||
{ 2, { 64, 128}},
|
||||
{ 3, { 32, 128}},
|
||||
{ 5, { 16, 128}},
|
||||
{ 17, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
/* Four shader engines */
|
||||
{ 0, {128, 128}},
|
||||
{ 3, { 64, 128}},
|
||||
{ 5, { 16, 128}},
|
||||
{ 17, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
},
|
||||
{
|
||||
/* Two RB / SE */
|
||||
{
|
||||
/* One shader engine */
|
||||
{ 0, {128, 128}},
|
||||
{ 2, { 64, 128}},
|
||||
{ 3, { 32, 128}},
|
||||
{ 5, { 16, 128}},
|
||||
{ 33, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
/* Two shader engines */
|
||||
{ 0, {128, 128}},
|
||||
{ 3, { 64, 128}},
|
||||
{ 5, { 32, 128}},
|
||||
{ 9, { 16, 128}},
|
||||
{ 33, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
/* Four shader engines */
|
||||
{ 0, {256, 256}},
|
||||
{ 2, {128, 256}},
|
||||
{ 3, {128, 128}},
|
||||
{ 5, { 64, 128}},
|
||||
{ 9, { 16, 128}},
|
||||
{ 33, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
},
|
||||
{
|
||||
/* Four RB / SE */
|
||||
{
|
||||
/* One shader engine */
|
||||
{ 0, {128, 256}},
|
||||
{ 2, {128, 128}},
|
||||
{ 3, { 64, 128}},
|
||||
{ 5, { 32, 128}},
|
||||
{ 9, { 16, 128}},
|
||||
{ 33, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
/* Two shader engines */
|
||||
{ 0, {256, 256}},
|
||||
{ 2, {128, 256}},
|
||||
{ 3, {128, 128}},
|
||||
{ 5, { 64, 128}},
|
||||
{ 9, { 32, 128}},
|
||||
{ 17, { 16, 128}},
|
||||
{ 33, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
/* Four shader engines */
|
||||
{ 0, {256, 512}},
|
||||
{ 2, {256, 256}},
|
||||
{ 3, {128, 256}},
|
||||
{ 5, {128, 128}},
|
||||
{ 9, { 64, 128}},
|
||||
{ 17, { 16, 128}},
|
||||
{ 33, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
},
|
||||
};
|
||||
static const struct radv_bin_size_entry ds_size_table[][3][9] = {
|
||||
{
|
||||
// One RB / SE
|
||||
{
|
||||
// One shader engine
|
||||
{ 0, {128, 256}},
|
||||
{ 2, {128, 128}},
|
||||
{ 4, { 64, 128}},
|
||||
{ 7, { 32, 128}},
|
||||
{ 13, { 16, 128}},
|
||||
{ 49, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
// Two shader engines
|
||||
{ 0, {256, 256}},
|
||||
{ 2, {128, 256}},
|
||||
{ 4, {128, 128}},
|
||||
{ 7, { 64, 128}},
|
||||
{ 13, { 32, 128}},
|
||||
{ 25, { 16, 128}},
|
||||
{ 49, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
// Four shader engines
|
||||
{ 0, {256, 512}},
|
||||
{ 2, {256, 256}},
|
||||
{ 4, {128, 256}},
|
||||
{ 7, {128, 128}},
|
||||
{ 13, { 64, 128}},
|
||||
{ 25, { 16, 128}},
|
||||
{ 49, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Two RB / SE
|
||||
{
|
||||
// One shader engine
|
||||
{ 0, {256, 256}},
|
||||
{ 2, {128, 256}},
|
||||
{ 4, {128, 128}},
|
||||
{ 7, { 64, 128}},
|
||||
{ 13, { 32, 128}},
|
||||
{ 25, { 16, 128}},
|
||||
{ 97, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
// Two shader engines
|
||||
{ 0, {256, 512}},
|
||||
{ 2, {256, 256}},
|
||||
{ 4, {128, 256}},
|
||||
{ 7, {128, 128}},
|
||||
{ 13, { 64, 128}},
|
||||
{ 25, { 32, 128}},
|
||||
{ 49, { 16, 128}},
|
||||
{ 97, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
// Four shader engines
|
||||
{ 0, {512, 512}},
|
||||
{ 2, {256, 512}},
|
||||
{ 4, {256, 256}},
|
||||
{ 7, {128, 256}},
|
||||
{ 13, {128, 128}},
|
||||
{ 25, { 64, 128}},
|
||||
{ 49, { 16, 128}},
|
||||
{ 97, { 0, 0}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Four RB / SE
|
||||
{
|
||||
// One shader engine
|
||||
{ 0, {256, 512}},
|
||||
{ 2, {256, 256}},
|
||||
{ 4, {128, 256}},
|
||||
{ 7, {128, 128}},
|
||||
{ 13, { 64, 128}},
|
||||
{ 25, { 32, 128}},
|
||||
{ 49, { 16, 128}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
// Two shader engines
|
||||
{ 0, {512, 512}},
|
||||
{ 2, {256, 512}},
|
||||
{ 4, {256, 256}},
|
||||
{ 7, {128, 256}},
|
||||
{ 13, {128, 128}},
|
||||
{ 25, { 64, 128}},
|
||||
{ 49, { 32, 128}},
|
||||
{ 97, { 16, 128}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
{
|
||||
// Four shader engines
|
||||
{ 0, {512, 512}},
|
||||
{ 4, {256, 512}},
|
||||
{ 7, {256, 256}},
|
||||
{ 13, {128, 256}},
|
||||
{ 25, {128, 128}},
|
||||
{ 49, { 64, 128}},
|
||||
{ 97, { 16, 128}},
|
||||
{ UINT_MAX, { 0, 0}},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
|
||||
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
|
||||
VkExtent2D extent = {512, 512};
|
||||
|
||||
unsigned log_num_rb_per_se =
|
||||
util_logbase2_ceil(pipeline->device->physical_device->rad_info.num_render_backends /
|
||||
pipeline->device->physical_device->rad_info.max_se);
|
||||
unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);
|
||||
|
||||
unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_mode_cntl_1);
|
||||
unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
|
||||
unsigned effective_samples = total_samples;
|
||||
unsigned cb_target_mask = pipeline->graphics.blend.cb_target_mask;
|
||||
unsigned color_bytes_per_pixel = 0;
|
||||
|
||||
for (unsigned i = 0; i < subpass->color_count; i++) {
|
||||
if (!(cb_target_mask & (0xf << (i * 4))))
|
||||
continue;
|
||||
|
||||
if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
|
||||
continue;
|
||||
|
||||
VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
|
||||
color_bytes_per_pixel += vk_format_get_blocksize(format);
|
||||
}
|
||||
|
||||
/* MSAA images typically don't use all samples all the time. */
|
||||
if (effective_samples >= 2 && ps_iter_samples <= 1)
|
||||
effective_samples = 2;
|
||||
color_bytes_per_pixel *= effective_samples;
|
||||
|
||||
const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
|
||||
while(color_entry->bpp <= color_bytes_per_pixel)
|
||||
++color_entry;
|
||||
|
||||
extent = color_entry->extent;
|
||||
|
||||
if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
|
||||
struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment.attachment;
|
||||
|
||||
/* Coefficients taken from AMDVLK */
|
||||
unsigned depth_coeff = vk_format_is_depth(attachment->format) ? 5 : 0;
|
||||
unsigned stencil_coeff = vk_format_is_stencil(attachment->format) ? 1 : 0;
|
||||
unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
|
||||
|
||||
const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
|
||||
while(ds_entry->bpp <= ds_bytes_per_pixel)
|
||||
++ds_entry;
|
||||
|
||||
extent.width = MIN2(extent.width, ds_entry->extent.width);
|
||||
extent.height = MIN2(extent.height, ds_entry->extent.height);
|
||||
}
|
||||
|
||||
return extent;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_compute_binning_state(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo)
|
||||
{
|
||||
pipeline->graphics.bin.pa_sc_binner_cntl_0 =
|
||||
S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
|
||||
S_028C44_DISABLE_START_OF_PRIM(1);
|
||||
pipeline->graphics.bin.db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF);
|
||||
|
||||
if (!pipeline->device->pbb_allowed)
|
||||
return;
|
||||
|
||||
VkExtent2D bin_size = radv_compute_bin_size(pipeline, pCreateInfo);
|
||||
if (!bin_size.width || !bin_size.height)
|
||||
return;
|
||||
|
||||
unsigned context_states_per_bin; /* allowed range: [1, 6] */
|
||||
unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
|
||||
unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
|
||||
|
||||
switch (pipeline->device->physical_device->rad_info.family) {
|
||||
case CHIP_VEGA10:
|
||||
context_states_per_bin = 1;
|
||||
persistent_states_per_bin = 1;
|
||||
fpovs_per_batch = 63;
|
||||
break;
|
||||
case CHIP_RAVEN:
|
||||
context_states_per_bin = 6;
|
||||
persistent_states_per_bin = 32;
|
||||
fpovs_per_batch = 63;
|
||||
break;
|
||||
default:
|
||||
unreachable("unhandled family while determining binning state.");
|
||||
}
|
||||
|
||||
pipeline->graphics.bin.pa_sc_binner_cntl_0 =
|
||||
S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
|
||||
S_028C44_BIN_SIZE_X(bin_size.width == 16) |
|
||||
S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
|
||||
S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
|
||||
S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
|
||||
S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
|
||||
S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
|
||||
S_028C44_DISABLE_START_OF_PRIM(1) |
|
||||
S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
|
||||
S_028C44_OPTIMAL_BIN_SELECTION(1);
|
||||
|
||||
/* DFSM is not implemented yet */
|
||||
assert(!pipeline->device->dfsm_allowed);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_pipeline_init(struct radv_pipeline *pipeline,
|
||||
@@ -2290,6 +2613,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
|
||||
radv_dump_pipeline_stats(device, pipeline);
|
||||
}
|
||||
|
||||
radv_compute_binning_state(pipeline, pCreateInfo);
|
||||
|
||||
result = radv_pipeline_scratch_init(device, pipeline);
|
||||
return result;
|
||||
}
|
||||
|
@@ -583,6 +583,7 @@ struct radv_device {
|
||||
|
||||
bool llvm_supports_spill;
|
||||
bool has_distributed_tess;
|
||||
bool pbb_allowed;
|
||||
bool dfsm_allowed;
|
||||
uint32_t tess_offchip_block_dw_size;
|
||||
uint32_t scratch_waves;
|
||||
@@ -1165,6 +1166,11 @@ struct radv_vs_state {
|
||||
uint32_t vgt_reuse_off;
|
||||
};
|
||||
|
||||
struct radv_binning_state {
|
||||
uint32_t pa_sc_binner_cntl_0;
|
||||
uint32_t db_dfsm_control;
|
||||
};
|
||||
|
||||
#define SI_GS_PER_ES 128
|
||||
|
||||
struct radv_pipeline {
|
||||
@@ -1193,6 +1199,7 @@ struct radv_pipeline {
|
||||
struct radv_tessellation_state tess;
|
||||
struct radv_gs_state gs;
|
||||
struct radv_vs_state vs;
|
||||
struct radv_binning_state bin;
|
||||
uint32_t db_shader_control;
|
||||
uint32_t shader_z_format;
|
||||
unsigned prim;
|
||||
|
@@ -518,12 +518,6 @@ si_emit_config(struct radv_physical_device *physical_device,
|
||||
assert(0);
|
||||
}
|
||||
|
||||
radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
|
||||
S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
|
||||
/* TODO: Enable the binner: */
|
||||
radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
|
||||
S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
|
||||
S_028C44_DISABLE_START_OF_PRIM(1));
|
||||
radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1,
|
||||
S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) |
|
||||
S_028C48_MAX_PRIM_PER_BATCH(1023));
|
||||
|
Reference in New Issue
Block a user