iris: Use more efficient binding table pointer formats on Icelake+.

Skylake and older use a 15:5 binding table pointer format, which means
our binder can be at most 64kB in size.  Each binding table within the
binder must be aligned to 32B.

XeHP uses a new 20:5 binding table format, which allows us to increase
the binder size to 1MB while retaining the nice 32B alignment.  Larger
binders mean fewer stalls as we update the base address for the binder.

Icelake and Tigerlake can either use the 15:5 format or an 18:8 format.
18:8 mode requires the base of each binding table to be aligned to 256B
instead of 32B, but it gives us a maximum binder size of 512kB.

We can store 64 binding table entries in a 256B chunk (256B / 4B = 64),
but only 8 entries in a 32B chunk (32B / 4B = 8).  Assuming that most
binding tables have fewer than 64 entries, this means that with the 18:8
format, we're likely to be able to fit 2048 (512KB / 256B) tables into a
a buffer before needing to allocate a new one and stall.

Technically, the old format could also store 2048 binding tables per
buffer as well (64KB / 32B = 2048).  However, tables that needed more
than 8 entries would need multiple 32B chunks.  A single table would
take multiple aligned chunks, while with the larger 256B format, it
could fit in a single one.

This cuts binder resets by 6.3% on a Shadow of Mordor benchmark trace.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14507>
This commit is contained in:
Kenneth Graunke
2019-06-25 13:16:50 -07:00
committed by Marge Bot
parent a83c91a261
commit db34c71513
6 changed files with 74 additions and 24 deletions

View File

@@ -53,15 +53,10 @@
#include "iris_bufmgr.h"
#include "iris_context.h"
#define BTP_ALIGNMENT 32
/* Avoid using offset 0, tools consider it NULL */
#define INIT_INSERT_POINT BTP_ALIGNMENT
static bool
binder_has_space(struct iris_binder *binder, unsigned size)
{
return binder->insert_point + size <= IRIS_BINDER_SIZE;
return binder->insert_point + size <= binder->size;
}
static void
@@ -74,10 +69,12 @@ binder_realloc(struct iris_context *ice)
if (binder->bo)
iris_bo_unreference(binder->bo);
binder->bo = iris_bo_alloc(bufmgr, "binder", IRIS_BINDER_SIZE, 1,
IRIS_MEMZONE_BINDER, 0);
binder->bo = iris_bo_alloc(bufmgr, "binder", binder->size, 1,
IRIS_MEMZONE_BINDER, 4096);
binder->map = iris_bo_map(NULL, binder->bo, MAP_WRITE);
binder->insert_point = INIT_INSERT_POINT;
/* Avoid using offset 0 - tools consider it NULL. */
binder->insert_point = binder->alignment;
/* Allocating a new binder requires changing Surface State Base Address,
* which also invalidates all our previous binding tables - each entry
@@ -95,7 +92,8 @@ binder_insert(struct iris_binder *binder, unsigned size)
{
uint32_t offset = binder->insert_point;
binder->insert_point = align(binder->insert_point + size, BTP_ALIGNMENT);
binder->insert_point =
align(binder->insert_point + size, binder->alignment);
return offset;
}
@@ -141,7 +139,7 @@ iris_binder_reserve_3d(struct iris_context *ice)
continue;
/* Round up the size so our next table has an aligned starting offset */
sizes[stage] = align(shaders[stage]->bt.size_bytes, BTP_ALIGNMENT);
sizes[stage] = align(shaders[stage]->bt.size_bytes, binder->alignment);
}
/* Make space for the new binding tables...this may take two tries. */
@@ -152,7 +150,7 @@ iris_binder_reserve_3d(struct iris_context *ice)
total_size += sizes[stage];
}
assert(total_size < IRIS_BINDER_SIZE);
assert(total_size < binder->size);
if (total_size == 0)
return;
@@ -201,7 +199,31 @@ iris_binder_reserve_compute(struct iris_context *ice)
void
iris_init_binder(struct iris_context *ice)
{
struct iris_screen *screen = (void *) ice->ctx.screen;
const struct intel_device_info *devinfo = &screen->devinfo;
memset(&ice->state.binder, 0, sizeof(struct iris_binder));
/* We use different binding table pointer formats on various generations.
*
* - The 20:5 format gives us an alignment of 32B and max size of 1024kB.
* - The 18:8 format gives us an alignment of 256B and max size of 512kB.
* - The 15:5 format gives us an alignment of 32B and max size of 64kB.
*
* XeHP and later use the 20:5 format. Icelake and Tigerlake use 18:8
* in iris, but can use 15:5 if desired, Older platforms require 15:5.
*/
if (devinfo->verx10 >= 125) {
ice->state.binder.alignment = 32;
ice->state.binder.size = 1024 * 1024;
} else if (devinfo->ver >= 11) {
ice->state.binder.alignment = 256;
ice->state.binder.size = 512 * 1024;
} else {
ice->state.binder.alignment = 32;
ice->state.binder.size = 64 * 1024;
}
binder_realloc(ice);
}

View File

@@ -39,6 +39,12 @@ struct iris_binder
struct iris_bo *bo;
void *map;
/** Required alignment for each binding table in bytes */
uint32_t alignment;
/** Binding table size in bytes */
uint32_t size;
/** Insert new entries at this offset (in bytes) */
uint32_t insert_point;

View File

@@ -154,7 +154,7 @@ blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
unsigned num_entries,
unsigned state_size,
unsigned state_alignment,
uint32_t *bt_offset,
uint32_t *out_bt_offset,
uint32_t *surface_offsets,
void **surface_maps)
{
@@ -162,8 +162,11 @@ blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
struct iris_binder *binder = &ice->state.binder;
struct iris_batch *batch = blorp_batch->driver_batch;
*bt_offset = iris_binder_reserve(ice, num_entries * sizeof(uint32_t));
uint32_t *bt_map = binder->map + *bt_offset;
unsigned bt_offset =
iris_binder_reserve(ice, num_entries * sizeof(uint32_t));
uint32_t *bt_map = binder->map + bt_offset;
*out_bt_offset = bt_offset;
for (unsigned i = 0; i < num_entries; i++) {
surface_maps[i] = stream_state(batch, ice->state.surface_uploader,
@@ -181,7 +184,8 @@ static uint32_t
blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
uint32_t offset)
{
return offset;
/* See IRIS_BT_OFFSET_SHIFT in iris_state.c */
return offset >> ((GFX_VER >= 11 && GFX_VERx10 < 125) ? 3 : 0);
}
static void *

View File

@@ -2422,7 +2422,6 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
STATIC_ASSERT(IRIS_MEMZONE_SHADER_START == 0ull);
const uint64_t _4GB = 1ull << 32;
const uint64_t _2GB = 1ul << 31;
const uint64_t _1GB = 1ul << 30;
/* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
@@ -2430,11 +2429,12 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SHADER],
PAGE_SIZE, _4GB_minus_1 - PAGE_SIZE);
util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_BINDER],
IRIS_MEMZONE_BINDER_START, _1GB - IRIS_BINDLESS_SIZE);
IRIS_MEMZONE_BINDER_START, IRIS_BINDER_ZONE_SIZE);
util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_BINDLESS],
IRIS_MEMZONE_BINDLESS_START, IRIS_BINDLESS_SIZE);
util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE],
IRIS_MEMZONE_SURFACE_START, _4GB_minus_1 - _1GB);
IRIS_MEMZONE_SURFACE_START, _4GB_minus_1 -
IRIS_BINDER_ZONE_SIZE - IRIS_BINDLESS_SIZE);
/* TODO: Why does limiting to 2GB help some state items on gfx12?
* - CC Viewport Pointer
* - Blend State Pointer

View File

@@ -83,13 +83,13 @@ enum iris_memory_zone {
/* Intentionally exclude single buffer "zones" */
#define IRIS_MEMZONE_COUNT (IRIS_MEMZONE_OTHER + 1)
#define IRIS_BINDER_SIZE (64 * 1024)
#define IRIS_BINDLESS_SIZE (8 * 1024 * 1024)
#define IRIS_BINDER_ZONE_SIZE ((1ull << 30) - IRIS_BINDLESS_SIZE)
#define IRIS_MEMZONE_SHADER_START (0ull * (1ull << 32))
#define IRIS_MEMZONE_BINDER_START (1ull * (1ull << 32))
#define IRIS_MEMZONE_BINDLESS_START (IRIS_MEMZONE_BINDER_START + (1ull << 30) - IRIS_BINDLESS_SIZE)
#define IRIS_MEMZONE_SURFACE_START (IRIS_MEMZONE_BINDER_START + (1ull << 30))
#define IRIS_MEMZONE_BINDLESS_START (IRIS_MEMZONE_BINDER_START + IRIS_BINDER_ZONE_SIZE)
#define IRIS_MEMZONE_SURFACE_START (IRIS_MEMZONE_BINDLESS_START + IRIS_BINDLESS_SIZE)
#define IRIS_MEMZONE_DYNAMIC_START (2ull * (1ull << 32))
#define IRIS_MEMZONE_OTHER_START (3ull * (1ull << 32))

View File

@@ -1015,6 +1015,22 @@ iris_init_common_context(struct iris_batch *batch)
reg.EnabledTexelOffsetPrecisionFix = 1;
reg.EnabledTexelOffsetPrecisionFixMask = 1;
}
#endif
/* Select 256B-aligned binding table mode on Icelake through Tigerlake,
* which gives us larger binding table pointers, at the cost of higher
* alignment requirements (bits 18:8 are valid instead of 15:5). When
* using this mode, we have to shift binding table pointers by 3 bits,
* as they're still stored in the same bit-location in the field.
*/
#if GFX_VER >= 11 && GFX_VERx10 < 125
iris_emit_reg(batch, GENX(GT_MODE), reg) {
reg.BindingTableAlignment = BTP_18_8;
reg.BindingTableAlignmentMask = true;
}
#define IRIS_BT_OFFSET_SHIFT 3
#else
#define IRIS_BT_OFFSET_SHIFT 0
#endif
}
@@ -6022,7 +6038,8 @@ iris_upload_dirty_render_state(struct iris_context *ice,
<< stage)) {
iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
ptr._3DCommandSubOpcode = 38 + stage;
ptr.PointertoVSBindingTable = binder->bt_offset[stage];
ptr.PointertoVSBindingTable =
binder->bt_offset[stage] >> IRIS_BT_OFFSET_SHIFT;
}
}
}
@@ -7236,7 +7253,8 @@ iris_upload_gpgpu_walker(struct iris_context *ice,
KSP(shader) + brw_cs_prog_data_prog_offset(cs_prog_data,
dispatch.simd_size);
idd.SamplerStatePointer = shs->sampler_table.offset;
idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
idd.BindingTablePointer =
binder->bt_offset[MESA_SHADER_COMPUTE] >> IRIS_BT_OFFSET_SHIFT;
idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
}