intel/perf: add performance query layout using MI_SRM
For all generations supported we had a layout describing what register to store to implement a MI_RPC replacement. This is because, on Gen12 we need to snapshot OAG registers to get correct values for the perf equations. There, the MI_RPC instruction captures OAR register which do not have all the information we need. v2: Fix commented code for debug (Marcin) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6518>
This commit is contained in:

committed by
Marge Bot

parent
f32d1bf529
commit
8750f43a90
@@ -1027,9 +1027,16 @@ gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
|
|||||||
&result->unslice_frequency[1]);
|
&result->unslice_frequency[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool
|
||||||
|
can_use_mi_rpc_bc_counters(const struct gen_device_info *devinfo)
|
||||||
|
{
|
||||||
|
return devinfo->gen <= 11;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
|
gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
|
||||||
const struct gen_perf_query_info *query,
|
const struct gen_perf_query_info *query,
|
||||||
|
const struct gen_device_info *devinfo,
|
||||||
const uint32_t *start,
|
const uint32_t *start,
|
||||||
const uint32_t *end)
|
const uint32_t *end)
|
||||||
{
|
{
|
||||||
@@ -1061,16 +1068,18 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
|
|||||||
result->accumulator + query->a_offset + 32 + i);
|
result->accumulator + query->a_offset + 32 + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 8x 32bit B counters */
|
if (can_use_mi_rpc_bc_counters(devinfo)) {
|
||||||
for (i = 0; i < 8; i++) {
|
/* 8x 32bit B counters */
|
||||||
accumulate_uint32(start + 48 + i, end + 48 + i,
|
for (i = 0; i < 8; i++) {
|
||||||
result->accumulator + query->b_offset + i);
|
accumulate_uint32(start + 48 + i, end + 48 + i,
|
||||||
}
|
result->accumulator + query->b_offset + i);
|
||||||
|
}
|
||||||
|
|
||||||
/* 8x 32bit C counters... */
|
/* 8x 32bit C counters... */
|
||||||
for (i = 0; i < 8; i++) {
|
for (i = 0; i < 8; i++) {
|
||||||
accumulate_uint32(start + 56 + i, end + 56 + i,
|
accumulate_uint32(start + 56 + i, end + 56 + i,
|
||||||
result->accumulator + query->c_offset + i);
|
result->accumulator + query->c_offset + i);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -1138,6 +1147,10 @@ query_accumulator_offset(const struct gen_perf_query_info *query,
|
|||||||
switch (type) {
|
switch (type) {
|
||||||
case GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
|
case GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
|
||||||
return query->perfcnt_offset + index;
|
return query->perfcnt_offset + index;
|
||||||
|
case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
|
||||||
|
return query->b_offset + index;
|
||||||
|
case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
|
||||||
|
return query->c_offset + index;
|
||||||
default:
|
default:
|
||||||
unreachable("Invalid register type");
|
unreachable("Invalid register type");
|
||||||
return 0;
|
return 0;
|
||||||
@@ -1166,7 +1179,7 @@ gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *result,
|
|||||||
* unrelated deltas, so don't accumulate the begin/end reports here.
|
* unrelated deltas, so don't accumulate the begin/end reports here.
|
||||||
*/
|
*/
|
||||||
if (!no_oa_accumulate) {
|
if (!no_oa_accumulate) {
|
||||||
gen_perf_query_result_accumulate(result, query,
|
gen_perf_query_result_accumulate(result, query, devinfo,
|
||||||
start + field->location,
|
start + field->location,
|
||||||
end + field->location);
|
end + field->location);
|
||||||
}
|
}
|
||||||
@@ -1205,6 +1218,35 @@ gen_perf_query_result_clear(struct gen_perf_query_result *result)
|
|||||||
result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
|
result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
gen_perf_query_result_print_fields(const struct gen_perf_query_info *query,
|
||||||
|
const struct gen_device_info *devinfo,
|
||||||
|
const void *data)
|
||||||
|
{
|
||||||
|
const struct gen_perf_query_field_layout *layout = &query->perf->query_layout;
|
||||||
|
|
||||||
|
for (uint32_t r = 0; r < layout->n_fields; r++) {
|
||||||
|
const struct gen_perf_query_field *field = &layout->fields[r];
|
||||||
|
const uint32_t *value32 = data + field->location;
|
||||||
|
|
||||||
|
switch (field->type) {
|
||||||
|
case GEN_PERF_QUERY_FIELD_TYPE_MI_RPC:
|
||||||
|
fprintf(stderr, "MI_RPC:\n");
|
||||||
|
fprintf(stderr, " TS: 0x%08x\n", *(value32 + 1));
|
||||||
|
fprintf(stderr, " CLK: 0x%08x\n", *(value32 + 3));
|
||||||
|
break;
|
||||||
|
case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
|
||||||
|
fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32);
|
||||||
|
break;
|
||||||
|
case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
|
||||||
|
fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
gen_perf_compare_query_names(const void *v1, const void *v2)
|
gen_perf_compare_query_names(const void *v1, const void *v2)
|
||||||
{
|
{
|
||||||
@@ -1252,6 +1294,8 @@ gen_perf_init_query_fields(struct gen_perf_config *perf_cfg,
|
|||||||
/* MI_RPC requires a 64byte alignment. */
|
/* MI_RPC requires a 64byte alignment. */
|
||||||
layout->alignment = 64;
|
layout->alignment = 64;
|
||||||
|
|
||||||
|
layout->fields = rzalloc_array(perf_cfg, struct gen_perf_query_field, 5 + 16);
|
||||||
|
|
||||||
add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_MI_RPC,
|
add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_MI_RPC,
|
||||||
0, 256, 0);
|
0, 256, 0);
|
||||||
|
|
||||||
@@ -1280,6 +1324,28 @@ gen_perf_init_query_fields(struct gen_perf_config *perf_cfg,
|
|||||||
GEN9_RPSTAT0, 4, 0);
|
GEN9_RPSTAT0, 4, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!can_use_mi_rpc_bc_counters(devinfo)) {
|
||||||
|
if (devinfo->gen >= 8 && devinfo->gen <= 11) {
|
||||||
|
for (uint32_t i = 0; i < GEN8_N_OA_PERF_B32; i++) {
|
||||||
|
add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
|
||||||
|
GEN8_OA_PERF_B32(i), 4, i);
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < GEN8_N_OA_PERF_C32; i++) {
|
||||||
|
add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
|
||||||
|
GEN8_OA_PERF_C32(i), 4, i);
|
||||||
|
}
|
||||||
|
} else if (devinfo->gen == 12) {
|
||||||
|
for (uint32_t i = 0; i < GEN12_N_OAG_PERF_B32; i++) {
|
||||||
|
add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
|
||||||
|
GEN12_OAG_PERF_B32(i), 4, i);
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < GEN12_N_OAG_PERF_C32; i++) {
|
||||||
|
add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
|
||||||
|
GEN12_OAG_PERF_C32(i), 4, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Align the whole package to 64bytes so that 2 snapshots can be put
|
/* Align the whole package to 64bytes so that 2 snapshots can be put
|
||||||
* together without extract alignment for the user.
|
* together without extract alignment for the user.
|
||||||
*/
|
*/
|
||||||
|
@@ -266,7 +266,9 @@ struct gen_perf_query_field_layout {
|
|||||||
enum gen_perf_query_field_type {
|
enum gen_perf_query_field_type {
|
||||||
GEN_PERF_QUERY_FIELD_TYPE_MI_RPC,
|
GEN_PERF_QUERY_FIELD_TYPE_MI_RPC,
|
||||||
GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
|
GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
|
||||||
GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT
|
GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
|
||||||
|
GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
|
||||||
|
GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
|
||||||
} type;
|
} type;
|
||||||
|
|
||||||
/* Index of register in the given type (for instance A31 or B2,
|
/* Index of register in the given type (for instance A31 or B2,
|
||||||
@@ -431,6 +433,7 @@ void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
|
|||||||
*/
|
*/
|
||||||
void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
|
void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
|
||||||
const struct gen_perf_query_info *query,
|
const struct gen_perf_query_info *query,
|
||||||
|
const struct gen_device_info *devinfo,
|
||||||
const uint32_t *start,
|
const uint32_t *start,
|
||||||
const uint32_t *end);
|
const uint32_t *end);
|
||||||
|
|
||||||
@@ -446,6 +449,12 @@ void gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *resul
|
|||||||
|
|
||||||
void gen_perf_query_result_clear(struct gen_perf_query_result *result);
|
void gen_perf_query_result_clear(struct gen_perf_query_result *result);
|
||||||
|
|
||||||
|
/** Debug helper printing out query data.
|
||||||
|
*/
|
||||||
|
void gen_perf_query_result_print_fields(const struct gen_perf_query_info *query,
|
||||||
|
const struct gen_device_info *devinfo,
|
||||||
|
const void *data);
|
||||||
|
|
||||||
static inline size_t
|
static inline size_t
|
||||||
gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter)
|
gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter)
|
||||||
{
|
{
|
||||||
|
@@ -1309,6 +1309,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
|
|||||||
if (add) {
|
if (add) {
|
||||||
gen_perf_query_result_accumulate(&query->oa.result,
|
gen_perf_query_result_accumulate(&query->oa.result,
|
||||||
query->queryinfo,
|
query->queryinfo,
|
||||||
|
devinfo,
|
||||||
last, report);
|
last, report);
|
||||||
} else {
|
} else {
|
||||||
/* We're not adding the delta because we've identified it's not
|
/* We're not adding the delta because we've identified it's not
|
||||||
@@ -1337,7 +1338,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
|
|||||||
end:
|
end:
|
||||||
|
|
||||||
gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
|
gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
|
||||||
last, end);
|
devinfo, last, end);
|
||||||
|
|
||||||
query->oa.results_accumulated = true;
|
query->oa.results_accumulated = true;
|
||||||
drop_from_unaccumulated_query_list(perf_ctx, query);
|
drop_from_unaccumulated_query_list(perf_ctx, query);
|
||||||
|
@@ -46,6 +46,32 @@
|
|||||||
#define PERF_CNT_2_DW0 0x91c0
|
#define PERF_CNT_2_DW0 0x91c0
|
||||||
#define PERF_CNT_VALUE_MASK ((1ull << 44) - 1)
|
#define PERF_CNT_VALUE_MASK ((1ull << 44) - 1)
|
||||||
|
|
||||||
|
/* Global OA perf counters */
|
||||||
|
#define GEN7_N_OA_PERF_A32 44
|
||||||
|
#define GEN7_OA_PERF_A32(idx) (0x2800 + (idx) * 4)
|
||||||
|
|
||||||
|
#define GEN8_OA_PERF_TICKS 0x2910
|
||||||
|
#define GEN8_N_OA_PERF_A64 32
|
||||||
|
#define GEN8_N_OA_PERF_A32 4
|
||||||
|
#define GEN8_N_OA_PERF_B32 8
|
||||||
|
#define GEN8_N_OA_PERF_C32 8
|
||||||
|
#define GEN8_OA_PERF_A64_LDW(idx) (0x2800 + (idx) * 8)
|
||||||
|
#define GEN8_OA_PERF_A64_UDW(idx) (0x2800 + (idx) * 8 + 4)
|
||||||
|
#define GEN8_OA_PERF_A32(idx) (0x2900 + (idx) * 4)
|
||||||
|
#define GEN8_OA_PERF_B32(idx) (0x2920 + (idx) * 4)
|
||||||
|
#define GEN8_OA_PERF_C32(idx) (0x2940 + (idx) * 4)
|
||||||
|
|
||||||
|
#define GEN12_OAG_PERF_TICKS 0xda90
|
||||||
|
#define GEN12_N_OAG_PERF_A64 32
|
||||||
|
#define GEN12_N_OAG_PERF_A32 4
|
||||||
|
#define GEN12_N_OAG_PERF_B32 8
|
||||||
|
#define GEN12_N_OAG_PERF_C32 8
|
||||||
|
#define GEN12_OAG_PERF_A64_LDW(idx) (0xd980 + (idx) * 8)
|
||||||
|
#define GEN12_OAG_PERF_A64_UDW(idx) (0xd980 + (idx) * 8 + 4)
|
||||||
|
#define GEN12_OAG_PERF_A32(idx) (0xda80 + (idx) * 4)
|
||||||
|
#define GEN12_OAG_PERF_B32(idx) (0xda94 + (idx) * 4)
|
||||||
|
#define GEN12_OAG_PERF_C32(idx) (0xdab4 + (idx) * 4)
|
||||||
|
|
||||||
/* Pipeline statistic counters */
|
/* Pipeline statistic counters */
|
||||||
#define IA_VERTICES_COUNT 0x2310
|
#define IA_VERTICES_COUNT 0x2310
|
||||||
#define IA_PRIMITIVES_COUNT 0x2318
|
#define IA_PRIMITIVES_COUNT 0x2318
|
||||||
|
@@ -518,7 +518,8 @@ VkResult genX(GetQueryPoolResults)(
|
|||||||
const uint32_t *end = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, true);
|
const uint32_t *end = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, true);
|
||||||
struct gen_perf_query_result result;
|
struct gen_perf_query_result result;
|
||||||
gen_perf_query_result_clear(&result);
|
gen_perf_query_result_clear(&result);
|
||||||
gen_perf_query_result_accumulate(&result, pool->pass_query[p], begin, end);
|
gen_perf_query_result_accumulate(&result, pool->pass_query[p],
|
||||||
|
&device->info, begin, end);
|
||||||
anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
|
anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -536,7 +537,8 @@ VkResult genX(GetQueryPoolResults)(
|
|||||||
const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
|
const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
|
||||||
struct gen_perf_query_result result;
|
struct gen_perf_query_result result;
|
||||||
gen_perf_query_result_clear(&result);
|
gen_perf_query_result_clear(&result);
|
||||||
gen_perf_query_result_accumulate(&result, query, oa_begin, oa_end);
|
gen_perf_query_result_accumulate(&result, query, &device->info,
|
||||||
|
oa_begin, oa_end);
|
||||||
gen_perf_query_result_read_frequencies(&result, &device->info,
|
gen_perf_query_result_read_frequencies(&result, &device->info,
|
||||||
oa_begin, oa_end);
|
oa_begin, oa_end);
|
||||||
gen_perf_query_result_read_gt_frequency(&result, &device->info,
|
gen_perf_query_result_read_gt_frequency(&result, &device->info,
|
||||||
|
Reference in New Issue
Block a user