From a6e980e9bf6c33f4166b423ead0d221c76c2bcde Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 4 Jul 2019 20:34:28 +0300 Subject: [PATCH] intel/perf: prep work to enable new perf counters Those are not part of the OA reports and need some additional scaffolding. Those counters are only available when doing queries as we need to emit MI_SRMs to record them. Equations making use of those counters are not there yet, they will come in a follow up commit updating a bunch of oa-*.xml files. v2: Fix typo v3: Use PERF_CNT_VALUE_MASK (Marcin) Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/perf/gen_perf.c | 13 ++++++++++++ src/intel/perf/gen_perf.h | 17 +++++++++++++--- src/intel/perf/gen_perf.py | 34 +++++++++++++++++++++----------- src/intel/perf/gen_perf_mdapi.c | 10 ++++++++++ src/intel/perf/gen_perf_mdapi.h | 35 --------------------------------- src/intel/perf/gen_perf_query.c | 4 ++-- src/intel/vulkan/anv_perf.c | 4 ++-- src/intel/vulkan/genX_query.c | 16 +++++++-------- 8 files changed, 72 insertions(+), 61 deletions(-) diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index 4530bb02e33..85cc4ec1338 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -423,6 +423,7 @@ init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *dev perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000; perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency; perf->sys_vars.revision = devinfo->revision; + perf->sys_vars.query_mode = true; compute_topology_builtins(perf, devinfo); return true; @@ -1117,6 +1118,18 @@ gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result, result->gt_frequency[1] *= 1000000ULL; } +void +gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint64_t *start, + const uint64_t *end) +{ + for (uint32_t i = 0; i < 2; i++) { + result->accumulator[query->perfcnt_offset + i] = + (end[i] & PERF_CNT_VALUE_MASK) - (start[i] & PERF_CNT_VALUE_MASK); + } +} + void gen_perf_query_result_clear(struct gen_perf_query_result *result) { diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h index 4348c731109..bbc87495821 100644 --- a/src/intel/perf/gen_perf.h +++ b/src/intel/perf/gen_perf.h @@ -108,8 +108,10 @@ struct gen_pipeline_stat { * 1 timestamp, 45 A counters, 8 B counters and 8 C counters. * For Gen8+ * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters + * + * Plus 2 PERF_CNT registers. */ -#define MAX_OA_REPORT_COUNTERS 62 +#define MAX_OA_REPORT_COUNTERS (62 + 2) /* * When currently allocate only one page for pipeline statistics queries. Here @@ -180,10 +182,10 @@ struct gen_perf_query_counter { union { uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf, const struct gen_perf_query_info *query, - const uint64_t *accumulator); + const struct gen_perf_query_result *results); float (*oa_counter_read_float)(struct gen_perf_config *perf, const struct gen_perf_query_info *query, - const uint64_t *accumulator); + const struct gen_perf_query_result *results); struct gen_pipeline_stat pipeline_stat; }; }; @@ -231,6 +233,7 @@ struct gen_perf_query_info { int a_offset; int b_offset; int c_offset; + int perfcnt_offset; struct gen_perf_registers config; }; @@ -282,6 +285,7 @@ struct gen_perf_config { uint64_t gt_min_freq; /** $GpuMinFrequency */ uint64_t gt_max_freq; /** $GpuMaxFrequency */ uint64_t revision; /** $SkuRevisionId */ + bool query_mode; /** $QueryMode */ } sys_vars; /* OA metric sets, indexed by GUID, as know by Mesa at build time, to @@ -370,6 +374,13 @@ void gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *resul const uint32_t start, const uint32_t end); +/** Store PERFCNT registers values. + */ +void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint64_t *start, + const uint64_t *end); + /** Accumulate the delta between 2 OA reports into result for a given query. */ void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, diff --git a/src/intel/perf/gen_perf.py b/src/intel/perf/gen_perf.py index c1e2b2a54b3..b35b6a0482c 100644 --- a/src/intel/perf/gen_perf.py +++ b/src/intel/perf/gen_perf.py @@ -94,7 +94,15 @@ def emit_fsub(tmp_id, args): def emit_read(tmp_id, args): type = args[1].lower() - c("uint64_t tmp{0} = accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0])) + c("uint64_t tmp{0} = results->accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0])) + return tmp_id + 1 + +def emit_read_reg(tmp_id, args): + offsets = { + 'PERFCNT1': 0, + 'PERFCNT2': 1, + } + c("uint64_t tmp{0} = results->accumulator[query->perfcnt_offset + {1}];".format(tmp_id, offsets[args[0]])) return tmp_id + 1 def emit_uadd(tmp_id, args): @@ -144,6 +152,7 @@ ops["FMAX"] = (2, emit_fmax) ops["FMUL"] = (2, emit_fmul) ops["FSUB"] = (2, emit_fsub) ops["READ"] = (2, emit_read) +ops["READ_REG"] = (1, emit_read_reg) ops["UADD"] = (2, emit_uadd) ops["UDIV"] = (2, emit_udiv) ops["UMUL"] = (2, emit_umul) @@ -193,6 +202,7 @@ hw_vars["$GpuTimestampFrequency"] = "perf->sys_vars.timestamp_frequency" hw_vars["$GpuMinFrequency"] = "perf->sys_vars.gt_min_freq" hw_vars["$GpuMaxFrequency"] = "perf->sys_vars.gt_max_freq" hw_vars["$SkuRevisionId"] = "perf->sys_vars.revision" +hw_vars["$QueryMode"] = "perf->sys_vars.query_mode" def output_rpn_equation_code(set, counter, equation): c("/* RPN equation: " + equation + " */") @@ -214,7 +224,7 @@ def output_rpn_equation_code(set, counter, equation): operand = hw_vars[operand] elif operand in set.counter_vars: reference = set.counter_vars[operand] - operand = set.read_funcs[operand[1:]] + "(perf, query, accumulator)" + operand = set.read_funcs[operand[1:]] + "(perf, query, results)" else: raise Exception("Failed to resolve variable " + operand + " in equation " + equation + " for " + set.name + " :: " + counter.get('name')); args.append(operand) @@ -234,7 +244,7 @@ def output_rpn_equation_code(set, counter, equation): if value in hw_vars: value = hw_vars[value] if value in set.counter_vars: - value = set.read_funcs[value[1:]] + "(perf, query, accumulator)" + value = set.read_funcs[value[1:]] + "(perf, query, results)" c("\nreturn " + value + ";") @@ -288,7 +298,7 @@ def output_counter_read(gen, set, counter): c(counter.read_sym + "(UNUSED struct gen_perf_config *perf,\n") c_indent(len(counter.read_sym) + 1) c("const struct gen_perf_query_info *query,\n") - c("const uint64_t *accumulator)\n") + c("const struct gen_perf_query_result *results)\n") c_outdent(len(counter.read_sym) + 1) c("{") @@ -729,19 +739,21 @@ def main(): query->oa_format = I915_OA_FORMAT_A45_B8_C8; /* Accumulation buffer offsets... */ query->gpu_time_offset = 0; - query->a_offset = 1; - query->b_offset = 46; - query->c_offset = 54; + query->a_offset = query->gpu_time_offset + 1; + query->b_offset = query->a_offset + 45; + query->c_offset = query->b_offset + 8; + query->perfcnt_offset = query->c_offset + 8; """)) else: c(textwrap.dedent("""\ query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; /* Accumulation buffer offsets... */ query->gpu_time_offset = 0; - query->gpu_clock_offset = 1; - query->a_offset = 2; - query->b_offset = 38; - query->c_offset = 46; + query->gpu_clock_offset = query->gpu_time_offset + 1; + query->a_offset = query->gpu_clock_offset + 1; + query->b_offset = query->a_offset + 36; + query->c_offset = query->b_offset + 8; + query->perfcnt_offset = query->c_offset + 8; """)) diff --git a/src/intel/perf/gen_perf_mdapi.c b/src/intel/perf/gen_perf_mdapi.c index 2452b99f59f..aad5e4e0202 100644 --- a/src/intel/perf/gen_perf_mdapi.c +++ b/src/intel/perf/gen_perf_mdapi.c @@ -54,6 +54,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i]; } + mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; + mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; + mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); @@ -75,6 +78,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i]; } + mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; + mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; + mdapi_data->ReportId = result->hw_id; mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = @@ -106,6 +112,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i]; } + mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; + mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; + mdapi_data->ReportId = result->hw_id; mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = @@ -354,5 +363,6 @@ gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf, query->a_offset = copy_query->a_offset; query->b_offset = copy_query->b_offset; query->c_offset = copy_query->c_offset; + query->perfcnt_offset = copy_query->perfcnt_offset; } } diff --git a/src/intel/perf/gen_perf_mdapi.h b/src/intel/perf/gen_perf_mdapi.h index acf1edd6e79..05717d11811 100644 --- a/src/intel/perf/gen_perf_mdapi.h +++ b/src/intel/perf/gen_perf_mdapi.h @@ -132,41 +132,6 @@ int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, const struct gen_perf_query_info *query, const struct gen_perf_query_result *result); -static inline void gen_perf_query_mdapi_write_perfcntr(void *data, uint32_t data_size, - const struct gen_device_info *devinfo, - const uint64_t *begin_perf_cntrs, - const uint64_t *end_perf_cntrs) -{ - /* Only bits 0:43 of the 64bit registers contains the value. */ - const uint64_t mask = (1ull << 44) - 1; - - switch (devinfo->gen) { - case 8: { - if (data_size < sizeof(struct gen8_mdapi_metrics)) - return; - struct gen8_mdapi_metrics *mdapi_data = data; - mdapi_data->PerfCounter1 = - (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask); - mdapi_data->PerfCounter2 = - (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask); - break; - } - case 9: - case 11: { - if (data_size < sizeof(struct gen9_mdapi_metrics)) - return; - struct gen9_mdapi_metrics *mdapi_data = data; - mdapi_data->PerfCounter1 = - (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask); - mdapi_data->PerfCounter2 = - (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask); - break; - } - default: - break; - } -} - static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size, const struct gen_device_info *devinfo, uint64_t value) diff --git a/src/intel/perf/gen_perf_query.c b/src/intel/perf/gen_perf_query.c index e6d38b6bb72..a1204b830be 100644 --- a/src/intel/perf/gen_perf_query.c +++ b/src/intel/perf/gen_perf_query.c @@ -1423,13 +1423,13 @@ get_oa_counter_data(struct gen_perf_context *perf_ctx, out_uint64 = (uint64_t *)(data + counter->offset); *out_uint64 = counter->oa_counter_read_uint64(perf_cfg, queryinfo, - query->oa.result.accumulator); + &query->oa.result); break; case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: out_float = (float *)(data + counter->offset); *out_float = counter->oa_counter_read_float(perf_cfg, queryinfo, - query->oa.result.accumulator); + &query->oa.result); break; default: /* So far we aren't using uint32, double or bool32... */ diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c index 0a323994cb6..35bd2468f67 100644 --- a/src/intel/vulkan/anv_perf.c +++ b/src/intel/vulkan/anv_perf.c @@ -421,13 +421,13 @@ anv_perf_write_pass_results(struct gen_perf_config *perf, results[c].uint64 = counter_pass->counter->oa_counter_read_uint64(perf, counter_pass->query, - accumulated_results->accumulator); + accumulated_results); break; case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: results[c].float32 = counter_pass->counter->oa_counter_read_float(perf, counter_pass->query, - accumulated_results->accumulator); + accumulated_results); break; default: /* So far we aren't using uint32, double or bool32... */ diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 5994488960d..ab3f6d0da77 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -326,7 +326,7 @@ intel_perf_rpstart_offset(bool end) return 16 + (end ? sizeof(uint32_t) : 0); } -#if GEN_GEN >= 8 && GEN_GEN <= 11 +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) static uint32_t intel_perf_counter(bool end) { @@ -541,14 +541,14 @@ VkResult genX(GetQueryPoolResults)( oa_begin, oa_end); gen_perf_query_result_read_gt_frequency(&result, &device->info, *rpstat_begin, *rpstat_end); - gen_perf_query_result_write_mdapi(pData, stride, - &device->info, - query, &result); -#if GEN_GEN >= 8 && GEN_GEN <= 11 - gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info, +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) + gen_perf_query_result_read_perfcnts(&result, query, query_data + intel_perf_counter(false), query_data + intel_perf_counter(true)); #endif + gen_perf_query_result_write_mdapi(pData, stride, + &device->info, + query, &result); const uint64_t *marker = query_data + intel_perf_marker_offset(); gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); break; @@ -913,7 +913,7 @@ void genX(CmdBeginQueryIndexedEXT)( intel_perf_rpstart_offset(false))), gen_mi_reg32(GENX(RPSTAT0_num))); #endif -#if GEN_GEN >= 8 && GEN_GEN <= 11 +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(false))), gen_mi_reg64(GENX(PERFCNT1_num))); @@ -1047,7 +1047,7 @@ void genX(CmdEndQueryIndexedEXT)( uint32_t marker_offset = intel_perf_marker_offset(); gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)), gen_mi_imm(cmd_buffer->intel_perf_marker)); -#if GEN_GEN >= 8 && GEN_GEN <= 11 +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))), gen_mi_reg64(GENX(PERFCNT1_num))); gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),