diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index 4530bb02e33..85cc4ec1338 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -423,6 +423,7 @@ init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *dev perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000; perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency; perf->sys_vars.revision = devinfo->revision; + perf->sys_vars.query_mode = true; compute_topology_builtins(perf, devinfo); return true; @@ -1117,6 +1118,18 @@ gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result, result->gt_frequency[1] *= 1000000ULL; } +void +gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint64_t *start, + const uint64_t *end) +{ + for (uint32_t i = 0; i < 2; i++) { + result->accumulator[query->perfcnt_offset + i] = + (end[i] & PERF_CNT_VALUE_MASK) - (start[i] & PERF_CNT_VALUE_MASK); + } +} + void gen_perf_query_result_clear(struct gen_perf_query_result *result) { diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h index 4348c731109..bbc87495821 100644 --- a/src/intel/perf/gen_perf.h +++ b/src/intel/perf/gen_perf.h @@ -108,8 +108,10 @@ struct gen_pipeline_stat { * 1 timestamp, 45 A counters, 8 B counters and 8 C counters. * For Gen8+ * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters + * + * Plus 2 PERF_CNT registers. */ -#define MAX_OA_REPORT_COUNTERS 62 +#define MAX_OA_REPORT_COUNTERS (62 + 2) /* * When currently allocate only one page for pipeline statistics queries. Here @@ -180,10 +182,10 @@ struct gen_perf_query_counter { union { uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf, const struct gen_perf_query_info *query, - const uint64_t *accumulator); + const struct gen_perf_query_result *results); float (*oa_counter_read_float)(struct gen_perf_config *perf, const struct gen_perf_query_info *query, - const uint64_t *accumulator); + const struct gen_perf_query_result *results); struct gen_pipeline_stat pipeline_stat; }; }; @@ -231,6 +233,7 @@ struct gen_perf_query_info { int a_offset; int b_offset; int c_offset; + int perfcnt_offset; struct gen_perf_registers config; }; @@ -282,6 +285,7 @@ struct gen_perf_config { uint64_t gt_min_freq; /** $GpuMinFrequency */ uint64_t gt_max_freq; /** $GpuMaxFrequency */ uint64_t revision; /** $SkuRevisionId */ + bool query_mode; /** $QueryMode */ } sys_vars; /* OA metric sets, indexed by GUID, as know by Mesa at build time, to @@ -370,6 +374,13 @@ void gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *resul const uint32_t start, const uint32_t end); +/** Store PERFCNT registers values. + */ +void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint64_t *start, + const uint64_t *end); + /** Accumulate the delta between 2 OA reports into result for a given query. */ void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, diff --git a/src/intel/perf/gen_perf.py b/src/intel/perf/gen_perf.py index c1e2b2a54b3..b35b6a0482c 100644 --- a/src/intel/perf/gen_perf.py +++ b/src/intel/perf/gen_perf.py @@ -94,7 +94,15 @@ def emit_fsub(tmp_id, args): def emit_read(tmp_id, args): type = args[1].lower() - c("uint64_t tmp{0} = accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0])) + c("uint64_t tmp{0} = results->accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0])) + return tmp_id + 1 + +def emit_read_reg(tmp_id, args): + offsets = { + 'PERFCNT1': 0, + 'PERFCNT2': 1, + } + c("uint64_t tmp{0} = results->accumulator[query->perfcnt_offset + {1}];".format(tmp_id, offsets[args[0]])) return tmp_id + 1 def emit_uadd(tmp_id, args): @@ -144,6 +152,7 @@ ops["FMAX"] = (2, emit_fmax) ops["FMUL"] = (2, emit_fmul) ops["FSUB"] = (2, emit_fsub) ops["READ"] = (2, emit_read) +ops["READ_REG"] = (1, emit_read_reg) ops["UADD"] = (2, emit_uadd) ops["UDIV"] = (2, emit_udiv) ops["UMUL"] = (2, emit_umul) @@ -193,6 +202,7 @@ hw_vars["$GpuTimestampFrequency"] = "perf->sys_vars.timestamp_frequency" hw_vars["$GpuMinFrequency"] = "perf->sys_vars.gt_min_freq" hw_vars["$GpuMaxFrequency"] = "perf->sys_vars.gt_max_freq" hw_vars["$SkuRevisionId"] = "perf->sys_vars.revision" +hw_vars["$QueryMode"] = "perf->sys_vars.query_mode" def output_rpn_equation_code(set, counter, equation): c("/* RPN equation: " + equation + " */") @@ -214,7 +224,7 @@ def output_rpn_equation_code(set, counter, equation): operand = hw_vars[operand] elif operand in set.counter_vars: reference = set.counter_vars[operand] - operand = set.read_funcs[operand[1:]] + "(perf, query, accumulator)" + operand = set.read_funcs[operand[1:]] + "(perf, query, results)" else: raise Exception("Failed to resolve variable " + operand + " in equation " + equation + " for " + set.name + " :: " + counter.get('name')); args.append(operand) @@ -234,7 +244,7 @@ def output_rpn_equation_code(set, counter, equation): if value in hw_vars: value = hw_vars[value] if value in set.counter_vars: - value = set.read_funcs[value[1:]] + "(perf, query, accumulator)" + value = set.read_funcs[value[1:]] + "(perf, query, results)" c("\nreturn " + value + ";") @@ -288,7 +298,7 @@ def output_counter_read(gen, set, counter): c(counter.read_sym + "(UNUSED struct gen_perf_config *perf,\n") c_indent(len(counter.read_sym) + 1) c("const struct gen_perf_query_info *query,\n") - c("const uint64_t *accumulator)\n") + c("const struct gen_perf_query_result *results)\n") c_outdent(len(counter.read_sym) + 1) c("{") @@ -729,19 +739,21 @@ def main(): query->oa_format = I915_OA_FORMAT_A45_B8_C8; /* Accumulation buffer offsets... */ query->gpu_time_offset = 0; - query->a_offset = 1; - query->b_offset = 46; - query->c_offset = 54; + query->a_offset = query->gpu_time_offset + 1; + query->b_offset = query->a_offset + 45; + query->c_offset = query->b_offset + 8; + query->perfcnt_offset = query->c_offset + 8; """)) else: c(textwrap.dedent("""\ query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; /* Accumulation buffer offsets... */ query->gpu_time_offset = 0; - query->gpu_clock_offset = 1; - query->a_offset = 2; - query->b_offset = 38; - query->c_offset = 46; + query->gpu_clock_offset = query->gpu_time_offset + 1; + query->a_offset = query->gpu_clock_offset + 1; + query->b_offset = query->a_offset + 36; + query->c_offset = query->b_offset + 8; + query->perfcnt_offset = query->c_offset + 8; """)) diff --git a/src/intel/perf/gen_perf_mdapi.c b/src/intel/perf/gen_perf_mdapi.c index 2452b99f59f..aad5e4e0202 100644 --- a/src/intel/perf/gen_perf_mdapi.c +++ b/src/intel/perf/gen_perf_mdapi.c @@ -54,6 +54,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i]; } + mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; + mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; + mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); @@ -75,6 +78,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i]; } + mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; + mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; + mdapi_data->ReportId = result->hw_id; mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = @@ -106,6 +112,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i]; } + mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; + mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; + mdapi_data->ReportId = result->hw_id; mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = @@ -354,5 +363,6 @@ gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf, query->a_offset = copy_query->a_offset; query->b_offset = copy_query->b_offset; query->c_offset = copy_query->c_offset; + query->perfcnt_offset = copy_query->perfcnt_offset; } } diff --git a/src/intel/perf/gen_perf_mdapi.h b/src/intel/perf/gen_perf_mdapi.h index acf1edd6e79..05717d11811 100644 --- a/src/intel/perf/gen_perf_mdapi.h +++ b/src/intel/perf/gen_perf_mdapi.h @@ -132,41 +132,6 @@ int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, const struct gen_perf_query_info *query, const struct gen_perf_query_result *result); -static inline void gen_perf_query_mdapi_write_perfcntr(void *data, uint32_t data_size, - const struct gen_device_info *devinfo, - const uint64_t *begin_perf_cntrs, - const uint64_t *end_perf_cntrs) -{ - /* Only bits 0:43 of the 64bit registers contains the value. */ - const uint64_t mask = (1ull << 44) - 1; - - switch (devinfo->gen) { - case 8: { - if (data_size < sizeof(struct gen8_mdapi_metrics)) - return; - struct gen8_mdapi_metrics *mdapi_data = data; - mdapi_data->PerfCounter1 = - (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask); - mdapi_data->PerfCounter2 = - (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask); - break; - } - case 9: - case 11: { - if (data_size < sizeof(struct gen9_mdapi_metrics)) - return; - struct gen9_mdapi_metrics *mdapi_data = data; - mdapi_data->PerfCounter1 = - (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask); - mdapi_data->PerfCounter2 = - (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask); - break; - } - default: - break; - } -} - static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size, const struct gen_device_info *devinfo, uint64_t value) diff --git a/src/intel/perf/gen_perf_query.c b/src/intel/perf/gen_perf_query.c index e6d38b6bb72..a1204b830be 100644 --- a/src/intel/perf/gen_perf_query.c +++ b/src/intel/perf/gen_perf_query.c @@ -1423,13 +1423,13 @@ get_oa_counter_data(struct gen_perf_context *perf_ctx, out_uint64 = (uint64_t *)(data + counter->offset); *out_uint64 = counter->oa_counter_read_uint64(perf_cfg, queryinfo, - query->oa.result.accumulator); + &query->oa.result); break; case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: out_float = (float *)(data + counter->offset); *out_float = counter->oa_counter_read_float(perf_cfg, queryinfo, - query->oa.result.accumulator); + &query->oa.result); break; default: /* So far we aren't using uint32, double or bool32... */ diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c index 0a323994cb6..35bd2468f67 100644 --- a/src/intel/vulkan/anv_perf.c +++ b/src/intel/vulkan/anv_perf.c @@ -421,13 +421,13 @@ anv_perf_write_pass_results(struct gen_perf_config *perf, results[c].uint64 = counter_pass->counter->oa_counter_read_uint64(perf, counter_pass->query, - accumulated_results->accumulator); + accumulated_results); break; case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: results[c].float32 = counter_pass->counter->oa_counter_read_float(perf, counter_pass->query, - accumulated_results->accumulator); + accumulated_results); break; default: /* So far we aren't using uint32, double or bool32... */ diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 5994488960d..ab3f6d0da77 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -326,7 +326,7 @@ intel_perf_rpstart_offset(bool end) return 16 + (end ? sizeof(uint32_t) : 0); } -#if GEN_GEN >= 8 && GEN_GEN <= 11 +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) static uint32_t intel_perf_counter(bool end) { @@ -541,14 +541,14 @@ VkResult genX(GetQueryPoolResults)( oa_begin, oa_end); gen_perf_query_result_read_gt_frequency(&result, &device->info, *rpstat_begin, *rpstat_end); - gen_perf_query_result_write_mdapi(pData, stride, - &device->info, - query, &result); -#if GEN_GEN >= 8 && GEN_GEN <= 11 - gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info, +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) + gen_perf_query_result_read_perfcnts(&result, query, query_data + intel_perf_counter(false), query_data + intel_perf_counter(true)); #endif + gen_perf_query_result_write_mdapi(pData, stride, + &device->info, + query, &result); const uint64_t *marker = query_data + intel_perf_marker_offset(); gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); break; @@ -913,7 +913,7 @@ void genX(CmdBeginQueryIndexedEXT)( intel_perf_rpstart_offset(false))), gen_mi_reg32(GENX(RPSTAT0_num))); #endif -#if GEN_GEN >= 8 && GEN_GEN <= 11 +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(false))), gen_mi_reg64(GENX(PERFCNT1_num))); @@ -1047,7 +1047,7 @@ void genX(CmdEndQueryIndexedEXT)( uint32_t marker_offset = intel_perf_marker_offset(); gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)), gen_mi_imm(cmd_buffer->intel_perf_marker)); -#if GEN_GEN >= 8 && GEN_GEN <= 11 +#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11) gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))), gen_mi_reg64(GENX(PERFCNT1_num))); gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),