anv: implement VK_INTEL_performance_query
v2: Introduce the appropriate pipe controls Properly deal with changes in metric sets (using execbuf parameter) Record marker at query end v3: Fill out PerfCntr1&2 v4: Introduce vkUninitializePerformanceApiINTEL v5: Use new execbuf extension mechanism v6: Fix comments in genX_query.c (Rafael) Use PIPE_CONTROL workarounds (Rafael) Refactor on the last kernel series update (Lionel) v7: Only I915_PERF_IOCTL_CONFIG when perf stream is already opened (Lionel) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com>
This commit is contained in:
@@ -37,6 +37,10 @@
|
||||
#define __gen_get_batch_dwords anv_batch_emit_dwords
|
||||
#define __gen_address_offset anv_address_add
|
||||
#include "common/gen_mi_builder.h"
|
||||
#include "perf/gen_perf.h"
|
||||
#include "perf/gen_perf_mdapi.h"
|
||||
|
||||
#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
|
||||
|
||||
VkResult genX(CreateQueryPool)(
|
||||
VkDevice _device,
|
||||
@@ -52,9 +56,14 @@ VkResult genX(CreateQueryPool)(
|
||||
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
|
||||
|
||||
/* Query pool slots are made up of some number of 64-bit values packed
|
||||
* tightly together. The first 64-bit value is always the "available" bit
|
||||
* which is 0 when the query is unavailable and 1 when it is available.
|
||||
* The 64-bit values that follow are determined by the type of query.
|
||||
* tightly together. For most query types have the first 64-bit value is
|
||||
* the "available" bit which is 0 when the query is unavailable and 1 when
|
||||
* it is available. The 64-bit values that follow are determined by the
|
||||
* type of query.
|
||||
*
|
||||
* For performance queries, we have a requirement to align OA reports at
|
||||
* 64bytes so we put those first and have the "available" bit behind
|
||||
* together with some other counters.
|
||||
*/
|
||||
uint32_t uint64s_per_slot = 1;
|
||||
|
||||
@@ -84,6 +93,15 @@ VkResult genX(CreateQueryPool)(
|
||||
*/
|
||||
uint64s_per_slot += 4;
|
||||
break;
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
|
||||
uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
|
||||
uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
|
||||
uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
|
||||
uint64s_per_slot++; /* 64bit marker */
|
||||
uint64s_per_slot++; /* availability */
|
||||
uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(!"Invalid query type");
|
||||
}
|
||||
@@ -160,6 +178,57 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* VK_INTEL_performance_query layout:
|
||||
*
|
||||
* ------------------------------
|
||||
* | end MI_RPC (256b) |
|
||||
* |----------------------------|
|
||||
* | begin MI_RPC (256b) |
|
||||
* |----------------------------|
|
||||
* | begin perfcntr 1 & 2 (16b) |
|
||||
* |----------------------------|
|
||||
* | end perfcntr 1 & 2 (16b) |
|
||||
* |----------------------------|
|
||||
* | begin RPSTAT register (4b) |
|
||||
* |----------------------------|
|
||||
* | end RPSTAT register (4b) |
|
||||
* |----------------------------|
|
||||
* | marker (8b) |
|
||||
* |----------------------------|
|
||||
* | availability (8b) |
|
||||
* ------------------------------
|
||||
*/
|
||||
|
||||
static uint32_t
|
||||
intel_perf_mi_rpc_offset(bool end)
|
||||
{
|
||||
return end ? 0 : 256;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
intel_perf_counter(bool end)
|
||||
{
|
||||
uint32_t offset = 512;
|
||||
offset += end ? 2 * sizeof(uint64_t) : 0;
|
||||
return offset;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
intel_perf_rpstart_offset(bool end)
|
||||
{
|
||||
uint32_t offset = intel_perf_counter(false) +
|
||||
4 * sizeof(uint64_t);
|
||||
offset += end ? sizeof(uint32_t) : 0;
|
||||
return offset;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
intel_perf_marker_offset(void)
|
||||
{
|
||||
return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
|
||||
}
|
||||
|
||||
static void
|
||||
cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
|
||||
uint32_t value_index, uint64_t result)
|
||||
@@ -173,18 +242,28 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
query_is_available(uint64_t *slot)
|
||||
static void *
|
||||
query_slot(struct anv_query_pool *pool, uint32_t query)
|
||||
{
|
||||
return *(volatile uint64_t *)slot;
|
||||
return pool->bo.map + query * pool->stride;
|
||||
}
|
||||
|
||||
static bool
|
||||
query_is_available(struct anv_query_pool *pool, uint32_t query)
|
||||
{
|
||||
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
|
||||
return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
|
||||
pool->stride - 8);
|
||||
} else
|
||||
return *(volatile uint64_t *)query_slot(pool, query);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
wait_for_available(struct anv_device *device,
|
||||
struct anv_query_pool *pool, uint64_t *slot)
|
||||
struct anv_query_pool *pool, uint32_t query)
|
||||
{
|
||||
while (true) {
|
||||
if (query_is_available(slot))
|
||||
if (query_is_available(pool, query))
|
||||
return VK_SUCCESS;
|
||||
|
||||
int ret = anv_gem_busy(device, pool->bo.gem_handle);
|
||||
@@ -197,7 +276,7 @@ wait_for_available(struct anv_device *device,
|
||||
} else {
|
||||
assert(ret == 0);
|
||||
/* The BO is no longer busy. */
|
||||
if (query_is_available(slot)) {
|
||||
if (query_is_available(pool, query)) {
|
||||
return VK_SUCCESS;
|
||||
} else {
|
||||
VkResult status = anv_device_query_status(device);
|
||||
@@ -233,7 +312,8 @@ VkResult genX(GetQueryPoolResults)(
|
||||
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
|
||||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
|
||||
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
|
||||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
|
||||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
|
||||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
|
||||
|
||||
if (anv_device_is_lost(device))
|
||||
return VK_ERROR_DEVICE_LOST;
|
||||
@@ -245,13 +325,10 @@ VkResult genX(GetQueryPoolResults)(
|
||||
|
||||
VkResult status = VK_SUCCESS;
|
||||
for (uint32_t i = 0; i < queryCount; i++) {
|
||||
uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
|
||||
|
||||
/* Availability is always at the start of the slot */
|
||||
bool available = slot[0];
|
||||
bool available = query_is_available(pool, firstQuery + i);
|
||||
|
||||
if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
|
||||
status = wait_for_available(device, pool, slot);
|
||||
status = wait_for_available(device, pool, firstQuery + i);
|
||||
if (status != VK_SUCCESS)
|
||||
return status;
|
||||
|
||||
@@ -271,13 +348,16 @@ VkResult genX(GetQueryPoolResults)(
|
||||
|
||||
uint32_t idx = 0;
|
||||
switch (pool->type) {
|
||||
case VK_QUERY_TYPE_OCCLUSION:
|
||||
case VK_QUERY_TYPE_OCCLUSION: {
|
||||
uint64_t *slot = query_slot(pool, firstQuery + i);
|
||||
if (write_results)
|
||||
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
|
||||
idx++;
|
||||
break;
|
||||
}
|
||||
|
||||
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
|
||||
uint64_t *slot = query_slot(pool, firstQuery + i);
|
||||
uint32_t statistics = pool->pipeline_statistics;
|
||||
while (statistics) {
|
||||
uint32_t stat = u_bit_scan(&statistics);
|
||||
@@ -297,7 +377,8 @@ VkResult genX(GetQueryPoolResults)(
|
||||
break;
|
||||
}
|
||||
|
||||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
|
||||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
|
||||
uint64_t *slot = query_slot(pool, firstQuery + i);
|
||||
if (write_results)
|
||||
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
|
||||
idx++;
|
||||
@@ -305,12 +386,54 @@ VkResult genX(GetQueryPoolResults)(
|
||||
cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
|
||||
idx++;
|
||||
break;
|
||||
}
|
||||
|
||||
case VK_QUERY_TYPE_TIMESTAMP:
|
||||
case VK_QUERY_TYPE_TIMESTAMP: {
|
||||
uint64_t *slot = query_slot(pool, firstQuery + i);
|
||||
if (write_results)
|
||||
cpu_write_query_result(pData, flags, idx, slot[1]);
|
||||
idx++;
|
||||
break;
|
||||
}
|
||||
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
|
||||
if (!write_results)
|
||||
break;
|
||||
const void *query_data = query_slot(pool, firstQuery + i);
|
||||
const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
|
||||
const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
|
||||
const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
|
||||
const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
|
||||
struct gen_perf_query_result result;
|
||||
struct gen_perf_query_info metric = {
|
||||
.oa_format = (GEN_GEN >= 8 ?
|
||||
I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
|
||||
I915_OA_FORMAT_A45_B8_C8),
|
||||
};
|
||||
uint32_t core_freq[2];
|
||||
#if GEN_GEN < 9
|
||||
core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
|
||||
core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
|
||||
#else
|
||||
core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
|
||||
core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
|
||||
#endif
|
||||
gen_perf_query_result_clear(&result);
|
||||
gen_perf_query_result_accumulate(&result, &metric,
|
||||
oa_begin, oa_end);
|
||||
gen_perf_query_result_read_frequencies(&result, &device->info,
|
||||
oa_begin, oa_end);
|
||||
gen_perf_query_result_write_mdapi(pData, stride,
|
||||
&device->info,
|
||||
&result,
|
||||
core_freq[0], core_freq[1]);
|
||||
gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
|
||||
query_data + intel_perf_counter(false),
|
||||
query_data + intel_perf_counter(true));
|
||||
const uint64_t *marker = query_data + intel_perf_marker_offset();
|
||||
gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("invalid pool type");
|
||||
@@ -406,6 +529,16 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
|
||||
}
|
||||
break;
|
||||
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
|
||||
for (uint32_t i = 0; i < num_queries; i++) {
|
||||
struct anv_address slot_addr =
|
||||
anv_query_address(pool, first_index + i);
|
||||
gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
|
||||
emit_query_mi_availability(b, anv_address_add(slot_addr,
|
||||
pool->stride - 8), true);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Unsupported query type");
|
||||
}
|
||||
@@ -440,6 +573,21 @@ void genX(CmdResetQueryPool)(
|
||||
break;
|
||||
}
|
||||
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
|
||||
struct gen_mi_builder b;
|
||||
gen_mi_builder_init(&b, &cmd_buffer->batch);
|
||||
|
||||
for (uint32_t i = 0; i < queryCount; i++) {
|
||||
emit_query_mi_availability(
|
||||
&b,
|
||||
anv_address_add(
|
||||
anv_query_address(pool, firstQuery + i),
|
||||
pool->stride - 8),
|
||||
false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("Unsupported query type");
|
||||
}
|
||||
@@ -550,6 +698,37 @@ void genX(CmdBeginQueryIndexedEXT)(
|
||||
emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
|
||||
break;
|
||||
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
|
||||
pc.CommandStreamerStallEnable = true;
|
||||
pc.StallAtPixelScoreboard = true;
|
||||
}
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
|
||||
rpc.MemoryAddress =
|
||||
anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
|
||||
}
|
||||
#if GEN_GEN < 9
|
||||
gen_mi_store(&b,
|
||||
gen_mi_mem32(anv_address_add(query_addr,
|
||||
intel_perf_rpstart_offset(false))),
|
||||
gen_mi_reg32(GENX(RPSTAT1_num)));
|
||||
#else
|
||||
gen_mi_store(&b,
|
||||
gen_mi_mem32(anv_address_add(query_addr,
|
||||
intel_perf_rpstart_offset(false))),
|
||||
gen_mi_reg32(GENX(RPSTAT0_num)));
|
||||
#endif
|
||||
#if GEN_GEN >= 8 && GEN_GEN <= 11
|
||||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
|
||||
intel_perf_counter(false))),
|
||||
gen_mi_reg64(GENX(PERFCNT1_num)));
|
||||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
|
||||
intel_perf_counter(false) + 8)),
|
||||
gen_mi_reg64(GENX(PERFCNT2_num)));
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("");
|
||||
}
|
||||
@@ -611,6 +790,45 @@ void genX(CmdEndQueryIndexedEXT)(
|
||||
emit_query_mi_availability(&b, query_addr, true);
|
||||
break;
|
||||
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
|
||||
pc.CommandStreamerStallEnable = true;
|
||||
pc.StallAtPixelScoreboard = true;
|
||||
}
|
||||
uint32_t marker_offset = intel_perf_marker_offset();
|
||||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
|
||||
gen_mi_imm(cmd_buffer->intel_perf_marker));
|
||||
#if GEN_GEN >= 8 && GEN_GEN <= 11
|
||||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
|
||||
gen_mi_reg64(GENX(PERFCNT1_num)));
|
||||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
|
||||
gen_mi_reg64(GENX(PERFCNT2_num)));
|
||||
#endif
|
||||
#if GEN_GEN < 9
|
||||
gen_mi_store(&b,
|
||||
gen_mi_mem32(anv_address_add(query_addr,
|
||||
intel_perf_rpstart_offset(true))),
|
||||
gen_mi_reg32(GENX(RPSTAT1_num)));
|
||||
#else
|
||||
gen_mi_store(&b,
|
||||
gen_mi_mem32(anv_address_add(query_addr,
|
||||
intel_perf_rpstart_offset(true))),
|
||||
gen_mi_reg32(GENX(RPSTAT0_num)));
|
||||
#endif
|
||||
/* Position the last OA snapshot at the beginning of the query so that
|
||||
* we can tell whether it's ready.
|
||||
*/
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
|
||||
rpc.MemoryAddress = anv_address_add(query_addr,
|
||||
intel_perf_mi_rpc_offset(true));
|
||||
rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
|
||||
}
|
||||
emit_query_mi_availability(&b,
|
||||
anv_address_add(query_addr, pool->stride - 8),
|
||||
true);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("");
|
||||
}
|
||||
|
Reference in New Issue
Block a user