diff --git a/docs/envvars.rst b/docs/envvars.rst index 1068f6dd79d..f397c41dcd7 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -723,6 +723,9 @@ RADV driver environment variables set the SQTT/RGP buffer size in bytes (default value is 32MiB, the buffer is automatically resized if too small) +:envvar:`RADV_THREAD_TRACE_CACHE_COUNTERS` + enable/disable SQTT/RGP cache counters on GFX10+ (disabled by default) + :envvar:`RADV_THREAD_TRACE_INSTRUCTION_TIMING` enable/disable SQTT/RGP instruction timing (enabled by default) diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c index 2213d9adc96..6cfa42ee192 100644 --- a/src/amd/vulkan/layers/radv_sqtt_layer.c +++ b/src/amd/vulkan/layers/radv_sqtt_layer.c @@ -363,7 +363,12 @@ radv_handle_thread_trace(VkQueue _queue) radv_QueueWaitIdle(_queue); if (radv_get_thread_trace(queue, &thread_trace)) { - ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &thread_trace, NULL); + struct ac_spm_trace_data *spm_trace = NULL; + + if (queue->device->spm_trace.bo) + spm_trace = &queue->device->spm_trace; + + ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &thread_trace, spm_trace); } else { /* Trigger a new capture if the driver failed to get * the trace because the buffer was too small. diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build index 04659a0098c..2e365b7a192 100644 --- a/src/amd/vulkan/meson.build +++ b/src/amd/vulkan/meson.build @@ -81,6 +81,7 @@ libradv_files = files( 'radv_shader_args.c', 'radv_shader_args.h', 'radv_shader_info.c', + 'radv_spm.c', 'radv_sqtt.c', 'radv_query.c', 'radv_util.c', diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index aac5f842b76..c2dc58532c7 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -347,6 +347,13 @@ radv_thread_trace_enabled() getenv("RADV_THREAD_TRACE_TRIGGER"); } +static bool +radv_spm_trace_enabled() +{ + return radv_thread_trace_enabled() && + debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", false); +} + #if defined(VK_USE_PLATFORM_WAYLAND_KHR) || defined(VK_USE_PLATFORM_XCB_KHR) || \ defined(VK_USE_PLATFORM_XLIB_KHR) || defined(VK_USE_PLATFORM_DISPLAY_KHR) #define RADV_USE_WSI_PLATFORM @@ -3149,9 +3156,20 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr goto fail; fprintf(stderr, "radv: Thread trace support is enabled (initial buffer size: %u MiB, " - "instruction timing: %s).\n", + "instruction timing: %s, cache counters: %s).\n", device->thread_trace.buffer_size / (1024 * 1024), - radv_is_instruction_timing_enabled() ? "enabled" : "disabled"); + radv_is_instruction_timing_enabled() ? "enabled" : "disabled", + radv_spm_trace_enabled() ? "enabled" : "disabled"); + + if (radv_spm_trace_enabled()) { + if (device->physical_device->rad_info.chip_class < GFX10) { + fprintf(stderr, "SPM isn't supported for this GPU!\n"); + abort(); + } + + if (!radv_spm_init(device)) + goto fail; + } } if (getenv("RADV_TRAP_HANDLER")) { @@ -3273,6 +3291,8 @@ fail_meta: fail: radv_thread_trace_finish(device); + radv_spm_finish(device); + radv_trap_handler_finish(device); radv_finish_trace(device); @@ -3342,6 +3362,8 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) radv_thread_trace_finish(device); + radv_spm_finish(device); + vk_device_finish(&device->vk); vk_free(&device->vk.alloc, device); } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index cce48e21424..31ac95628d3 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -70,6 +70,7 @@ #include "ac_binary.h" #include "ac_gpu_info.h" #include "ac_shader_util.h" +#include "ac_spm.h" #include "ac_sqtt.h" #include "ac_surface.h" #include "radv_constants.h" @@ -834,6 +835,12 @@ struct radv_device { /* Thread trace. */ struct ac_thread_trace_data thread_trace; + /* SPM. */ + struct ac_spm_trace_data spm_trace; + + /* Performance counters. */ + struct ac_perfcounters perfcounters; + /* Trap handler. */ struct radv_shader *trap_handler_shader; struct radeon_winsys_bo *tma_bo; /* Trap Memory Address */ @@ -2928,6 +2935,11 @@ void radv_perfcounter_emit_reset(struct radeon_cmdbuf *cs); void radv_perfcounter_emit_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family); void radv_perfcounter_emit_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family); +/* radv_spm.c */ +bool radv_spm_init(struct radv_device *device); +void radv_spm_finish(struct radv_device *device); +void radv_emit_spm_setup(struct radv_device *device, struct radeon_cmdbuf *cs); + #define RADV_FROM_HANDLE(__radv_type, __name, __handle) \ VK_FROM_HANDLE(__radv_type, __name, __handle) diff --git a/src/amd/vulkan/radv_spm.c b/src/amd/vulkan/radv_spm.c new file mode 100644 index 00000000000..f8669fee040 --- /dev/null +++ b/src/amd/vulkan/radv_spm.c @@ -0,0 +1,230 @@ +/* + * Copyright © 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "radv_cs.h" +#include "radv_private.h" +#include "sid.h" + +#define SPM_RING_BASE_ALIGN 32 + +static bool +radv_spm_init_bo(struct radv_device *device) +{ + struct radeon_winsys *ws = device->ws; + uint64_t size = 32 * 1024 * 1024; /* Default to 1MB. */ + uint16_t sample_interval = 4096; /* Default to 4096 clk. */ + VkResult result; + + device->spm_trace.buffer_size = size; + device->spm_trace.sample_interval = sample_interval; + + struct radeon_winsys_bo *bo = NULL; + result = ws->buffer_create( + ws, size, 4096, RADEON_DOMAIN_VRAM, + RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM, + RADV_BO_PRIORITY_SCRATCH, 0, &bo); + device->spm_trace.bo = bo; + if (result != VK_SUCCESS) + return false; + + result = ws->buffer_make_resident(ws, device->spm_trace.bo, true); + if (result != VK_SUCCESS) + return false; + + device->spm_trace.ptr = ws->buffer_map(device->spm_trace.bo); + if (!device->spm_trace.ptr) + return false; + + return true; +} + +static void +radv_emit_spm_counters(struct radv_device *device, struct radeon_cmdbuf *cs) +{ + struct ac_spm_trace_data *spm_trace = &device->spm_trace; + + for (uint32_t b = 0; b < spm_trace->num_used_sq_block_sel; b++) { + struct ac_spm_block_select *sq_block_sel = &spm_trace->sq_block_sel[b]; + const struct ac_spm_counter_select *cntr_sel = &sq_block_sel->counters[0]; + uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT; + + radeon_set_uconfig_reg_seq(cs, reg_base + b * 4, 1); + radeon_emit(cs, cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */ + } + + for (uint32_t b = 0; b < spm_trace->num_block_sel; b++) { + struct ac_spm_block_select *block_sel = &spm_trace->block_sel[b]; + struct ac_pc_block_base *regs = block_sel->b->b->b; + + radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, block_sel->grbm_gfx_index); + + for (unsigned c = 0; c < block_sel->num_counters; c++) { + const struct ac_spm_counter_select *cntr_sel = &block_sel->counters[c]; + + if (!cntr_sel->active) + continue; + + radeon_set_uconfig_reg_seq(cs, regs->select0[c], 1); + radeon_emit(cs, cntr_sel->sel0); + + radeon_set_uconfig_reg_seq(cs, regs->select1[c], 1); + radeon_emit(cs, cntr_sel->sel1); + } + } + + /* Restore global broadcasting. */ + radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, + S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); +} + +void +radv_emit_spm_setup(struct radv_device *device, struct radeon_cmdbuf *cs) +{ + struct ac_spm_trace_data *spm_trace = &device->spm_trace; + uint64_t va = radv_buffer_get_va(spm_trace->bo); + uint64_t ring_size = spm_trace->buffer_size; + + /* It's required that the ring VA and the size are correctly aligned. */ + assert(!(va & (SPM_RING_BASE_ALIGN - 1))); + assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1))); + assert(spm_trace->sample_interval >= 32); + + /* Configure the SPM ring buffer. */ + radeon_set_uconfig_reg(cs, R_037200_RLC_SPM_PERFMON_CNTL, + S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */ + S_037200_PERFMON_SAMPLE_INTERVAL(spm_trace->sample_interval)); /* in sclk */ + radeon_set_uconfig_reg(cs, R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va); + radeon_set_uconfig_reg(cs, R_037208_RLC_SPM_PERFMON_RING_BASE_HI, + S_037208_RING_BASE_HI(va >> 32)); + radeon_set_uconfig_reg(cs, R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size); + + /* Configure the muxsel. */ + uint32_t total_muxsel_lines = 0; + for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) { + total_muxsel_lines += spm_trace->num_muxsel_lines[s]; + } + + radeon_set_uconfig_reg(cs, R_03726C_RLC_SPM_ACCUM_MODE, 0); + radeon_set_uconfig_reg(cs, R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0); + radeon_set_uconfig_reg(cs, R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE, + S_03727C_SE0_NUM_LINE(spm_trace->num_muxsel_lines[0]) | + S_03727C_SE1_NUM_LINE(spm_trace->num_muxsel_lines[1]) | + S_03727C_SE2_NUM_LINE(spm_trace->num_muxsel_lines[2]) | + S_03727C_SE3_NUM_LINE(spm_trace->num_muxsel_lines[3])); + radeon_set_uconfig_reg(cs, R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE, + S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) | + S_037280_GLOBAL_NUM_LINE(spm_trace->num_muxsel_lines[4])); + + /* Upload each muxsel ram to the RLC. */ + for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) { + unsigned rlc_muxsel_addr, rlc_muxsel_data; + unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1); + + if (!spm_trace->num_muxsel_lines[s]) + continue; + + if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) { + grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1); + + rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR; + rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA; + } else { + grbm_gfx_index |= S_030800_SE_INDEX(s); + + rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR; + rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA; + } + + radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, grbm_gfx_index); + + for (unsigned l = 0; l < spm_trace->num_muxsel_lines[s]; l++) { + uint32_t *data = (uint32_t *)spm_trace->muxsel_lines[s][l].muxsel; + + /* Select MUXSEL_ADDR to point to the next muxsel. */ + radeon_set_uconfig_reg(cs, rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE); + + /* Write the muxsel line configuration with MUXSEL_DATA. */ + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME) | + S_370_WR_ONE_ADDR(1)); + radeon_emit(cs, rlc_muxsel_data >> 2); + radeon_emit(cs, 0); + radeon_emit_array(cs, data, AC_SPM_MUXSEL_LINE_SIZE); + } + } + + /* Select SPM counters. */ + radv_emit_spm_counters(device, cs); +} + +bool +radv_spm_init(struct radv_device *device) +{ + const struct radeon_info *info = &device->physical_device->rad_info; + struct ac_perfcounters *pc = &device->perfcounters; + struct ac_spm_counter_create_info spm_counters[] = { + {TCP, 0, 0x9}, /* Number of L2 requests. */ + {TCP, 0, 0x12}, /* Number of L2 misses. */ + {SQ, 0, 0x14f}, /* Number of SCACHE hits. */ + {SQ, 0, 0x150}, /* Number of SCACHE misses. */ + {SQ, 0, 0x151}, /* Number of SCACHE misses duplicate. */ + {SQ, 0, 0x12c}, /* Number of ICACHE hits. */ + {SQ, 0, 0x12d}, /* Number of ICACHE misses. */ + {SQ, 0, 0x12e}, /* Number of ICACHE misses duplicate. */ + {GL1C, 0, 0xe}, /* Number of GL1C requests. */ + {GL1C, 0, 0x12}, /* Number of GL1C misses. */ + {GL2C, 0, 0x3}, /* Number of GL2C requests. */ + {GL2C, 0, info->chip_class >= GFX10_3 ? 0x2b : 0x23}, /* Number of GL2C misses. */ + }; + + if (!ac_init_perfcounters(info, false, false, pc)) + return false; + + if (!ac_init_spm(info, pc, ARRAY_SIZE(spm_counters), spm_counters, &device->spm_trace)) + return false; + + if (!radv_spm_init_bo(device)) + return false; + + return true; +} + +void +radv_spm_finish(struct radv_device *device) +{ + struct radeon_winsys *ws = device->ws; + + if (device->spm_trace.bo) { + ws->buffer_make_resident(ws, device->spm_trace.bo, false); + ws->buffer_destroy(ws, device->spm_trace.bo); + } + + ac_destroy_spm(&device->spm_trace); + ac_destroy_perfcounters(&device->perfcounters); +} diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c index f7a996f500c..4180fba5950 100644 --- a/src/amd/vulkan/radv_sqtt.c +++ b/src/amd/vulkan/radv_sqtt.c @@ -543,9 +543,21 @@ radv_begin_thread_trace(struct radv_queue *queue) /* Enable SQG events that collects thread trace data. */ radv_emit_spi_config_cntl(device, cs, true); + radv_perfcounter_emit_reset(cs); + + if (device->spm_trace.bo) { + /* Enable all shader stages by default. */ + radv_perfcounter_emit_shaders(cs, 0x7f); + + radv_emit_spm_setup(device, cs); + } + /* Start SQTT. */ radv_emit_thread_trace_start(device, cs, family); + if (device->spm_trace.bo) + radv_perfcounter_emit_start(device, cs, family); + result = ws->cs_finalize(cs); if (result != VK_SUCCESS) { ws->cs_destroy(cs); @@ -591,9 +603,14 @@ radv_end_thread_trace(struct radv_queue *queue) /* Make sure to wait-for-idle before stopping SQTT. */ radv_emit_wait_for_idle(device, cs, family); + if (device->spm_trace.bo) + radv_perfcounter_emit_stop(device, cs, family); + /* Stop SQTT. */ radv_emit_thread_trace_stop(device, cs, family); + radv_perfcounter_emit_reset(cs); + /* Restore previous state by disabling SQG events. */ radv_emit_spi_config_cntl(device, cs, false);