ac,radeonsi import PM4 state from RadeonSI

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29452>
This commit is contained in:
Samuel Pitoiset
2024-05-28 15:56:52 +02:00
committed by Marge Bot
parent 62c52fb59d
commit 428601095c
10 changed files with 829 additions and 710 deletions

371
src/amd/common/ac_pm4.c Normal file
View File

@@ -0,0 +1,371 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#include "ac_debug.h"
#include "ac_gpu_info.h"
#include "ac_pm4.h"
#include "sid.h"
#include <string.h>
#include <stdlib.h>
static bool
opcode_is_pairs(unsigned opcode)
{
return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
opcode == PKT3_SET_SH_REG_PAIRS ||
opcode == PKT3_SET_UCONFIG_REG_PAIRS;
}
static bool
opcode_is_pairs_packed(unsigned opcode)
{
return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
}
static unsigned
pairs_packed_opcode_to_regular(unsigned opcode)
{
switch (opcode) {
case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
return PKT3_SET_CONTEXT_REG;
case PKT3_SET_SH_REG_PAIRS_PACKED:
return PKT3_SET_SH_REG;
default:
unreachable("invalid packed opcode");
}
}
static unsigned
regular_opcode_to_pairs(struct ac_pm4_state *state, unsigned opcode)
{
const struct radeon_info *info = state->info;
switch (opcode) {
case PKT3_SET_CONTEXT_REG:
return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
case PKT3_SET_SH_REG:
return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
case PKT3_SET_UCONFIG_REG:
return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
}
return opcode;
}
static bool
packed_next_is_reg_offset_pair(struct ac_pm4_state *state)
{
return (state->ndw - state->last_pm4) % 3 == 2;
}
static bool
packed_next_is_reg_value1(struct ac_pm4_state *state)
{
return (state->ndw - state->last_pm4) % 3 == 1;
}
static bool
packed_prev_is_reg_value0(struct ac_pm4_state *state)
{
return packed_next_is_reg_value1(state);
}
static unsigned
get_packed_reg_dw_offsetN(struct ac_pm4_state *state, unsigned index)
{
unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
assert(i < state->ndw);
return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
}
static unsigned
get_packed_reg_valueN_idx(struct ac_pm4_state *state, unsigned index)
{
unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
assert(i < state->ndw);
return i;
}
static unsigned
get_packed_reg_valueN(struct ac_pm4_state *state, unsigned index)
{
return state->pm4[get_packed_reg_valueN_idx(state, index)];
}
static unsigned
get_packed_reg_count(struct ac_pm4_state *state)
{
int body_size = state->ndw - state->last_pm4 - 2;
assert(body_size > 0 && body_size % 3 == 0);
return (body_size / 3) * 2;
}
void
ac_pm4_finalize(struct ac_pm4_state *state)
{
if (opcode_is_pairs_packed(state->last_opcode)) {
unsigned reg_count = get_packed_reg_count(state);
unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
if (state->packed_is_padded)
reg_count--;
bool all_consecutive = true;
/* If the whole packed SET packet only sets consecutive registers, rewrite the packet
* to be unpacked to make it shorter.
*
* This also eliminates the invalid scenario when the packed SET packet sets only
* 2 registers and the register offsets are equal due to padding.
*/
for (unsigned i = 1; i < reg_count; i++) {
if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
all_consecutive = false;
break;
}
}
if (all_consecutive) {
assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
reg_count, 0);
state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
for (unsigned i = 0; i < reg_count; i++)
state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
state->ndw = state->last_pm4 + 2 + reg_count;
state->last_opcode = PKT3_SET_SH_REG;
} else {
/* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
if (state->debug_sqtt &&
(state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
if (state->packed_is_padded)
reg_count++; /* Add this back because we only need to record the last write. */
for (int i = reg_count - 1; i >= 0; i--) {
unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
if (strstr(ac_get_register_name(state->info->gfx_level,
state->info->family, reg_offset),
"SPI_SHADER_PGM_LO_")) {
state->spi_shader_pgm_lo_reg = reg_offset;
break;
}
}
}
/* If it's a packed SET_SH packet, use the *_N variant when possible. */
if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
}
}
}
if (state->debug_sqtt && state->last_opcode == PKT3_SET_SH_REG) {
/* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
for (unsigned i = 0; i < reg_count; i++) {
if (strstr(ac_get_register_name(state->info->gfx_level,
state->info->family, reg_base_offset + i * 4),
"SPI_SHADER_PGM_LO_")) {
state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
break;
}
}
}
}
void
ac_pm4_cmd_begin(struct ac_pm4_state *state, unsigned opcode)
{
ac_pm4_finalize(state);
assert(state->max_dw);
assert(state->ndw < state->max_dw);
assert(opcode <= 254);
state->last_opcode = opcode;
state->last_pm4 = state->ndw++;
state->packed_is_padded = false;
}
void
ac_pm4_cmd_add(struct ac_pm4_state *state, uint32_t dw)
{
assert(state->max_dw);
assert(state->ndw < state->max_dw);
state->pm4[state->ndw++] = dw;
state->last_opcode = 255; /* invalid opcode */
}
void
ac_pm4_cmd_end(struct ac_pm4_state *state, bool predicate)
{
unsigned count;
count = state->ndw - state->last_pm4 - 2;
/* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
bool reset_filter_cam = !state->is_compute_queue &&
(opcode_is_pairs(state->last_opcode) ||
opcode_is_pairs_packed(state->last_opcode));
state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
if (opcode_is_pairs_packed(state->last_opcode)) {
if (packed_prev_is_reg_value0(state)) {
/* Duplicate the first register at the end to make the number of registers aligned to 2. */
ac_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
get_packed_reg_valueN(state, 0),
state->last_opcode, 0);
state->packed_is_padded = true;
}
state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
}
}
void
ac_pm4_set_reg_custom(struct ac_pm4_state *state, unsigned reg, uint32_t val,
unsigned opcode, unsigned idx)
{
bool is_packed = opcode_is_pairs_packed(opcode);
reg >>= 2;
assert(state->max_dw);
assert(state->ndw + 2 <= state->max_dw);
if (is_packed) {
assert(idx == 0);
if (opcode != state->last_opcode) {
ac_pm4_cmd_begin(state, opcode); /* reserve space for the header */
state->ndw++; /* reserve space for the register count, it will be set at the end */
}
} else if (opcode_is_pairs(opcode)) {
assert(idx == 0);
if (opcode != state->last_opcode)
ac_pm4_cmd_begin(state, opcode);
state->pm4[state->ndw++] = reg;
} else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
idx != state->last_idx) {
ac_pm4_cmd_begin(state, opcode);
state->pm4[state->ndw++] = reg | (idx << 28);
}
assert(reg <= UINT16_MAX);
state->last_reg = reg;
state->last_idx = idx;
if (is_packed) {
if (state->packed_is_padded) {
/* The packet is padded, which means the first register is written redundantly again
* at the end. Remove it, so that we can replace it with this register.
*/
state->packed_is_padded = false;
state->ndw--;
}
if (packed_next_is_reg_offset_pair(state)) {
state->pm4[state->ndw++] = reg;
} else if (packed_next_is_reg_value1(state)) {
/* Set the second register offset in the high 16 bits. */
state->pm4[state->ndw - 2] &= 0x0000ffff;
state->pm4[state->ndw - 2] |= reg << 16;
}
}
state->pm4[state->ndw++] = val;
ac_pm4_cmd_end(state, false);
}
void ac_pm4_set_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val)
{
unsigned opcode;
if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
opcode = PKT3_SET_CONFIG_REG;
reg -= SI_CONFIG_REG_OFFSET;
} else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
opcode = PKT3_SET_SH_REG;
reg -= SI_SH_REG_OFFSET;
} else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
opcode = PKT3_SET_CONTEXT_REG;
reg -= SI_CONTEXT_REG_OFFSET;
} else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
opcode = PKT3_SET_UCONFIG_REG;
reg -= CIK_UCONFIG_REG_OFFSET;
} else {
fprintf(stderr, "mesa: Invalid register offset %08x!\n", reg);
return;
}
opcode = regular_opcode_to_pairs(state, opcode);
ac_pm4_set_reg_custom(state, reg, val, opcode, 0);
}
void
ac_pm4_set_reg_idx3(struct ac_pm4_state *state, unsigned reg, uint32_t val)
{
if (state->info->uses_kernel_cu_mask) {
assert(state->info->gfx_level >= GFX10);
ac_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
} else {
ac_pm4_set_reg(state, reg, val);
}
}
void
ac_pm4_clear_state(struct ac_pm4_state *state, const struct radeon_info *info,
bool debug_sqtt, bool is_compute_queue)
{
state->info = info;
state->debug_sqtt = debug_sqtt;
state->ndw = 0;
state->is_compute_queue = is_compute_queue;
if (!state->max_dw)
state->max_dw = ARRAY_SIZE(state->pm4);
}
struct ac_pm4_state *
ac_pm4_create_sized(const struct radeon_info *info, bool debug_sqtt,
unsigned max_dw, bool is_compute_queue)
{
struct ac_pm4_state *pm4;
unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
pm4 = (struct ac_pm4_state *)calloc(1, size);
if (pm4) {
pm4->max_dw = max_dw;
ac_pm4_clear_state(pm4, info, debug_sqtt, is_compute_queue);
}
return pm4;
}
void
ac_pm4_free_state(struct ac_pm4_state *state)
{
if (!state)
return;
free(state);
}

76
src/amd/common/ac_pm4.h Normal file
View File

@@ -0,0 +1,76 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#ifndef AC_PM4_H
#define AC_PM4_H
#include "ac_gpu_info.h"
#ifdef __cplusplus
extern "C" {
#endif
struct ac_pm4_state {
const struct radeon_info *info;
/* PKT3_SET_*_REG handling */
uint16_t last_reg; /* register offset in dwords */
uint16_t last_pm4;
uint16_t ndw; /* number of dwords in pm4 */
uint8_t last_opcode;
uint8_t last_idx;
bool is_compute_queue;
bool packed_is_padded; /* whether SET_*_REG_PAIRS_PACKED is padded to an even number of regs */
/* commands for the DE */
uint16_t max_dw;
/* Used by SQTT to override the shader address */
bool debug_sqtt;
uint32_t spi_shader_pgm_lo_reg;
/* This must be the last field because the array can continue after the structure. */
uint32_t pm4[64];
};
void
ac_pm4_set_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val);
void
ac_pm4_set_reg_custom(struct ac_pm4_state *state, unsigned reg, uint32_t val,
unsigned opcode, unsigned idx);
void
ac_pm4_set_reg_idx3(struct ac_pm4_state *state, unsigned reg, uint32_t val);
void
ac_pm4_clear_state(struct ac_pm4_state *state, const struct radeon_info *info,
bool debug_sqtt, bool is_compute_queue);
void
ac_pm4_cmd_begin(struct ac_pm4_state *state, unsigned opcode);
void
ac_pm4_cmd_add(struct ac_pm4_state *state, uint32_t dw);
void
ac_pm4_cmd_end(struct ac_pm4_state *state, bool predicate);
void
ac_pm4_finalize(struct ac_pm4_state *state);
struct ac_pm4_state *
ac_pm4_create_sized(const struct radeon_info *info, bool debug_sqtt,
unsigned max_dw, bool is_compute_queue);
void
ac_pm4_free_state(struct ac_pm4_state *state);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -115,6 +115,8 @@ amd_common_files = files(
'ac_parse_ib.c',
'ac_perfcounter.c',
'ac_perfcounter.h',
'ac_pm4.c',
'ac_pm4.h',
'ac_vcn_av1_default.h',
)

View File

@@ -65,7 +65,7 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
struct si_pm4_state *shadowing_preamble = si_pm4_create_sized(sctx->screen, 256, false);
ac_create_shadowing_ib_preamble(&sctx->screen->info,
(pm4_cmd_add_fn)si_pm4_cmd_add, shadowing_preamble,
(pm4_cmd_add_fn)ac_pm4_cmd_add, shadowing_preamble,
sctx->shadowing.registers->gpu_address, sctx->screen->dpbb_allowed);
/* Initialize shadowed registers as follows. */
@@ -95,8 +95,8 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
/* Setup preemption. The shadowing preamble will be executed as a preamble IB,
* which will load register values from memory on a context switch.
*/
sctx->ws->cs_setup_preemption(&sctx->gfx_cs, shadowing_preamble->pm4,
shadowing_preamble->ndw);
sctx->ws->cs_setup_preemption(&sctx->gfx_cs, shadowing_preamble->base.pm4,
shadowing_preamble->base.ndw);
si_pm4_free_state(sctx, shadowing_preamble, ~0);
}
}

View File

@@ -511,7 +511,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz :
ctx->cs_preamble_state;
radeon_begin(&ctx->gfx_cs);
radeon_emit_array(preamble->pm4, preamble->ndw);
radeon_emit_array(preamble->base.pm4, preamble->base.ndw);
radeon_end();
}

View File

@@ -11,321 +11,12 @@
#include "util/u_memory.h"
#include "ac_debug.h"
static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
unsigned opcode, unsigned idx);
static bool opcode_is_pairs(unsigned opcode)
{
return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
opcode == PKT3_SET_SH_REG_PAIRS ||
opcode == PKT3_SET_UCONFIG_REG_PAIRS;
}
static bool opcode_is_pairs_packed(unsigned opcode)
{
return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
}
static unsigned pairs_packed_opcode_to_regular(unsigned opcode)
{
switch (opcode) {
case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
return PKT3_SET_CONTEXT_REG;
case PKT3_SET_SH_REG_PAIRS_PACKED:
return PKT3_SET_SH_REG;
default:
unreachable("invalid packed opcode");
}
}
static unsigned regular_opcode_to_pairs(struct si_pm4_state *state, unsigned opcode)
{
const struct radeon_info *info = &state->screen->info;
switch (opcode) {
case PKT3_SET_CONTEXT_REG:
return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
case PKT3_SET_SH_REG:
return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
case PKT3_SET_UCONFIG_REG:
return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
}
return opcode;
}
static bool packed_next_is_reg_offset_pair(struct si_pm4_state *state)
{
return (state->ndw - state->last_pm4) % 3 == 2;
}
static bool packed_next_is_reg_value1(struct si_pm4_state *state)
{
return (state->ndw - state->last_pm4) % 3 == 1;
}
static bool packed_prev_is_reg_value0(struct si_pm4_state *state)
{
return packed_next_is_reg_value1(state);
}
static unsigned get_packed_reg_dw_offsetN(struct si_pm4_state *state, unsigned index)
{
unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
assert(i < state->ndw);
return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
}
static unsigned get_packed_reg_valueN_idx(struct si_pm4_state *state, unsigned index)
{
unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
assert(i < state->ndw);
return i;
}
static unsigned get_packed_reg_valueN(struct si_pm4_state *state, unsigned index)
{
return state->pm4[get_packed_reg_valueN_idx(state, index)];
}
static unsigned get_packed_reg_count(struct si_pm4_state *state)
{
int body_size = state->ndw - state->last_pm4 - 2;
assert(body_size > 0 && body_size % 3 == 0);
return (body_size / 3) * 2;
}
void si_pm4_finalize(struct si_pm4_state *state)
{
if (opcode_is_pairs_packed(state->last_opcode)) {
unsigned reg_count = get_packed_reg_count(state);
unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
if (state->packed_is_padded)
reg_count--;
bool all_consecutive = true;
/* If the whole packed SET packet only sets consecutive registers, rewrite the packet
* to be unpacked to make it shorter.
*
* This also eliminates the invalid scenario when the packed SET packet sets only
* 2 registers and the register offsets are equal due to padding.
*/
for (unsigned i = 1; i < reg_count; i++) {
if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
all_consecutive = false;
break;
}
}
if (all_consecutive) {
assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
reg_count, 0);
state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
for (unsigned i = 0; i < reg_count; i++)
state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
state->ndw = state->last_pm4 + 2 + reg_count;
state->last_opcode = PKT3_SET_SH_REG;
} else {
/* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
if (state->screen->debug_flags & DBG(SQTT) &&
(state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
if (state->packed_is_padded)
reg_count++; /* Add this back because we only need to record the last write. */
for (int i = reg_count - 1; i >= 0; i--) {
unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
if (strstr(ac_get_register_name(state->screen->info.gfx_level,
state->screen->info.family, reg_offset),
"SPI_SHADER_PGM_LO_")) {
state->spi_shader_pgm_lo_reg = reg_offset;
break;
}
}
}
/* If it's a packed SET_SH packet, use the *_N variant when possible. */
if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
}
}
}
if (state->screen->debug_flags & DBG(SQTT) && state->last_opcode == PKT3_SET_SH_REG) {
/* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
for (unsigned i = 0; i < reg_count; i++) {
if (strstr(ac_get_register_name(state->screen->info.gfx_level,
state->screen->info.family, reg_base_offset + i * 4),
"SPI_SHADER_PGM_LO_")) {
state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
break;
}
}
}
}
static void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
{
si_pm4_finalize(state);
assert(state->max_dw);
assert(state->ndw < state->max_dw);
assert(opcode <= 254);
state->last_opcode = opcode;
state->last_pm4 = state->ndw++;
state->packed_is_padded = false;
}
void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
{
assert(state->max_dw);
assert(state->ndw < state->max_dw);
state->pm4[state->ndw++] = dw;
state->last_opcode = 255; /* invalid opcode */
}
static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
{
unsigned count;
count = state->ndw - state->last_pm4 - 2;
/* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
bool reset_filter_cam = !state->is_compute_queue &&
(opcode_is_pairs(state->last_opcode) ||
opcode_is_pairs_packed(state->last_opcode));
state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
if (opcode_is_pairs_packed(state->last_opcode)) {
if (packed_prev_is_reg_value0(state)) {
/* Duplicate the first register at the end to make the number of registers aligned to 2. */
si_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
get_packed_reg_valueN(state, 0),
state->last_opcode, 0);
state->packed_is_padded = true;
}
state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
}
}
static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
unsigned opcode, unsigned idx)
{
bool is_packed = opcode_is_pairs_packed(opcode);
reg >>= 2;
assert(state->max_dw);
assert(state->ndw + 2 <= state->max_dw);
if (is_packed) {
assert(idx == 0);
if (opcode != state->last_opcode) {
si_pm4_cmd_begin(state, opcode); /* reserve space for the header */
state->ndw++; /* reserve space for the register count, it will be set at the end */
}
} else if (opcode_is_pairs(opcode)) {
assert(idx == 0);
if (opcode != state->last_opcode)
si_pm4_cmd_begin(state, opcode);
state->pm4[state->ndw++] = reg;
} else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
idx != state->last_idx) {
si_pm4_cmd_begin(state, opcode);
state->pm4[state->ndw++] = reg | (idx << 28);
}
assert(reg <= UINT16_MAX);
state->last_reg = reg;
state->last_idx = idx;
if (is_packed) {
if (state->packed_is_padded) {
/* The packet is padded, which means the first register is written redundantly again
* at the end. Remove it, so that we can replace it with this register.
*/
state->packed_is_padded = false;
state->ndw--;
}
if (packed_next_is_reg_offset_pair(state)) {
state->pm4[state->ndw++] = reg;
} else if (packed_next_is_reg_value1(state)) {
/* Set the second register offset in the high 16 bits. */
state->pm4[state->ndw - 2] &= 0x0000ffff;
state->pm4[state->ndw - 2] |= reg << 16;
}
}
state->pm4[state->ndw++] = val;
si_pm4_cmd_end(state, false);
}
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
{
unsigned opcode;
if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
opcode = PKT3_SET_CONFIG_REG;
reg -= SI_CONFIG_REG_OFFSET;
} else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
opcode = PKT3_SET_SH_REG;
reg -= SI_SH_REG_OFFSET;
} else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
opcode = PKT3_SET_CONTEXT_REG;
reg -= SI_CONTEXT_REG_OFFSET;
} else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
opcode = PKT3_SET_UCONFIG_REG;
reg -= CIK_UCONFIG_REG_OFFSET;
} else {
PRINT_ERR("Invalid register offset %08x!\n", reg);
return;
}
opcode = regular_opcode_to_pairs(state, opcode);
si_pm4_set_reg_custom(state, reg, val, opcode, 0);
}
void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
{
if (state->screen->info.uses_kernel_cu_mask) {
assert(state->screen->info.gfx_level >= GFX10);
si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
} else {
si_pm4_set_reg(state, reg, val);
}
}
void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen,
bool is_compute_queue)
{
state->screen = sscreen;
state->ndw = 0;
state->is_compute_queue = is_compute_queue;
const bool debug_sqtt = !!(sscreen->debug_flags & DBG(SQTT));
if (!state->max_dw)
state->max_dw = ARRAY_SIZE(state->pm4);
ac_pm4_clear_state(&state->base, &sscreen->info, debug_sqtt, is_compute_queue);
}
void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
@@ -351,7 +42,7 @@ void si_pm4_emit_commands(struct si_context *sctx, struct si_pm4_state *state)
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
radeon_begin(cs);
radeon_emit_array(state->pm4, state->ndw);
radeon_emit_array(state->base.pm4, state->base.ndw);
radeon_end();
}
@@ -364,7 +55,7 @@ void si_pm4_emit_state(struct si_context *sctx, unsigned index)
assert(state && state != sctx->emitted.array[index]);
radeon_begin(cs);
radeon_emit_array(state->pm4, state->ndw);
radeon_emit_array(state->base.pm4, state->base.ndw);
radeon_end();
sctx->emitted.array[index] = state;
@@ -396,21 +87,21 @@ struct si_pm4_state *si_pm4_create_sized(struct si_screen *sscreen, unsigned max
bool is_compute_queue)
{
struct si_pm4_state *pm4;
unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->base.pm4));
pm4 = (struct si_pm4_state *)calloc(1, size);
if (pm4) {
pm4->max_dw = max_dw;
pm4->base.max_dw = max_dw;
si_pm4_clear_state(pm4, sscreen, is_compute_queue);
}
return pm4;
}
struct si_pm4_state *si_pm4_clone(struct si_pm4_state *orig)
struct si_pm4_state *si_pm4_clone(struct si_screen *sscreen, struct si_pm4_state *orig)
{
struct si_pm4_state *pm4 = si_pm4_create_sized(orig->screen, orig->max_dw,
orig->is_compute_queue);
struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, orig->base.max_dw,
orig->base.is_compute_queue);
if (pm4)
memcpy(pm4, orig, sizeof(*pm4) + 4 * (pm4->max_dw - ARRAY_SIZE(pm4->pm4)));
memcpy(pm4, orig, sizeof(*pm4) + 4 * (pm4->base.max_dw - ARRAY_SIZE(pm4->base.pm4)));
return pm4;
}

View File

@@ -10,6 +10,8 @@
#include <stdint.h>
#include <stdbool.h>
#include "ac_pm4.h"
#ifdef __cplusplus
extern "C" {
#endif
@@ -27,35 +29,12 @@ struct si_atom {
};
struct si_pm4_state {
struct si_screen *screen;
/* PKT3_SET_*_REG handling */
uint16_t last_reg; /* register offset in dwords */
uint16_t last_pm4;
uint16_t ndw; /* number of dwords in pm4 */
uint8_t last_opcode;
uint8_t last_idx;
bool is_compute_queue;
bool packed_is_padded; /* whether SET_*_REG_PAIRS_PACKED is padded to an even number of regs */
/* For shader states only */
struct si_atom atom;
/* commands for the DE */
uint16_t max_dw;
/* Used by SQTT to override the shader address */
uint32_t spi_shader_pgm_lo_reg;
/* This must be the last field because the array can continue after the structure. */
uint32_t pm4[64];
struct ac_pm4_state base;
};
void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val);
void si_pm4_finalize(struct si_pm4_state *state);
void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen,
bool is_compute_queue);
void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
@@ -66,7 +45,7 @@ void si_pm4_emit_shader(struct si_context *sctx, unsigned index);
void si_pm4_reset_emitted(struct si_context *sctx);
struct si_pm4_state *si_pm4_create_sized(struct si_screen *sscreen, unsigned max_dw,
bool is_compute_queue);
struct si_pm4_state *si_pm4_clone(struct si_pm4_state *orig);
struct si_pm4_state *si_pm4_clone(struct si_screen *sscreen, struct si_pm4_state *orig);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load Diff

View File

@@ -376,11 +376,11 @@ static bool si_update_shaders(struct si_context *sctx)
struct si_pm4_state *pm4 = &shader->pm4;
uint64_t va_low = shader->gpu_address >> 8;
uint32_t reg = pm4->spi_shader_pgm_lo_reg;
si_pm4_set_reg(&pipeline->pm4, reg, va_low);
uint32_t reg = pm4->base.spi_shader_pgm_lo_reg;
ac_pm4_set_reg(&pipeline->pm4.base, reg, va_low);
}
}
si_pm4_finalize(&pipeline->pm4);
ac_pm4_finalize(&pipeline->pm4.base);
sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf);
_mesa_hash_table_u64_insert(sctx->sqtt->pipeline_bos,

View File

@@ -685,7 +685,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
return;
va = shader->bo->gpu_address;
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
shader->config.rsrc1 = S_00B528_VGPRS(si_shader_encode_vgprs(shader)) |
S_00B528_SGPRS(si_shader_encode_sgprs(shader)) |
@@ -694,7 +694,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
S_00B528_FLOAT_MODE(shader->config.float_mode);
shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
@@ -709,30 +709,30 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
GFX6_TCS_NUM_USER_SGPR;
if (sscreen->info.gfx_level >= GFX12) {
si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_RSRC4_HS,
ac_pm4_set_reg(&pm4->base, R_00B420_SPI_SHADER_PGM_RSRC4_HS,
S_00B420_WAVE_LIMIT(0x3ff) |
S_00B420_GLG_FORCE_DISABLE(1) |
S_00B420_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_LO_LS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B424_SPI_SHADER_PGM_LO_LS, va >> 8);
} else if (sscreen->info.gfx_level >= GFX11) {
si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
ac_pm4_set_reg_idx3(&pm4->base, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
S_00B404_CU_EN(0xffff),
C_00B404_CU_EN, 16, &sscreen->info));
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
} else if (sscreen->info.gfx_level >= GFX10) {
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
} else if (sscreen->info.gfx_level >= GFX9) {
si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
} else {
si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
ac_pm4_set_reg(&pm4->base, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B424_SPI_SHADER_PGM_HI_HS,
S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
}
si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
ac_pm4_set_reg(&pm4->base, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
S_00B428_VGPRS(si_shader_encode_vgprs(shader)) |
S_00B428_SGPRS(si_shader_encode_sgprs(shader)) |
S_00B428_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
@@ -752,9 +752,9 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
shader->config.rsrc2 |= S_00B42C_OC_LDS_EN(1);
if (sscreen->info.gfx_level <= GFX8)
si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
ac_pm4_set_reg(&pm4->base, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
static void si_emit_shader_es(struct si_context *sctx, unsigned index)
@@ -804,16 +804,16 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B324_SPI_SHADER_PGM_HI_ES,
S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
ac_pm4_set_reg(&pm4->base, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
S_00B328_VGPRS(si_shader_encode_vgprs(shader)) |
S_00B328_SGPRS(si_shader_encode_sgprs(shader)) |
S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
S_00B328_DX10_CLAMP(1) |
S_00B328_FLOAT_MODE(shader->config.float_mode));
si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
ac_pm4_set_reg(&pm4->base, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
@@ -821,7 +821,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
si_set_tesseval_regs(sscreen, shader->selector, shader);
polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
@@ -1094,9 +1094,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
if (sscreen->info.gfx_level >= GFX10) {
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
} else {
si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
}
uint32_t rsrc1 = S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
@@ -1117,8 +1117,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
}
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
ac_pm4_set_reg(&pm4->base, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
ac_pm4_set_reg(&pm4->base, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
shader->gs.spi_shader_pgm_rsrc3_gs =
ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
@@ -1147,20 +1147,20 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
S_00B21C_WAVE_LIMIT(0x3F),
C_00B21C_CU_EN, 0, &sscreen->info);
si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
ac_pm4_set_reg(&pm4->base, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B224_SPI_SHADER_PGM_HI_GS,
S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
ac_pm4_set_reg(&pm4->base, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
S_00B228_SGPRS(si_shader_encode_sgprs(shader)) |
S_00B228_DX10_CLAMP(1) |
S_00B228_FLOAT_MODE(shader->config.float_mode));
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
ac_pm4_set_reg(&pm4->base, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
}
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
bool gfx10_is_ngg_passthrough(struct si_shader *shader)
@@ -1488,18 +1488,18 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
}
if (sscreen->info.gfx_level >= GFX12) {
si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_LO_ES, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B224_SPI_SHADER_PGM_LO_ES, va >> 8);
} else {
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
}
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
ac_pm4_set_reg(&pm4->base, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
S_00B228_FLOAT_MODE(shader->config.float_mode) |
S_00B228_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
ac_pm4_set_reg(&pm4->base, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
S_00B22C_USER_SGPR(num_user_sgprs) |
S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
@@ -1672,7 +1672,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_028B54_MAX_PRIMGRP_IN_WAVE(2);
}
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
static void si_emit_shader_vs(struct si_context *sctx, unsigned index)
@@ -1829,15 +1829,15 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
if (sscreen->info.gfx_level >= GFX7) {
si_pm4_set_reg_idx3(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
ac_pm4_set_reg_idx3(&pm4->base, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
S_00B118_WAVE_LIMIT(0x3F),
C_00B118_CU_EN, 0, &sscreen->info));
si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
ac_pm4_set_reg(&pm4->base, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
}
si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
ac_pm4_set_reg(&pm4->base, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B124_SPI_SHADER_PGM_HI_VS,
S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
uint32_t rsrc1 =
@@ -1863,8 +1863,8 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
S_00B12C_SO_EN(1);
}
si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
ac_pm4_set_reg(&pm4->base, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
ac_pm4_set_reg(&pm4->base, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
if (window_space)
shader->vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
@@ -1878,7 +1878,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
si_set_tesseval_regs(sscreen, shader->selector, shader);
polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
@@ -2173,40 +2173,40 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
if (sscreen->dpbb_allowed &&
(sscreen->pbb_context_states_per_bin > 1 ||
sscreen->pbb_persistent_states_per_bin > 1)) {
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_EVENT_WRITE, 0, 0));
ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
}
if (sscreen->info.gfx_level >= GFX12) {
si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC4_PS,
ac_pm4_set_reg(&pm4->base, R_00B01C_SPI_SHADER_PGM_RSRC4_PS,
S_00B01C_WAVE_LIMIT_GFX12(0x3FF) |
S_00B01C_LDS_GROUP_SIZE_GFX12(1) |
S_00B01C_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
} else if (sscreen->info.gfx_level >= GFX11) {
unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
ac_pm4_set_reg_idx3(&pm4->base, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16) |
S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
C_00B004_CU_EN, 16, &sscreen->info));
}
uint64_t va = shader->bo->gpu_address;
si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
ac_pm4_set_reg(&pm4->base, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
ac_pm4_set_reg(&pm4->base, R_00B024_SPI_SHADER_PGM_HI_PS,
S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
ac_pm4_set_reg(&pm4->base, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
S_00B028_VGPRS(si_shader_encode_vgprs(shader)) |
S_00B028_SGPRS(si_shader_encode_sgprs(shader)) |
S_00B028_DX10_CLAMP(sscreen->info.gfx_level < GFX12) |
S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
S_00B028_FLOAT_MODE(shader->config.float_mode));
si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
ac_pm4_set_reg(&pm4->base, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
}
static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
@@ -2251,7 +2251,7 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
assert(0);
}
assert(!(sscreen->debug_flags & DBG(SQTT)) || shader->pm4.spi_shader_pgm_lo_reg != 0);
assert(!(sscreen->debug_flags & DBG(SQTT)) || shader->pm4.base.spi_shader_pgm_lo_reg != 0);
}
static void si_clear_vs_key_inputs(union si_shader_key *key)
@@ -4052,13 +4052,13 @@ static void si_cs_preamble_add_vgt_flush(struct si_context *sctx, bool tmz)
return;
/* Done by Vulkan before VGT_FLUSH. */
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_EVENT_WRITE, 0, 0));
ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
si_pm4_finalize(pm4);
ac_pm4_cmd_add(&pm4->base, PKT3(PKT3_EVENT_WRITE, 0, 0));
ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
ac_pm4_finalize(&pm4->base);
*has_vgt_flush = true;
}
@@ -4199,32 +4199,32 @@ bool si_update_gs_ring_buffers(struct si_context *sctx)
if (!*gs_ring_state_dw_offset) {
/* We are here for the first time. The packets will be added. */
*gs_ring_state_dw_offset = pm4->ndw;
*gs_ring_state_dw_offset = pm4->base.ndw;
} else {
/* We have been here before. Overwrite the previous packets. */
old_ndw = pm4->ndw;
pm4->ndw = *gs_ring_state_dw_offset;
old_ndw = pm4->base.ndw;
pm4->base.ndw = *gs_ring_state_dw_offset;
}
/* Unallocated rings are written to reserve the space in the pm4
* (to be able to overwrite them later). */
if (sctx->gfx_level >= GFX7) {
if (sctx->gfx_level <= GFX8)
si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
ac_pm4_set_reg(&pm4->base, R_030900_VGT_ESGS_RING_SIZE,
sctx->esgs_ring ? sctx->esgs_ring->width0 / 256 : 0);
si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
ac_pm4_set_reg(&pm4->base, R_030904_VGT_GSVS_RING_SIZE,
sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0);
} else {
si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
ac_pm4_set_reg(&pm4->base, R_0088C8_VGT_ESGS_RING_SIZE,
sctx->esgs_ring ? sctx->esgs_ring->width0 / 256 : 0);
si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
ac_pm4_set_reg(&pm4->base, R_0088CC_VGT_GSVS_RING_SIZE,
sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0);
}
si_pm4_finalize(pm4);
ac_pm4_finalize(&pm4->base);
if (old_ndw) {
pm4->ndw = old_ndw;
pm4->last_opcode = 255; /* invalid opcode (we don't save the last opcode) */
pm4->base.ndw = old_ndw;
pm4->base.last_opcode = 255; /* invalid opcode (we don't save the last opcode) */
}
}