freedreno/a6xx: Initial a7xx support

Passing all of deqp-gles*

LRZ is still causing some artifacts in games so it is disabled for now.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30304>
This commit is contained in:
Rob Clark
2024-07-19 13:23:19 -07:00
committed by Marge Bot
parent e6be78c703
commit ad90bf0500
23 changed files with 720 additions and 212 deletions

View File

@@ -1684,12 +1684,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords)
</reg32>
<stripe varset="event_write_dst" variants="EV_DST_RAM">
<reg32 offset="1" name="1">
<bitfield name="ADDR_0_LO" low="0" high="31"/>
</reg32>
<reg32 offset="2" name="2">
<bitfield name="ADDR_0_HI" low="0" high="31"/>
</reg32>
<reg64 offset="1" name="1" type="waddress"/>
<reg32 offset="3" name="3">
<bitfield name="PAYLOAD_0" low="0" high="31"/>
</reg32>

View File

@@ -58,6 +58,7 @@ blend_func(unsigned func)
}
}
template <chip CHIP>
struct fd6_blend_variant *
__fd6_setup_blend_variant(struct fd6_blend_stateobj *blend,
unsigned sample_mask)
@@ -118,18 +119,21 @@ __fd6_setup_blend_variant(struct fd6_blend_stateobj *blend,
}
}
OUT_REG(
ring,
/* sRGB + dither on a7xx goes badly: */
bool dither = (CHIP < A7XX) ? cso->dither : false;
OUT_REG(ring,
A6XX_RB_DITHER_CNTL(
.dither_mode_mrt0 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt1 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt2 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt3 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt4 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt5 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt6 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt7 =
cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, ));
.dither_mode_mrt0 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt1 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt2 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt3 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt4 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt5 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt6 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
.dither_mode_mrt7 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
)
);
OUT_REG(ring,
A6XX_SP_BLEND_CNTL(
@@ -157,6 +161,7 @@ __fd6_setup_blend_variant(struct fd6_blend_stateobj *blend,
return so;
}
FD_GENX(__fd6_setup_blend_variant);
void *
fd6_blend_state_create(struct pipe_context *pctx,

View File

@@ -34,8 +34,6 @@
#include "freedreno_context.h"
#include "freedreno_util.h"
BEGINC;
/**
* Since the sample-mask is part of the hw blend state, we need to have state
* variants per sample-mask value. But we don't expect the sample-mask state
@@ -63,10 +61,12 @@ fd6_blend_stateobj(struct pipe_blend_state *blend)
return (struct fd6_blend_stateobj *)blend;
}
template <chip CHIP>
struct fd6_blend_variant *
__fd6_setup_blend_variant(struct fd6_blend_stateobj *blend,
unsigned sample_mask);
template <chip CHIP>
static inline struct fd6_blend_variant *
fd6_blend_variant(struct pipe_blend_state *cso, unsigned nr_samples,
unsigned sample_mask)
@@ -85,13 +85,11 @@ fd6_blend_variant(struct pipe_blend_state *cso, unsigned nr_samples,
}
}
return __fd6_setup_blend_variant(blend, sample_mask);
return __fd6_setup_blend_variant<CHIP>(blend, sample_mask);
}
void *fd6_blend_state_create(struct pipe_context *pctx,
const struct pipe_blend_state *cso);
void fd6_blend_state_delete(struct pipe_context *, void *hwcso);
ENDC;
#endif /* FD6_BLEND_H_ */

View File

@@ -305,6 +305,11 @@ emit_blit_setup(struct fd_ringbuffer *ring, enum pipe_format pfmt,
OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
OUT_RING(ring, blit_cntl);
if (CHIP >= A7XX) {
OUT_PKT4(ring, REG_A7XX_SP_PS_UNKNOWN_B2D2, 1);
OUT_RING(ring, 0x20000000);
}
if (fmt == FMT6_10_10_10_2_UNORM_DEST)
fmt = FMT6_16_16_16_16_FLOAT;

View File

@@ -48,9 +48,6 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
struct ir3_shader_variant *v)
assert_dt
{
const struct ir3_info *i = &v->info;
enum a6xx_threadsize thrsz_cs = i->double_threadsize ? THREAD128 : THREAD64;
OUT_REG(ring, HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true,
.ds_state = true, .gs_state = true,
.fs_state = true, .cs_state = true,
@@ -77,30 +74,86 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
enum a6xx_threadsize thrsz = ctx->screen->info->a6xx.supports_double_threadsize ? thrsz_cs : THREAD128;
OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2);
OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
if (!ctx->screen->info->a6xx.supports_double_threadsize) {
OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL_0, 1);
OUT_RING(ring, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz_cs));
/*
* Devices that do not support double threadsize take the threadsize from
* A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
* which is always set to THREAD128.
*/
enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
enum a6xx_threadsize thrsz_cs = ctx->screen->info->a6xx
.supports_double_threadsize ? thrsz : THREAD128;
if (CHIP == A6XX) {
OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2);
OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
if (!ctx->screen->info->a6xx.supports_double_threadsize) {
OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL_0, 1);
OUT_RING(ring, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
}
if (ctx->screen->info->a6xx.has_lpac) {
OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
}
} else {
enum a7xx_cs_yalign yalign = (v->local_size[1] % 8 == 0) ? CS_YALIGN_8
: (v->local_size[1] % 4 == 0) ? CS_YALIGN_4
: (v->local_size[1] % 2 == 0) ? CS_YALIGN_2
: CS_YALIGN_1;
OUT_REG(ring,
HLSQ_CS_CNTL_1(
CHIP,
.linearlocalidregid = regid(63, 0),
.threadsize = thrsz_cs,
/* A7XX TODO: blob either sets all of these unknowns
* together or doesn't set them at all.
*/
.unk11 = true,
.unk22 = true,
.yalign = yalign,
)
);
OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
OUT_REG(ring,
A6XX_SP_CS_CNTL_0(
.wgidconstid = work_group_id,
.wgsizeconstid = INVALID_REG,
.wgoffsetconstid = INVALID_REG,
.localidregid = local_invocation_id,
)
);
OUT_REG(ring,
SP_CS_CNTL_1(
CHIP,
.linearlocalidregid = INVALID_REG,
.threadsize = thrsz_cs,
/* A7XX TODO: enable UNK15 when we don't use subgroup ops. */
.unk15 = false,
)
);
OUT_REG(ring,
A7XX_HLSQ_CS_LOCAL_SIZE(
.localsizex = v->local_size[0] - 1,
.localsizey = v->local_size[1] - 1,
.localsizez = v->local_size[2] - 1,
)
);
OUT_REG(ring, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
}
if (ctx->screen->info->a6xx.has_lpac) {
OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
}
fd6_emit_shader(ctx, ring, v);
fd6_emit_shader<CHIP>(ctx, ring, v);
}
template <chip CHIP>

View File

@@ -267,7 +267,7 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv,
pctx->destroy = fd6_context_destroy;
pctx->create_blend_state = fd6_blend_state_create;
pctx->create_rasterizer_state = fd6_rasterizer_state_create;
pctx->create_depth_stencil_alpha_state = fd6_zsa_state_create;
pctx->create_depth_stencil_alpha_state = fd6_zsa_state_create<CHIP>;
pctx->create_vertex_elements_state = fd6_vertex_state_create;
fd6_draw_init<CHIP>(pctx);

View File

@@ -231,6 +231,7 @@ compute_lrz_state(struct fd6_emit *emit) assert_dt
return lrz;
}
template <chip CHIP>
static struct fd_ringbuffer *
build_lrz(struct fd6_emit *emit) assert_dt
{
@@ -244,14 +245,39 @@ build_lrz(struct fd6_emit *emit) assert_dt
fd6_ctx->last.lrz = lrz;
unsigned ndwords = (CHIP >= A7XX) ? 10 : 8;
struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
ctx->batch->submit, 8 * 4, FD_RINGBUFFER_STREAMING);
ctx->batch->submit, ndwords * 4, FD_RINGBUFFER_STREAMING);
OUT_REG(ring,
A6XX_GRAS_LRZ_CNTL(.enable = lrz.enable, .lrz_write = lrz.write,
.greater = lrz.direction == FD_LRZ_GREATER,
.z_test_enable = lrz.test,
.z_bounds_enable = lrz.z_bounds_enable, ));
if (CHIP >= A7XX) {
OUT_REG(ring,
A6XX_GRAS_LRZ_CNTL(
.enable = lrz.enable,
.lrz_write = lrz.write,
.greater = lrz.direction == FD_LRZ_GREATER,
.z_test_enable = lrz.test,
.z_bounds_enable = lrz.z_bounds_enable,
)
);
OUT_REG(ring,
A7XX_GRAS_LRZ_CNTL2(
.disable_on_wrong_dir = false,
.fc_enable = false,
)
);
} else {
OUT_REG(ring,
A6XX_GRAS_LRZ_CNTL(
.enable = lrz.enable,
.lrz_write = lrz.write,
.greater = lrz.direction == FD_LRZ_GREATER,
.fc_enable = false,
.z_test_enable = lrz.test,
.z_bounds_enable = lrz.z_bounds_enable,
.disable_on_wrong_dir = false,
)
);
}
OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
@@ -393,6 +419,7 @@ build_sample_locations(struct fd6_emit *emit)
return ring;
}
template <chip CHIP>
static void
fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
{
@@ -433,7 +460,8 @@ fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
} else {
OUT_PKT7(ring, CP_MEM_TO_REG, 3);
OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
CP_MEM_TO_REG_0_UNK31 |
CP_MEM_TO_REG_0_CNT(0));
OUT_RELOC(ring, offset_bo, 0, 0, 0);
}
@@ -606,7 +634,7 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
fd6_state_add_group(&emit->state, state, FD6_GROUP_ZSA);
break;
case FD6_GROUP_LRZ:
state = build_lrz(emit);
state = build_lrz<CHIP>(emit);
if (state)
fd6_state_take_group(&emit->state, state, FD6_GROUP_LRZ);
break;
@@ -636,7 +664,7 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
fd6_state_take_group(&emit->state, state, FD6_GROUP_PROG_FB_RAST);
break;
case FD6_GROUP_BLEND:
state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask)
state = fd6_blend_variant<CHIP>(ctx->blend, pfb->samples, ctx->sample_mask)
->stateobj;
fd6_state_add_group(&emit->state, state, FD6_GROUP_BLEND);
break;
@@ -703,7 +731,7 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_TEX);
break;
case FD6_GROUP_SO:
fd6_emit_streamout(ring, emit);
fd6_emit_streamout<CHIP>(ring, emit);
break;
case FD6_GROUP_PRIM_MODE_SYSMEM:
state = build_prim_mode(emit, ctx, false);
@@ -784,7 +812,7 @@ void
fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gmem)
{
const struct fd6_gmem_config *cfg = gmem ? &screen->config_gmem : &screen->config_sysmem;
enum a6xx_ccu_cache_size color_cache_size =
enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL :
(enum a6xx_ccu_cache_size)(screen->info->a6xx.gmem_ccu_color_cache_fraction);
uint32_t color_offset = cfg->color_ccu_offset & 0x1fffff;
uint32_t color_offset_hi = cfg->color_ccu_offset >> 21;
@@ -815,7 +843,8 @@ fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gme
}
} else {
OUT_REG(ring,
A6XX_RB_CCU_CNTL(
RB_CCU_CNTL(
CHIP,
.gmem_fast_clear_disable =
!screen->info->a6xx.has_gmem_fast_clear,
.concurrent_resolve =
@@ -850,7 +879,8 @@ template <chip CHIP>
void
fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
{
struct fd_screen *screen = batch->ctx->screen;
struct fd_context *ctx = batch->ctx;
struct fd_screen *screen = ctx->screen;
if (!batch->nondraw) {
trace_start_state_restore(&batch->trace, ring);
@@ -864,39 +894,107 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
OUT_PKT7(ring, CP_SET_MODE, 1);
OUT_RING(ring, 0);
fd6_cache_inv<CHIP>(batch->ctx, ring);
if (CHIP == A6XX) {
fd6_cache_inv<CHIP>(ctx, ring);
} else {
OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
OUT_RING(ring, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_COLOR);
fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_DEPTH);
OUT_PKT7(ring, CP_EVENT_WRITE, 1);
OUT_RING(ring, UNK_40);
fd6_event_write<CHIP>(ctx, ring, FD_CACHE_INVALIDATE);
OUT_WFI5(ring);
}
OUT_REG(ring,
HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true,
.ds_state = true, .gs_state = true,
.fs_state = true, .cs_state = true,
.cs_ibo = true, .gfx_ibo = true,
.cs_shared_const = true,
.gfx_shared_const = true,
.cs_bindless = 0x1f, .gfx_bindless = 0x1f));
HLSQ_INVALIDATE_CMD(CHIP,
.vs_state = true, .hs_state = true,
.ds_state = true, .gs_state = true,
.fs_state = true, .cs_state = true,
.cs_ibo = true, .gfx_ibo = true,
.cs_shared_const = true,
.gfx_shared_const = true,
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
)
);
OUT_WFI5(ring);
if (CHIP >= A7XX) {
/* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
* static properties that can be set once, this requires a WFI to take effect.
* While the newly introduced register RB_CCU_CNTL2 has properties that may
* change per-RP and don't require a WFI to take effect, only CCU inval/flush
* events are required.
*/
OUT_REG(ring,
RB_CCU_CNTL(
CHIP,
.gmem_fast_clear_disable = true, // !screen->info->a6xx.has_gmem_fast_clear,
.concurrent_resolve = screen->info->a6xx.concurrent_resolve,
)
);
OUT_WFI5(ring);
}
fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
for (size_t i = 0; i < ARRAY_SIZE(screen->info->a6xx.magic_raw); i++) {
auto magic_reg = screen->info->a6xx.magic_raw[i];
if (!magic_reg.reg)
break;
uint32_t value = magic_reg.value;
switch(magic_reg.reg) {
case REG_A6XX_TPL1_DBG_ECO_CNTL1:
value = (value & ~A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT) |
(screen->info->a7xx.enable_tp_ubwc_flag_hint
? A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT
: 0);
break;
}
WRITE(magic_reg.reg, value);
}
WRITE(REG_A6XX_RB_DBG_ECO_CNTL, screen->info->a6xx.magic.RB_DBG_ECO_CNTL);
WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
WRITE(REG_A6XX_SP_DBG_ECO_CNTL, screen->info->a6xx.magic.SP_DBG_ECO_CNTL);
WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
if (CHIP == A6XX)
WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
if (CHIP == A6XX) {
WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
}
WRITE(REG_A6XX_VPC_DBG_ECO_CNTL, screen->info->a6xx.magic.VPC_DBG_ECO_CNTL);
WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, screen->info->a6xx.magic.GRAS_DBG_ECO_CNTL);
WRITE(REG_A6XX_HLSQ_DBG_ECO_CNTL, screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
if (CHIP == A6XX)
WRITE(REG_A6XX_HLSQ_DBG_ECO_CNTL, screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
WRITE(REG_A6XX_SP_CHICKEN_BITS, screen->info->a6xx.magic.SP_CHICKEN_BITS);
WRITE(REG_A6XX_SP_IBO_COUNT, 0);
WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
if (CHIP == A6XX)
WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, screen->info->a6xx.magic.UCHE_UNKNOWN_0E12);
WRITE(REG_A6XX_UCHE_CLIENT_PF, screen->info->a6xx.magic.UCHE_CLIENT_PF);
WRITE(REG_A6XX_RB_UNKNOWN_8E01, screen->info->a6xx.magic.RB_UNKNOWN_8E01);
WRITE(REG_A6XX_SP_UNKNOWN_A9A8, 0);
OUT_REG(ring,
A6XX_SP_MODE_CONTROL(
.constant_demotion_enable = true,
.isammode = ISAMMODE_GL,
.shared_consts_enable = false,
)
);
WRITE(REG_A6XX_SP_MODE_CONTROL,
A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
@@ -909,12 +1007,16 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
if (CHIP == A6XX) {
WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
}
WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
@@ -932,8 +1034,10 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
if (CHIP == A6XX) {
WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
}
WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
/* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
@@ -955,9 +1059,6 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1);
OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */
/* Clear any potential pending state groups to be safe: */
OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
@@ -969,6 +1070,17 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
if (CHIP >= A7XX) {
OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
OUT_REG(ring, A7XX_GRAS_LRZ_CNTL2());
} else {
OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
}
OUT_REG(ring, A6XX_RB_LRZ_CNTL());
OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL());
OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
OUT_RING(ring, 0x00000000);
@@ -990,13 +1102,12 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
if (batch->tessellation) {
assert(screen->tess_bo);
fd_ringbuffer_attach_bo(ring, screen->tess_bo);
OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2);
OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
OUT_REG(ring, PC_TESSFACTOR_ADDR(CHIP, screen->tess_bo));
/* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
OUT_WFI5(ring);
}
struct fd6_context *fd6_ctx = fd6_context(batch->ctx);
struct fd6_context *fd6_ctx = fd6_context(ctx);
struct fd_bo *bcolor_mem = fd6_ctx->bcolor_mem;
OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
@@ -1005,6 +1116,27 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
OUT_PKT4(ring, REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, 2);
OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
/* These regs are blocked (CP_PROTECT) on a6xx: */
if (CHIP >= A7XX) {
OUT_REG(ring,
TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0),
TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4),
TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee),
TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed),
TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0),
);
}
if (CHIP >= A7XX) {
/* Blob sets these two per draw. */
OUT_REG(ring, A7XX_PC_TESS_PARAM_SIZE(FD6_TESS_PARAM_SIZE));
/* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
* but the meaning of this additional space is not known,
* so we play safe and don't add it.
*/
OUT_REG(ring, A7XX_PC_TESS_FACTOR_SIZE(FD6_TESS_FACTOR_SIZE));
}
/* There is an optimization to skip executing draw states for draws with no
* instances. Instead of simply skipping the draw, internally the firmware
* sets a bit in PC_DRAW_INITIATOR that seemingly skips the draw. However

View File

@@ -237,7 +237,7 @@ __event_write(struct fd_ringbuffer *ring, enum fd_gpu_event event,
OUT_RING(ring, CP_EVENT_WRITE7_0_EVENT(info.raw_event) |
CP_EVENT_WRITE7_0_WRITE_SRC(esrc) |
CP_EVENT_WRITE7_0_WRITE_DST(edst) |
CP_EVENT_WRITE7_0_WRITE_ENABLED);
COND(info.needs_seqno, CP_EVENT_WRITE7_0_WRITE_ENABLED));
}
if (info.needs_seqno) {

View File

@@ -86,7 +86,7 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
* the effects of the fragment on the framebuffer contents are undefined."
*/
unsigned max_layer_index = 0;
enum a6xx_format mrt0_format = (enum a6xx_format)0;
enum a6xx_format mrt0_format = FMT6_NONE;
for (i = 0; i < pfb->nr_cbufs; i++) {
enum a3xx_color_swap swap = WZYX;
@@ -129,10 +129,13 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
/* Batch with no draws? */
fd_ringbuffer_attach_bo(ring, rsc->bo);
OUT_REG(
ring,
RB_MRT_BUF_INFO(CHIP, i, .color_format = format,
.color_tile_mode = tile_mode, .color_swap = swap),
OUT_REG(ring,
RB_MRT_BUF_INFO(CHIP, i,
.color_format = format,
.color_tile_mode = tile_mode,
.color_swap = swap,
.losslesscompen = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level),
),
A6XX_RB_MRT_PITCH(i, stride),
A6XX_RB_MRT_ARRAY_PITCH(i, array_stride),
A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset),
@@ -183,8 +186,12 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
/* S8 is implemented as Z32_S8 minus the Z32 plane: */
enum a6xx_depth_format fmt = DEPTH6_32;
OUT_REG(
ring, RB_DEPTH_BUFFER_INFO(CHIP, .depth_format = fmt),
OUT_REG(ring,
RB_DEPTH_BUFFER_INFO(CHIP,
.depth_format = fmt,
.tilemode = TILE6_3,
.losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level),
),
A6XX_RB_DEPTH_BUFFER_PITCH(0),
A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
A6XX_RB_DEPTH_BUFFER_BASE(.qword = 0),
@@ -196,8 +203,12 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
} else {
enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format);
OUT_REG(
ring, RB_DEPTH_BUFFER_INFO(CHIP, .depth_format = fmt),
OUT_REG(ring,
RB_DEPTH_BUFFER_INFO(CHIP,
.depth_format = fmt,
.tilemode = TILE6_3,
.losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level),
),
A6XX_RB_DEPTH_BUFFER_PITCH(stride),
A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(array_stride),
A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset),
@@ -208,11 +219,6 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
fd6_emit_flag_reference(ring, rsc, zsbuf->u.tex.level,
zsbuf->u.tex.first_layer);
/* NOTE: blob emits GRAS_LRZ_CNTL plus GRAZ_LRZ_BUFFER_BASE
* plus this CP_EVENT_WRITE at the end in it's own IB..
*/
fd6_event_write<CHIP>(ctx, ring, FD_LRZ_CLEAR);
}
if (stencil) {
@@ -224,11 +230,17 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
fd_ringbuffer_attach_bo(ring, stencil->bo);
OUT_REG(ring, RB_STENCIL_INFO(CHIP, .separate_stencil = true),
A6XX_RB_STENCIL_BUFFER_PITCH(stride),
A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(array_stride),
A6XX_RB_STENCIL_BUFFER_BASE(.bo = stencil->bo, .bo_offset = offset),
A6XX_RB_STENCIL_BUFFER_BASE_GMEM(base));
OUT_REG(ring,
RB_STENCIL_INFO(
CHIP,
.separate_stencil = true,
.tilemode = TILE6_3,
),
A6XX_RB_STENCIL_BUFFER_PITCH(stride),
A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(array_stride),
A6XX_RB_STENCIL_BUFFER_BASE(.bo = stencil->bo, .bo_offset = offset),
A6XX_RB_STENCIL_BUFFER_BASE_GMEM(base)
);
} else {
OUT_REG(ring, RB_STENCIL_INFO(CHIP, 0));
}
@@ -247,13 +259,6 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
OUT_REG(ring,
A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE, 5);
OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_BASE_LO */
OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_BASE_HI */
OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */
OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */
OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */
OUT_REG(ring, RB_STENCIL_INFO(CHIP, 0));
}
}
@@ -269,6 +274,8 @@ emit_lrz(struct fd_batch *batch, struct fd_batch_subpass *subpass)
OUT_REG(ring, A6XX_GRAS_LRZ_BUFFER_BASE(),
A6XX_GRAS_LRZ_BUFFER_PITCH(),
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
if (CHIP >= A7XX)
OUT_REG(ring, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
return;
}
@@ -290,6 +297,14 @@ emit_lrz(struct fd_batch *batch, struct fd_batch_subpass *subpass)
),
);
fd_ringbuffer_attach_bo(ring, subpass->lrz);
if (CHIP >= A7XX) {
OUT_REG(ring,
A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
.depth_format = fd6_pipe2depth(pfb->zsbuf->format),
)
);
}
}
/* Emit any needed lrz clears to the prologue cmds
@@ -437,6 +452,7 @@ patch_fb_read_gmem(struct fd_batch *batch)
util_dynarray_clear(&batch->fb_read_patches);
}
template <chip CHIP>
static void
patch_fb_read_sysmem(struct fd_batch *batch)
{
@@ -462,7 +478,7 @@ patch_fb_read_sysmem(struct fd_batch *batch)
fdl6_get_ubwc_blockwidth(&rsc->layout, &block_width, &block_height);
struct fdl_view_args args = {
.chip = A6XX,
.chip = CHIP,
.iova = fd_bo_get_iova(rsc->bo),
@@ -496,6 +512,24 @@ update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb,
bool binning)
{
struct fd_ringbuffer *ring = batch->gmem;
if (CHIP >= A7XX) {
OUT_REG(ring,
RB_RENDER_CNTL(
CHIP,
.binning = binning,
.raster_mode = TYPE_TILED,
.raster_direction = LR_TB
)
);
OUT_REG(ring,
A7XX_GRAS_SU_RENDER_CNTL(
.binning = binning,
)
);
return;
}
struct fd_screen *screen = batch->ctx->screen;
bool depth_ubwc_enable = false;
uint32_t mrts_ubwc_enable = 0;
@@ -732,6 +766,7 @@ template <chip CHIP>
static void
emit_common_init(struct fd_batch *batch)
{
struct fd_context *ctx = batch->ctx;
struct fd_ringbuffer *ring = batch->gmem;
struct fd_autotune *at = &batch->ctx->autotune;
struct fd_batch_result *result = batch->autotune_result;
@@ -744,16 +779,34 @@ emit_common_init(struct fd_batch *batch)
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start));
if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start));
fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
fd6_event_write<CHIP>(ctx, ring, FD_ZPASS_DONE);
/* Copied from blob's cmdstream, not sure why it is done. */
if (CHIP == A7XX) {
fd6_event_write<CHIP>(ctx, ring, FD_CCU_CLEAN_DEPTH);
}
} else {
OUT_PKT(ring, CP_EVENT_WRITE7,
CP_EVENT_WRITE7_0(
.event = ZPASS_DONE,
.write_sample_count = true,
),
EV_DST_RAM_CP_EVENT_WRITE7_1(
results_ptr(at, result[result->idx].samples_start)
),
);
}
}
template <chip CHIP>
static void
emit_common_fini(struct fd_batch *batch)
{
struct fd_context *ctx = batch->ctx;
struct fd_ringbuffer *ring = batch->gmem;
struct fd_autotune *at = &batch->ctx->autotune;
struct fd_batch_result *result = batch->autotune_result;
@@ -763,16 +816,30 @@ emit_common_fini(struct fd_batch *batch)
if (!result)
return;
// TODO attach directly to submit:
fd_ringbuffer_attach_bo(ring, at->results_mem);
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end));
if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end));
fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
} else {
OUT_PKT(ring, CP_EVENT_WRITE7,
CP_EVENT_WRITE7_0(
.event = ZPASS_DONE,
.write_sample_count = true,
.sample_count_end_offset = true,
.write_accum_sample_count_diff = true,
),
EV_DST_RAM_CP_EVENT_WRITE7_1(
results_ptr(at, result[result->idx].samples_start)
),
);
}
fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
fd6_fence_write<CHIP>(ring, result->fence, results_ptr(at, fence));
}
@@ -852,13 +919,22 @@ set_bin_size(struct fd_ringbuffer *ring, const struct fd_gmem_stateobj *gmem,
unsigned w = gmem ? gmem->bin_w : 0;
unsigned h = gmem ? gmem->bin_h : 0;
OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
.binw = w, .binh = h,
.render_mode = p.render_mode,
.force_lrz_write_dis = p.force_lrz_write_dis,
.buffers_location = p.buffers_location,
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
));
if (CHIP == A6XX) {
OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
.binw = w, .binh = h,
.render_mode = p.render_mode,
.force_lrz_write_dis = p.force_lrz_write_dis,
.buffers_location = p.buffers_location,
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
));
} else {
OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
.binw = w, .binh = h,
.render_mode = p.render_mode,
.force_lrz_write_dis = p.force_lrz_write_dis,
.lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
));
}
OUT_REG(ring, RB_BIN_CONTROL(
CHIP,
.binw = w, .binh = h,
@@ -1036,6 +1112,14 @@ fd6_emit_tile_init(struct fd_batch *batch) assert_dt
emit_msaa(ring, pfb->samples);
patch_fb_read_gmem(batch);
if (CHIP >= A7XX) {
OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x0));
OUT_REG(ring, A7XX_RB_UNKNOWN_8E06(0x0));
OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0));
OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4));
}
if (use_hw_binning(batch)) {
/* enable stream-out during binning pass: */
OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
@@ -1257,6 +1341,9 @@ emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base,
psurf->u.tex.first_layer);
}
if (CHIP >= A7XX)
OUT_REG(ring, A7XX_RB_UNKNOWN_88E4(.unk0 = 1));
fd6_emit_blit<CHIP>(batch->ctx, ring);
}
@@ -1357,6 +1444,9 @@ emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass)
OUT_RING(ring, uc.ui[2]);
OUT_RING(ring, uc.ui[3]);
if (CHIP >= A7XX)
OUT_REG(ring, A7XX_RB_UNKNOWN_88E4(.unk0 = 1));
fd6_emit_blit<CHIP>(batch->ctx, ring);
}
}
@@ -1851,6 +1941,14 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
.buffers_location = BUFFERS_IN_SYSMEM,
});
if (CHIP >= A7XX) {
OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem
OUT_REG(ring, A7XX_RB_UNKNOWN_8E06(batch->ctx->screen->info->a6xx.magic.RB_UNKNOWN_8E06));
OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0));
OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4));
}
emit_marker6(ring, 7);
OUT_PKT7(ring, CP_SET_MARKER, 1);
OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
@@ -1872,7 +1970,7 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, NULL);
emit_mrt<CHIP>(ring, pfb, NULL);
emit_msaa(ring, pfb->samples);
patch_fb_read_sysmem(batch);
patch_fb_read_sysmem<CHIP>(batch);
emit_common_init<CHIP>(batch);
}

View File

@@ -73,7 +73,7 @@ fd6_image_descriptor(struct fd_context *ctx, const struct pipe_image_view *buf,
size);
} else {
struct fdl_view_args args = {
.chip = A6XX,
.chip = ctx->screen->gen,
.iova = rsc_iova(buf->resource, 0),
@@ -259,7 +259,12 @@ fd6_build_bindless_state(struct fd_context *ctx, enum pipe_shader_type shader,
fd_ringbuffer_attach_bo(ring, set->bo);
if (shader == PIPE_SHADER_COMPUTE) {
OUT_REG(ring, HLSQ_INVALIDATE_CMD(CHIP, .cs_bindless = 0x1f));
OUT_REG(ring,
HLSQ_INVALIDATE_CMD(
CHIP,
.cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
)
);
OUT_REG(ring, SP_CS_BINDLESS_BASE_DESCRIPTOR(CHIP,
idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo,
));
@@ -301,13 +306,20 @@ fd6_build_bindless_state(struct fd_context *ctx, enum pipe_shader_type shader,
);
}
} else {
OUT_REG(ring, HLSQ_INVALIDATE_CMD(CHIP, .gfx_bindless = 0x1f));
OUT_REG(ring,
HLSQ_INVALIDATE_CMD(
CHIP,
.gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
)
);
OUT_REG(ring, SP_BINDLESS_BASE_DESCRIPTOR(CHIP,
idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo,
));
OUT_REG(ring, A6XX_HLSQ_BINDLESS_BASE_DESCRIPTOR(
idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo,
));
if (CHIP == A6XX) {
OUT_REG(ring, A6XX_HLSQ_BINDLESS_BASE_DESCRIPTOR(
idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo,
));
}
if (bufso->enabled_mask) {
OUT_PKT(ring, CP_LOAD_STATE6,

View File

@@ -27,6 +27,8 @@
#define FD_BO_NO_HARDPIN 1
#include <initializer_list>
#include "pipe/p_state.h"
#include "util/bitset.h"
#include "util/format/u_format.h"
@@ -58,50 +60,62 @@ struct program_builder {
bool binning_pass;
};
static const struct xs_config {
template <chip CHIP>
struct xs_config {
uint16_t reg_sp_xs_instrlen;
uint16_t reg_hlsq_xs_ctrl;
uint16_t reg_sp_xs_first_exec_offset;
uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
} xs_config[] = {
uint16_t reg_sp_xs_vgpr_config;
};
template <chip CHIP>
static const struct xs_config<CHIP> xs_configs[] = {
[MESA_SHADER_VERTEX] = {
REG_A6XX_SP_VS_INSTRLEN,
REG_A6XX_HLSQ_VS_CNTL,
CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
REG_A7XX_SP_VS_VGPR_CONFIG,
},
[MESA_SHADER_TESS_CTRL] = {
REG_A6XX_SP_HS_INSTRLEN,
REG_A6XX_HLSQ_HS_CNTL,
CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
REG_A7XX_SP_HS_VGPR_CONFIG,
},
[MESA_SHADER_TESS_EVAL] = {
REG_A6XX_SP_DS_INSTRLEN,
REG_A6XX_HLSQ_DS_CNTL,
CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
REG_A7XX_SP_DS_VGPR_CONFIG,
},
[MESA_SHADER_GEOMETRY] = {
REG_A6XX_SP_GS_INSTRLEN,
REG_A6XX_HLSQ_GS_CNTL,
CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
REG_A7XX_SP_GS_VGPR_CONFIG,
},
[MESA_SHADER_FRAGMENT] = {
REG_A6XX_SP_FS_INSTRLEN,
REG_A6XX_HLSQ_FS_CNTL,
CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
REG_A7XX_SP_FS_VGPR_CONFIG,
},
[MESA_SHADER_COMPUTE] = {
REG_A6XX_SP_CS_INSTRLEN,
REG_A6XX_HLSQ_CS_CNTL,
CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
REG_A7XX_SP_CS_VGPR_CONFIG,
},
};
template <chip CHIP>
void
fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
const struct ir3_shader_variant *so)
@@ -189,7 +203,7 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
unreachable("bad shader stage");
}
const struct xs_config *cfg = &xs_config[type];
const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[type];
OUT_PKT4(ring, cfg->reg_sp_xs_instrlen, 1);
OUT_RING(ring, so->instrlen);
@@ -221,20 +235,28 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
OUT_PKT4(ring, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size));
uint32_t shader_preload_size =
MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size);
if (CHIP >= A7XX) {
OUT_PKT4(ring, cfg->reg_sp_xs_vgpr_config, 1);
OUT_RING(ring, 0);
}
enum a6xx_state_block sb = fd6_stage2shadersb(so->type);
OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
OUT_RELOC(ring, so->bo, 0, 0, 0);
if (CHIP == A6XX) {
uint32_t shader_preload_size =
MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size);
enum a6xx_state_block sb = fd6_stage2shadersb(so->type);
OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
OUT_RELOC(ring, so->bo, 0, 0, 0);
}
fd6_emit_immediates(so, ring);
}
FD_GENX(fd6_emit_shader);
/**
* Build a pre-baked state-obj to disable SO, so that we aren't dynamically
@@ -577,6 +599,7 @@ emit_vs_system_values(struct fd_ringbuffer *ring,
OUT_RING(ring, COND(b->fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN)); /* VFD_CONTROL_6 */
}
template <chip CHIP>
static void
emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
{
@@ -824,6 +847,11 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
OUT_REG(ring, A6XX_PC_PS_CNTL(b->fs->reads_primid));
if (CHIP >= A7XX) {
OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
OUT_REG(ring, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
}
OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1);
OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(b->fs->total_in) |
COND(b->fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
@@ -848,7 +876,7 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
} else {
fd6_emit_link_map(b->vs, b->gs, ring);
}
vertices_out = b->gs->gs.vertices_out - 1;
vertices_out = MAX2(1, b->gs->gs.vertices_out) - 1;
enum a6xx_tess_output output =
primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive);
invocations = b->gs->gs.invocations - 1;
@@ -862,8 +890,18 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1);
OUT_RING(ring, 0xff);
if (CHIP >= A7XX) {
OUT_REG(ring,
A7XX_VPC_PRIMITIVE_CNTL_5(
.gs_vertices_out = vertices_out,
.gs_invocations = invocations,
.gs_output = output,
)
);
} else {
OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1);
OUT_RING(ring, 0xff);
}
OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
@@ -918,6 +956,8 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
COND(fs->prefetch_end_of_quad,
@@ -927,8 +967,12 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
OUT_RING(ring, SP_FS_PREFETCH_CMD(
CHIP, i,
.src = prefetch->src,
.samp_id = prefetch->samp_id,
.tex_id = prefetch->tex_id,
/* For a7xx, samp_id/tex_id is always in SP_FS_BINDLESS_PREFETCH_CMD[n]
* even in the non-bindless case (which probably makes the reg name
* wrong)
*/
.samp_id = (CHIP == A6XX) ? prefetch->samp_id : 0,
.tex_id = (CHIP == A6XX) ? prefetch->tex_id : 0,
.dst = prefetch->dst,
.wrmask = prefetch->wrmask,
.half = prefetch->half_precision,
@@ -938,6 +982,18 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
);
}
if (CHIP == A7XX) {
for (int i = 0; i < fs->num_sampler_prefetch; i++) {
const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
OUT_REG(ring,
A6XX_SP_FS_BINDLESS_PREFETCH_CMD(i,
.samp_id = prefetch->samp_id,
.tex_id = prefetch->tex_id,
)
);
}
}
OUT_REG(ring,
HLSQ_CONTROL_1_REG(CHIP,
b->ctx->screen->info->a6xx.prim_alloc_threshold),
@@ -969,6 +1025,36 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
),
);
if (CHIP >= A7XX) {
uint32_t sysval_regs = 0;
for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
if (VALIDREG(ij_regid[i])) {
if (i == IJ_PERSP_CENTER_RHW)
sysval_regs += 1;
else
sysval_regs += 2;
}
}
for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid }) {
if (VALIDREG(sysval))
sysval_regs += 1;
}
for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
if (VALIDREG(sysval))
sysval_regs += 2;
}
OUT_REG(ring,
A7XX_HLSQ_UNKNOWN_A9AE(
.sysval_regs_count = sysval_regs,
.unk8 = 1,
.unk9 = 1,
)
);
}
enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
OUT_REG(ring,
HLSQ_FS_CNTL_0(
@@ -1084,19 +1170,19 @@ static void
setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b)
assert_dt
{
fd6_emit_shader(b->ctx, ring, b->vs);
fd6_emit_shader(b->ctx, ring, b->hs);
fd6_emit_shader(b->ctx, ring, b->ds);
fd6_emit_shader(b->ctx, ring, b->gs);
fd6_emit_shader<CHIP>(b->ctx, ring, b->vs);
fd6_emit_shader<CHIP>(b->ctx, ring, b->hs);
fd6_emit_shader<CHIP>(b->ctx, ring, b->ds);
fd6_emit_shader<CHIP>(b->ctx, ring, b->gs);
if (!b->binning_pass)
fd6_emit_shader(b->ctx, ring, b->fs);
fd6_emit_shader<CHIP>(b->ctx, ring, b->fs);
OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
OUT_RING(ring, 0);
emit_vfd_dest(ring, b->vs);
emit_vpc(ring, b);
emit_vpc<CHIP>(ring, b);
emit_fs_inputs<CHIP>(ring, b);
emit_fs_outputs(ring, b);

View File

@@ -99,6 +99,7 @@ fd6_last_shader(const struct fd6_program_state *state)
return state->vs;
}
template <chip CHIP>
void fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
const struct ir3_shader_variant *so) assert_dt;

View File

@@ -36,6 +36,8 @@
#include "fd6_emit.h"
#include "fd6_query.h"
#include "fd6_pack.h"
/* g++ is a picky about offsets that cannot be resolved at compile time, so
* roll our own __offsetof()
*/
@@ -75,6 +77,7 @@ template <chip CHIP>
static void
occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
{
struct fd_context *ctx = batch->ctx;
struct fd_ringbuffer *ring = batch->draw;
ASSERT_ALIGNED(struct fd6_query_sample, start, 16);
@@ -82,55 +85,109 @@ occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, query_sample(aq, start));
if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, query_sample(aq, start));
fd6_event_write<CHIP>(ctx, ring, FD_ZPASS_DONE);
/* Copied from blob's cmdstream, not sure why it is done. */
if (CHIP == A7XX) {
fd6_event_write<CHIP>(ctx, ring, FD_CCU_CLEAN_DEPTH);
}
} else {
OUT_PKT(ring, CP_EVENT_WRITE7,
CP_EVENT_WRITE7_0(
.event = ZPASS_DONE,
.write_sample_count = true,
),
EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
);
OUT_PKT(ring, CP_EVENT_WRITE7,
CP_EVENT_WRITE7_0(
.event = ZPASS_DONE,
.write_sample_count = true,
.sample_count_end_offset = true,
.write_accum_sample_count_diff = true,
),
EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
);
}
fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
}
template <chip CHIP>
static void
occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
{
struct fd_context *ctx = batch->ctx;
struct fd_ringbuffer *ring = batch->draw;
OUT_PKT7(ring, CP_MEM_WRITE, 4);
OUT_RELOC(ring, query_sample(aq, stop));
OUT_RING(ring, 0xffffffff);
OUT_RING(ring, 0xffffffff);
if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
OUT_PKT7(ring, CP_MEM_WRITE, 4);
OUT_RELOC(ring, query_sample(aq, stop));
OUT_RING(ring, 0xffffffff);
OUT_RING(ring, 0xffffffff);
OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
}
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
ASSERT_ALIGNED(struct fd6_query_sample, stop, 16);
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, query_sample(aq, stop));
if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
OUT_RELOC(ring, query_sample(aq, stop));
fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
/* To avoid stalling in the draw buffer, emit code the code to compute the
* counter delta in the epilogue ring.
*/
struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch);
/* To avoid stalling in the draw buffer, emit code the code to compute the
* counter delta in the epilogue ring.
*/
struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch);
OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6);
OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
OUT_RELOC(epilogue, query_sample(aq, stop));
OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff));
OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff));
OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6);
OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
OUT_RELOC(epilogue, query_sample(aq, stop));
OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff));
OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff));
OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
/* result += stop - start: */
OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9);
OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */
OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */
OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */
OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */
/* result += stop - start: */
OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9);
OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */
OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */
OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */
OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */
} else {
OUT_PKT(ring, CP_EVENT_WRITE7,
CP_EVENT_WRITE7_0(
.event = ZPASS_DONE,
.write_sample_count = true,
),
EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, stop)),
);
OUT_PKT(ring, CP_EVENT_WRITE7,
CP_EVENT_WRITE7_0(
.event = ZPASS_DONE,
.write_sample_count = true,
.sample_count_end_offset = true,
.write_accum_sample_count_diff = true,
),
/* Note: SQE is adding offsets to the iova, SAMPLE_COUNT_END_OFFSET causes
* the result to be written to iova+16, and WRITE_ACCUM_SAMP_COUNT_DIFF
* does *(iova + 8) += *(iova + 16) - *iova
*
* It just so happens this is the layout we already to for start/result/stop
* So we just give the start address in all cases.
*/
EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
);
}
}
static void

View File

@@ -31,6 +31,8 @@
#include "util/u_memory.h"
#include "util/u_string.h"
#include "freedreno_state.h"
#include "fd6_context.h"
#include "fd6_pack.h"
#include "fd6_rasterizer.h"
@@ -41,7 +43,8 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
const struct pipe_rasterizer_state *cso,
bool primitive_restart)
{
struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 26 * 4);
unsigned ndwords = (CHIP >= A7XX) ? 66 : 26;
struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, ndwords * 4);
float psize_min, psize_max;
if (cso->point_size_per_vertex) {
@@ -57,7 +60,7 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
A6XX_GRAS_CL_CNTL(
.znear_clip_disable = !cso->depth_clip_near,
.zfar_clip_disable = !cso->depth_clip_far,
.z_clamp_enable = cso->depth_clamp,
.z_clamp_enable = cso->depth_clamp || CHIP >= A7XX,
.zero_gb_scale_z = cso->clip_halfz,
.vp_clip_code_ignore = 1,
),
@@ -89,6 +92,15 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
),
);
if (CHIP >= A7XX) {
OUT_REG(ring,
A7XX_VPC_PRIMITIVE_CNTL_0(
.primitive_restart = primitive_restart,
.provoking_vtx_last = !cso->flatshade_first,
),
);
}
enum a6xx_polygon_mode mode = POLYMODE6_TRIANGLES;
switch (cso->fill_front) {
case PIPE_POLYGON_MODE_POINT:
@@ -105,7 +117,34 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
OUT_REG(ring, A6XX_VPC_POLYGON_MODE(mode));
OUT_REG(ring, PC_POLYGON_MODE(CHIP, mode));
if (ctx->screen->info->a6xx.has_shading_rate) {
if (CHIP == A7XX) {
OUT_REG(ring, A7XX_VPC_POLYGON_MODE2(mode));
}
/* With a7xx the hw doesn't do the clamping for us. When depth clamp
* is enabled, this gets emitted in fd6_emit_non_ring() due to
* dependency on viewport state. But when it is disabled there is
* no dependency on external state (other than to know the max
* number of viewports, here we just assume the max) so we can emit
* this state here:
*/
if (CHIP >= A7XX && !fd_rast_depth_clamp_enabled(cso)) {
/* We must assume the max: */
const unsigned num_viewports = 16;
OUT_PKT4(ring, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewports * 2);
for (unsigned i = 0; i < num_viewports; i++) {
OUT_RING(ring, fui(0.0f));
OUT_RING(ring, fui(1.0f));
}
OUT_REG(ring,
A6XX_RB_Z_CLAMP_MIN(0.0f),
A6XX_RB_Z_CLAMP_MAX(1.0),
);
}
if (CHIP == A6XX && ctx->screen->info->a6xx.has_shading_rate) {
OUT_REG(ring, A6XX_RB_UNKNOWN_8A00());
OUT_REG(ring, A6XX_RB_UNKNOWN_8A10());
OUT_REG(ring, A6XX_RB_UNKNOWN_8A20());

View File

@@ -70,6 +70,14 @@ ok_ubwc_format(struct pipe_screen *pscreen, enum pipe_format pfmt)
break;
}
/* In copy_format, we treat snorm as unorm to avoid clamping. But snorm
* and unorm are UBWC incompatible for special values such as all 0's or
* all 1's prior to a740. Disable UBWC for snorm.
*/
if (util_format_is_snorm(pfmt) &&
!info->a7xx.ubwc_unorm_snorm_int_compatible)
return false;
/* A690 seem to have broken UBWC for depth/stencil, it requires
* depth flushing where we cannot realistically place it, like between
* ordinary draw calls writing read/depth. WSL blob seem to use ubwc

View File

@@ -436,7 +436,7 @@ fd6_sampler_view_update(struct fd_context *ctx,
fdl6_buffer_view_init(so->descriptor, cso->format, swiz, iova, size);
} else {
struct fdl_view_args args = {
.chip = A6XX,
.chip = ctx->screen->gen,
/* Using relocs for addresses still */
.iova = 0,

View File

@@ -90,6 +90,7 @@ update_lrz_stencil(struct fd6_zsa_stateobj *so, enum pipe_compare_func func,
}
}
template <chip CHIP>
void *
fd6_zsa_state_create(struct pipe_context *pctx,
const struct pipe_depth_stencil_alpha_state *cso)
@@ -238,6 +239,7 @@ fd6_zsa_state_create(struct pipe_context *pctx,
/* Build the four state permutations (with/without alpha/depth-clamp)*/
for (int i = 0; i < 4; i++) {
struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 12 * 4);
bool depth_clamp_enable = (i & FD6_ZSA_DEPTH_CLAMP);
OUT_PKT4(ring, REG_A6XX_RB_ALPHA_CONTROL, 1);
OUT_RING(ring,
@@ -250,21 +252,31 @@ fd6_zsa_state_create(struct pipe_context *pctx,
OUT_PKT4(ring, REG_A6XX_RB_DEPTH_CNTL, 1);
OUT_RING(ring,
so->rb_depth_cntl | COND(i & FD6_ZSA_DEPTH_CLAMP,
so->rb_depth_cntl | COND(depth_clamp_enable || CHIP >= A7XX,
A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE));
OUT_PKT4(ring, REG_A6XX_RB_STENCILMASK, 2);
OUT_RING(ring, so->rb_stencilmask);
OUT_RING(ring, so->rb_stencilwrmask);
OUT_REG(ring, A6XX_RB_Z_BOUNDS_MIN(cso->depth_bounds_min),
A6XX_RB_Z_BOUNDS_MAX(cso->depth_bounds_max));
if (CHIP >= A7XX && !depth_clamp_enable) {
OUT_REG(ring,
A6XX_RB_Z_BOUNDS_MIN(0.0f),
A6XX_RB_Z_BOUNDS_MAX(1.0f),
);
} else {
OUT_REG(ring,
A6XX_RB_Z_BOUNDS_MIN(cso->depth_bounds_min),
A6XX_RB_Z_BOUNDS_MAX(cso->depth_bounds_max),
);
}
so->stateobj[i] = ring;
}
return so;
}
FD_GENX(fd6_zsa_state_create);
void
fd6_zsa_state_delete(struct pipe_context *pctx, void *hwcso)

View File

@@ -35,8 +35,6 @@
#include "fd6_context.h"
BEGINC;
#define FD6_ZSA_NO_ALPHA (1 << 0)
#define FD6_ZSA_DEPTH_CLAMP (1 << 1)
@@ -82,11 +80,10 @@ fd6_zsa_state(struct fd_context *ctx, bool no_alpha, bool depth_clamp) assert_dt
return fd6_zsa_stateobj(ctx->zsa)->stateobj[variant];
}
template <chip CHIP>
void *fd6_zsa_state_create(struct pipe_context *pctx,
const struct pipe_depth_stencil_alpha_state *cso);
void fd6_zsa_state_delete(struct pipe_context *pctx, void *hwcso);
ENDC;
#endif /* FD6_ZSA_H_ */

View File

@@ -126,7 +126,7 @@ struct fd_autotune_results {
*/
struct {
uint64_t samples_start;
uint64_t __pad0;
uint64_t samples_result;
uint64_t samples_end;
uint64_t __pad1;
} result[127];

View File

@@ -134,7 +134,7 @@ fd_acc_end_query(struct fd_context *ctx, struct fd_query *q) assert_dt
/* mark the result available: */
struct fd_batch *batch = fd_context_batch(ctx);
struct fd_ringbuffer *ring = batch->draw;
struct fd_ringbuffer *ring = fd_batch_get_tile_epilogue(batch);
struct fd_resource *rsc = fd_resource(aq->prsc);
if (ctx->screen->gen < 5) {

View File

@@ -1201,6 +1201,10 @@ fd_screen_create(int fd,
screen->dev_info = info;
screen->info = &screen->dev_info;
/* HACK: disable lrz for now on a7xx: */
if (screen->gen == 7)
fd_mesa_debug |= FD_DBG_NOLRZ;
/* explicitly checking for GPU revisions that are known to work. This
* may be overly conservative for a3xx, where spoofing the gpu_id with
* the blob driver seems to generate identical cmdstream dumps. But
@@ -1226,6 +1230,7 @@ fd_screen_create(int fd,
fd5_screen_init(pscreen);
break;
case 6:
case 7:
fd6_screen_init(pscreen);
break;
default:

View File

@@ -274,7 +274,7 @@ is_a5xx(struct fd_screen *screen)
static inline bool
is_a6xx(struct fd_screen *screen)
{
return screen->gen == 6;
return screen->gen >= 6;
}
/* is it using the ir3 compiler (shader isa introduced with a3xx)? */

View File

@@ -56,11 +56,16 @@ fd_blend_enabled(struct fd_context *ctx, unsigned n) assert_dt
return ctx->blend && ctx->blend->rt[n].blend_enable;
}
static inline bool
fd_rast_depth_clamp_enabled(const struct pipe_rasterizer_state *cso)
{
return !(cso->depth_clip_near && cso->depth_clip_far);
}
static inline bool
fd_depth_clamp_enabled(struct fd_context *ctx) assert_dt
{
return !(ctx->rasterizer->depth_clip_near &&
ctx->rasterizer->depth_clip_far);
return fd_rast_depth_clamp_enabled(ctx->rasterizer);
}
void fd_set_shader_buffers(struct pipe_context *pctx,