i965: Port gen7+ 3DSTATE_SOL to genxml.

Emit 3DSTATE_SOL on Gen7+ using brw_batch_emit helper, that uses pack
structs from genxml.

v2:
   - Add helpers to assign struct brw_address (Kristian)
v3:
   - Rename MOCS -> SOBufferMOCS
   - Do not re-declare MOCS macros (Ken).
   - Style and code reorganization (Ken).

Signed-off-by: Rafael Antognolli <rafael.antognolli@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Rafael Antognolli
2017-03-21 07:30:03 -07:00
committed by Kenneth Graunke
parent c5d6ee6ccb
commit ddc6f4d069
5 changed files with 338 additions and 412 deletions

View File

@@ -111,7 +111,6 @@ i965_FILES = \
gen8_hs_state.c \
gen8_multisample_state.c \
gen8_ps_state.c \
gen8_sol_state.c \
gen8_surface_state.c \
gen8_viewport_state.c \
gen8_vs_state.c \

View File

@@ -135,7 +135,6 @@ extern const struct brw_tracked_state gen7_l3_state;
extern const struct brw_tracked_state gen7_ps_state;
extern const struct brw_tracked_state gen7_push_constant_space;
extern const struct brw_tracked_state gen7_sf_clip_viewport;
extern const struct brw_tracked_state gen7_sol_state;
extern const struct brw_tracked_state gen7_te_state;
extern const struct brw_tracked_state gen7_tes_push_constants;
extern const struct brw_tracked_state gen7_urb;
@@ -299,11 +298,6 @@ void gen8_upload_ps_state(struct brw_context *brw,
void gen8_upload_ps_extra(struct brw_context *brw,
const struct brw_wm_prog_data *prog_data);
/* gen7_sol_state.c */
void gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
const struct brw_vue_map *vue_map);
void gen8_upload_3dstate_so_buffers(struct brw_context *brw);
/* gen8_surface_state.c */
void gen8_init_vtable_surface_functions(struct brw_context *brw);

View File

@@ -35,313 +35,6 @@
#include "intel_buffer_objects.h"
#include "main/transformfeedback.h"
static void
upload_3dstate_so_buffers(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
const struct gl_transform_feedback_info *linked_xfb_info =
xfb_obj->program->sh.LinkedTransformFeedback;
int i;
/* Set up the up to 4 output buffers. These are the ranges defined in the
* gl_transform_feedback_object.
*/
for (i = 0; i < 4; i++) {
struct intel_buffer_object *bufferobj =
intel_buffer_object(xfb_obj->Buffers[i]);
struct brw_bo *bo;
uint32_t start, end;
uint32_t stride;
if (!xfb_obj->Buffers[i]) {
/* The pitch of 0 in this command indicates that the buffer is
* unbound and won't be written to.
*/
BEGIN_BATCH(4);
OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
OUT_BATCH(0);
OUT_BATCH(0);
ADVANCE_BATCH();
continue;
}
stride = linked_xfb_info->Buffers[i].Stride * 4;
start = xfb_obj->Offset[i];
assert(start % 4 == 0);
end = ALIGN(start + xfb_obj->Size[i], 4);
bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start);
assert(end <= bo->size);
BEGIN_BATCH(4);
OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride);
OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end);
ADVANCE_BATCH();
}
}
/**
* Outputs the 3DSTATE_SO_DECL_LIST command.
*
* The data output is a series of 64-bit entries containing a SO_DECL per
* stream. We only have one stream of rendering coming out of the GS unit, so
* we only emit stream 0 (low 16 bits) SO_DECLs.
*/
void
gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
const struct brw_vue_map *vue_map)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
const struct gl_transform_feedback_info *linked_xfb_info =
xfb_obj->program->sh.LinkedTransformFeedback;
uint16_t so_decl[MAX_VERTEX_STREAMS][128];
int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int next_offset[BRW_MAX_SOL_BUFFERS] = {0, 0, 0, 0};
int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int max_decls = 0;
STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
memset(so_decl, 0, sizeof(so_decl));
/* Construct the list of SO_DECLs to be emitted. The formatting of the
* command is feels strange -- each dword pair contains a SO_DECL per stream.
*/
for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
uint16_t decl = 0;
int varying = linked_xfb_info->Outputs[i].OutputRegister;
const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
unsigned component_mask = (1 << components) - 1;
unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;
assert(stream_id < MAX_VERTEX_STREAMS);
/* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
* gl_Layer is stored in VARYING_SLOT_PSIZ.y
* gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
*/
if (varying == VARYING_SLOT_PSIZ) {
assert(components == 1);
component_mask <<= 3;
} else if (varying == VARYING_SLOT_LAYER) {
assert(components == 1);
component_mask <<= 1;
} else if (varying == VARYING_SLOT_VIEWPORT) {
assert(components == 1);
component_mask <<= 2;
} else {
component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
}
buffer_mask[stream_id] |= 1 << buffer;
decl |= decl_buffer_slot;
if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] <<
SO_DECL_REGISTER_INDEX_SHIFT;
} else {
assert(vue_map->varying_to_slot[varying] >= 0);
decl |= vue_map->varying_to_slot[varying] <<
SO_DECL_REGISTER_INDEX_SHIFT;
}
decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT;
/* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
* array. Instead, it simply increments DstOffset for the following
* input by the number of components that should be skipped.
*
* Our hardware is unusual in that it requires us to program SO_DECLs
* for fake "hole" components, rather than simply taking the offset
* for each real varying. Each hole can have size 1, 2, 3, or 4; we
* program as many size = 4 holes as we can, then a final hole to
* accommodate the final 1, 2, or 3 remaining.
*/
int skip_components =
linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
next_offset[buffer] += skip_components;
while (skip_components >= 4) {
so_decl[stream_id][decls[stream_id]++] =
SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot;
skip_components -= 4;
}
if (skip_components > 0)
so_decl[stream_id][decls[stream_id]++] =
SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) |
decl_buffer_slot;
assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
next_offset[buffer] += components;
so_decl[stream_id][decls[stream_id]++] = decl;
if (decls[stream_id] > max_decls)
max_decls = decls[stream_id];
}
BEGIN_BATCH(max_decls * 2 + 3);
OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1));
OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) |
(buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) |
(buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) |
(buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT));
OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) |
(decls[1] << SO_NUM_ENTRIES_1_SHIFT) |
(decls[2] << SO_NUM_ENTRIES_2_SHIFT) |
(decls[3] << SO_NUM_ENTRIES_3_SHIFT));
for (int i = 0; i < max_decls; i++) {
/* Stream 1 | Stream 0 */
OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]);
/* Stream 3 | Stream 2 */
OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]);
}
ADVANCE_BATCH();
}
static bool
query_active(struct gl_query_object *q)
{
return q && q->Active;
}
static void
upload_3dstate_streamout(struct brw_context *brw, bool active,
const struct brw_vue_map *vue_map)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0;
int i;
if (active) {
const struct gl_transform_feedback_info *linked_xfb_info =
xfb_obj->program->sh.LinkedTransformFeedback;
int urb_entry_read_offset = 0;
int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
urb_entry_read_offset;
dw1 |= SO_FUNCTION_ENABLE;
dw1 |= SO_STATISTICS_ENABLE;
/* BRW_NEW_RASTERIZER_DISCARD */
if (ctx->RasterDiscard) {
if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
dw1 |= SO_RENDERING_DISABLE;
} else {
perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
"query active relies on the clipper.");
}
}
/* _NEW_LIGHT */
if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
dw1 |= SO_REORDER_TRAILING;
if (brw->gen < 8) {
for (i = 0; i < 4; i++) {
if (xfb_obj->Buffers[i]) {
dw1 |= SO_BUFFER_ENABLE(i);
}
}
}
/* We always read the whole vertex. This could be reduced at some
* point by reading less and offsetting the register index in the
* SO_DECLs.
*/
dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET);
dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH);
dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET);
dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH);
dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET);
dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH);
dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET);
dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH);
if (brw->gen >= 8) {
/* Set buffer pitches; 0 means unbound. */
if (xfb_obj->Buffers[0])
dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
if (xfb_obj->Buffers[1])
dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
if (xfb_obj->Buffers[2])
dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
if (xfb_obj->Buffers[3])
dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
}
}
const int dwords = brw->gen >= 8 ? 5 : 3;
BEGIN_BATCH(dwords);
OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2));
OUT_BATCH(dw1);
OUT_BATCH(dw2);
if (dwords > 3) {
OUT_BATCH(dw3);
OUT_BATCH(dw4);
}
ADVANCE_BATCH();
}
static void
upload_sol_state(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
bool active = _mesa_is_xfb_active_and_unpaused(ctx);
if (active) {
if (brw->gen >= 8)
gen8_upload_3dstate_so_buffers(brw);
else
upload_3dstate_so_buffers(brw);
/* BRW_NEW_VUE_MAP_GEOM_OUT */
gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out);
}
/* Finally, set up the SOL stage. This command must always follow updates to
* the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
* MMIO register updates (current performed by the kernel at each batch
* emit).
*/
upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out);
}
const struct brw_tracked_state gen7_sol_state = {
.dirty = {
.mesa = _NEW_LIGHT,
.brw = BRW_NEW_BATCH |
BRW_NEW_BLORP |
BRW_NEW_RASTERIZER_DISCARD |
BRW_NEW_VUE_MAP_GEOM_OUT |
BRW_NEW_TRANSFORM_FEEDBACK,
},
.emit = upload_sol_state,
};
void
gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
struct gl_transform_feedback_object *obj)

View File

@@ -1,95 +0,0 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/**
* @file gen8_sol_state.c
*
* Controls the stream output logic (SOL) stage of the gen8 hardware, which is
* used to implement GL_EXT_transform_feedback.
*/
#include "brw_context.h"
#include "brw_state.h"
#include "brw_defines.h"
#include "intel_batchbuffer.h"
#include "intel_buffer_objects.h"
#include "main/transformfeedback.h"
void
gen8_upload_3dstate_so_buffers(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
struct brw_transform_feedback_object *brw_obj =
(struct brw_transform_feedback_object *) xfb_obj;
uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
/* Set up the up to 4 output buffers. These are the ranges defined in the
* gl_transform_feedback_object.
*/
for (int i = 0; i < 4; i++) {
struct intel_buffer_object *bufferobj =
intel_buffer_object(xfb_obj->Buffers[i]);
if (!bufferobj) {
BEGIN_BATCH(8);
OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
OUT_BATCH(0);
OUT_BATCH(0);
OUT_BATCH(0);
OUT_BATCH(0);
OUT_BATCH(0);
OUT_BATCH(0);
ADVANCE_BATCH();
continue;
}
uint32_t start = xfb_obj->Offset[i];
assert(start % 4 == 0);
uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
struct brw_bo *bo =
intel_bufferobj_buffer(brw, bufferobj, start, end - start);
assert(end <= bo->size);
BEGIN_BATCH(8);
OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) |
GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE |
GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE |
(mocs_wb << 22));
OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
OUT_BATCH(xfb_obj->Size[i] / 4 - 1);
OUT_RELOC64(brw_obj->offset_bo,
I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
i * sizeof(uint32_t));
if (brw_obj->zero_offsets)
OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */
else
OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */
ADVANCE_BATCH();
}
brw_obj->zero_offsets = false;
}

View File

@@ -31,11 +31,13 @@
#include "brw_util.h"
#include "intel_batchbuffer.h"
#include "intel_buffer_objects.h"
#include "intel_fbo.h"
#include "main/fbobject.h"
#include "main/framebuffer.h"
#include "main/stencil.h"
#include "main/transformfeedback.h"
UNUSED static void *
emit_dwords(struct brw_context *brw, unsigned n)
@@ -80,6 +82,28 @@ __gen_combine_address(struct brw_context *brw, void *location,
}
}
static inline struct brw_address
render_bo(struct brw_bo *bo, uint32_t offset)
{
return (struct brw_address) {
.bo = bo,
.offset = offset,
.read_domains = I915_GEM_DOMAIN_RENDER,
.write_domain = I915_GEM_DOMAIN_RENDER,
};
}
static inline struct brw_address
instruction_bo(struct brw_bo *bo, uint32_t offset)
{
return (struct brw_address) {
.bo = bo,
.offset = offset,
.read_domains = I915_GEM_DOMAIN_INSTRUCTION,
.write_domain = I915_GEM_DOMAIN_INSTRUCTION,
};
}
#include "genxml/genX_pack.h"
#define _brw_cmd_length(cmd) cmd ## _length
@@ -94,11 +118,12 @@ __gen_combine_address(struct brw_context *brw, void *location,
_brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
_dst = NULL)
#define brw_batch_emitn(brw, cmd, n) ({ \
#define brw_batch_emitn(brw, cmd, n, ...) ({ \
uint32_t *_dw = emit_dwords(brw, n); \
struct cmd template = { \
_brw_cmd_header(cmd), \
.DWordLength = n - _brw_cmd_length_bias(cmd), \
__VA_ARGS__ \
}; \
_brw_cmd_pack(cmd)(brw, _dw, &template); \
_dw + 1; /* Array starts at dw[1] */ \
@@ -860,6 +885,316 @@ static const struct brw_tracked_state genX(sbe_state) = {
},
.emit = genX(upload_sbe),
};
/* ---------------------------------------------------------------------- */
/**
* Outputs the 3DSTATE_SO_DECL_LIST command.
*
* The data output is a series of 64-bit entries containing a SO_DECL per
* stream. We only have one stream of rendering coming out of the GS unit, so
* we only emit stream 0 (low 16 bits) SO_DECLs.
*/
static void
genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
const struct brw_vue_map *vue_map)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
const struct gl_transform_feedback_info *linked_xfb_info =
xfb_obj->program->sh.LinkedTransformFeedback;
struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int max_decls = 0;
STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
memset(so_decl, 0, sizeof(so_decl));
/* Construct the list of SO_DECLs to be emitted. The formatting of the
* command feels strange -- each dword pair contains a SO_DECL per stream.
*/
for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
struct GENX(SO_DECL) decl = {0};
int varying = linked_xfb_info->Outputs[i].OutputRegister;
const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
unsigned component_mask = (1 << components) - 1;
unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
unsigned decl_buffer_slot = buffer;
assert(stream_id < MAX_VERTEX_STREAMS);
/* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
* gl_Layer is stored in VARYING_SLOT_PSIZ.y
* gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
*/
if (varying == VARYING_SLOT_PSIZ) {
assert(components == 1);
component_mask <<= 3;
} else if (varying == VARYING_SLOT_LAYER) {
assert(components == 1);
component_mask <<= 1;
} else if (varying == VARYING_SLOT_VIEWPORT) {
assert(components == 1);
component_mask <<= 2;
} else {
component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
}
buffer_mask[stream_id] |= 1 << buffer;
decl.OutputBufferSlot = decl_buffer_slot;
if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ];
} else {
assert(vue_map->varying_to_slot[varying] >= 0);
decl.RegisterIndex = vue_map->varying_to_slot[varying];
}
decl.ComponentMask = component_mask;
/* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
* array. Instead, it simply increments DstOffset for the following
* input by the number of components that should be skipped.
*
* Our hardware is unusual in that it requires us to program SO_DECLs
* for fake "hole" components, rather than simply taking the offset
* for each real varying. Each hole can have size 1, 2, 3, or 4; we
* program as many size = 4 holes as we can, then a final hole to
* accommodate the final 1, 2, or 3 remaining.
*/
int skip_components =
linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
next_offset[buffer] += skip_components;
while (skip_components >= 4) {
struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
d->HoleFlag = 1;
d->OutputBufferSlot = decl_buffer_slot;
d->ComponentMask = 0xf;
skip_components -= 4;
}
if (skip_components > 0) {
struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
d->HoleFlag = 1;
d->OutputBufferSlot = decl_buffer_slot;
d->ComponentMask = (1 << skip_components) - 1;
}
assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
next_offset[buffer] += components;
so_decl[stream_id][decls[stream_id]++] = decl;
if (decls[stream_id] > max_decls)
max_decls = decls[stream_id];
}
uint32_t *dw;
dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
.StreamtoBufferSelects0 = buffer_mask[0],
.StreamtoBufferSelects1 = buffer_mask[1],
.StreamtoBufferSelects2 = buffer_mask[2],
.StreamtoBufferSelects3 = buffer_mask[3],
.NumEntries0 = decls[0],
.NumEntries1 = decls[1],
.NumEntries2 = decls[2],
.NumEntries3 = decls[3]);
for (int i = 0; i < max_decls; i++) {
GENX(SO_DECL_ENTRY_pack)(
brw, dw + 2 + i * 2,
&(struct GENX(SO_DECL_ENTRY)) {
.Stream0Decl = so_decl[0][i],
.Stream1Decl = so_decl[1][i],
.Stream2Decl = so_decl[2][i],
.Stream3Decl = so_decl[3][i],
});
}
}
static void
genX(upload_3dstate_so_buffers)(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
#if GEN_GEN < 8
const struct gl_transform_feedback_info *linked_xfb_info =
xfb_obj->program->sh.LinkedTransformFeedback;
#else
struct brw_transform_feedback_object *brw_obj =
(struct brw_transform_feedback_object *) xfb_obj;
uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
#endif
/* Set up the up to 4 output buffers. These are the ranges defined in the
* gl_transform_feedback_object.
*/
for (int i = 0; i < 4; i++) {
struct intel_buffer_object *bufferobj =
intel_buffer_object(xfb_obj->Buffers[i]);
if (!bufferobj) {
brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferIndex = i;
}
continue;
}
uint32_t start = xfb_obj->Offset[i];
assert(start % 4 == 0);
uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
struct brw_bo *bo =
intel_bufferobj_buffer(brw, bufferobj, start, end - start);
assert(end <= bo->size);
brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferIndex = i;
sob.SurfaceBaseAddress = render_bo(bo, start);
#if GEN_GEN < 8
sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
sob.SurfaceEndAddress = render_bo(bo, end);
#else
sob.SOBufferEnable = true;
sob.StreamOffsetWriteEnable = true;
sob.StreamOutputBufferOffsetAddressEnable = true;
sob.SOBufferMOCS = mocs_wb;
sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
sob.StreamOutputBufferOffsetAddress =
instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
if (brw_obj->zero_offsets) {
/* Zero out the offset and write that to offset_bo */
sob.StreamOffset = 0;
} else {
/* Use offset_bo as the "Stream Offset." */
sob.StreamOffset = 0xFFFFFFFF;
}
#endif
}
}
#if GEN_GEN >= 8
brw_obj->zero_offsets = false;
#endif
}
static inline bool
query_active(struct gl_query_object *q)
{
return q && q->Active;
}
static void
genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
const struct brw_vue_map *vue_map)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
struct gl_transform_feedback_object *xfb_obj =
ctx->TransformFeedback.CurrentObject;
brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
if (active) {
int urb_entry_read_offset = 0;
int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
urb_entry_read_offset;
sos.SOFunctionEnable = true;
sos.SOStatisticsEnable = true;
/* BRW_NEW_RASTERIZER_DISCARD */
if (ctx->RasterDiscard) {
if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
sos.RenderingDisable = true;
} else {
perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
"query active relies on the clipper.");
}
}
/* _NEW_LIGHT */
if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
sos.ReorderMode = TRAILING;
#if GEN_GEN < 8
sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
#else
const struct gl_transform_feedback_info *linked_xfb_info =
xfb_obj->program->sh.LinkedTransformFeedback;
/* Set buffer pitches; 0 means unbound. */
if (xfb_obj->Buffers[0])
sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
if (xfb_obj->Buffers[1])
sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
if (xfb_obj->Buffers[2])
sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
if (xfb_obj->Buffers[3])
sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
#endif
/* We always read the whole vertex. This could be reduced at some
* point by reading less and offsetting the register index in the
* SO_DECLs.
*/
sos.Stream0VertexReadOffset = urb_entry_read_offset;
sos.Stream0VertexReadLength = urb_entry_read_length - 1;
sos.Stream1VertexReadOffset = urb_entry_read_offset;
sos.Stream1VertexReadLength = urb_entry_read_length - 1;
sos.Stream2VertexReadOffset = urb_entry_read_offset;
sos.Stream2VertexReadLength = urb_entry_read_length - 1;
sos.Stream3VertexReadOffset = urb_entry_read_offset;
sos.Stream3VertexReadLength = urb_entry_read_length - 1;
}
}
}
static void
genX(upload_sol)(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
/* BRW_NEW_TRANSFORM_FEEDBACK */
bool active = _mesa_is_xfb_active_and_unpaused(ctx);
if (active) {
genX(upload_3dstate_so_buffers)(brw);
/* BRW_NEW_VUE_MAP_GEOM_OUT */
genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
}
/* Finally, set up the SOL stage. This command must always follow updates to
* the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
* MMIO register updates (current performed by the kernel at each batch
* emit).
*/
genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
}
static const struct brw_tracked_state genX(sol_state) = {
.dirty = {
.mesa = _NEW_LIGHT,
.brw = BRW_NEW_BATCH |
BRW_NEW_BLORP |
BRW_NEW_RASTERIZER_DISCARD |
BRW_NEW_VUE_MAP_GEOM_OUT |
BRW_NEW_TRANSFORM_FEEDBACK,
},
.emit = genX(upload_sol),
};
#endif
/* ---------------------------------------------------------------------- */
@@ -1178,7 +1513,7 @@ genX(init_atoms)(struct brw_context *brw)
&gen7_te_state,
&gen7_ds_state,
&gen7_gs_state,
&gen7_sol_state,
&genX(sol_state),
&genX(clip_state),
&genX(sbe_state),
&genX(sf_state),
@@ -1265,7 +1600,7 @@ genX(init_atoms)(struct brw_context *brw)
&gen7_te_state,
&gen8_ds_state,
&gen8_gs_state,
&gen7_sol_state,
&genX(sol_state),
&genX(clip_state),
&genX(raster_state),
&genX(sbe_state),