intel/compiler/mesh: compactify MUE layout

Instead of using 4 dwords for each output slot, use only the amount
of memory actually needed by each variable.

There are some complications from this "obvious" idea:
- flat and non-flat variables can't be merged into the same vec4 slot,
  because flat inputs mask has vec4 stride
- multi-slot variables can have different layout:
   float[N] requires N 1-dword slots, but
   i64vec3 requires 1 fully occupied 4-dword slot followed by 2-dword slot
- some output variables occur both in single-channel/component split
  and combined variants
- crossing vec4 boundary requires generating more writes, so avoiding them
  if possible is beneficial

This patch fixes some issues with arrays in per-vertex and per-primitive data
(func.mesh.ext.outputs.*.indirect_array.q0 in crucible)
and by reduction in single MUE size it allows spawning more threads at
the same time.

Note: this patch doesn't improve vk_meshlet_cadscene performance because
default layout is already optimal enough.

Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
This commit is contained in:
Marcin Ślusarz
2022-12-21 15:40:07 +01:00
committed by Marge Bot
parent fb765a65c8
commit a252123363
8 changed files with 478 additions and 118 deletions

View File

@@ -1022,6 +1022,7 @@ struct brw_wm_prog_data {
* For varying slots that are not used by the FS, the value is -1.
*/
int urb_setup[VARYING_SLOT_MAX];
int urb_setup_channel[VARYING_SLOT_MAX];
/**
* Cache structure into the urb_setup array above that contains the
@@ -1625,6 +1626,7 @@ struct brw_tue_map {
struct brw_mue_map {
int32_t start_dw[VARYING_SLOT_MAX];
uint32_t len_dw[VARYING_SLOT_MAX];
uint32_t per_primitive_indices_dw;
uint32_t size_dw;

View File

@@ -1764,10 +1764,10 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
const nir_shader *nir,
const struct brw_mue_map *mue_map)
{
memset(prog_data->urb_setup, -1,
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
int urb_next = 0;
int urb_next = 0; /* in vec4s */
const uint64_t inputs_read =
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
@@ -1782,6 +1782,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
uint64_t per_prim_inputs_read =
nir->info.inputs_read & nir->info.per_primitive_inputs;
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
* are always at the beginning, because they come from MUE
* Primitive Header, not Per-Primitive Attributes.
@@ -1789,8 +1792,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
VARYING_BIT_LAYER |
VARYING_BIT_PRIMITIVE_SHADING_RATE;
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
if (per_prim_inputs_read & primitive_header_bits) {
if (reads_header) {
/* Primitive Shading Rate, Layer and Viewport live in the same
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
* is dword 2).
@@ -1804,23 +1808,30 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
/* 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputRead[Offset|Length]
* are in full GRFs (8 dwords) and MUE Primitive Header is 8 dwords,
* so next per-primitive attribute must be placed in slot 2 (each slot
* is 4 dwords long).
*/
urb_next = 2;
per_prim_inputs_read &= ~primitive_header_bits;
} else {
/* If fs doesn't need primitive header, then it won't be made
* available through SBE_MESH, so we have to skip them when
* calculating offset from start of per-prim data.
*/
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
}
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
u_foreach_bit64(i, per_prim_inputs_read) {
int start = mue_map->start_dw[i];
assert(start >= 0);
assert(mue_map->len_dw[i] > 0);
assert(unsigned(start) >= per_prim_start_dw);
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
prog_data->urb_setup_channel[i] = pos_dw % 4;
}
/* The actual setup attributes later must be aligned to a full GRF. */
urb_next = ALIGN(urb_next, 2);
urb_next = per_prim_size_dw / 4;
prog_data->num_per_primitive_inputs = urb_next;
}
@@ -1835,21 +1846,43 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
unique_fs_attrs &= ~clip_dist_bits;
}
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
/* Per-Vertex header is never available to fragment shader. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
/* In Mesh, CLIP_DIST slots are always at the beginning, because
* they come from MUE Vertex Header, not Per-Vertex Attributes.
*/
if (inputs_read & clip_dist_bits) {
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
} else if (mue_map->per_vertex_header_size_dw > 8) {
/* Clip distances are in MUE, but we are not reading them in FS. */
per_vertex_start_dw += 8;
per_vertex_size_dw -= 8;
}
/* Per-Vertex attributes are laid out ordered. Because we always link
* Mesh and Fragment shaders, the which slots are written and read by
* each of them will match. */
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
if (unique_fs_attrs & BITFIELD64_BIT(i))
prog_data->urb_setup[i] = urb_next++;
u_foreach_bit64(i, unique_fs_attrs) {
int start = mue_map->start_dw[i];
assert(start >= 0);
assert(mue_map->len_dw[i] > 0);
assert(unsigned(start) >= per_vertex_start_dw);
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
prog_data->urb_setup_channel[i] = pos_dw % 4;
}
urb_next += per_vertex_size_dw / 4;
} else if (devinfo->ver >= 6) {
uint64_t vue_header_bits =
VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;

View File

@@ -438,7 +438,7 @@ public:
fs_reg get_timestamp(const brw::fs_builder &bld);
fs_reg interp_reg(int location, int channel);
fs_reg per_primitive_reg(int location);
fs_reg per_primitive_reg(int location, unsigned comp);
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
virtual void dump_instructions_to_file(FILE *file) const;

View File

@@ -3489,7 +3489,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
retype(component(per_primitive_reg(base), comp + i), dest.type));
retype(per_primitive_reg(base, comp + i), dest.type));
}
} else {
for (unsigned int i = 0; i < num_components; i++) {

View File

@@ -126,6 +126,7 @@ fs_visitor::interp_reg(int location, int channel)
assert(prog_data->urb_setup[location] >= 0);
unsigned nr = prog_data->urb_setup[location];
channel += prog_data->urb_setup_channel[location];
/* Adjust so we start counting from the first per_vertex input. */
assert(nr >= prog_data->num_per_primitive_inputs);
@@ -142,19 +143,22 @@ fs_visitor::interp_reg(int location, int channel)
* generate_code() time.
*/
fs_reg
fs_visitor::per_primitive_reg(int location)
fs_visitor::per_primitive_reg(int location, unsigned comp)
{
assert(stage == MESA_SHADER_FRAGMENT);
assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
comp += prog_data->urb_setup_channel[location];
assert(prog_data->urb_setup[location] >= 0);
const unsigned regnr = prog_data->urb_setup[location];
const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
assert(regnr < prog_data->num_per_primitive_inputs);
return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4);
}
/** Emits the interpolation for the varying inputs. */

View File

@@ -21,6 +21,8 @@
* IN THE SOFTWARE.
*/
#include <list>
#include <vector>
#include "brw_compiler.h"
#include "brw_fs.h"
#include "brw_nir.h"
@@ -414,6 +416,224 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
nir_address_format_32bit_offset);
}
/* Attribute types. Flat attributes have to be a separate class because
* flat and interpolated attributes can't share the same vec4 slot
* (see 3DSTATE_SBE.ConstantInterpolationEnable).
*/
enum {
PRIM, /* per primitive */
VERT, /* per vertex interpolated */
VERT_FLAT, /* per vertex flat */
};
struct attr_desc {
int location;
const struct glsl_type *type;
unsigned dwords;
unsigned slots;
};
struct attr_type_info {
/* order of attributes, negative values are holes */
std::list<struct attr_desc> *order;
/* attributes after which there's hole of size equal to array index */
std::list<int> holes[4];
};
static void
brw_mue_assign_position(const struct attr_desc *attr,
struct brw_mue_map *map,
unsigned start_dw)
{
bool is_array = glsl_type_is_array(attr->type);
int location = attr->location;
unsigned remaining = attr->dwords;
for (unsigned slot = 0; slot < attr->slots; ++slot) {
map->start_dw[location + slot] = start_dw;
unsigned sz;
if (is_array) {
assert(attr->dwords % attr->slots == 0);
sz = attr->dwords / attr->slots;
} else {
sz = MIN2(remaining, 4);
}
map->len_dw[location + slot] = sz;
start_dw += sz;
remaining -= sz;
}
}
static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader *shader,
nir_variable_mode mode,
int location)
{
nir_variable *best_var = NULL;
unsigned last_size = 0;
nir_foreach_variable_with_modes(var, shader, mode) {
if (var->data.location != location)
continue;
unsigned new_size = glsl_count_dword_slots(var->type, false);
if (new_size > last_size) {
best_var = var;
last_size = new_size;
}
}
return best_var;
}
/* Finds order of outputs which require minimum size, without splitting
* of URB read/write messages (which operate on vec4-aligned memory).
*/
static void
brw_compute_mue_layout(std::list<struct attr_desc> *orders,
uint64_t outputs_written,
struct nir_shader *nir)
{
const struct shader_info *info = &nir->info;
struct attr_type_info data[3];
bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true);
for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
data[i].order = &orders[i];
u_foreach_bit64(location, outputs_written) {
if ((BITFIELD64_BIT(location) & outputs_written) == 0)
continue;
/* At this point there are both complete and split variables as
* outputs. We need the complete variable to compute the required
* size.
*/
nir_variable *var =
brw_nir_find_complete_variable_with_location(nir,
nir_var_shader_out,
location);
struct attr_desc d;
d.location = location;
d.type = brw_nir_get_var_type(nir, var);
d.dwords = glsl_count_dword_slots(d.type, false);
d.slots = glsl_count_attribute_slots(d.type, false);
struct attr_type_info *type_data;
if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
type_data = &data[PRIM];
else if (var->data.interpolation == INTERP_MODE_FLAT)
type_data = &data[VERT_FLAT];
else
type_data = &data[VERT];
std::list<struct attr_desc> *order = type_data->order;
std::list<int> *holes = type_data->holes;
outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
int mod = d.dwords % 4;
if (mod == 0) {
order->push_back(d);
continue;
}
struct attr_desc h;
h.location = -1;
h.type = NULL;
h.dwords = 4 - mod;
h.slots = 0;
if (no_compact) {
order->push_back(d);
order->push_back(h);
continue;
}
if (d.dwords > 4) {
order->push_back(d);
order->push_back(h);
holes[h.dwords].push_back(location);
continue;
}
assert(d.dwords < 4);
unsigned found = 0;
/* try to find the smallest hole big enough to hold this attribute */
for (unsigned sz = d.dwords; sz < 4; sz++){
if (!holes[sz].empty()) {
found = sz;
break;
}
}
/* append at the end if not found */
if (found == 0) {
order->push_back(d);
order->push_back(h);
holes[h.dwords].push_back(location);
continue;
}
assert(found < 4);
assert(!holes[found].empty());
int after_loc = holes[found].back();
holes[found].pop_back();
bool inserted_back = false;
for (auto it = order->begin(); it != order->end(); ++it) {
if ((*it).location != after_loc)
continue;
++it;
/* must be a hole */
assert((*it).location < 0);
/* and it must be big enough */
assert(d.dwords <= (*it).dwords);
if (d.dwords == (*it).dwords) {
/* exact size, just replace */
*it = d;
} else {
/* inexact size, shrink hole */
(*it).dwords -= d.dwords;
/* and insert new attribute before it */
order->insert(it, d);
/* Insert shrunk hole in a spot so that the order of attributes
* is preserved.
*/
std::list<int> &hole_list = holes[(*it).dwords];
std::list<int>::iterator insert_before = hole_list.end();
for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
if ((*it2) >= (int)location) {
insert_before = it2;
break;
}
}
hole_list.insert(insert_before, location);
}
inserted_back = true;
break;
}
assert(inserted_back);
}
}
/* Mesh URB Entry consists of an initial section
*
* - Primitive Count
@@ -443,8 +663,8 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
{
memset(map, 0, sizeof(*map));
for (int i = 0; i < VARYING_SLOT_MAX; i++)
map->start_dw[i] = -1;
memset(&map->start_dw[0], -1, sizeof(map->start_dw));
memset(&map->len_dw[0], 0, sizeof(map->len_dw));
unsigned vertices_per_primitive =
num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
@@ -454,16 +674,6 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
uint64_t outputs_written = nir->info.outputs_written;
/* Assign initial section. */
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
}
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
}
/* One dword for primitives count then K extra dwords for each primitive. */
switch (index_format) {
case BRW_INDEX_FORMAT_U32:
@@ -479,86 +689,157 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
map->max_primitives + 1, 8);
/* TODO(mesh): Multiview. */
map->per_primitive_header_size_dw =
(nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) |
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0;
/* Assign initial section. */
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
}
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
map->per_primitive_indices_dw * map->max_primitives;
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
}
map->per_primitive_data_size_dw = 0;
u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) {
assert(map->start_dw[location] == -1);
const uint64_t per_primitive_header_bits =
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
BITFIELD64_BIT(VARYING_SLOT_LAYER) |
BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
unsigned start;
switch (location) {
case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
start = map->per_primitive_start_dw + 0;
break;
case VARYING_SLOT_LAYER:
start = map->per_primitive_start_dw + 1; /* RTAIndex */
break;
case VARYING_SLOT_VIEWPORT:
start = map->per_primitive_start_dw + 2;
break;
case VARYING_SLOT_CULL_PRIMITIVE:
start = map->per_primitive_start_dw + 3;
break;
default:
assert(location == VARYING_SLOT_PRIMITIVE_ID ||
location >= VARYING_SLOT_VAR0);
start = map->per_primitive_start_dw +
map->per_primitive_header_size_dw +
map->per_primitive_data_size_dw;
map->per_primitive_data_size_dw += 4;
break;
const uint64_t per_vertex_header_bits =
BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
BITFIELD64_BIT(VARYING_SLOT_POS) |
BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
std::list<struct attr_desc> orders[3];
uint64_t regular_outputs = outputs_written &
~(per_primitive_header_bits | per_vertex_header_bits);
brw_compute_mue_layout(orders, regular_outputs, nir);
if (outputs_written & per_primitive_header_bits) {
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
map->per_primitive_start_dw + 0;
map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
}
map->start_dw[location] = start;
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
map->start_dw[VARYING_SLOT_LAYER] =
map->per_primitive_start_dw + 1; /* RTAIndex */
map->len_dw[VARYING_SLOT_LAYER] = 1;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
map->start_dw[VARYING_SLOT_VIEWPORT] =
map->per_primitive_start_dw + 2;
map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
map->per_primitive_start_dw + 3;
map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
}
map->per_primitive_header_size_dw = 8;
outputs_written &= ~per_primitive_header_bits;
} else {
map->per_primitive_header_size_dw = 0;
}
map->per_primitive_data_size_dw = 0;
unsigned start_dw = map->per_primitive_start_dw +
map->per_primitive_header_size_dw;
for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
int location = (*it).location;
if (location < 0) {
start_dw += (*it).dwords;
map->per_primitive_data_size_dw += (*it).dwords;
continue;
}
assert(map->start_dw[location] == -1);
assert(location == VARYING_SLOT_PRIMITIVE_ID ||
location >= VARYING_SLOT_VAR0);
brw_mue_assign_position(&*it, map, start_dw);
start_dw += (*it).dwords;
map->per_primitive_data_size_dw += (*it).dwords;
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
}
map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
map->per_primitive_data_size_dw, 8);
map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
map->per_primitive_pitch_dw * map->max_primitives, 8);
map->per_primitive_pitch_dw *
map->max_primitives, 8);
/* TODO(mesh): Multiview. */
unsigned fixed_header_size = 8;
map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
nir->info.clip_distance_array_size +
nir->info.cull_distance_array_size, 8);
map->per_vertex_data_size_dw = 0;
u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) {
assert(map->start_dw[location] == -1);
unsigned start;
switch (location) {
case VARYING_SLOT_PSIZ:
start = map->per_vertex_start_dw + 3;
break;
case VARYING_SLOT_POS:
start = map->per_vertex_start_dw + 4;
break;
case VARYING_SLOT_CLIP_DIST0:
start = map->per_vertex_start_dw + fixed_header_size + 0;
break;
case VARYING_SLOT_CLIP_DIST1:
start = map->per_vertex_start_dw + fixed_header_size + 4;
break;
case VARYING_SLOT_CULL_DIST0:
case VARYING_SLOT_CULL_DIST1:
unreachable("cull distances should be lowered earlier");
break;
default:
assert(location >= VARYING_SLOT_VAR0);
start = map->per_vertex_start_dw +
map->per_vertex_header_size_dw +
map->per_vertex_data_size_dw;
map->per_vertex_data_size_dw += 4;
break;
if (outputs_written & per_vertex_header_bits) {
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
map->len_dw[VARYING_SLOT_PSIZ] = 1;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
map->len_dw[VARYING_SLOT_POS] = 4;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
map->start_dw[VARYING_SLOT_CLIP_DIST0] =
map->per_vertex_start_dw + fixed_header_size + 0;
map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
}
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
map->start_dw[VARYING_SLOT_CLIP_DIST1] =
map->per_vertex_start_dw + fixed_header_size + 4;
map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
}
outputs_written &= ~per_vertex_header_bits;
}
/* cull distances should be lowered earlier */
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
map->per_vertex_data_size_dw = 0;
start_dw = map->per_vertex_start_dw +
map->per_vertex_header_size_dw;
for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
int location = (*it).location;
if (location < 0) {
start_dw += (*it).dwords;
map->per_vertex_data_size_dw += (*it).dwords;
continue;
}
assert(map->start_dw[location] == -1);
assert(location >= VARYING_SLOT_VAR0);
brw_mue_assign_position(&*it, map, start_dw);
start_dw += (*it).dwords;
map->per_vertex_data_size_dw += (*it).dwords;
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
}
map->start_dw[location] = start;
}
map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
@@ -571,14 +852,18 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
}
static void
brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
{
fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
map->size_dw, map->max_primitives, map->max_vertices);
fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_COUNT\n",
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]);
fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_INDICES\n",
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]);
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
map->per_primitive_start_dw,
@@ -589,13 +874,20 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (map->start_dw[i] < 0)
continue;
const unsigned offset = map->start_dw[i];
if (offset >= map->per_primitive_start_dw &&
offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) {
fprintf(fp, " %4d: %s\n", offset,
gl_varying_slot_name_for_stage((gl_varying_slot)i,
MESA_SHADER_MESH));
}
const unsigned len = map->len_dw[i];
if (offset < map->per_primitive_start_dw ||
offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
continue;
const char *name =
gl_varying_slot_name_for_stage((gl_varying_slot)i,
MESA_SHADER_MESH);
fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
name, i);
}
fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
@@ -607,13 +899,24 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (map->start_dw[i] < 0)
continue;
const unsigned offset = map->start_dw[i];
if (offset >= map->per_vertex_start_dw &&
offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) {
fprintf(fp, " %4d: %s\n", offset,
gl_varying_slot_name_for_stage((gl_varying_slot)i,
MESA_SHADER_MESH));
}
const unsigned len = map->len_dw[i];
if (offset < map->per_vertex_start_dw ||
offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
continue;
nir_variable *var =
nir_find_variable_with_location(nir, nir_var_shader_out, i);
bool flat = var->data.interpolation == INTERP_MODE_FLAT;
const char *name =
gl_varying_slot_name_for_stage((gl_varying_slot)i,
MESA_SHADER_MESH);
fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
name, i, flat ? " (flat)" : "");
}
fprintf(fp, "\n");
@@ -1070,7 +1373,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
brw_print_tue_map(stderr, params->tue_map);
}
fprintf(stderr, "Mesh Output ");
brw_print_mue_map(stderr, &prog_data->map);
brw_print_mue_map(stderr, &prog_data->map, nir);
}
fs_generator g(compiler, &params->base, &prog_data->base.base,

View File

@@ -2085,6 +2085,21 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
return sysval;
}
const struct glsl_type *
brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
{
const struct glsl_type *type = var->interface_type;
if (!type) {
type = var->type;
if (nir_is_arrayed_io(var, nir->info.stage) || var->data.per_view) {
assert(glsl_type_is_array(type));
type = glsl_get_array_element(type);
}
}
return type;
}
bool
brw_nir_pulls_at_sample(nir_shader *shader)
{

View File

@@ -283,6 +283,9 @@ nir_ssa_def *brw_nir_load_global_const(nir_builder *b,
nir_ssa_def *base_addr,
unsigned off);
const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
nir_variable *var);
#ifdef __cplusplus
}
#endif