intel/compiler/mesh: compactify MUE layout
Instead of using 4 dwords for each output slot, use only the amount of memory actually needed by each variable. There are some complications from this "obvious" idea: - flat and non-flat variables can't be merged into the same vec4 slot, because flat inputs mask has vec4 stride - multi-slot variables can have different layout: float[N] requires N 1-dword slots, but i64vec3 requires 1 fully occupied 4-dword slot followed by 2-dword slot - some output variables occur both in single-channel/component split and combined variants - crossing vec4 boundary requires generating more writes, so avoiding them if possible is beneficial This patch fixes some issues with arrays in per-vertex and per-primitive data (func.mesh.ext.outputs.*.indirect_array.q0 in crucible) and by reduction in single MUE size it allows spawning more threads at the same time. Note: this patch doesn't improve vk_meshlet_cadscene performance because default layout is already optimal enough. Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20407>
This commit is contained in:

committed by
Marge Bot

parent
fb765a65c8
commit
a252123363
@@ -1022,6 +1022,7 @@ struct brw_wm_prog_data {
|
||||
* For varying slots that are not used by the FS, the value is -1.
|
||||
*/
|
||||
int urb_setup[VARYING_SLOT_MAX];
|
||||
int urb_setup_channel[VARYING_SLOT_MAX];
|
||||
|
||||
/**
|
||||
* Cache structure into the urb_setup array above that contains the
|
||||
@@ -1625,6 +1626,7 @@ struct brw_tue_map {
|
||||
|
||||
struct brw_mue_map {
|
||||
int32_t start_dw[VARYING_SLOT_MAX];
|
||||
uint32_t len_dw[VARYING_SLOT_MAX];
|
||||
uint32_t per_primitive_indices_dw;
|
||||
|
||||
uint32_t size_dw;
|
||||
|
@@ -1764,10 +1764,10 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
|
||||
const nir_shader *nir,
|
||||
const struct brw_mue_map *mue_map)
|
||||
{
|
||||
memset(prog_data->urb_setup, -1,
|
||||
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
|
||||
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
|
||||
memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
|
||||
|
||||
int urb_next = 0;
|
||||
int urb_next = 0; /* in vec4s */
|
||||
|
||||
const uint64_t inputs_read =
|
||||
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
|
||||
@@ -1782,6 +1782,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
|
||||
uint64_t per_prim_inputs_read =
|
||||
nir->info.inputs_read & nir->info.per_primitive_inputs;
|
||||
|
||||
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
|
||||
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
|
||||
|
||||
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
|
||||
* are always at the beginning, because they come from MUE
|
||||
* Primitive Header, not Per-Primitive Attributes.
|
||||
@@ -1789,8 +1792,9 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
|
||||
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
|
||||
VARYING_BIT_LAYER |
|
||||
VARYING_BIT_PRIMITIVE_SHADING_RATE;
|
||||
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
|
||||
|
||||
if (per_prim_inputs_read & primitive_header_bits) {
|
||||
if (reads_header) {
|
||||
/* Primitive Shading Rate, Layer and Viewport live in the same
|
||||
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
|
||||
* is dword 2).
|
||||
@@ -1804,23 +1808,30 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
|
||||
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
|
||||
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
|
||||
|
||||
/* 3DSTATE_SBE_MESH.Per[Primitive|Vertex]URBEntryOutputRead[Offset|Length]
|
||||
* are in full GRFs (8 dwords) and MUE Primitive Header is 8 dwords,
|
||||
* so next per-primitive attribute must be placed in slot 2 (each slot
|
||||
* is 4 dwords long).
|
||||
*/
|
||||
urb_next = 2;
|
||||
per_prim_inputs_read &= ~primitive_header_bits;
|
||||
} else {
|
||||
/* If fs doesn't need primitive header, then it won't be made
|
||||
* available through SBE_MESH, so we have to skip them when
|
||||
* calculating offset from start of per-prim data.
|
||||
*/
|
||||
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
|
||||
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
|
||||
prog_data->urb_setup[i] = urb_next++;
|
||||
}
|
||||
u_foreach_bit64(i, per_prim_inputs_read) {
|
||||
int start = mue_map->start_dw[i];
|
||||
|
||||
assert(start >= 0);
|
||||
assert(mue_map->len_dw[i] > 0);
|
||||
|
||||
assert(unsigned(start) >= per_prim_start_dw);
|
||||
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
|
||||
|
||||
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
||||
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
||||
}
|
||||
|
||||
/* The actual setup attributes later must be aligned to a full GRF. */
|
||||
urb_next = ALIGN(urb_next, 2);
|
||||
urb_next = per_prim_size_dw / 4;
|
||||
|
||||
prog_data->num_per_primitive_inputs = urb_next;
|
||||
}
|
||||
@@ -1835,21 +1846,43 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
|
||||
unique_fs_attrs &= ~clip_dist_bits;
|
||||
}
|
||||
|
||||
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
|
||||
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
|
||||
|
||||
/* Per-Vertex header is never available to fragment shader. */
|
||||
per_vertex_start_dw += 8;
|
||||
per_vertex_size_dw -= 8;
|
||||
|
||||
/* In Mesh, CLIP_DIST slots are always at the beginning, because
|
||||
* they come from MUE Vertex Header, not Per-Vertex Attributes.
|
||||
*/
|
||||
if (inputs_read & clip_dist_bits) {
|
||||
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
|
||||
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
|
||||
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
|
||||
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
|
||||
} else if (mue_map->per_vertex_header_size_dw > 8) {
|
||||
/* Clip distances are in MUE, but we are not reading them in FS. */
|
||||
per_vertex_start_dw += 8;
|
||||
per_vertex_size_dw -= 8;
|
||||
}
|
||||
|
||||
/* Per-Vertex attributes are laid out ordered. Because we always link
|
||||
* Mesh and Fragment shaders, the which slots are written and read by
|
||||
* each of them will match. */
|
||||
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (unique_fs_attrs & BITFIELD64_BIT(i))
|
||||
prog_data->urb_setup[i] = urb_next++;
|
||||
|
||||
u_foreach_bit64(i, unique_fs_attrs) {
|
||||
int start = mue_map->start_dw[i];
|
||||
|
||||
assert(start >= 0);
|
||||
assert(mue_map->len_dw[i] > 0);
|
||||
|
||||
assert(unsigned(start) >= per_vertex_start_dw);
|
||||
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
|
||||
|
||||
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
||||
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
||||
}
|
||||
|
||||
urb_next += per_vertex_size_dw / 4;
|
||||
} else if (devinfo->ver >= 6) {
|
||||
uint64_t vue_header_bits =
|
||||
VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
|
||||
|
@@ -438,7 +438,7 @@ public:
|
||||
fs_reg get_timestamp(const brw::fs_builder &bld);
|
||||
|
||||
fs_reg interp_reg(int location, int channel);
|
||||
fs_reg per_primitive_reg(int location);
|
||||
fs_reg per_primitive_reg(int location, unsigned comp);
|
||||
|
||||
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
|
||||
virtual void dump_instructions_to_file(FILE *file) const;
|
||||
|
@@ -3489,7 +3489,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
|
||||
assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
|
||||
for (unsigned int i = 0; i < num_components; i++) {
|
||||
bld.MOV(offset(dest, bld, i),
|
||||
retype(component(per_primitive_reg(base), comp + i), dest.type));
|
||||
retype(per_primitive_reg(base, comp + i), dest.type));
|
||||
}
|
||||
} else {
|
||||
for (unsigned int i = 0; i < num_components; i++) {
|
||||
|
@@ -126,6 +126,7 @@ fs_visitor::interp_reg(int location, int channel)
|
||||
|
||||
assert(prog_data->urb_setup[location] >= 0);
|
||||
unsigned nr = prog_data->urb_setup[location];
|
||||
channel += prog_data->urb_setup_channel[location];
|
||||
|
||||
/* Adjust so we start counting from the first per_vertex input. */
|
||||
assert(nr >= prog_data->num_per_primitive_inputs);
|
||||
@@ -142,19 +143,22 @@ fs_visitor::interp_reg(int location, int channel)
|
||||
* generate_code() time.
|
||||
*/
|
||||
fs_reg
|
||||
fs_visitor::per_primitive_reg(int location)
|
||||
fs_visitor::per_primitive_reg(int location, unsigned comp)
|
||||
{
|
||||
assert(stage == MESA_SHADER_FRAGMENT);
|
||||
assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
|
||||
|
||||
const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
|
||||
|
||||
comp += prog_data->urb_setup_channel[location];
|
||||
|
||||
assert(prog_data->urb_setup[location] >= 0);
|
||||
|
||||
const unsigned regnr = prog_data->urb_setup[location];
|
||||
const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
|
||||
|
||||
assert(regnr < prog_data->num_per_primitive_inputs);
|
||||
|
||||
return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
|
||||
return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4);
|
||||
}
|
||||
|
||||
/** Emits the interpolation for the varying inputs. */
|
||||
|
@@ -21,6 +21,8 @@
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_nir.h"
|
||||
@@ -414,6 +416,224 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
|
||||
nir_address_format_32bit_offset);
|
||||
}
|
||||
|
||||
/* Attribute types. Flat attributes have to be a separate class because
|
||||
* flat and interpolated attributes can't share the same vec4 slot
|
||||
* (see 3DSTATE_SBE.ConstantInterpolationEnable).
|
||||
*/
|
||||
enum {
|
||||
PRIM, /* per primitive */
|
||||
VERT, /* per vertex interpolated */
|
||||
VERT_FLAT, /* per vertex flat */
|
||||
};
|
||||
|
||||
struct attr_desc {
|
||||
int location;
|
||||
const struct glsl_type *type;
|
||||
unsigned dwords;
|
||||
unsigned slots;
|
||||
};
|
||||
|
||||
struct attr_type_info {
|
||||
/* order of attributes, negative values are holes */
|
||||
std::list<struct attr_desc> *order;
|
||||
|
||||
/* attributes after which there's hole of size equal to array index */
|
||||
std::list<int> holes[4];
|
||||
};
|
||||
|
||||
static void
|
||||
brw_mue_assign_position(const struct attr_desc *attr,
|
||||
struct brw_mue_map *map,
|
||||
unsigned start_dw)
|
||||
{
|
||||
bool is_array = glsl_type_is_array(attr->type);
|
||||
int location = attr->location;
|
||||
unsigned remaining = attr->dwords;
|
||||
|
||||
for (unsigned slot = 0; slot < attr->slots; ++slot) {
|
||||
map->start_dw[location + slot] = start_dw;
|
||||
|
||||
unsigned sz;
|
||||
|
||||
if (is_array) {
|
||||
assert(attr->dwords % attr->slots == 0);
|
||||
sz = attr->dwords / attr->slots;
|
||||
} else {
|
||||
sz = MIN2(remaining, 4);
|
||||
}
|
||||
|
||||
map->len_dw[location + slot] = sz;
|
||||
start_dw += sz;
|
||||
remaining -= sz;
|
||||
}
|
||||
}
|
||||
|
||||
static nir_variable *
|
||||
brw_nir_find_complete_variable_with_location(nir_shader *shader,
|
||||
nir_variable_mode mode,
|
||||
int location)
|
||||
{
|
||||
nir_variable *best_var = NULL;
|
||||
unsigned last_size = 0;
|
||||
|
||||
nir_foreach_variable_with_modes(var, shader, mode) {
|
||||
if (var->data.location != location)
|
||||
continue;
|
||||
|
||||
unsigned new_size = glsl_count_dword_slots(var->type, false);
|
||||
if (new_size > last_size) {
|
||||
best_var = var;
|
||||
last_size = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
return best_var;
|
||||
}
|
||||
|
||||
/* Finds order of outputs which require minimum size, without splitting
|
||||
* of URB read/write messages (which operate on vec4-aligned memory).
|
||||
*/
|
||||
static void
|
||||
brw_compute_mue_layout(std::list<struct attr_desc> *orders,
|
||||
uint64_t outputs_written,
|
||||
struct nir_shader *nir)
|
||||
{
|
||||
const struct shader_info *info = &nir->info;
|
||||
|
||||
struct attr_type_info data[3];
|
||||
bool no_compact = !debug_get_bool_option("BRW_MESH_COMPACTION", true);
|
||||
|
||||
for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
|
||||
data[i].order = &orders[i];
|
||||
|
||||
u_foreach_bit64(location, outputs_written) {
|
||||
if ((BITFIELD64_BIT(location) & outputs_written) == 0)
|
||||
continue;
|
||||
|
||||
/* At this point there are both complete and split variables as
|
||||
* outputs. We need the complete variable to compute the required
|
||||
* size.
|
||||
*/
|
||||
nir_variable *var =
|
||||
brw_nir_find_complete_variable_with_location(nir,
|
||||
nir_var_shader_out,
|
||||
location);
|
||||
|
||||
struct attr_desc d;
|
||||
d.location = location;
|
||||
d.type = brw_nir_get_var_type(nir, var);
|
||||
d.dwords = glsl_count_dword_slots(d.type, false);
|
||||
d.slots = glsl_count_attribute_slots(d.type, false);
|
||||
|
||||
struct attr_type_info *type_data;
|
||||
|
||||
if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
|
||||
type_data = &data[PRIM];
|
||||
else if (var->data.interpolation == INTERP_MODE_FLAT)
|
||||
type_data = &data[VERT_FLAT];
|
||||
else
|
||||
type_data = &data[VERT];
|
||||
|
||||
std::list<struct attr_desc> *order = type_data->order;
|
||||
std::list<int> *holes = type_data->holes;
|
||||
|
||||
outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
|
||||
|
||||
int mod = d.dwords % 4;
|
||||
if (mod == 0) {
|
||||
order->push_back(d);
|
||||
continue;
|
||||
}
|
||||
|
||||
struct attr_desc h;
|
||||
h.location = -1;
|
||||
h.type = NULL;
|
||||
h.dwords = 4 - mod;
|
||||
h.slots = 0;
|
||||
|
||||
if (no_compact) {
|
||||
order->push_back(d);
|
||||
order->push_back(h);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (d.dwords > 4) {
|
||||
order->push_back(d);
|
||||
order->push_back(h);
|
||||
holes[h.dwords].push_back(location);
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(d.dwords < 4);
|
||||
|
||||
unsigned found = 0;
|
||||
/* try to find the smallest hole big enough to hold this attribute */
|
||||
for (unsigned sz = d.dwords; sz < 4; sz++){
|
||||
if (!holes[sz].empty()) {
|
||||
found = sz;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* append at the end if not found */
|
||||
if (found == 0) {
|
||||
order->push_back(d);
|
||||
order->push_back(h);
|
||||
holes[h.dwords].push_back(location);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(found < 4);
|
||||
assert(!holes[found].empty());
|
||||
int after_loc = holes[found].back();
|
||||
holes[found].pop_back();
|
||||
|
||||
bool inserted_back = false;
|
||||
|
||||
for (auto it = order->begin(); it != order->end(); ++it) {
|
||||
if ((*it).location != after_loc)
|
||||
continue;
|
||||
|
||||
++it;
|
||||
/* must be a hole */
|
||||
assert((*it).location < 0);
|
||||
/* and it must be big enough */
|
||||
assert(d.dwords <= (*it).dwords);
|
||||
|
||||
if (d.dwords == (*it).dwords) {
|
||||
/* exact size, just replace */
|
||||
*it = d;
|
||||
} else {
|
||||
/* inexact size, shrink hole */
|
||||
(*it).dwords -= d.dwords;
|
||||
/* and insert new attribute before it */
|
||||
order->insert(it, d);
|
||||
|
||||
/* Insert shrunk hole in a spot so that the order of attributes
|
||||
* is preserved.
|
||||
*/
|
||||
std::list<int> &hole_list = holes[(*it).dwords];
|
||||
std::list<int>::iterator insert_before = hole_list.end();
|
||||
|
||||
for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
|
||||
if ((*it2) >= (int)location) {
|
||||
insert_before = it2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
hole_list.insert(insert_before, location);
|
||||
}
|
||||
|
||||
inserted_back = true;
|
||||
break;
|
||||
}
|
||||
|
||||
assert(inserted_back);
|
||||
}
|
||||
}
|
||||
|
||||
/* Mesh URB Entry consists of an initial section
|
||||
*
|
||||
* - Primitive Count
|
||||
@@ -443,8 +663,8 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
|
||||
{
|
||||
memset(map, 0, sizeof(*map));
|
||||
|
||||
for (int i = 0; i < VARYING_SLOT_MAX; i++)
|
||||
map->start_dw[i] = -1;
|
||||
memset(&map->start_dw[0], -1, sizeof(map->start_dw));
|
||||
memset(&map->len_dw[0], 0, sizeof(map->len_dw));
|
||||
|
||||
unsigned vertices_per_primitive =
|
||||
num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
|
||||
@@ -454,16 +674,6 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
|
||||
|
||||
uint64_t outputs_written = nir->info.outputs_written;
|
||||
|
||||
/* Assign initial section. */
|
||||
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
|
||||
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
|
||||
}
|
||||
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
|
||||
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
|
||||
}
|
||||
|
||||
/* One dword for primitives count then K extra dwords for each primitive. */
|
||||
switch (index_format) {
|
||||
case BRW_INDEX_FORMAT_U32:
|
||||
@@ -479,86 +689,157 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
|
||||
map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
|
||||
map->max_primitives + 1, 8);
|
||||
|
||||
/* TODO(mesh): Multiview. */
|
||||
map->per_primitive_header_size_dw =
|
||||
(nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0;
|
||||
/* Assign initial section. */
|
||||
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
|
||||
map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
|
||||
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
|
||||
}
|
||||
if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
|
||||
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
|
||||
map->per_primitive_indices_dw * map->max_primitives;
|
||||
outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
|
||||
}
|
||||
|
||||
map->per_primitive_data_size_dw = 0;
|
||||
u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) {
|
||||
assert(map->start_dw[location] == -1);
|
||||
const uint64_t per_primitive_header_bits =
|
||||
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_LAYER) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
|
||||
|
||||
unsigned start;
|
||||
switch (location) {
|
||||
case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
|
||||
start = map->per_primitive_start_dw + 0;
|
||||
break;
|
||||
case VARYING_SLOT_LAYER:
|
||||
start = map->per_primitive_start_dw + 1; /* RTAIndex */
|
||||
break;
|
||||
case VARYING_SLOT_VIEWPORT:
|
||||
start = map->per_primitive_start_dw + 2;
|
||||
break;
|
||||
case VARYING_SLOT_CULL_PRIMITIVE:
|
||||
start = map->per_primitive_start_dw + 3;
|
||||
break;
|
||||
default:
|
||||
assert(location == VARYING_SLOT_PRIMITIVE_ID ||
|
||||
location >= VARYING_SLOT_VAR0);
|
||||
start = map->per_primitive_start_dw +
|
||||
map->per_primitive_header_size_dw +
|
||||
map->per_primitive_data_size_dw;
|
||||
map->per_primitive_data_size_dw += 4;
|
||||
break;
|
||||
const uint64_t per_vertex_header_bits =
|
||||
BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_POS) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
|
||||
BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
|
||||
|
||||
std::list<struct attr_desc> orders[3];
|
||||
uint64_t regular_outputs = outputs_written &
|
||||
~(per_primitive_header_bits | per_vertex_header_bits);
|
||||
brw_compute_mue_layout(orders, regular_outputs, nir);
|
||||
|
||||
if (outputs_written & per_primitive_header_bits) {
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
|
||||
map->per_primitive_start_dw + 0;
|
||||
map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
|
||||
}
|
||||
|
||||
map->start_dw[location] = start;
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
|
||||
map->start_dw[VARYING_SLOT_LAYER] =
|
||||
map->per_primitive_start_dw + 1; /* RTAIndex */
|
||||
map->len_dw[VARYING_SLOT_LAYER] = 1;
|
||||
}
|
||||
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
|
||||
map->start_dw[VARYING_SLOT_VIEWPORT] =
|
||||
map->per_primitive_start_dw + 2;
|
||||
map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
|
||||
}
|
||||
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
|
||||
map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
|
||||
map->per_primitive_start_dw + 3;
|
||||
map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
|
||||
}
|
||||
|
||||
map->per_primitive_header_size_dw = 8;
|
||||
outputs_written &= ~per_primitive_header_bits;
|
||||
} else {
|
||||
map->per_primitive_header_size_dw = 0;
|
||||
}
|
||||
|
||||
map->per_primitive_data_size_dw = 0;
|
||||
|
||||
unsigned start_dw = map->per_primitive_start_dw +
|
||||
map->per_primitive_header_size_dw;
|
||||
for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
|
||||
int location = (*it).location;
|
||||
if (location < 0) {
|
||||
start_dw += (*it).dwords;
|
||||
map->per_primitive_data_size_dw += (*it).dwords;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(map->start_dw[location] == -1);
|
||||
|
||||
assert(location == VARYING_SLOT_PRIMITIVE_ID ||
|
||||
location >= VARYING_SLOT_VAR0);
|
||||
|
||||
brw_mue_assign_position(&*it, map, start_dw);
|
||||
|
||||
start_dw += (*it).dwords;
|
||||
map->per_primitive_data_size_dw += (*it).dwords;
|
||||
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
|
||||
}
|
||||
|
||||
map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
|
||||
map->per_primitive_data_size_dw, 8);
|
||||
|
||||
map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
|
||||
map->per_primitive_pitch_dw * map->max_primitives, 8);
|
||||
map->per_primitive_pitch_dw *
|
||||
map->max_primitives, 8);
|
||||
|
||||
/* TODO(mesh): Multiview. */
|
||||
unsigned fixed_header_size = 8;
|
||||
map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
|
||||
nir->info.clip_distance_array_size +
|
||||
nir->info.cull_distance_array_size, 8);
|
||||
map->per_vertex_data_size_dw = 0;
|
||||
u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) {
|
||||
assert(map->start_dw[location] == -1);
|
||||
|
||||
unsigned start;
|
||||
switch (location) {
|
||||
case VARYING_SLOT_PSIZ:
|
||||
start = map->per_vertex_start_dw + 3;
|
||||
break;
|
||||
case VARYING_SLOT_POS:
|
||||
start = map->per_vertex_start_dw + 4;
|
||||
break;
|
||||
case VARYING_SLOT_CLIP_DIST0:
|
||||
start = map->per_vertex_start_dw + fixed_header_size + 0;
|
||||
break;
|
||||
case VARYING_SLOT_CLIP_DIST1:
|
||||
start = map->per_vertex_start_dw + fixed_header_size + 4;
|
||||
break;
|
||||
case VARYING_SLOT_CULL_DIST0:
|
||||
case VARYING_SLOT_CULL_DIST1:
|
||||
unreachable("cull distances should be lowered earlier");
|
||||
break;
|
||||
default:
|
||||
assert(location >= VARYING_SLOT_VAR0);
|
||||
start = map->per_vertex_start_dw +
|
||||
map->per_vertex_header_size_dw +
|
||||
map->per_vertex_data_size_dw;
|
||||
map->per_vertex_data_size_dw += 4;
|
||||
break;
|
||||
if (outputs_written & per_vertex_header_bits) {
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
|
||||
map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
|
||||
map->len_dw[VARYING_SLOT_PSIZ] = 1;
|
||||
}
|
||||
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
|
||||
map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
|
||||
map->len_dw[VARYING_SLOT_POS] = 4;
|
||||
}
|
||||
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
|
||||
map->start_dw[VARYING_SLOT_CLIP_DIST0] =
|
||||
map->per_vertex_start_dw + fixed_header_size + 0;
|
||||
map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
|
||||
}
|
||||
|
||||
if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
|
||||
map->start_dw[VARYING_SLOT_CLIP_DIST1] =
|
||||
map->per_vertex_start_dw + fixed_header_size + 4;
|
||||
map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
|
||||
}
|
||||
|
||||
outputs_written &= ~per_vertex_header_bits;
|
||||
}
|
||||
|
||||
/* cull distances should be lowered earlier */
|
||||
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
|
||||
assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
|
||||
|
||||
map->per_vertex_data_size_dw = 0;
|
||||
|
||||
start_dw = map->per_vertex_start_dw +
|
||||
map->per_vertex_header_size_dw;
|
||||
for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
|
||||
for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
|
||||
int location = (*it).location;
|
||||
if (location < 0) {
|
||||
start_dw += (*it).dwords;
|
||||
map->per_vertex_data_size_dw += (*it).dwords;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(map->start_dw[location] == -1);
|
||||
|
||||
assert(location >= VARYING_SLOT_VAR0);
|
||||
|
||||
brw_mue_assign_position(&*it, map, start_dw);
|
||||
|
||||
start_dw += (*it).dwords;
|
||||
map->per_vertex_data_size_dw += (*it).dwords;
|
||||
outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
|
||||
}
|
||||
map->start_dw[location] = start;
|
||||
}
|
||||
|
||||
map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
|
||||
@@ -571,14 +852,18 @@ brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map,
|
||||
}
|
||||
|
||||
static void
|
||||
brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
|
||||
brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
|
||||
{
|
||||
fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
|
||||
map->size_dw, map->max_primitives, map->max_vertices);
|
||||
fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_COUNT\n",
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]);
|
||||
fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_INDICES\n",
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]);
|
||||
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
|
||||
map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
|
||||
fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
|
||||
map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
|
||||
map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
|
||||
|
||||
fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
|
||||
map->per_primitive_start_dw,
|
||||
@@ -589,13 +874,20 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
|
||||
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (map->start_dw[i] < 0)
|
||||
continue;
|
||||
|
||||
const unsigned offset = map->start_dw[i];
|
||||
if (offset >= map->per_primitive_start_dw &&
|
||||
offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) {
|
||||
fprintf(fp, " %4d: %s\n", offset,
|
||||
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
||||
MESA_SHADER_MESH));
|
||||
}
|
||||
const unsigned len = map->len_dw[i];
|
||||
|
||||
if (offset < map->per_primitive_start_dw ||
|
||||
offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
|
||||
continue;
|
||||
|
||||
const char *name =
|
||||
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
||||
MESA_SHADER_MESH);
|
||||
|
||||
fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
|
||||
name, i);
|
||||
}
|
||||
|
||||
fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
|
||||
@@ -607,13 +899,24 @@ brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
|
||||
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (map->start_dw[i] < 0)
|
||||
continue;
|
||||
|
||||
const unsigned offset = map->start_dw[i];
|
||||
if (offset >= map->per_vertex_start_dw &&
|
||||
offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) {
|
||||
fprintf(fp, " %4d: %s\n", offset,
|
||||
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
||||
MESA_SHADER_MESH));
|
||||
}
|
||||
const unsigned len = map->len_dw[i];
|
||||
|
||||
if (offset < map->per_vertex_start_dw ||
|
||||
offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
|
||||
continue;
|
||||
|
||||
nir_variable *var =
|
||||
nir_find_variable_with_location(nir, nir_var_shader_out, i);
|
||||
bool flat = var->data.interpolation == INTERP_MODE_FLAT;
|
||||
|
||||
const char *name =
|
||||
gl_varying_slot_name_for_stage((gl_varying_slot)i,
|
||||
MESA_SHADER_MESH);
|
||||
|
||||
fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
|
||||
name, i, flat ? " (flat)" : "");
|
||||
}
|
||||
|
||||
fprintf(fp, "\n");
|
||||
@@ -1070,7 +1373,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
|
||||
brw_print_tue_map(stderr, params->tue_map);
|
||||
}
|
||||
fprintf(stderr, "Mesh Output ");
|
||||
brw_print_mue_map(stderr, &prog_data->map);
|
||||
brw_print_mue_map(stderr, &prog_data->map, nir);
|
||||
}
|
||||
|
||||
fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
|
||||
|
@@ -2085,6 +2085,21 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
|
||||
return sysval;
|
||||
}
|
||||
|
||||
const struct glsl_type *
|
||||
brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
|
||||
{
|
||||
const struct glsl_type *type = var->interface_type;
|
||||
if (!type) {
|
||||
type = var->type;
|
||||
if (nir_is_arrayed_io(var, nir->info.stage) || var->data.per_view) {
|
||||
assert(glsl_type_is_array(type));
|
||||
type = glsl_get_array_element(type);
|
||||
}
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_pulls_at_sample(nir_shader *shader)
|
||||
{
|
||||
|
@@ -283,6 +283,9 @@ nir_ssa_def *brw_nir_load_global_const(nir_builder *b,
|
||||
nir_ssa_def *base_addr,
|
||||
unsigned off);
|
||||
|
||||
const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
|
||||
nir_variable *var);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user