intel/compiler: Handle per-primitive inputs in FS

In Fragment Shader, regular inputs are laid out in the thread payload
in a one dword per each half-GRF, that gives room for having the two
delta dwords needed for interpolation.

Per-primitive inputs are laid out before the regular inputs, and since
there's no need to have delta information, they are packed.  So
half-GRF will be fully filled with 4 dwords of input.

When num_per_primitive_inputs is zero (the default case), behavior
should be the same as before.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13661>
This commit is contained in:
Caio Oliveira
2021-05-18 10:17:43 -07:00
committed by Marge Bot
parent 7938c38778
commit be89ea3231
5 changed files with 96 additions and 15 deletions

View File

@@ -848,6 +848,7 @@ enum brw_pixel_shader_computed_depth_mode {
struct brw_wm_prog_data { struct brw_wm_prog_data {
struct brw_stage_prog_data base; struct brw_stage_prog_data base;
GLuint num_per_primitive_inputs;
GLuint num_varying_inputs; GLuint num_varying_inputs;
uint8_t reg_blocks_8; uint8_t reg_blocks_8;

View File

@@ -1871,10 +1871,31 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
int urb_next = 0; int urb_next = 0;
/* Per-Primitive Attributes are laid out by Hardware before the regular
* attributes, so order them like this to make easy later to map setup into
* real HW registers.
*/
if (nir->info.per_primitive_inputs) {
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
if (nir->info.per_primitive_inputs & BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
}
/* The actual setup attributes later must be aligned to a full GRF. */
urb_next = ALIGN(urb_next, 2);
prog_data->num_per_primitive_inputs = urb_next;
}
const uint64_t inputs_read =
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
/* Figure out where each of the incoming setup attributes lands. */ /* Figure out where each of the incoming setup attributes lands. */
if (devinfo->ver >= 6) { if (devinfo->ver >= 6) {
if (util_bitcount64(nir->info.inputs_read & if (util_bitcount64(inputs_read &
BRW_FS_VARYING_INPUT_MASK) <= 16) { BRW_FS_VARYING_INPUT_MASK) <= 16) {
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
* first 16 varying inputs, so we can put them wherever we want. * first 16 varying inputs, so we can put them wherever we want.
* Just put them in order. * Just put them in order.
@@ -1885,7 +1906,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
* a different vertex (or geometry) shader. * a different vertex (or geometry) shader.
*/ */
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & if (inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(i)) { BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++; prog_data->urb_setup[i] = urb_next++;
} }
@@ -1897,6 +1918,11 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
* (geometry or vertex shader). * (geometry or vertex shader).
*/ */
/* TODO(mesh): Implement this case for Mesh. Basically have a large
* number of outputs in Mesh (hence a lot of inputs in Fragment)
* should already trigger this.
*/
/* Re-compute the VUE map here in the case that the one coming from /* Re-compute the VUE map here in the case that the one coming from
* geometry has more than one position slot (used for Primitive * geometry has more than one position slot (used for Primitive
* Replication). * Replication).
@@ -1907,7 +1933,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
nir->info.separate_shader, 1); nir->info.separate_shader, 1);
int first_slot = int first_slot =
brw_compute_first_urb_slot_required(nir->info.inputs_read, brw_compute_first_urb_slot_required(inputs_read,
&prev_stage_vue_map); &prev_stage_vue_map);
assert(prev_stage_vue_map.num_slots <= first_slot + 32); assert(prev_stage_vue_map.num_slots <= first_slot + 32);
@@ -1915,7 +1941,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
slot++) { slot++) {
int varying = prev_stage_vue_map.slot_to_varying[slot]; int varying = prev_stage_vue_map.slot_to_varying[slot];
if (varying != BRW_VARYING_SLOT_PAD && if (varying != BRW_VARYING_SLOT_PAD &&
(nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & (inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(varying))) { BITFIELD64_BIT(varying))) {
prog_data->urb_setup[varying] = slot - first_slot; prog_data->urb_setup[varying] = slot - first_slot;
} }
@@ -1948,12 +1974,12 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
* *
* See compile_sf_prog() for more info. * See compile_sf_prog() for more info.
*/ */
if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
} }
prog_data->num_varying_inputs = urb_next; prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
prog_data->inputs = nir->info.inputs_read; prog_data->inputs = inputs_read;
brw_compute_urb_setup_index(prog_data); brw_compute_urb_setup_index(prog_data);
} }
@@ -1995,6 +2021,12 @@ fs_visitor::assign_urb_setup()
/* Each attribute is 4 setup channels, each of which is half a reg. */ /* Each attribute is 4 setup channels, each of which is half a reg. */
this->first_non_payload_grf += prog_data->num_varying_inputs * 2; this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
/* Unlike regular attributes, per-primitive attributes have all 4 channels
* in the same slot, so each GRF can store two slots.
*/
assert(prog_data->num_per_primitive_inputs % 2 == 0);
this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
} }
void void

View File

@@ -332,6 +332,7 @@ public:
fs_reg get_timestamp(const brw::fs_builder &bld); fs_reg get_timestamp(const brw::fs_builder &bld);
fs_reg interp_reg(int location, int channel); fs_reg interp_reg(int location, int channel);
fs_reg per_primitive_reg(int location);
virtual void dump_instructions() const; virtual void dump_instructions() const;
virtual void dump_instructions(const char *name) const; virtual void dump_instructions(const char *name) const;

View File

@@ -3620,21 +3620,33 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
} }
case nir_intrinsic_load_input: { case nir_intrinsic_load_input: {
/* load_input is only used for flat inputs */ /* In Fragment Shaders load_input is used either for flat inputs or
* per-primitive inputs.
*/
assert(nir_dest_bit_size(instr->dest) == 32); assert(nir_dest_bit_size(instr->dest) == 32);
unsigned base = nir_intrinsic_base(instr); unsigned base = nir_intrinsic_base(instr);
unsigned comp = nir_intrinsic_component(instr); unsigned comp = nir_intrinsic_component(instr);
unsigned num_components = instr->num_components; unsigned num_components = instr->num_components;
/* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
/* Special case fields in the VUE header */ /* Special case fields in the VUE header */
if (base == VARYING_SLOT_LAYER) if (base == VARYING_SLOT_LAYER)
comp = 1; comp = 1;
else if (base == VARYING_SLOT_VIEWPORT) else if (base == VARYING_SLOT_VIEWPORT)
comp = 2; comp = 2;
for (unsigned int i = 0; i < num_components; i++) { if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) {
bld.MOV(offset(dest, bld, i), assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
retype(component(interp_reg(base, comp + i), 3), dest.type)); for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
retype(component(per_primitive_reg(base), comp + i), dest.type));
}
} else {
for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
retype(component(interp_reg(base, comp + i), 3), dest.type));
}
} }
break; break;
} }

View File

@@ -136,6 +136,11 @@ fs_visitor::emit_dummy_fs()
calculate_cfg(); calculate_cfg();
} }
/* Input data is organized with first the per-primitive values, followed
* by per-vertex values. The per-vertex will have interpolation information
* associated, so use 4 components for each value.
*/
/* The register location here is relative to the start of the URB /* The register location here is relative to the start of the URB
* data. It will get adjusted to be a real location before * data. It will get adjusted to be a real location before
* generate_code() time. * generate_code() time.
@@ -144,9 +149,39 @@ fs_reg
fs_visitor::interp_reg(int location, int channel) fs_visitor::interp_reg(int location, int channel)
{ {
assert(stage == MESA_SHADER_FRAGMENT); assert(stage == MESA_SHADER_FRAGMENT);
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
int regnr = prog_data->urb_setup[location] * 4 + channel;
assert(prog_data->urb_setup[location] != -1); const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
assert(prog_data->urb_setup[location] >= 0);
unsigned nr = prog_data->urb_setup[location];
/* Adjust so we start counting from the first per_vertex input. */
assert(nr >= prog_data->num_per_primitive_inputs);
nr -= prog_data->num_per_primitive_inputs;
const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
const unsigned regnr = per_vertex_start + (nr * 4) + channel;
return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
}
/* The register location here is relative to the start of the URB
* data. It will get adjusted to be a real location before
* generate_code() time.
*/
fs_reg
fs_visitor::per_primitive_reg(int location)
{
assert(stage == MESA_SHADER_FRAGMENT);
assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
assert(prog_data->urb_setup[location] >= 0);
const unsigned regnr = prog_data->urb_setup[location];
assert(regnr < prog_data->num_per_primitive_inputs);
return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F); return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
} }