intel/elk: Remove Gfx9+ from nir conversion

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
This commit is contained in:
Caio Oliveira
2024-02-10 20:38:12 -08:00
committed by Marge Bot
parent 6b6de68b1c
commit cb2d96af6a

View File

@@ -70,7 +70,6 @@ static void fs_nir_emit_intrinsic(nir_to_elk_state &ntb, const fs_builder &bld,
static elk_fs_reg emit_samplepos_setup(nir_to_elk_state &ntb);
static elk_fs_reg emit_sampleid_setup(nir_to_elk_state &ntb);
static elk_fs_reg emit_samplemaskin_setup(nir_to_elk_state &ntb);
static elk_fs_reg emit_shading_rate_setup(nir_to_elk_state &ntb);
static void fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl);
static void fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list);
@@ -138,15 +137,13 @@ fs_nir_setup_outputs(nir_to_elk_state &ntb)
static void
fs_nir_setup_uniforms(elk_fs_visitor &s)
{
const intel_device_info *devinfo = s.devinfo;
/* Only the first compile gets to set up uniforms. */
if (s.push_constant_loc)
return;
s.uniforms = s.nir->num_uniforms / 4;
if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) {
if (gl_shader_stage_is_compute(s.stage)) {
/* Add uniforms for builtins after regular NIR uniforms. */
assert(s.uniforms == s.prog_data->nr_params);
@@ -277,8 +274,7 @@ emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
* stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
* gfx6+.
*/
const struct elk_reg reg = s.devinfo->ver >= 20 ?
xe2_vec1_grf(i, 15) : elk_vec1_grf(i + 1, 7);
const struct elk_reg reg = elk_vec1_grf(i + 1, 7);
hbld.SHR(offset(shifted, hbld, i),
stride(retype(reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
elk_imm_v(0x76543210));
@@ -310,12 +306,6 @@ emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
}
break;
case nir_intrinsic_load_frag_shading_rate:
reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
if (reg->file == BAD_FILE)
*reg = emit_shading_rate_setup(ntb);
break;
default:
break;
}
@@ -545,66 +535,7 @@ optimize_frontfacing_ternary(nir_to_elk_state &ntb,
elk_fs_reg tmp = s.vgrf(glsl_int_type());
if (devinfo->ver >= 20) {
/* Gfx20+ has separate back-facing bits for each pair of
* subspans in order to support multiple polygons, so we need to
* use a <1;8,0> region in order to select the correct word for
* each channel. Unfortunately they're no longer aligned to the
* sign bit of a 16-bit word, so a left shift is necessary.
*/
elk_fs_reg ff = ntb.bld.vgrf(ELK_REGISTER_TYPE_UW);
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
const fs_builder hbld = ntb.bld.group(16, i);
const struct elk_reg gi_uw = retype(xe2_vec1_grf(i, 9),
ELK_REGISTER_TYPE_UW);
hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), elk_imm_ud(4));
}
if (value1 == -1.0f)
ff.negate = true;
ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_UW, 1), ff,
elk_imm_uw(0x3f80));
} else if (devinfo->ver >= 12 && s.max_polygons == 2) {
/* According to the BSpec "PS Thread Payload for Normal
* Dispatch", the front/back facing interpolation bit is stored
* as bit 15 of either the R1.1 or R1.6 poly info field, for the
* first and second polygons respectively in multipolygon PS
* dispatch mode.
*/
assert(s.dispatch_width == 16);
for (unsigned i = 0; i < s.max_polygons; i++) {
const fs_builder hbld = ntb.bld.group(8, i);
struct elk_reg g1 = retype(elk_vec1_grf(1, 1 + 5 * i),
ELK_REGISTER_TYPE_UW);
if (value1 == -1.0f)
g1.negate = true;
hbld.OR(subscript(offset(tmp, hbld, i), ELK_REGISTER_TYPE_UW, 1),
g1, elk_imm_uw(0x3f80));
}
} else if (devinfo->ver >= 12) {
/* Bit 15 of g1.1 is 0 if the polygon is front facing. */
elk_fs_reg g1 = elk_fs_reg(retype(elk_vec1_grf(1, 1), ELK_REGISTER_TYPE_W));
/* For (gl_FrontFacing ? 1.0 : -1.0), emit:
*
* or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W
* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
*
* and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
*/
if (value1 == -1.0f)
g1.negate = true;
ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
g1, elk_imm_uw(0x3f80));
} else if (devinfo->ver >= 6) {
if (devinfo->ver >= 6) {
/* Bit 15 of g0.0 is 0 if the polygon is front facing. */
elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
@@ -775,7 +706,7 @@ try_emit_b2fi_of_inot(nir_to_elk_state &ntb, const fs_builder &bld,
{
const intel_device_info *devinfo = bld.shader->devinfo;
if (devinfo->ver < 6 || devinfo->verx10 >= 125)
if (devinfo->ver < 6)
return false;
nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
@@ -2469,7 +2400,6 @@ emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
unsigned num_components,
unsigned first_component)
{
const intel_device_info *devinfo = ntb.devinfo;
const fs_builder &bld = ntb.bld;
elk_fs_visitor &s = ntb.s;
@@ -2540,7 +2470,7 @@ emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
if (nir_src_is_const(vertex_src)) {
unsigned vertex = nir_src_as_uint(vertex_src);
assert(devinfo->ver >= 9 || vertex <= 5);
assert(vertex <= 5);
bld.MOV(icp_handle, component(start, vertex));
} else {
/* The vertex index is non-constant. We need to use indirect
@@ -2780,20 +2710,6 @@ get_tcs_multi_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
return icp_handle;
}
static void
setup_barrier_message_payload_gfx125(const fs_builder &bld,
const elk_fs_reg &msg_payload)
{
assert(bld.shader->devinfo->verx10 >= 125);
/* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
elk_fs_reg m0_10ub = component(retype(msg_payload, ELK_REGISTER_TYPE_UB), 10);
elk_fs_reg r0_11ub =
stride(suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UB), 11),
0, 1, 0);
bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
}
static void
emit_barrier(nir_to_elk_state &ntb)
{
@@ -2809,31 +2725,22 @@ emit_barrier(nir_to_elk_state &ntb)
/* Clear the message payload */
bld.exec_all().group(8, 0).MOV(payload, elk_imm_ud(0u));
if (devinfo->verx10 >= 125) {
setup_barrier_message_payload_gfx125(bld, payload);
} else {
assert(gl_shader_stage_is_compute(s.stage));
assert(gl_shader_stage_is_compute(s.stage));
uint32_t barrier_id_mask;
switch (devinfo->ver) {
case 7:
case 8:
barrier_id_mask = 0x0f000000u; break;
case 9:
barrier_id_mask = 0x8f000000u; break;
case 11:
case 12:
barrier_id_mask = 0x7f000000u; break;
default:
unreachable("barrier is only available on gen >= 7");
}
/* Copy the barrier id from r0.2 to the message payload reg.2 */
elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
elk_imm_ud(barrier_id_mask));
uint32_t barrier_id_mask;
switch (devinfo->ver) {
case 7:
case 8:
barrier_id_mask = 0x0f000000u; break;
default:
unreachable("barrier is only available on gen >= 7");
}
/* Copy the barrier id from r0.2 to the message payload reg.2 */
elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
elk_imm_ud(barrier_id_mask));
/* Emit a gateway "barrier" message using the payload we set up, followed
* by a wait instruction.
*/
@@ -2843,7 +2750,6 @@ emit_barrier(nir_to_elk_state &ntb)
static void
emit_tcs_barrier(nir_to_elk_state &ntb)
{
const intel_device_info *devinfo = ntb.devinfo;
const fs_builder &bld = ntb.bld;
elk_fs_visitor &s = ntb.s;
@@ -2858,27 +2764,16 @@ emit_tcs_barrier(nir_to_elk_state &ntb)
/* Zero the message header */
bld.exec_all().MOV(m0, elk_imm_ud(0u));
if (devinfo->verx10 >= 125) {
setup_barrier_message_payload_gfx125(bld, m0);
} else if (devinfo->ver >= 11) {
chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
elk_imm_ud(INTEL_MASK(30, 24)));
/* Copy "Barrier ID" from r0.2, bits 16:13 */
chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
elk_imm_ud(INTEL_MASK(16, 13)));
/* Set the Barrier Count and the enable bit */
chanbld.OR(m0_2, m0_2,
elk_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
} else {
/* Copy "Barrier ID" from r0.2, bits 16:13 */
chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
elk_imm_ud(INTEL_MASK(16, 13)));
/* Shift it up to bits 27:24. */
chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
/* Shift it up to bits 27:24. */
chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
/* Set the Barrier Count and the enable bit */
chanbld.OR(m0_2, m0_2,
elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
}
/* Set the Barrier Count and the enable bit */
chanbld.OR(m0_2, m0_2,
elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
bld.emit(ELK_SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
}
@@ -3075,25 +2970,23 @@ fs_nir_emit_tcs_intrinsic(nir_to_elk_state &ntb,
mask = mask << first_component;
const bool has_urb_lsc = devinfo->ver >= 20;
elk_fs_reg mask_reg;
if (mask != WRITEMASK_XYZW)
mask_reg = elk_imm_ud(mask << 16);
elk_fs_reg sources[4];
unsigned m = has_urb_lsc ? 0 : first_component;
unsigned m = first_component;
for (unsigned i = 0; i < num_components; i++) {
int c = i + first_component;
if (mask & (1 << c)) {
sources[m++] = offset(value, bld, i);
} else if (devinfo->ver < 20) {
} else {
m++;
}
}
assert(has_urb_lsc || m == (first_component + num_components));
assert(m == (first_component + num_components));
elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
@@ -3286,51 +3179,7 @@ fs_nir_emit_gs_intrinsic(nir_to_elk_state &ntb,
static elk_fs_reg
fetch_render_target_array_index(const fs_builder &bld)
{
const elk_fs_visitor *v = static_cast<const elk_fs_visitor *>(bld.shader);
if (bld.shader->devinfo->ver >= 20) {
/* Gfx20+ has separate Render Target Array indices for each pair
* of subspans in order to support multiple polygons, so we need
* to use a <1;8,0> region in order to select the correct word
* for each channel.
*/
const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
const fs_builder hbld = bld.group(16, i);
const struct elk_reg reg = retype(elk_vec1_grf(2 * i + 1, 1),
ELK_REGISTER_TYPE_UW);
hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
elk_imm_uw(0x7ff));
}
return idx;
} else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
/* According to the BSpec "PS Thread Payload for Normal
* Dispatch", the render target array index is stored as bits
* 26:16 of either the R1.1 or R1.6 poly info dwords, for the
* first and second polygons respectively in multipolygon PS
* dispatch mode.
*/
assert(bld.dispatch_width() == 16);
const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
for (unsigned i = 0; i < v->max_polygons; i++) {
const fs_builder hbld = bld.group(8, i);
const struct elk_reg g1 = elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 1, 3 + 10 * i);
hbld.AND(offset(idx, hbld, i), g1, elk_imm_uw(0x7ff));
}
return idx;
} else if (bld.shader->devinfo->ver >= 12) {
/* The render target array index is provided in the thread payload as
* bits 26:16 of r1.1.
*/
const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 1, 3),
elk_imm_uw(0x7ff));
return idx;
} else if (bld.shader->devinfo->ver >= 6) {
if (bld.shader->devinfo->ver >= 6) {
/* The render target array index is provided in the thread payload as
* bits 26:16 of r0.0.
*/
@@ -3385,7 +3234,6 @@ emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const el
unsigned target)
{
elk_fs_visitor &s = ntb.s;
const struct intel_device_info *devinfo = s.devinfo;
assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
const elk_wm_prog_key *wm_key =
@@ -3420,18 +3268,7 @@ emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const el
*/
elk_opcode op;
if (wm_key->multisample_fbo) {
/* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
* multisampling, it should be equivalent to the normal CMS fetch for
* lower multisampling modes.
*
* On Gfx12HP, there is only CMS_W variant available.
*/
if (devinfo->verx10 >= 125)
op = ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
else if (devinfo->ver >= 9)
op = ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL;
else
op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
} else {
op = ELK_SHADER_OPCODE_TXF_LOGICAL;
}
@@ -3571,53 +3408,10 @@ emit_frontfacing_interpolation(nir_to_elk_state &ntb)
{
const intel_device_info *devinfo = ntb.devinfo;
const fs_builder &bld = ntb.bld;
elk_fs_visitor &s = ntb.s;
elk_fs_reg ff = bld.vgrf(ELK_REGISTER_TYPE_D);
if (devinfo->ver >= 20) {
/* Gfx20+ has separate back-facing bits for each pair of
* subspans in order to support multiple polygons, so we need to
* use a <1;8,0> region in order to select the correct word for
* each channel.
*/
const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UW);
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
const fs_builder hbld = bld.group(16, i);
const struct elk_reg gi_uw = retype(xe2_vec1_grf(i, 9),
ELK_REGISTER_TYPE_UW);
hbld.AND(offset(tmp, hbld, i), gi_uw, elk_imm_uw(0x800));
}
bld.CMP(ff, tmp, elk_imm_uw(0), ELK_CONDITIONAL_Z);
} else if (devinfo->ver >= 12 && s.max_polygons == 2) {
/* According to the BSpec "PS Thread Payload for Normal
* Dispatch", the front/back facing interpolation bit is stored
* as bit 15 of either the R1.1 or R1.6 poly info field, for the
* first and second polygons respectively in multipolygon PS
* dispatch mode.
*/
assert(s.dispatch_width == 16);
elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_W);
for (unsigned i = 0; i < s.max_polygons; i++) {
const fs_builder hbld = bld.group(8, i);
const struct elk_reg g1 = retype(elk_vec1_grf(1, 1 + 5 * i),
ELK_REGISTER_TYPE_W);
hbld.ASR(offset(tmp, hbld, i), g1, elk_imm_d(15));
}
bld.NOT(ff, tmp);
} else if (devinfo->ver >= 12) {
elk_fs_reg g1 = elk_fs_reg(retype(elk_vec1_grf(1, 1), ELK_REGISTER_TYPE_W));
elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_W);
bld.ASR(tmp, g1, elk_imm_d(15));
bld.NOT(ff, tmp);
} else if (devinfo->ver >= 6) {
if (devinfo->ver >= 6) {
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
* a boolean result from this (~0/true or 0/false).
*
@@ -3766,11 +3560,9 @@ emit_sampleid_setup(nir_to_elk_state &ntb)
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
/* According to the "PS Thread Payload for Normal Dispatch"
* pages on the BSpec, the sample ids are stored in R0.8/R1.8
* on gfx20+ and in R1.0/R2.0 on gfx8+.
* pages on the BSpec, the sample ids are stored in R1.0/R2.0 on gfx8+.
*/
const struct elk_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
elk_vec1_grf(i + 1, 0);
const struct elk_reg id_reg = elk_vec1_grf(i + 1, 0);
hbld.SHR(offset(tmp, hbld, i),
stride(retype(id_reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
elk_imm_v(0x44440000));
@@ -3887,56 +3679,6 @@ emit_samplemaskin_setup(nir_to_elk_state &ntb)
return mask;
}
static elk_fs_reg
emit_shading_rate_setup(nir_to_elk_state &ntb)
{
const intel_device_info *devinfo = ntb.devinfo;
const fs_builder &bld = ntb.bld;
assert(devinfo->ver >= 11);
struct elk_wm_prog_data *wm_prog_data =
elk_wm_prog_data(bld.shader->stage_prog_data);
/* Coarse pixel shading size fields overlap with other fields of not in
* coarse pixel dispatch mode, so report 0 when that's not the case.
*/
if (wm_prog_data->coarse_pixel_dispatch == ELK_NEVER)
return elk_imm_ud(0);
const fs_builder abld = bld.annotate("compute fragment shading rate");
/* The shading rates provided in the shader are the actual 2D shading
* rate while the SPIR-V built-in is the enum value that has the shading
* rate encoded as a bitfield. Fortunately, the bitfield value is just
* the shading rate divided by two and shifted.
*/
/* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
elk_fs_reg actual_x = elk_fs_reg(retype(elk_vec1_grf(1, 0), ELK_REGISTER_TYPE_UB));
/* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
elk_fs_reg actual_y = byte_offset(actual_x, 1);
elk_fs_reg int_rate_x = bld.vgrf(ELK_REGISTER_TYPE_UD);
elk_fs_reg int_rate_y = bld.vgrf(ELK_REGISTER_TYPE_UD);
abld.SHR(int_rate_y, actual_y, elk_imm_ud(1));
abld.SHR(int_rate_x, actual_x, elk_imm_ud(1));
abld.SHL(int_rate_x, int_rate_x, elk_imm_ud(2));
elk_fs_reg rate = abld.vgrf(ELK_REGISTER_TYPE_UD);
abld.OR(rate, int_rate_x, int_rate_y);
if (wm_prog_data->coarse_pixel_dispatch == ELK_ALWAYS)
return rate;
check_dynamic_msaa_flag(abld, wm_prog_data,
INTEL_MSAA_FLAG_COARSE_RT_WRITES);
set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(rate, rate, elk_imm_ud(0)));
return rate;
}
static void
fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
nir_intrinsic_instr *instr)
@@ -4103,8 +3845,7 @@ fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
/* Only jump when the whole quad is demoted. For historical
* reasons this is also used for discard.
*/
jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
ELK_PREDICATE_ALIGN1_ANY4H);
jump->predicate = ELK_PREDICATE_ALIGN1_ANY4H;
}
if (devinfo->ver < 7)
@@ -4135,11 +3876,7 @@ fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
}
} else {
/* Gfx20+ packs the plane parameters of a single logical
* input in a vec3 format instead of the previously used vec4
* format.
*/
const unsigned k = devinfo->ver >= 20 ? 0 : 3;
const unsigned k = 3;
for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
retype(s.interp_reg(bld, base, comp + i, k), dest.type));
@@ -4155,19 +3892,9 @@ fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
const unsigned comp = nir_intrinsic_component(instr);
dest.type = ELK_REGISTER_TYPE_F;
/* Gfx20+ packs the plane parameters of a single logical
* input in a vec3 format instead of the previously used vec4
* format.
*/
if (devinfo->ver >= 20) {
bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 0));
bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 2));
bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 1));
} else {
bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
}
bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
break;
}
@@ -5194,59 +4921,30 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
const fs_builder ubld = bld.group(8, 0);
if (devinfo->ver >= 11) {
if (tgm_fence || ugm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
true /* commit_enable HSD ES # 1404612949 */,
0 /* BTI = 0 means data cache */);
}
/* Prior to Icelake, they're all lumped into a single cache except on
* Ivy Bridge and Bay Trail where typed messages actually go through
* the render cache. There, we need both fences because we may
* access storage images as either typed or untyped.
*/
const bool render_fence = tgm_fence && devinfo->verx10 == 70;
if (slm_fence) {
assert(opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
true /* commit_enable HSD ES # 1404612949 */,
GFX7_BTI_SLM);
}
} else {
/* Prior to Icelake, they're all lumped into a single cache except on
* Ivy Bridge and Bay Trail where typed messages actually go through
* the render cache. There, we need both fences because we may
* access storage images as either typed or untyped.
*/
const bool render_fence = tgm_fence && devinfo->verx10 == 70;
const bool commit_enable = render_fence ||
instr->intrinsic == nir_intrinsic_end_invocation_interlock;
/* Simulation also complains on Gfx9 if we do not enable commit.
*/
const bool commit_enable = render_fence ||
instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
devinfo->ver == 9;
if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
commit_enable, 0 /* BTI */);
}
if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
commit_enable, 0 /* BTI */);
}
if (render_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
commit_enable, /* bti */ 0);
}
if (render_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
commit_enable, /* bti */ 0);
}
assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
/* Be conservative in Gen11+ and always stall in a fence. Since
* there are two different fences, and shader might want to
* synchronize between them.
*
* TODO: Use scope and visibility information for the barriers from NIR
* to make a better decision on whether we need to stall.
*/
bool force_stall = devinfo->ver >= 11;
/* There are four cases where we want to insert a stall:
*
* 1. If we're a nir_intrinsic_end_invocation_interlock. This is
@@ -5261,13 +4959,9 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
* 3. If we have no fences. In this case, we need at least a
* scheduling barrier to keep the compiler from moving things
* around in an invalid way.
*
* 4. On Gen11+ and platforms with LSC, we have multiple fence types,
* without further information about the fence, we need to force a
* stall.
*/
if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
fence_regs_count != 1) {
ubld.exec_all().group(1, 0).emit(
ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
fence_regs, fence_regs_count);
@@ -5965,24 +5659,12 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
if (devinfo->verx10 >= 125) {
assert(bit_size == 32 &&
nir_intrinsic_align(instr) >= 4);
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(1);
bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
}
bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
@@ -6032,25 +5714,14 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
if (devinfo->verx10 >= 125) {
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(1);
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
}
bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
@@ -6107,11 +5778,10 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
* getting garbage in the second half. Work around this by using a pair
* of 1-wide MOVs and scattering the result.
*/
const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
const fs_builder ubld = ubld1;
elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
ubld.MOV(res1, elk_imm_d(0));
set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ANY8H :
set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ANY8H :
s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ANY16H :
ELK_PREDICATE_ALIGN1_ANY32H,
ubld.MOV(res1, elk_imm_d(-1)));
@@ -6141,11 +5811,10 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
* getting garbage in the second half. Work around this by using a pair
* of 1-wide MOVs and scattering the result.
*/
const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
const fs_builder ubld = ubld1;
elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
ubld.MOV(res1, elk_imm_d(0));
set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
ELK_PREDICATE_ALIGN1_ALL32H,
ubld.MOV(res1, elk_imm_d(-1)));
@@ -6184,11 +5853,10 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
* getting garbage in the second half. Work around this by using a pair
* of 1-wide MOVs and scattering the result.
*/
const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
const fs_builder ubld = ubld1;
elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
ubld.MOV(res1, elk_imm_d(0));
set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
ELK_PREDICATE_ALIGN1_ALL32H,
ubld.MOV(res1, elk_imm_d(-1)));
@@ -6605,136 +6273,6 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
break;
}
case nir_intrinsic_load_topology_id_intel: {
/* These move around basically every hardware generation, so don't
* do any unbounded checks and fail if the platform hasn't explicitly
* been enabled here.
*/
assert(devinfo->ver >= 12 && devinfo->ver <= 20);
/* Here is what the layout of SR0 looks like on Gfx12
* https://gfxspecs.intel.com/Predator/Home/Index/47256
* [13:11] : Slice ID.
* [10:9] : Dual-SubSlice ID
* [8] : SubSlice ID
* [7] : EUID[2] (aka EU Row ID)
* [6] : Reserved
* [5:4] : EUID[1:0]
* [2:0] : Thread ID
*
* Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
* Register Regions, ARF Registers, State Register,
* https://gfxspecs.intel.com/Predator/Home/Index/56623
* [15:11] : Slice ID.
* [9:8] : SubSlice ID
* [6:4] : EUID
* [2:0] : Thread ID
*/
elk_fs_reg raw_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.emit(ELK_SHADER_OPCODE_READ_SR_REG, raw_id, elk_imm_ud(0));
switch (nir_intrinsic_base(instr)) {
case ELK_TOPOLOGY_ID_DSS:
if (devinfo->ver >= 20) {
/* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
* https://gfxspecs.intel.com/Predator/Home/Index/56936
*
* Note: DSSID in all formulas below is a logical identifier of an
* XeCore (a value that goes from 0 to (number_of_slices *
* number_of_XeCores_per_slice -1). SW can get this value from
* either:
*
* - Message Control Register LogicalSSID field (only in shaders
* eligible for Mid-Thread Preemption).
* - Calculated based of State Register with the following formula:
* DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
* StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
* architectural parameter defined per product SKU.
*
* We are using the state register to calculate the DSSID.
*/
elk_fs_reg slice_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
elk_fs_reg subslice_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.AND(slice_id, raw_id, elk_imm_ud(INTEL_MASK(15, 11)));
bld.SHR(slice_id, slice_id, elk_imm_ud(11));
/* Assert that max subslices covers at least 2 bits that we use for
* subslices.
*/
assert(devinfo->max_subslices_per_slice >= (1 << 2));
bld.MUL(slice_id, slice_id,
elk_imm_ud(devinfo->max_subslices_per_slice));
bld.AND(subslice_id, raw_id, elk_imm_ud(INTEL_MASK(9, 8)));
bld.SHR(subslice_id, subslice_id, elk_imm_ud(8));
bld.ADD(retype(dest, ELK_REGISTER_TYPE_UD), slice_id,
subslice_id);
} else {
bld.AND(raw_id, raw_id, elk_imm_ud(0x3fff));
/* Get rid of anything below dualsubslice */
bld.SHR(retype(dest, ELK_REGISTER_TYPE_UD), raw_id, elk_imm_ud(9));
}
break;
case ELK_TOPOLOGY_ID_EU_THREAD_SIMD: {
s.limit_dispatch_width(16, "Topology helper for Ray queries, "
"not supported in SIMD32 mode.");
elk_fs_reg dst = retype(dest, ELK_REGISTER_TYPE_UD);
if (devinfo->ver >= 20) {
/* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
* Ray Tracing,
* https://gfxspecs.intel.com/Predator/Home/Index/56936
*
* SyncStackID = (EUID[2:0] << 8) | (ThreadID[2:0] << 4) |
* SIMDLaneID[3:0];
*
* This section just deals with the EUID part.
*
* The 3bit EU[2:0] we need to build for ray query memory addresses
* computations is a bit odd :
*
* EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
*/
bld.AND(dst, raw_id, elk_imm_ud(INTEL_MASK(6, 4)));
bld.SHL(dst, dst, elk_imm_ud(4));
} else {
/* EU[3:0] << 7
*
* The 4bit EU[3:0] we need to build for ray query memory addresses
* computations is a bit odd :
*
* EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
* EU[2] = raw_id[8] (identified as SubSlice ID)
* EU[3] = raw_id[7] (identified as EUID[2] or Row ID)
*/
elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.AND(tmp, raw_id, elk_imm_ud(INTEL_MASK(7, 7)));
bld.SHL(dst, tmp, elk_imm_ud(3));
bld.AND(tmp, raw_id, elk_imm_ud(INTEL_MASK(8, 8)));
bld.SHL(tmp, tmp, elk_imm_ud(1));
bld.OR(dst, dst, tmp);
bld.AND(tmp, raw_id, elk_imm_ud(INTEL_MASK(5, 4)));
bld.SHL(tmp, tmp, elk_imm_ud(3));
bld.OR(dst, dst, tmp);
}
/* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */
{
bld.AND(raw_id, raw_id, elk_imm_ud(INTEL_MASK(2, 0)));
bld.SHL(raw_id, raw_id, elk_imm_ud(4));
bld.OR(dst, dst, raw_id);
}
/* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
bld.ADD(dst, dst,
ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
break;
}
default:
unreachable("Invalid topology id type");
}
break;
}
default:
#ifndef NDEBUG
assert(instr->intrinsic < nir_num_intrinsics);
@@ -6995,11 +6533,6 @@ fs_nir_emit_texture(nir_to_elk_state &ntb,
if (elk_texture_offset(instr, i, &offset_bits)) {
header_bits |= offset_bits;
} else {
/* On gfx12.5+, if the offsets are not both constant and in the
* {-8,7} range, nir_lower_tex() will have already lowered the
* source offset. So we should never reach this point.
*/
assert(devinfo->verx10 < 125);
srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
retype(src, ELK_REGISTER_TYPE_D);
}
@@ -7121,17 +6654,7 @@ fs_nir_emit_texture(nir_to_elk_state &ntb,
opcode = ELK_SHADER_OPCODE_TXF_LOGICAL;
break;
case nir_texop_txf_ms:
/* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
* Functions - 3D Sampler - Messages - Message Format:
*
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
*/
if (devinfo->verx10 >= 125)
opcode = ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
else if (devinfo->ver >= 9)
opcode = ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL;
else
opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
break;
case nir_texop_txf_ms_mcs_intel:
opcode = ELK_SHADER_OPCODE_TXF_MCS_LOGICAL;
@@ -7160,11 +6683,6 @@ fs_nir_emit_texture(nir_to_elk_state &ntb,
*/
if (srcs[TEX_LOGICAL_SRC_MCS].file == ELK_IMMEDIATE_VALUE) {
bld.MOV(dst, elk_imm_ud(0u));
} else if (devinfo->ver >= 9) {
elk_fs_reg tmp = s.vgrf(glsl_uint_type());
bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
bld.CMP(dst, tmp, elk_imm_ud(0u), ELK_CONDITIONAL_EQ);
} else {
bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], elk_imm_ud(0u),
ELK_CONDITIONAL_EQ);
@@ -7192,22 +6710,8 @@ fs_nir_emit_texture(nir_to_elk_state &ntb,
inst->offset = header_bits;
const unsigned dest_size = nir_tex_instr_dest_size(instr);
if (devinfo->ver >= 9 &&
instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
unsigned write_mask = nir_def_components_read(&instr->def);
assert(write_mask != 0); /* dead code should have been eliminated */
if (instr->is_sparse) {
inst->size_written = (util_last_bit(write_mask) - 1) *
inst->dst.component_size(inst->exec_size) +
(reg_unit(devinfo) * REG_SIZE);
} else {
inst->size_written = util_last_bit(write_mask) *
inst->dst.component_size(inst->exec_size);
}
} else {
inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
(instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
}
inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
(instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
inst->shadow_compare = true;
@@ -7234,21 +6738,17 @@ fs_nir_emit_texture(nir_to_elk_state &ntb,
if (instr->op == nir_texop_query_levels) {
/* # levels is in .w */
if (devinfo->ver <= 9) {
/**
* Wa_1940217:
*
* When a surface of type SURFTYPE_NULL is accessed by resinfo, the
* MIPCount returned is undefined instead of 0.
*/
elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
mov->conditional_mod = ELK_CONDITIONAL_NZ;
nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
sel->predicate = ELK_PREDICATE_NORMAL;
} else {
nir_dest[0] = offset(dst, bld, 3);
}
/**
* Wa_1940217:
*
* When a surface of type SURFTYPE_NULL is accessed by resinfo, the
* MIPCount returned is undefined instead of 0.
*/
elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
mov->conditional_mod = ELK_CONDITIONAL_NZ;
nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
sel->predicate = ELK_PREDICATE_NORMAL;
} else if (instr->op == nir_texop_txs &&
dest_size >= 3 && devinfo->ver < 7) {
/* Gfx4-6 return 0 instead of 1 for single layer surfaces. */