aco,ac/llvm,ac/nir,vtn: unify cube opcodes
fossil-db (navi21): Totals from 17068 (12.79% of 133461) affected shaders: Instrs: 24743703 -> 24743572 (-0.00%); split: -0.00%, +0.00% CodeSize: 132579952 -> 132580620 (+0.00%); split: -0.00%, +0.00% VGPRs: 1227840 -> 1227984 (+0.01%) Latency: 403180114 -> 403251188 (+0.02%); split: -0.00%, +0.02% InvThroughput: 75311302 -> 75320892 (+0.01%); split: -0.00%, +0.01% VClause: 415400 -> 415402 (+0.00%); split: -0.00%, +0.00% Copies: 1715404 -> 1715258 (-0.01%); split: -0.01%, +0.01% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Gert Wollny <gert.wollny@collabora.com> (r600) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23930>
This commit is contained in:
@@ -98,12 +98,12 @@ prepare_cube_coords(nir_builder *b, nir_tex_instr *tex, nir_ssa_def **coord, nir
|
|||||||
if (tex->is_array && options->gfx_level <= GFX8 && coords[3])
|
if (tex->is_array && options->gfx_level <= GFX8 && coords[3])
|
||||||
coords[3] = nir_fmax(b, coords[3], nir_imm_float(b, 0.0));
|
coords[3] = nir_fmax(b, coords[3], nir_imm_float(b, 0.0));
|
||||||
|
|
||||||
nir_ssa_def *cube_coords = nir_cube_face_coord_amd(b, nir_vec(b, coords, 3));
|
nir_ssa_def *cube_coords = nir_cube_amd(b, nir_vec(b, coords, 3));
|
||||||
nir_ssa_def *sc = nir_channel(b, cube_coords, 0);
|
nir_ssa_def *sc = nir_channel(b, cube_coords, 1);
|
||||||
nir_ssa_def *tc = nir_channel(b, cube_coords, 1);
|
nir_ssa_def *tc = nir_channel(b, cube_coords, 0);
|
||||||
nir_ssa_def *ma = nir_channel(b, cube_coords, 2);
|
nir_ssa_def *ma = nir_channel(b, cube_coords, 2);
|
||||||
nir_ssa_def *invma = nir_frcp(b, nir_fabs(b, ma));
|
nir_ssa_def *invma = nir_frcp(b, nir_fabs(b, ma));
|
||||||
nir_ssa_def *id = nir_cube_face_index_amd(b, nir_vec(b, coords, 3));
|
nir_ssa_def *id = nir_channel(b, cube_coords, 3);
|
||||||
|
|
||||||
if (ddx || ddy) {
|
if (ddx || ddy) {
|
||||||
sc = nir_fmul(b, sc, invma);
|
sc = nir_fmul(b, sc, invma);
|
||||||
|
@@ -2498,21 +2498,15 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
|
|||||||
emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
|
emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_op_cube_face_coord_amd: {
|
case nir_op_cube_amd: {
|
||||||
Temp in = get_alu_src(ctx, instr->src[0], 3);
|
Temp in = get_alu_src(ctx, instr->src[0], 3);
|
||||||
Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
|
Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
|
||||||
emit_extract_vector(ctx, in, 2, v1)};
|
emit_extract_vector(ctx, in, 2, v1)};
|
||||||
Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
|
Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
|
||||||
Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
|
Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
|
||||||
Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
|
Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
|
||||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc, ma);
|
Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
|
||||||
break;
|
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
|
||||||
}
|
|
||||||
case nir_op_cube_face_index_amd: {
|
|
||||||
Temp in = get_alu_src(ctx, instr->src[0], 3);
|
|
||||||
Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
|
|
||||||
emit_extract_vector(ctx, in, 2, v1)};
|
|
||||||
bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case nir_op_bcsel: {
|
case nir_op_bcsel: {
|
||||||
|
@@ -396,8 +396,7 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|||||||
case nir_op_ldexp:
|
case nir_op_ldexp:
|
||||||
case nir_op_frexp_sig:
|
case nir_op_frexp_sig:
|
||||||
case nir_op_frexp_exp:
|
case nir_op_frexp_exp:
|
||||||
case nir_op_cube_face_index_amd:
|
case nir_op_cube_amd:
|
||||||
case nir_op_cube_face_coord_amd:
|
|
||||||
case nir_op_sad_u8x4:
|
case nir_op_sad_u8x4:
|
||||||
case nir_op_udot_4x8_uadd:
|
case nir_op_udot_4x8_uadd:
|
||||||
case nir_op_sdot_4x8_iadd:
|
case nir_op_sdot_4x8_iadd:
|
||||||
|
@@ -568,8 +568,7 @@ static bool visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
|
|||||||
case nir_op_pack_sint_2x16:
|
case nir_op_pack_sint_2x16:
|
||||||
src_components = 2;
|
src_components = 2;
|
||||||
break;
|
break;
|
||||||
case nir_op_cube_face_coord_amd:
|
case nir_op_cube_amd:
|
||||||
case nir_op_cube_face_index_amd:
|
|
||||||
src_components = 3;
|
src_components = 3;
|
||||||
break;
|
break;
|
||||||
case nir_op_pack_32_4x8:
|
case nir_op_pack_32_4x8:
|
||||||
@@ -1190,25 +1189,17 @@ static bool visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case nir_op_cube_face_coord_amd: {
|
case nir_op_cube_amd: {
|
||||||
src[0] = ac_to_float(&ctx->ac, src[0]);
|
src[0] = ac_to_float(&ctx->ac, src[0]);
|
||||||
LLVMValueRef results[3];
|
LLVMValueRef results[4];
|
||||||
LLVMValueRef in[3];
|
LLVMValueRef in[3];
|
||||||
for (unsigned chan = 0; chan < 3; chan++)
|
for (unsigned chan = 0; chan < 3; chan++)
|
||||||
in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
|
in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
|
||||||
results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3, 0);
|
results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3, 0);
|
||||||
results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3, 0);
|
results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3, 0);
|
||||||
results[2] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", ctx->ac.f32, in, 3, 0);
|
results[2] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", ctx->ac.f32, in, 3, 0);
|
||||||
result = ac_build_gather_values(&ctx->ac, results, 3);
|
results[3] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", ctx->ac.f32, in, 3, 0);
|
||||||
break;
|
result = ac_build_gather_values(&ctx->ac, results, 4);
|
||||||
}
|
|
||||||
|
|
||||||
case nir_op_cube_face_index_amd: {
|
|
||||||
src[0] = ac_to_float(&ctx->ac, src[0]);
|
|
||||||
LLVMValueRef in[3];
|
|
||||||
for (unsigned chan = 0; chan < 3; chan++)
|
|
||||||
in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
|
|
||||||
result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", ctx->ac.f32, in, 3, 0);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -219,8 +219,7 @@ lower_alu_instr_width(nir_builder *b, nir_instr *instr, void *_data)
|
|||||||
case nir_op_vec4:
|
case nir_op_vec4:
|
||||||
case nir_op_vec3:
|
case nir_op_vec3:
|
||||||
case nir_op_vec2:
|
case nir_op_vec2:
|
||||||
case nir_op_cube_face_coord_amd:
|
case nir_op_cube_amd:
|
||||||
case nir_op_cube_face_index_amd:
|
|
||||||
/* We don't need to scalarize these ops, they're the ones generated to
|
/* We don't need to scalarize these ops, they're the ones generated to
|
||||||
* group up outputs into a value that can be SSAed.
|
* group up outputs into a value that can be SSAed.
|
||||||
*/
|
*/
|
||||||
|
@@ -536,38 +536,6 @@ for (unsigned bit = 0; bit < bit_size; bit++) {
|
|||||||
}
|
}
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# AMD_gcn_shader extended instructions
|
|
||||||
unop_horiz("cube_face_coord_amd", 3, tfloat32, 3, tfloat32, """
|
|
||||||
dst.x = dst.y = dst.z = 0.0;
|
|
||||||
float absX = fabsf(src0.x);
|
|
||||||
float absY = fabsf(src0.y);
|
|
||||||
float absZ = fabsf(src0.z);
|
|
||||||
|
|
||||||
if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; }
|
|
||||||
if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; }
|
|
||||||
if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; }
|
|
||||||
|
|
||||||
if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
|
|
||||||
if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
|
|
||||||
if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
|
|
||||||
if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
|
|
||||||
if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
|
|
||||||
if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
|
|
||||||
""")
|
|
||||||
|
|
||||||
unop_horiz("cube_face_index_amd", 1, tfloat32, 3, tfloat32, """
|
|
||||||
dst.x = 0.0;
|
|
||||||
float absX = fabsf(src0.x);
|
|
||||||
float absY = fabsf(src0.y);
|
|
||||||
float absZ = fabsf(src0.z);
|
|
||||||
if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
|
|
||||||
if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
|
|
||||||
if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
|
|
||||||
if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
|
|
||||||
if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
|
|
||||||
if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
|
|
||||||
""")
|
|
||||||
|
|
||||||
unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}",
|
unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}",
|
||||||
description = "Sum of vector components")
|
description = "Sum of vector components")
|
||||||
|
|
||||||
@@ -1267,11 +1235,11 @@ dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
|
|||||||
triop("imad24_ir3", tint32, _2src_commutative,
|
triop("imad24_ir3", tint32, _2src_commutative,
|
||||||
"(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
|
"(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
|
||||||
|
|
||||||
# r600-specific instruction that evaluates unnormalized cube texture coordinates
|
# r600/gcn specific instruction that evaluates unnormalized cube texture coordinates
|
||||||
# and face index
|
# and face index
|
||||||
# The actual texture coordinates are evaluated from this according to
|
# The actual texture coordinates are evaluated from this according to
|
||||||
# dst.yx / abs(dst.z) + 1.5
|
# dst.yx / abs(dst.z) + 1.5
|
||||||
unop_horiz("cube_r600", 4, tfloat32, 3, tfloat32, """
|
unop_horiz("cube_amd", 4, tfloat32, 3, tfloat32, """
|
||||||
dst.x = dst.y = dst.z = 0.0;
|
dst.x = dst.y = dst.z = 0.0;
|
||||||
float absX = fabsf(src0.x);
|
float absX = fabsf(src0.x);
|
||||||
float absY = fabsf(src0.y);
|
float absY = fabsf(src0.y);
|
||||||
|
@@ -33,11 +33,11 @@ vtn_handle_amd_gcn_shader_instruction(struct vtn_builder *b, SpvOp ext_opcode,
|
|||||||
nir_ssa_def *def;
|
nir_ssa_def *def;
|
||||||
switch ((enum GcnShaderAMD)ext_opcode) {
|
switch ((enum GcnShaderAMD)ext_opcode) {
|
||||||
case CubeFaceIndexAMD:
|
case CubeFaceIndexAMD:
|
||||||
def = nir_cube_face_index_amd(&b->nb, vtn_get_nir_ssa(b, w[5]));
|
def = nir_channel(&b->nb, nir_cube_amd(&b->nb, vtn_get_nir_ssa(b, w[5])), 3);
|
||||||
break;
|
break;
|
||||||
case CubeFaceCoordAMD: {
|
case CubeFaceCoordAMD: {
|
||||||
def = nir_cube_face_coord_amd(&b->nb, vtn_get_nir_ssa(b, w[5]));
|
def = nir_cube_amd(&b->nb, vtn_get_nir_ssa(b, w[5]));
|
||||||
nir_ssa_def *st = nir_trim_vector(&b->nb, def, 2);
|
nir_ssa_def *st = nir_swizzle(&b->nb, def, (unsigned[]){1, 0}, 2);
|
||||||
nir_ssa_def *invma = nir_frcp(&b->nb, nir_channel(&b->nb, def, 2));
|
nir_ssa_def *invma = nir_frcp(&b->nb, nir_channel(&b->nb, def, 2));
|
||||||
def = nir_ffma_imm2(&b->nb, st, invma, 0.5);
|
def = nir_ffma_imm2(&b->nb, st, invma, 0.5);
|
||||||
break;
|
break;
|
||||||
|
@@ -1200,8 +1200,7 @@ visit_alu(struct lp_build_nir_context *bld_base,
|
|||||||
case nir_op_unpack_half_2x16:
|
case nir_op_unpack_half_2x16:
|
||||||
src_components = 1;
|
src_components = 1;
|
||||||
break;
|
break;
|
||||||
case nir_op_cube_face_coord_amd:
|
case nir_op_cube_amd:
|
||||||
case nir_op_cube_face_index_amd:
|
|
||||||
src_components = 3;
|
src_components = 3;
|
||||||
break;
|
break;
|
||||||
case nir_op_fsum2:
|
case nir_op_fsum2:
|
||||||
|
@@ -1878,7 +1878,7 @@ AluInstr::from_nir(nir_alu_instr *alu, Shader& shader)
|
|||||||
return emit_tex_fdd(*alu, TexInstr::get_gradient_v, false, shader);
|
return emit_tex_fdd(*alu, TexInstr::get_gradient_v, false, shader);
|
||||||
case nir_op_fddy_fine:
|
case nir_op_fddy_fine:
|
||||||
return emit_tex_fdd(*alu, TexInstr::get_gradient_v, true, shader);
|
return emit_tex_fdd(*alu, TexInstr::get_gradient_v, true, shader);
|
||||||
case nir_op_cube_r600:
|
case nir_op_cube_amd:
|
||||||
return emit_alu_cube(*alu, shader);
|
return emit_alu_cube(*alu, shader);
|
||||||
default:
|
default:
|
||||||
fprintf(stderr, "Unknown instruction '");
|
fprintf(stderr, "Unknown instruction '");
|
||||||
|
@@ -734,8 +734,6 @@ r600_lower_to_scalar_instr_filter(const nir_instr *instr, const void *)
|
|||||||
case nir_op_fddy_coarse:
|
case nir_op_fddy_coarse:
|
||||||
case nir_op_fddy_fine:
|
case nir_op_fddy_fine:
|
||||||
return nir_src_bit_size(alu->src[0].src) == 64;
|
return nir_src_bit_size(alu->src[0].src) == 64;
|
||||||
case nir_op_cube_r600:
|
|
||||||
return false;
|
|
||||||
default:
|
default:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@@ -265,8 +265,8 @@ r600_nir_lower_cube_to_2darray_impl(nir_builder *b, nir_instr *instr, void *_opt
|
|||||||
int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
|
int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
|
||||||
assert(coord_idx >= 0);
|
assert(coord_idx >= 0);
|
||||||
|
|
||||||
auto cubed = nir_cube_r600(b,
|
auto cubed = nir_cube_amd(b,
|
||||||
nir_trim_vector(b, tex->src[coord_idx].src.ssa, 3));
|
nir_trim_vector(b, tex->src[coord_idx].src.ssa, 3));
|
||||||
auto xy = nir_fmad(b,
|
auto xy = nir_fmad(b,
|
||||||
nir_vec2(b, nir_channel(b, cubed, 1), nir_channel(b, cubed, 0)),
|
nir_vec2(b, nir_channel(b, cubed, 1), nir_channel(b, cubed, 0)),
|
||||||
nir_frcp(b, nir_fabs(b, nir_channel(b, cubed, 2))),
|
nir_frcp(b, nir_fabs(b, nir_channel(b, cubed, 2))),
|
||||||
|
Reference in New Issue
Block a user