gallium: remove TGSI opcode XPD

use MUL+MAD+MOV instead. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2017-08-19 22:23:08 +02:00
parent 3e2ff8fade
commit 985e6b5ef9
24 changed files with 41 additions and 539 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -858,61 +858,6 @@ static void fmin_emit(
                                   emit_data->args[1], emit_data->args[0], "");
 }

-/* TGSI_OPCODE_XPD */
-
-static void
-xpd_fetch_args(
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   dp_fetch_args(bld_base, emit_data, 3);
-}
-
-/**
- * (a * b) - (c * d)
- */
-static LLVMValueRef
-xpd_helper(
-  struct lp_build_tgsi_context * bld_base,
-  LLVMValueRef a,
-  LLVMValueRef b,
-  LLVMValueRef c,
-  LLVMValueRef d)
-{
-   LLVMValueRef tmp0, tmp1;
-
-   tmp0 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, a, b);
-   tmp1 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, c, d);
-
-   return lp_build_sub(&bld_base->base, tmp0, tmp1);
-}
-
-static void
-xpd_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   emit_data->output[TGSI_CHAN_X] = xpd_helper(bld_base,
-              emit_data->args[1] /* src0.y */, emit_data->args[5] /* src1.z */,
-              emit_data->args[4] /* src1.y */, emit_data->args[2] /* src0.z */);
-
-   emit_data->output[TGSI_CHAN_Y] = xpd_helper(bld_base,
-              emit_data->args[2] /* src0.z */, emit_data->args[3] /* src1.x */,
-              emit_data->args[5] /* src1.z */, emit_data->args[0] /* src0.x */);
-
-   emit_data->output[TGSI_CHAN_Z] = xpd_helper(bld_base,
-              emit_data->args[0] /* src0.x */, emit_data->args[4] /* src1.y */,
-              emit_data->args[3] /* src1.x */, emit_data->args[1] /* src0.y */);
-
-   emit_data->output[TGSI_CHAN_W] = bld_base->base.one;
-}
-
-const struct lp_build_tgsi_action xpd_action = {
-   xpd_fetch_args,	 /* fetch_args */
-   xpd_emit	 /* emit */
-};
-
 /* TGSI_OPCODE_D2F */
 static void
 d2f_emit(
@@ -1252,7 +1197,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
   bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
   bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
-   bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;

   bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;
   bld_base->op_actions[TGSI_OPCODE_SWITCH].fetch_args = scalar_unary_fetch_args;
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -590,9 +590,6 @@ lp_emit_instruction_aos(
      dst0 = lp_build_pow(&bld->bld_base.base, src0, src1);
      break;

-   case TGSI_OPCODE_XPD:
-      return FALSE;
-
   case TGSI_OPCODE_COS:
      src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
      tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -985,21 +985,6 @@ ttn_sgt(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
   ttn_move_dest(b, dest, nir_slt(b, src[1], src[0]));
 }

-static void
-ttn_xpd(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest_masked(b, dest,
-                        nir_fsub(b,
-                                 nir_fmul(b,
-                                          ttn_swizzle(b, src[0], Y, Z, X, X),
-                                          ttn_swizzle(b, src[1], Z, X, Y, X)),
-                                 nir_fmul(b,
-                                          ttn_swizzle(b, src[1], Y, Z, X, X),
-                                          ttn_swizzle(b, src[0], Z, X, Y, X))),
-                        TGSI_WRITEMASK_XYZ);
-   ttn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), TGSI_WRITEMASK_W);
-}
-
 static void
 ttn_dp2(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 {
@@ -1526,7 +1511,6 @@ static const nir_op op_trans[TGSI_OPCODE_LAST] = {
   [TGSI_OPCODE_EX2] = nir_op_fexp2,
   [TGSI_OPCODE_LG2] = nir_op_flog2,
   [TGSI_OPCODE_POW] = nir_op_fpow,
-   [TGSI_OPCODE_XPD] = 0,
   [TGSI_OPCODE_COS] = nir_op_fcos,
   [TGSI_OPCODE_DDX] = nir_op_fddx,
   [TGSI_OPCODE_DDY] = nir_op_fddy,
@@ -1739,10 +1723,6 @@ ttn_emit_instruction(struct ttn_compile *c)
      ttn_lit(b, op_trans[tgsi_op], dest, src);
      break;

-   case TGSI_OPCODE_XPD:
-      ttn_xpd(b, op_trans[tgsi_op], dest, src);
-      break;
-
   case TGSI_OPCODE_DP2:
      ttn_dp2(b, op_trans[tgsi_op], dest, src);
      break;
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -3312,51 +3312,6 @@ exec_scs(struct tgsi_exec_machine *mach,
   }
 }

-static void
-exec_xpd(struct tgsi_exec_machine *mach,
-         const struct tgsi_full_instruction *inst)
-{
-   union tgsi_exec_channel r[6];
-   union tgsi_exec_channel d[3];
-
-   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
-   fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
-
-   micro_mul(&r[2], &r[0], &r[1]);
-
-   fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
-   fetch_source(mach, &r[4], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
-
-   micro_mul(&r[5], &r[3], &r[4] );
-   micro_sub(&d[TGSI_CHAN_X], &r[2], &r[5]);
-
-   fetch_source(mach, &r[2], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
-
-   micro_mul(&r[3], &r[3], &r[2]);
-
-   fetch_source(mach, &r[5], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
-
-   micro_mul(&r[1], &r[1], &r[5]);
-   micro_sub(&d[TGSI_CHAN_Y], &r[3], &r[1]);
-
-   micro_mul(&r[5], &r[5], &r[4]);
-   micro_mul(&r[0], &r[0], &r[2]);
-   micro_sub(&d[TGSI_CHAN_Z], &r[5], &r[0]);
-
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
-      store_dest(mach, &d[TGSI_CHAN_X], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
-   }
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
-      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
-   }
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
-      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
-   }
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
-      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
-   }
-}
-
 static void
 exec_dst(struct tgsi_exec_machine *mach,
         const struct tgsi_full_instruction *inst)
@@ -5153,10 +5108,6 @@ exec_instruction(
      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
      break;

-   case TGSI_OPCODE_XPD:
-      exec_xpd(mach, inst);
-      break;
-
   case TGSI_OPCODE_COS:
      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
      break;
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -68,7 +68,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
   { 1, 1, 0, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
   { 1, 1, 0, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
   { 1, 2, 0, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
-   { 1, 2, 0, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "", 31 }, /* removed */
   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2I64", TGSI_OPCODE_U2I64 },
   { 1, 0, 0, 0, 0, 0, 0, OTHR, "CLOCK", TGSI_OPCODE_CLOCK },
   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2I64", TGSI_OPCODE_I2I64 },
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
@@ -258,65 +258,6 @@ transform_dst(struct tgsi_transform_context *tctx,
   }
 }

-/* XPD - Cross Product
- *   dst.x = src0.y \times src1.z - src1.y \times src0.z
- *   dst.y = src0.z \times src1.x - src1.z \times src0.x
- *   dst.z = src0.x \times src1.y - src1.x \times src0.y
- *   dst.w = 1.0
- *
- * ; needs: 1 tmp, imm{1.0}
- * MUL tmpA.xyz, src1.yzx, src0.zxy
- * MAD dst.xyz, src0.yzx, src1.zxy, -tmpA.xyz
- * MOV dst.w, imm{1.0}
- */
-#define XPD_GROW (NINST(2) + NINST(3) + NINST(1) - OINST(2))
-#define XPD_TMP  1
-static void
-transform_xpd(struct tgsi_transform_context *tctx,
-              struct tgsi_full_instruction *inst)
-{
-   struct tgsi_lowering_context *ctx = tgsi_lowering_context(tctx);
-   struct tgsi_full_dst_register *dst  = &inst->Dst[0];
-   struct tgsi_full_src_register *src0 = &inst->Src[0];
-   struct tgsi_full_src_register *src1 = &inst->Src[1];
-   struct tgsi_full_instruction new_inst;
-
-   if (dst->Register.WriteMask & TGSI_WRITEMASK_XYZ) {
-      /* MUL tmpA.xyz, src1.yzx, src0.zxy */
-      new_inst = tgsi_default_full_instruction();
-      new_inst.Instruction.Opcode = TGSI_OPCODE_MUL;
-      new_inst.Instruction.NumDstRegs = 1;
-      reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZ);
-      new_inst.Instruction.NumSrcRegs = 2;
-      reg_src(&new_inst.Src[0], src1, SWIZ(Y, Z, X, _));
-      reg_src(&new_inst.Src[1], src0, SWIZ(Z, X, Y, _));
-      tctx->emit_instruction(tctx, &new_inst);
-
-      /* MAD dst.xyz, src0.yzx, src1.zxy, -tmpA.xyz */
-      new_inst = tgsi_default_full_instruction();
-      new_inst.Instruction.Opcode = TGSI_OPCODE_MAD;
-      new_inst.Instruction.NumDstRegs = 1;
-      reg_dst(&new_inst.Dst[0], dst, TGSI_WRITEMASK_XYZ);
-      new_inst.Instruction.NumSrcRegs = 3;
-      reg_src(&new_inst.Src[0], src0, SWIZ(Y, Z, X, _));
-      reg_src(&new_inst.Src[1], src1, SWIZ(Z, X, Y, _));
-      reg_src(&new_inst.Src[2], &ctx->tmp[A].src, SWIZ(X, Y, Z, _));
-      new_inst.Src[2].Register.Negate = true;
-      tctx->emit_instruction(tctx, &new_inst);
-   }
-
-   if (dst->Register.WriteMask & TGSI_WRITEMASK_W) {
-      /* MOV dst.w, imm{1.0} */
-      new_inst = tgsi_default_full_instruction();
-      new_inst.Instruction.Opcode = TGSI_OPCODE_MOV;
-      new_inst.Instruction.NumDstRegs = 1;
-      reg_dst(&new_inst.Dst[0], dst, TGSI_WRITEMASK_W);
-      new_inst.Instruction.NumSrcRegs = 1;
-      reg_src(&new_inst.Src[0], &ctx->imm, SWIZ(_, _, _, Y));
-      tctx->emit_instruction(tctx, &new_inst);
-   }
-}
-
 /* SCS - Sine Cosine
 *   dst.x = \cos{src.x}
 *   dst.y = \sin{src.x}
@@ -1466,11 +1407,6 @@ transform_instr(struct tgsi_transform_context *tctx,
         goto skip;
      transform_dst(tctx, inst);
      break;
-   case TGSI_OPCODE_XPD:
-      if (!ctx->config->lower_XPD)
-         goto skip;
-      transform_xpd(tctx, inst);
-      break;
   case TGSI_OPCODE_SCS:
      if (!ctx->config->lower_SCS)
         goto skip;
@@ -1599,7 +1535,6 @@ tgsi_transform_lowering(const struct tgsi_lowering_config *config,
 #define OPCS(x) ((config->lower_ ## x) ? info->opcode_count[TGSI_OPCODE_ ## x] : 0)
   /* if there are no instructions to lower, then we are done: */
   if (!(OPCS(DST) ||
-         OPCS(XPD) ||
         OPCS(SCS) ||
         OPCS(LRP) ||
         OPCS(FRC) ||
@@ -1629,10 +1564,6 @@ tgsi_transform_lowering(const struct tgsi_lowering_config *config,
      newlen += DST_GROW * OPCS(DST);
      numtmp = MAX2(numtmp, DST_TMP);
   }
-   if (OPCS(XPD)) {
-      newlen += XPD_GROW * OPCS(XPD);
-      numtmp = MAX2(numtmp, XPD_TMP);
-   }
   if (OPCS(SCS)) {
      newlen += SCS_GROW * OPCS(SCS);
      numtmp = MAX2(numtmp, SCS_TMP);
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.h
@@ -55,7 +55,6 @@ struct tgsi_lowering_config
    * enable lowering of TGSI_OPCODE_<opc>
    */
   unsigned lower_DST:1;
-   unsigned lower_XPD:1;
   unsigned lower_SCS:1;
   unsigned lower_LRP:1;
   unsigned lower_FRC:1;
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -63,7 +63,6 @@ OP11(ROUND)
 OP11(EX2)
 OP11(LG2)
 OP12(POW)
-OP12(XPD)
 OP11(COS)
 OP11(DDX)
 OP11(DDY)
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -350,18 +350,6 @@ This instruction replicates its result.

  dst = src0.x^{src1.x}

-.. opcode:: XPD - Cross Product
-
-.. math::
-
-  dst.x = src0.y \times src1.z - src1.y \times src0.z
-
-  dst.y = src0.z \times src1.x - src1.z \times src0.x
-
-  dst.z = src0.x \times src1.y - src1.x \times src0.y
-
-  dst.w = 1
-

 .. opcode:: COS - Cosine

@@ -3663,7 +3651,7 @@ of the operands are equal to 0. That means that 0 * Inf = 0. This
 should be set the same way for an entire pipeline. Note that this
 applies not only to the literal MUL TGSI opcode, but all FP32
 multiplications implied by other operations, such as MAD, FMA, DP2,
-DP3, DP4, DST, LOG, LRP, XPD, and possibly others. If there is a
+DP3, DP4, DST, LOG, LRP, and possibly others. If there is a
 mismatch between shaders, then it is unspecified whether this behavior
 will be enabled.

--- a/src/gallium/drivers/etnaviv/etnaviv_compiler.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
@@ -2317,7 +2317,6 @@ etna_compile_shader(struct etna_shader_variant *v)
      .lower_LOG = true,
      .lower_DP2 = true,
      .lower_TRUNC = true,
-      .lower_XPD = true
   };

   c = CALLOC_STRUCT(etna_compile);
--- a/src/gallium/drivers/i915/i915_fpc_optimize.c
+++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
@@ -118,7 +118,6 @@ static const struct {
   [ TGSI_OPCODE_TRUNC   ] = { false,  false,                  0,  1,  1 },
   [ TGSI_OPCODE_TXB     ] = {  true,  false,                  0,  1,  2 },
   [ TGSI_OPCODE_TXP     ] = {  true,  false,                  0,  1,  2 },
-   [ TGSI_OPCODE_XPD     ] = { false,  false,                  0,  1,  2 },
 };

 static boolean op_has_dst(unsigned opcode)
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -1027,32 +1027,6 @@ i915_translate_instruction(struct i915_fp_compile *p,
      emit_tex(p, inst, T0_TEXLDP, fs);
      break;

-   case TGSI_OPCODE_XPD:
-      /* Cross product:
-       *      result.x = src0.y * src1.z - src0.z * src1.y;
-       *      result.y = src0.z * src1.x - src0.x * src1.z;
-       *      result.z = src0.x * src1.y - src0.y * src1.x;
-       *      result.w = undef;
-       */
-      src0 = src_vector(p, &inst->Src[0], fs);
-      src1 = src_vector(p, &inst->Src[1], fs);
-      tmp = i915_get_utemp(p);
-
-      i915_emit_arith(p,
-                      A0_MUL,
-                      tmp, A0_DEST_CHANNEL_ALL, 0,
-                      swizzle(src0, Z, X, Y, ONE),
-                      swizzle(src1, Y, Z, X, ONE), 0);
-
-      i915_emit_arith(p,
-                      A0_MAD,
-                      get_result_vector(p, &inst->Dst[0]),
-                      get_result_flags(inst), 0,
-                      swizzle(src0, Y, Z, X, ONE),
-                      swizzle(src1, Z, X, Y, ONE),
-                      negate(tmp, 1, 1, 1, 0));
-      break;
-
   default:
      i915_program_error(p, "bad opcode %d", inst->Instruction.Opcode);
      p->error = 1;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -346,14 +346,6 @@ unsigned int Instruction::srcMask(unsigned int s) const
      return mask;
   case TGSI_OPCODE_TXQ:
      return 1;
-   case TGSI_OPCODE_XPD:
-   {
-      unsigned int x = 0;
-      if (mask & 1) x |= 0x6;
-      if (mask & 2) x |= 0x5;
-      if (mask & 4) x |= 0x3;
-      return x;
-   }
   case TGSI_OPCODE_D2I:
   case TGSI_OPCODE_D2U:
   case TGSI_OPCODE_D2F:
@@ -3347,25 +3339,6 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
   case TGSI_OPCODE_LIT:
      handleLIT(dst0);
      break;
-   case TGSI_OPCODE_XPD:
-      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
-         if (c < 3) {
-            val0 = getSSA();
-            src0 = fetchSrc(1, (c + 1) % 3);
-            src1 = fetchSrc(0, (c + 2) % 3);
-            mkOp2(OP_MUL, TYPE_F32, val0, src0, src1)
-               ->dnz = info->io.mul_zero_wins;
-            mkOp1(OP_NEG, TYPE_F32, val0, val0);
-
-            src0 = fetchSrc(0, (c + 1) % 3);
-            src1 = fetchSrc(1, (c + 2) % 3);
-            mkOp3(OP_MAD, TYPE_F32, dst0[c], src0, src1, val0)
-               ->dnz = info->io.mul_zero_wins;
-         } else {
-            loadImm(dst0[c], 1.0f);
-         }
-      }
-      break;
   case TGSI_OPCODE_ISSG:
   case TGSI_OPCODE_SSG:
      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
--- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.h
@@ -10,7 +10,6 @@
 *   POW - EX2 + MUL + LG2
 *   SUB - ADD, second source negated
 *   SWZ - MOV
- *   XPD -
 *
 * Register access
 *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -774,11 +774,6 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
        case TGSI_OPCODE_TXP:
                nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
                break;
-   case TGSI_OPCODE_XPD:
-      tmp = nvfx_src(temp(fpc));
-      nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
-      nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
-      break;

   case TGSI_OPCODE_IF:
      // MOVRC0 R31 (TR0.xyzw), R<src>:
--- a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
@@ -164,7 +164,6 @@
 *   RSQ - LG2 + EX2
 *   POW - LG2 + MUL + EX2
 *   SCS - COS + SIN
- *   XPD
 *
 * NV40 Looping
 *   Loops appear to be fairly expensive on NV40 at least, the proprietary
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -683,11 +683,6 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
      insn.cc_test = NVFX_COND_LT;
      nvfx_vp_emit(vpc, insn);
      break;
-   case TGSI_OPCODE_XPD:
-      tmp = nvfx_src(temp(vpc));
-      nvfx_vp_emit(vpc, arith(0, VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
-      nvfx_vp_emit(vpc, arith(sat, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
-      break;
   case TGSI_OPCODE_IF:
      insn = arith(0, VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
      insn.cc_update = 1;
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -57,7 +57,6 @@ static unsigned translate_opcode(unsigned opcode)
        case TGSI_OPCODE_EX2: return RC_OPCODE_EX2;
        case TGSI_OPCODE_LG2: return RC_OPCODE_LG2;
        case TGSI_OPCODE_POW: return RC_OPCODE_POW;
-        case TGSI_OPCODE_XPD: return RC_OPCODE_XPD;
        case TGSI_OPCODE_COS: return RC_OPCODE_COS;
        case TGSI_OPCODE_DDX: return RC_OPCODE_DDX;
        case TGSI_OPCODE_DDY: return RC_OPCODE_DDY;
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -7844,78 +7844,6 @@ static int tgsi_ucmp(struct r600_shader_ctx *ctx)
 	return 0;
 }

-static int tgsi_xpd(struct r600_shader_ctx *ctx)
-{
-	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	static const unsigned int src0_swizzle[] = {2, 0, 1};
-	static const unsigned int src1_swizzle[] = {1, 2, 0};
-	struct r600_bytecode_alu alu;
-	uint32_t use_temp = 0;
-	int i, r;
-
-	if (inst->Dst[0].Register.WriteMask != 0xf)
-		use_temp = 1;
-
-	for (i = 0; i < 4; i++) {
-		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.op = ALU_OP2_MUL;
-		if (i < 3) {
-			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
-			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
-		} else {
-			alu.src[0].sel = V_SQ_ALU_SRC_0;
-			alu.src[0].chan = i;
-			alu.src[1].sel = V_SQ_ALU_SRC_0;
-			alu.src[1].chan = i;
-		}
-
-		alu.dst.sel = ctx->temp_reg;
-		alu.dst.chan = i;
-		alu.dst.write = 1;
-
-		if (i == 3)
-			alu.last = 1;
-		r = r600_bytecode_add_alu(ctx->bc, &alu);
-		if (r)
-			return r;
-	}
-
-	for (i = 0; i < 4; i++) {
-		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.op = ALU_OP3_MULADD;
-
-		if (i < 3) {
-			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
-			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
-		} else {
-			alu.src[0].sel = V_SQ_ALU_SRC_0;
-			alu.src[0].chan = i;
-			alu.src[1].sel = V_SQ_ALU_SRC_0;
-			alu.src[1].chan = i;
-		}
-
-		alu.src[2].sel = ctx->temp_reg;
-		alu.src[2].neg = 1;
-		alu.src[2].chan = i;
-
-		if (use_temp)
-			alu.dst.sel = ctx->temp_reg;
-		else
-			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
-		alu.dst.chan = i;
-		alu.dst.write = 1;
-		alu.is_op3 = 1;
-		if (i == 3)
-			alu.last = 1;
-		r = r600_bytecode_add_alu(ctx->bc, &alu);
-		if (r)
-			return r;
-	}
-	if (use_temp)
-		return tgsi_helper_copy(ctx, inst);
-	return 0;
-}
-
 static int tgsi_exp(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -9092,7 +9020,7 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[]
 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
-	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
+	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
@@ -9290,7 +9218,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
-	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
+	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
@@ -9513,7 +9441,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
-	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
+	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -2195,63 +2195,6 @@ emit_pow(struct svga_shader_emitter *emit,
 }


-/**
- * Translate/emit TGSI XPD (vector cross product) instruction.
- */
-static boolean
-emit_xpd(struct svga_shader_emitter *emit,
-         const struct tgsi_full_instruction *insn)
-{
-   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
-   const struct src_register src0 = translate_src_register(
-      emit, &insn->Src[0] );
-   const struct src_register src1 = translate_src_register(
-      emit, &insn->Src[1] );
-   boolean need_dst_tmp = FALSE;
-
-   /* XPD can only output to a temporary */
-   if (SVGA3dShaderGetRegType(dst.value) != SVGA3DREG_TEMP)
-      need_dst_tmp = TRUE;
-
-   /* The dst reg must not be the same as src0 or src1*/
-   if (alias_src_dst(src0, dst) ||
-       alias_src_dst(src1, dst))
-      need_dst_tmp = TRUE;
-
-   if (need_dst_tmp) {
-      SVGA3dShaderDestToken tmp = get_temp( emit );
-
-      /* Obey DX9 restrictions on mask:
-       */
-      tmp.mask = dst.mask & TGSI_WRITEMASK_XYZ;
-
-      if (!submit_op2(emit, inst_token( SVGA3DOP_CRS ), tmp, src0, src1))
-         return FALSE;
-
-      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
-         return FALSE;
-   }
-   else {
-      if (!submit_op2(emit, inst_token( SVGA3DOP_CRS ), dst, src0, src1))
-         return FALSE;
-   }
-
-   /* Need to emit 1.0 to dst.w?
-    */
-   if (dst.mask & TGSI_WRITEMASK_W) {
-      struct src_register one = get_one_immediate( emit );
-
-      if (!submit_op1(emit,
-                      inst_token( SVGA3DOP_MOV ),
-                      writemask(dst, TGSI_WRITEMASK_W),
-                      one))
-         return FALSE;
-   }
-
-   return TRUE;
-}
-
-
 /**
 * Emit a LRP (linear interpolation) instruction.
 */
@@ -2986,9 +2929,6 @@ svga_emit_instruction(struct svga_shader_emitter *emit,
   case TGSI_OPCODE_BRK:
      return emit_brk( emit, insn );

-   case TGSI_OPCODE_XPD:
-      return emit_xpd( emit, insn );
-
   case TGSI_OPCODE_KILL:
      return emit_kill( emit, insn );

@@ -3604,7 +3544,6 @@ needs_to_create_common_immediate(const struct svga_shader_emitter *emit)
       emit->info.opcode_count[TGSI_OPCODE_SEQ] >= 1 ||
       emit->info.opcode_count[TGSI_OPCODE_EXP] >= 1 ||
       emit->info.opcode_count[TGSI_OPCODE_LOG] >= 1 ||
-       emit->info.opcode_count[TGSI_OPCODE_XPD] >= 1 ||
       emit->info.opcode_count[TGSI_OPCODE_KILL] >= 1)
      return TRUE;

--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -5210,117 +5210,6 @@ emit_txp(struct svga_shader_emitter_v10 *emit,
 }


-/*
- * Emit code for TGSI_OPCODE_XPD instruction.
- */
-static boolean
-emit_xpd(struct svga_shader_emitter_v10 *emit,
-         const struct tgsi_full_instruction *inst)
-{
-   /* dst.x = src0.y * src1.z - src1.y * src0.z
-    * dst.y = src0.z * src1.x - src1.z * src0.x
-    * dst.z = src0.x * src1.y - src1.x * src0.y
-    * dst.w = 1
-    */
-   struct tgsi_full_src_register s0_xxxx =
-      scalar_src(&inst->Src[0], TGSI_SWIZZLE_X);
-   struct tgsi_full_src_register s0_yyyy =
-      scalar_src(&inst->Src[0], TGSI_SWIZZLE_Y);
-   struct tgsi_full_src_register s0_zzzz =
-      scalar_src(&inst->Src[0], TGSI_SWIZZLE_Z);
-
-   struct tgsi_full_src_register s1_xxxx =
-      scalar_src(&inst->Src[1], TGSI_SWIZZLE_X);
-   struct tgsi_full_src_register s1_yyyy =
-      scalar_src(&inst->Src[1], TGSI_SWIZZLE_Y);
-   struct tgsi_full_src_register s1_zzzz =
-      scalar_src(&inst->Src[1], TGSI_SWIZZLE_Z);
-
-   unsigned tmp1 = get_temp_index(emit);
-   struct tgsi_full_src_register tmp1_src = make_src_temp_reg(tmp1);
-   struct tgsi_full_dst_register tmp1_dst = make_dst_temp_reg(tmp1);
-
-   unsigned tmp2 = get_temp_index(emit);
-   struct tgsi_full_src_register tmp2_src = make_src_temp_reg(tmp2);
-   struct tgsi_full_dst_register tmp2_dst = make_dst_temp_reg(tmp2);
-   struct tgsi_full_src_register neg_tmp2_src = negate_src(&tmp2_src);
-
-   unsigned tmp3 = get_temp_index(emit);
-   struct tgsi_full_src_register tmp3_src = make_src_temp_reg(tmp3);
-   struct tgsi_full_dst_register tmp3_dst = make_dst_temp_reg(tmp3);
-   struct tgsi_full_dst_register tmp3_dst_x =
-      writemask_dst(&tmp3_dst, TGSI_WRITEMASK_X);
-   struct tgsi_full_dst_register tmp3_dst_y =
-      writemask_dst(&tmp3_dst, TGSI_WRITEMASK_Y);
-   struct tgsi_full_dst_register tmp3_dst_z =
-      writemask_dst(&tmp3_dst, TGSI_WRITEMASK_Z);
-   struct tgsi_full_dst_register tmp3_dst_w =
-      writemask_dst(&tmp3_dst, TGSI_WRITEMASK_W);
-
-   /* Note: we put all the intermediate computations into tmp3 in case
-    * the XPD dest register is that same as one of the src regs (in which
-    * case we could clobber a src reg before we're done with it) .
-    *
-    * Note: we could get by with just one temp register instead of three
-    * since we're doing scalar operations and there's enough room in one
-    * temp for everything.
-    */
-
-   /* MUL tmp1, src0.y, src1.z */
-   /* MUL tmp2, src1.y, src0.z */
-   /* ADD tmp3.x, tmp1, -tmp2 */
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp1_dst,
-                           &s0_yyyy, &s1_zzzz, FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp2_dst,
-                           &s1_yyyy, &s0_zzzz, FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &tmp3_dst_x,
-                           &tmp1_src, &neg_tmp2_src, FALSE);
-   }
-
-   /* MUL tmp1, src0.z, src1.x */
-   /* MUL tmp2, src1.z, src0.x */
-   /* ADD tmp3.y, tmp1, -tmp2 */
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp1_dst, &s0_zzzz,
-                           &s1_xxxx, FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp2_dst, &s1_zzzz,
-                           &s0_xxxx, FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &tmp3_dst_y,
-                           &tmp1_src, &neg_tmp2_src, FALSE);
-   }
-
-   /* MUL tmp1, src0.x, src1.y */
-   /* MUL tmp2, src1.x, src0.y */
-   /* ADD tmp3.z, tmp1, -tmp2 */
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp1_dst, &s0_xxxx,
-                           &s1_yyyy, FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp2_dst, &s1_xxxx,
-                           &s0_yyyy, FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &tmp3_dst_z,
-                           &tmp1_src, &neg_tmp2_src, FALSE);
-   }
-
-   /* MOV tmp3.w, 1.0 */
-   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
-      struct tgsi_full_src_register one =
-         make_immediate_reg_float(emit, 1.0f);
-
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &tmp3_dst_w, &one, FALSE);
-   }
-
-   /* MOV dst, tmp3 */
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &tmp3_src,
-                        inst->Instruction.Saturate);
-
-
-   free_temp_indexes(emit);
-
-   return TRUE;
-}
-
-
 /**
 * Emit code for TGSI_OPCODE_TXD (explicit derivatives)
 */
@@ -5742,8 +5631,6 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
      return emit_txq(emit, inst);
   case TGSI_OPCODE_UIF:
      return emit_if(emit, inst);
-   case TGSI_OPCODE_XPD:
-      return emit_xpd(emit, inst);
   case TGSI_OPCODE_UMUL_HI:
   case TGSI_OPCODE_IMUL_HI:
   case TGSI_OPCODE_UDIV:
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -368,7 +368,7 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_EX2                 28
 #define TGSI_OPCODE_LG2                 29
 #define TGSI_OPCODE_POW                 30
-#define TGSI_OPCODE_XPD                 31
+/* gap */
 #define TGSI_OPCODE_U2I64               32
 #define TGSI_OPCODE_CLOCK               33
 #define TGSI_OPCODE_I2I64               34
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -1588,6 +1588,29 @@ DECL_SPECIAL(ABS)
    return D3D_OK;
 }

+DECL_SPECIAL(XPD)
+{
+    struct ureg_program *ureg = tx->ureg;
+    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
+    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
+    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
+
+    ureg_MUL(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ),
+             ureg_swizzle(src0, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z,
+                          TGSI_SWIZZLE_X, 0),
+             ureg_swizzle(src1, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
+                          TGSI_SWIZZLE_Y, 0));
+    ureg_MAD(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ),
+             ureg_swizzle(src0, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
+                          TGSI_SWIZZLE_Y, 0),
+             ureg_negate(ureg_swizzle(src1, TGSI_SWIZZLE_Y,
+                                      TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X, 0)),
+             ureg_src(dst));
+    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W),
+             ureg_imm1f(ureg, 1));
+    return D3D_OK;
+}
+
 DECL_SPECIAL(M4x4)
 {
    return NineTranslateInstruction_Mkxn(tx, 4, 4);
@@ -2915,7 +2938,7 @@ struct sm1_op_info inst_table[] =
    _OPI(DCL, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(DCL)),

    _OPI(POW, POW, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(POW)),
-    _OPI(CRS, XPD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* XXX: .w */
+    _OPI(CRS, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(XPD)), /* XXX: .w */
    _OPI(SGN, SSG, V(2,0), V(3,0), V(0,0), V(0,0), 1, 3, SPECIAL(SGN)), /* ignore src1,2 */
    _OPI(ABS, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(ABS)),
    _OPI(NRM, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(NRM)), /* NRM doesn't fit */
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -505,8 +505,6 @@ translate_opcode( unsigned op )
      return TGSI_OPCODE_TXB;
   case OPCODE_TXP:
      return TGSI_OPCODE_TXP;
-   case OPCODE_XPD:
-      return TGSI_OPCODE_XPD;
   case OPCODE_END:
      return TGSI_OPCODE_END;
   default:
@@ -568,11 +566,17 @@ compile_instruction(
      break;

   case OPCODE_XPD:
-      dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ );
-      ureg_insn( ureg, 
-                 translate_opcode( inst->Opcode ), 
-                 dst, num_dst, 
-                 src, num_src, 0 );
+      ureg_MUL(ureg, ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ),
+               ureg_swizzle(src[0], TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z,
+                            TGSI_SWIZZLE_X, 0),
+               ureg_swizzle(src[1], TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
+                            TGSI_SWIZZLE_Y, 0));
+      ureg_MAD(ureg, ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ),
+               ureg_swizzle(src[0], TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
+                            TGSI_SWIZZLE_Y, 0),
+               ureg_negate(ureg_swizzle(src[1], TGSI_SWIZZLE_Y,
+                                        TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X, 0)),
+               ureg_src(dst[0]));
      break;

   case OPCODE_RSQ: