radeon/llvm: Lower UDIV using the Selection DAG

2012-05-24 12:17:58 -04:00
parent d088da917b
commit 33e7db9a1d
8 changed files with 126 additions and 212 deletions
--- a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
@@ -33,6 +33,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
  setOperationAction(ISD::FRINT,  MVT::f32, Legal);

+  setOperationAction(ISD::UDIV, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
 }

 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
@@ -42,6 +44,10 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  default: return AMDILTargetLowering::LowerOperation(Op, DAG);
  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::UDIV:
+    return DAG.getNode(ISD::UDIVREM, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1)).getValue(0);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
  }
 }

@@ -227,6 +233,114 @@ SDValue AMDGPUTargetLowering::LowerSELECT_CC(SDValue Op,
  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }

+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  SmallVector<SDValue, 8> Results;
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = umulo(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, VT),
+                                                 DAG.getConstant(0, VT),
+                                                 ISD::SETGE);
+  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
+                                                  DAG.getConstant(0, VT),
+                                                  DAG.getConstant(-1, VT),
+                                                  DAG.getConstant(0, VT),
+                                                  ISD::SETGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+
+  DAG.ReplaceAllUsesWith(Op.getValue(0).getNode(), &Div);
+  DAG.ReplaceAllUsesWith(Op.getValue(1).getNode(), &Rem);
+
+  return Op;
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -274,5 +388,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const
  NODE_NAME_CASE(FMIN)
  NODE_NAME_CASE(SMIN)
  NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(URECIP)
  }
 }
--- a/src/gallium/drivers/radeon/AMDGPUISelLowering.h
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.h
@@ -24,6 +24,7 @@ class AMDGPUTargetLowering : public AMDILTargetLowering
 private:
  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;

 protected:

@@ -63,6 +64,7 @@ enum
  FMIN,
  SMIN,
  UMIN,
+  URECIP,
  LAST_AMDGPU_ISD_NUMBER
 };

--- a/src/gallium/drivers/radeon/AMDGPUInstrInfo.td
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.td
@@ -47,3 +47,9 @@ def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
 def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
  [SDNPCommutative, SDNPAssociative]
 >;
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
--- a/src/gallium/drivers/radeon/AMDILISelLowering.cpp
+++ b/src/gallium/drivers/radeon/AMDILISelLowering.cpp
@@ -650,7 +650,6 @@ AMDILTargetLowering::convertToReg(MachineOperand op) const
    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    if (VT != MVT::i64 && VT != MVT::v2i64) {
      setOperationAction(ISD::SDIV, VT, Custom);
-      setOperationAction(ISD::UDIV, VT, Custom);
    }
    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -730,7 +729,6 @@ AMDILTargetLowering::convertToReg(MachineOperand op) const
    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
-    setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Expand);
    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Expand);
@@ -1505,7 +1503,6 @@ AMDILTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
      LOWER(FDIV);
      LOWER(SDIV);
      LOWER(SREM);
-      LOWER(UDIV);
      LOWER(UREM);
      LOWER(BUILD_VECTOR);
      LOWER(INSERT_VECTOR_ELT);
@@ -2811,24 +2808,6 @@ AMDILTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const
  return DST;
 }

-SDValue
-AMDILTargetLowering::LowerUDIV(SDValue Op, SelectionDAG &DAG) const
-{
-  EVT OVT = Op.getValueType();
-  SDValue DST;
-  if (OVT.getScalarType() == MVT::i64) {
-    DST = LowerUDIV64(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i32) {
-    DST = LowerUDIV32(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i16
-      || OVT.getScalarType() == MVT::i8) {
-    DST = LowerUDIV24(Op, DAG);
-  } else {
-    DST = SDValue(Op.getNode(), 0);
-  }
-  return DST;
-}
-
 SDValue
 AMDILTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const
 {
@@ -3960,17 +3939,6 @@ AMDILTargetLowering::LowerUDIV24(SDValue Op, SelectionDAG &DAG) const

 }

-SDValue
-AMDILTargetLowering::LowerUDIV32(SDValue Op, SelectionDAG &DAG) const
-{
-  return SDValue(Op.getNode(), 0);
-}
-
-SDValue
-AMDILTargetLowering::LowerUDIV64(SDValue Op, SelectionDAG &DAG) const
-{
-  return SDValue(Op.getNode(), 0);
-}
 SDValue
 AMDILTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const
 {
--- a/src/gallium/drivers/radeon/R600InstrInfo.cpp
+++ b/src/gallium/drivers/radeon/R600InstrInfo.cpp
@@ -100,38 +100,3 @@ unsigned R600InstrInfo::getLSHRop() const
    return AMDIL::LSHR_eg;
  }
 }
-
-unsigned R600InstrInfo::getMULHI_UINT() const
-{
-  unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
-
-  if (gen < AMDILDeviceInfo::HD5XXX) {
-    return AMDIL::MULHI_UINT_r600;
-  } else {
-    return AMDIL::MULHI_UINT_eg;
-  }
-}
-
-unsigned R600InstrInfo::getMULLO_UINT() const
-{
-  unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
-
-  if (gen < AMDILDeviceInfo::HD5XXX) {
-    return AMDIL::MULLO_UINT_r600;
-  } else {
-    return AMDIL::MULLO_UINT_eg;
-  }
-}
-
-unsigned R600InstrInfo::getRECIP_UINT() const
-{
-  const AMDILDevice * dev = TM.getSubtarget<AMDILSubtarget>().device();
-
-  if (dev->getGeneration() < AMDILDeviceInfo::HD5XXX) {
-    return AMDIL::RECIP_UINT_r600;
-  } else if (dev->getDeviceFlag() != OCL_DEVICE_CAYMAN) {
-    return AMDIL::RECIP_UINT_eg;
-  } else {
-    return AMDIL::RECIP_UINT_cm;
-  }
-}
--- a/src/gallium/drivers/radeon/R600InstrInfo.h
+++ b/src/gallium/drivers/radeon/R600InstrInfo.h
@@ -46,9 +46,6 @@ namespace llvm {

  unsigned getLSHRop() const;
  unsigned getASHRop() const;
-  unsigned getMULHI_UINT() const;
-  unsigned getMULLO_UINT() const;
-  unsigned getRECIP_UINT() const;

  };

--- a/src/gallium/drivers/radeon/R600Instructions.td
+++ b/src/gallium/drivers/radeon/R600Instructions.td
@@ -649,8 +649,8 @@ class MULHI_INT_Common <bits<32> inst> : R600_2OP <
 >;

 class MULHI_UINT_Common <bits<32> inst> : R600_2OP <
-	inst, "MULHI $dst, $src0, $src1",
-	[]
+  inst, "MULHI $dst, $src0, $src1",
+  [(set R600_Reg32:$dst, (mulhu R600_Reg32:$src0, R600_Reg32:$src1))]
 >;

 class MULLO_INT_Common <bits<32> inst> : R600_2OP <
@@ -675,7 +675,7 @@ class RECIP_IEEE_Common <bits<32> inst> : R600_1OP <

 class RECIP_UINT_Common <bits<32> inst> : R600_1OP <
  inst, "RECIP_INT $dst, $src",
-  []
+  [(set R600_Reg32:$dst, (AMDGPUurecip R600_Reg32:$src))]
 >;

 class RECIPSQRT_CLAMPED_Common <bits<32> inst> : R600_1OP <
--- a/src/gallium/drivers/radeon/R600LowerInstructions.cpp
+++ b/src/gallium/drivers/radeon/R600LowerInstructions.cpp
@@ -45,11 +45,6 @@ namespace {
                     MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator I) const;

-    void divMod(MachineInstr &MI,
-                  MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  bool div = true) const;
-
  public:
    R600LowerInstructionsPass(TargetMachine &tm) :
      MachineFunctionPass(ID), TM(tm),
@@ -115,10 +110,6 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
        break;
        }

-      case AMDIL::UDIV_i32:
-        divMod(MI, MBB, I);
-        break;
-
      /* XXX: Figure out the semantics of DIV_INF_f32 and make sure this is OK */
 /*      case AMDIL::DIV_INF_f32:
        {
@@ -322,133 +313,3 @@ void R600LowerInstructionsPass::calcAddress(const MachineOperand &ptrOp,
            .addOperand(ptrOp);
  }
 }
-
-/* Mostly copied from tgsi_divmod() in r600_shader.c */
-void R600LowerInstructionsPass::divMod(MachineInstr &MI,
-                                       MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator I,
-                                       bool div) const
-{
-  unsigned dst = MI.getOperand(0).getReg();
-  MachineOperand &numerator = MI.getOperand(1);
-  MachineOperand &denominator = MI.getOperand(2);
-  /* rcp = RECIP(denominator) = 2^32 / denominator + e
-   * e is rounding error */
-  unsigned rcp = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getRECIP_UINT()), rcp)
-          .addOperand(denominator);
-
-  /* rcp_lo = lo(rcp * denominator) */
-  unsigned rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()), rcp_lo)
-          .addReg(rcp)
-          .addOperand(denominator);
-
-  /* rcp_hi = HI (rcp * denominator) */
-  unsigned rcp_hi = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), rcp_hi)
-          .addReg(rcp)
-          .addOperand(denominator);
-
-  unsigned neg_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), neg_rcp_lo)
-          .addReg(AMDIL::ZERO)
-          .addReg(rcp_lo);
-
-  unsigned abs_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), abs_rcp_lo)
-          .addReg(rcp_hi)
-          .addReg(neg_rcp_lo)
-          .addReg(rcp_lo);
-
-  unsigned e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), e)
-          .addReg(abs_rcp_lo)
-          .addReg(rcp);
-
-  unsigned rcp_plus_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), rcp_plus_e)
-          .addReg(rcp)
-          .addReg(e);
-
-  unsigned rcp_sub_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), rcp_sub_e)
-          .addReg(rcp)
-          .addReg(e);
-
-  /* tmp0 = rcp_hi == 0 ? rcp_plus_e : rcp_sub_e */
-  unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), tmp0)
-          .addReg(rcp_hi)
-          .addReg(rcp_plus_e)
-          .addReg(rcp_sub_e);
-
-  unsigned q = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), q)
-          .addReg(tmp0)
-          .addOperand(numerator);
-
-  /* num_sub_r = q * denominator */
-  unsigned num_sub_r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()),
-          num_sub_r)
-          .addReg(q)
-          .addOperand(denominator);
-
-  unsigned r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), r)
-          .addOperand(numerator)
-          .addReg(num_sub_r);
-
-  unsigned r_ge_den = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_den)
-          .addReg(r)
-          .addOperand(denominator);
-
-  unsigned r_ge_zero = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_zero)
-          .addOperand(numerator)
-          .addReg(num_sub_r);
-
-  unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::AND_INT), tmp1)
-          .addReg(r_ge_den)
-          .addReg(r_ge_zero);
-
-  unsigned val0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  unsigned val1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  unsigned result = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
-  if (div) {
-    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val0)
-            .addReg(q)
-            .addReg(AMDIL::ONE_INT);
-
-    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val1)
-            .addReg(q)
-            .addReg(AMDIL::ONE_INT);
-
-    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result)
-            .addReg(tmp1)
-            .addReg(q)
-            .addReg(val0);
-  } else {
-    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val0)
-            .addReg(r)
-            .addOperand(denominator);
-
-    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val1)
-            .addReg(r)
-            .addOperand(denominator);
-
-    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result)
-            .addReg(tmp1)
-            .addReg(r)
-            .addReg(val0);
-  }
-
-  /* XXX: Do we need to set to MAX_INT if denominator is 0? */
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), dst)
-          .addReg(r_ge_zero)
-          .addReg(val1)
-          .addReg(result);
-}