1 files changed, 495 insertions, 191 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f235313e48535..f4776adb069c9 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,10 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 #include "SIDefines.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
@@ -68,19 +70,30 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // make the right decision when generating code for different targets.
   const AMDGPUSubtarget *Subtarget;
   AMDGPUAS AMDGPUASI;
+  bool EnableLateStructurizeCFG;
 
 public:
-  explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel){
-    AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
+  explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
+                              CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
+    : SelectionDAGISel(*TM, OptLevel) {
+    AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
+    EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
   }
   ~AMDGPUDAGToDAGISel() override = default;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AMDGPUArgumentUsageInfo>();
+    SelectionDAGISel::getAnalysisUsage(AU);
+  }
+
   bool runOnMachineFunction(MachineFunction &MF) override;
   void Select(SDNode *N) override;
   StringRef getPassName() const override;
   void PostprocessISelDAG() override;
 
+protected:
+  void SelectBuildVector(SDNode *N, unsigned RegClassID);
+
 private:
   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
   bool isNoNanSrc(SDValue N) const;
@@ -99,8 +112,8 @@ private:
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
   bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
                                        SDValue& Offset);
-  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+  virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
                        unsigned OffsetBits) const;
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
@@ -116,10 +129,10 @@ private:
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
-  bool SelectMUBUFScratchOffen(SDNode *Root,
+  bool SelectMUBUFScratchOffen(SDNode *Parent,
                                SDValue Addr, SDValue &RSrc, SDValue &VAddr,
                                SDValue &SOffset, SDValue &ImmOffset) const;
-  bool SelectMUBUFScratchOffset(SDNode *Root,
+  bool SelectMUBUFScratchOffset(SDNode *Parent,
                                 SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                                 SDValue &Offset) const;
 
@@ -140,6 +153,10 @@ private:
 
   bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
                         SDValue &Offset, SDValue &SLC) const;
+  bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr,
+                              SDValue &Offset, SDValue &SLC) const;
+
+  template <bool IsSigned>
   bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
                         SDValue &Offset, SDValue &SLC) const;
 
@@ -152,10 +169,10 @@ private:
   bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
-  bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
 
   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -174,9 +191,22 @@ private:
   bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                         SDValue &Clamp) const;
 
+  bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                        SDValue &Clamp) const;
+
+  bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                            SDValue &Clamp) const;
+  bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
+  bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+  bool SelectHi16Elt(SDValue In, SDValue &Src) const;
+
   void SelectADD_SUB_I64(SDNode *N);
   void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
+  void SelectMAD_64_32(SDNode *N);
   void SelectFMA_W_CHAIN(SDNode *N);
   void SelectFMUL_W_CHAIN(SDNode *N);
 
@@ -186,21 +216,49 @@ private:
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
   void SelectBRCOND(SDNode *N);
+  void SelectFMAD(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
 
+protected:
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
 };
 
+class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+public:
+  explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
+      AMDGPUDAGToDAGISel(TM, OptLevel) {}
+
+  void Select(SDNode *N) override;
+
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) override;
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) override;
+};
+
 }  // end anonymous namespace
 
+INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
+                      "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
+                    "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
+
 /// \brief This pass converts a legalized DAG into a AMDGPU-specific
 // DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
                                         CodeGenOpt::Level OptLevel) {
   return new AMDGPUDAGToDAGISel(TM, OptLevel);
 }
 
+/// \brief This pass converts a legalized DAG into a R600-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
+                                      CodeGenOpt::Level OptLevel) {
+  return new R600DAGToDAGISel(TM, OptLevel);
+}
+
 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
@@ -279,8 +337,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
 }
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
+  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
+      !Subtarget->ldsRequiresM0Init())
     return N;
 
   const SITargetLowering& Lowering =
@@ -298,9 +356,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
      Ops.push_back(N->getOperand(i));
   }
   Ops.push_back(Glue);
-  CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
-
-  return N;
+  return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
 }
 
 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
@@ -334,6 +390,58 @@ static bool getConstantValue(SDValue N, uint32_t &Out) {
   return false;
 }
 
+void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
+  EVT VT = N->getValueType(0);
+  unsigned NumVectorElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+  SDLoc DL(N);
+  SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+
+  if (NumVectorElts == 1) {
+    CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
+                         RegClass);
+    return;
+  }
+
+  assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+  // 16 = Max Num Vector Elements
+  // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+  // 1 = Vector Register Class
+  SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+  RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+  bool IsRegSeq = true;
+  unsigned NOps = N->getNumOperands();
+  for (unsigned i = 0; i < NOps; i++) {
+    // XXX: Why is this here?
+    if (isa<RegisterSDNode>(N->getOperand(i))) {
+      IsRegSeq = false;
+      break;
+    }
+    RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+    RegSeqArgs[1 + (2 * i) + 1] =
+            CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
+                                      MVT::i32);
+  }
+  if (NOps != NumVectorElts) {
+    // Fill in the missing undef elements if this was a scalar_to_vector.
+    assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+    MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                   DL, EltVT);
+    for (unsigned i = NOps; i < NumVectorElts; ++i) {
+      RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+      RegSeqArgs[1 + (2 * i) + 1] =
+        CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+    }
+  }
+
+  if (!IsRegSeq)
+    SelectCode(N);
+  CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+}
+
 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -346,18 +454,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     N = glueCopyToM0(N);
 
   switch (Opc) {
-  default: break;
+  default:
+    break;
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
-  case ISD::ADD:
   case ISD::ADDC:
   case ISD::ADDE:
-  case ISD::SUB:
   case ISD::SUBC:
   case ISD::SUBE: {
-    if (N->getValueType(0) != MVT::i64 ||
-        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    if (N->getValueType(0) != MVT::i64)
       break;
 
     SelectADD_SUB_I64(N);
@@ -378,13 +484,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ISD::SCALAR_TO_VECTOR:
-  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
-    unsigned RegClassID;
-    const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
-    EVT EltVT = VT.getVectorElementType();
 
     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
       if (Opc == ISD::BUILD_VECTOR) {
@@ -401,81 +503,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
     }
 
-    assert(EltVT.bitsEq(MVT::i32));
-
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
-    } else {
-      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-      // that adds a 128 bits reg copy when going through TwoAddressInstructions
-      // pass. We want to avoid 128 bits copies as much as possible because they
-      // can't be bundled by our scheduler.
-      switch(NumVectorElts) {
-      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-      case 4:
-        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
-        else
-          RegClassID = AMDGPU::R600_Reg128RegClassID;
-        break;
-      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
-      }
-    }
-
-    SDLoc DL(N);
-    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
-
-    if (NumVectorElts == 1) {
-      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
-                           RegClass);
-      return;
-    }
-
-    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
-                                  "supported yet");
-    // 16 = Max Num Vector Elements
-    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
-    // 1 = Vector Register Class
-    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
-
-    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
-    bool IsRegSeq = true;
-    unsigned NOps = N->getNumOperands();
-    for (unsigned i = 0; i < NOps; i++) {
-      // XXX: Why is this here?
-      if (isa<RegisterSDNode>(N->getOperand(i))) {
-        IsRegSeq = false;
-        break;
-      }
-      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
-      RegSeqArgs[1 + (2 * i) + 1] =
-              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
-                                        MVT::i32);
-    }
-
-    if (NOps != NumVectorElts) {
-      // Fill in the missing undef elements if this was a scalar_to_vector.
-      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
-
-      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                     DL, EltVT);
-      for (unsigned i = NOps; i < NumVectorElts; ++i) {
-        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
-        RegSeqArgs[1 + (2 * i) + 1] =
-          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
-      }
-    }
-
-    if (!IsRegSeq)
-      break;
-    CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+    SelectBuildVector(N, RegClassID);
     return;
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      break;
-    }
     SDLoc DL(N);
     if (N->getValueType(0) == MVT::i128) {
       RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
@@ -497,8 +531,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::Constant:
   case ISD::ConstantFP: {
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
       break;
 
     uint64_t Imm;
@@ -533,9 +566,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      break;
-
     // There is a scalar version available, but unlike the vector version which
     // has a separate operand for the offset and width, the scalar version packs
     // the width and offset into a single operand. Try to move to the scalar
@@ -565,6 +595,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectDIV_SCALE(N);
     return;
   }
+  case AMDGPUISD::MAD_I64_I32:
+  case AMDGPUISD::MAD_U64_U32: {
+    SelectMAD_64_32(N);
+    return;
+  }
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
       *static_cast<const SITargetLowering*>(getTargetLowering());
@@ -575,8 +610,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::SRL:
   case ISD::SRA:
   case ISD::SIGN_EXTEND_INREG:
-    if (N->getValueType(0) != MVT::i32 ||
-        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    if (N->getValueType(0) != MVT::i32)
       break;
 
     SelectS_BFE(N);
@@ -584,7 +618,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::BRCOND:
     SelectBRCOND(N);
     return;
-
+  case ISD::FMAD:
+    SelectFMAD(N);
+    return;
   case AMDGPUISD::ATOMIC_CMP_SWAP:
     SelectATOMIC_CMP_SWAP(N);
     return;
@@ -638,32 +674,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
 }
 
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
-                                           SDValue &Offset) {
-  ConstantSDNode *IMMOffset;
-
-  if (Addr.getOpcode() == ISD::ADD
-      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
-      && isInt<16>(IMMOffset->getZExtValue())) {
-
-      Base = Addr.getOperand(0);
-      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
-                                         MVT::i32);
-      return true;
-  // If the pointer address is constant, we can move it to the offset field.
-  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
-             && isInt<16>(IMMOffset->getZExtValue())) {
-    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                  SDLoc(CurDAG->getEntryNode()),
-                                  AMDGPU::ZERO, MVT::i32);
-    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
-                                       MVT::i32);
-    return true;
-  }
-
-  // Default case, no offset
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-  return true;
+                                            SDValue &Offset) {
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
@@ -690,6 +702,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
+// FIXME: Should only handle addcarry/subcarry
 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   SDLoc DL(N);
   SDValue LHS = N->getOperand(0);
@@ -699,8 +712,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
   bool ProduceCarry =
       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
-  bool IsAdd =
-      (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE);
+  bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
 
   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
@@ -782,7 +794,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
 
 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
   SDLoc SL(N);
-  //	src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
+  //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
   SDValue Ops[8];
 
   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
@@ -808,6 +820,19 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
+  SDLoc SL(N);
+  bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
+  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
+
+  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                    Clamp };
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
                                          unsigned OffsetBits) const {
   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
@@ -850,8 +875,12 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
                                       Zero, Addr.getOperand(1));
 
         if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+          // FIXME: Select to VOP3 version for with-carry.
+          unsigned SubOp = Subtarget->hasAddNoCarry() ?
+            AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+
           MachineSDNode *MachineSub
-            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+            = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
                                      Zero, Addr.getOperand(1));
 
           Base = SDValue(MachineSub, 0);
@@ -920,8 +949,11 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
                                       Zero, Addr.getOperand(1));
 
         if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+          unsigned SubOp = Subtarget->hasAddNoCarry() ?
+            AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+
           MachineSDNode *MachineSub
-            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+            = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
                                      Zero, Addr.getOperand(1));
 
           Base = SDValue(MachineSub, 0);
@@ -958,14 +990,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
   return true;
 }
 
-static bool isLegalMUBUFImmOffset(unsigned Imm) {
-  return isUInt<12>(Imm);
-}
-
-static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
-  return isLegalMUBUFImmOffset(Imm->getZExtValue());
-}
-
 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &VAddr, SDValue &SOffset,
                                      SDValue &Offset, SDValue &Offen,
@@ -1007,7 +1031,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
       Ptr = N0;
     }
 
-    if (isLegalMUBUFImmOffset(C1)) {
+    if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
     }
@@ -1104,7 +1128,7 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
                                                MVT::i32));
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
                                                  SDValue Addr, SDValue &Rsrc,
                                                  SDValue &VAddr, SDValue &SOffset,
                                                  SDValue &ImmOffset) const {
@@ -1117,8 +1141,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
 
   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned Imm = CAddr->getZExtValue();
-    assert(!isLegalMUBUFImmOffset(Imm) &&
-           "should have been selected by other pattern");
 
     SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
     MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
@@ -1127,7 +1149,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
 
     // In a call sequence, stores to the argument stack area are relative to the
     // stack pointer.
-    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
     unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
       Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
@@ -1142,9 +1164,25 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
-    // Offsets in vaddr must be positive.
+    // Offsets in vaddr must be positive if range checking is enabled.
+    //
+    // The total computation of vaddr + soffset + offset must not overflow.  If
+    // vaddr is negative, even if offset is 0 the sgpr offset add will end up
+    // overflowing.
+    //
+    // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
+    // always perform a range check. If a negative vaddr base index was used,
+    // this would fail the range check. The overall address computation would
+    // compute a valid address, but this doesn't happen due to the range
+    // check. For out-of-bounds MUBUF loads, a 0 is returned.
+    //
+    // Therefore it should be safe to fold any VGPR offset on gfx9 into the
+    // MUBUF vaddr, but not on older subtargets which can only do this if the
+    // sign bit is known 0.
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    if (isLegalMUBUFImmOffset(C1)) {
+    if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
+        (!Subtarget->privateMemoryResourceIsRangeChecked() ||
+         CurDAG->SignBitIsZero(N0))) {
       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
@@ -1157,13 +1195,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
                                                   SDValue Addr,
                                                   SDValue &SRsrc,
                                                   SDValue &SOffset,
                                                   SDValue &Offset) const {
   ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
-  if (!CAddr || !isLegalMUBUFImmOffset(CAddr))
+  if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
     return false;
 
   SDLoc DL(Addr);
@@ -1172,7 +1210,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
 
   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
-  const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+  const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
   unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
     Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
@@ -1231,24 +1269,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
                                              SDValue &SOffset,
                                              SDValue &ImmOffset) const {
   SDLoc DL(Constant);
+  const uint32_t Align = 4;
+  const uint32_t MaxImm = alignDown(4095, Align);
   uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
   uint32_t Overflow = 0;
 
-  if (Imm >= 4096) {
-    if (Imm <= 4095 + 64) {
-      // Use an SOffset inline constant for 1..64
-      Overflow = Imm - 4095;
-      Imm = 4095;
+  if (Imm > MaxImm) {
+    if (Imm <= MaxImm + 64) {
+      // Use an SOffset inline constant for 4..64
+      Overflow = Imm - MaxImm;
+      Imm = MaxImm;
     } else {
       // Try to keep the same value in SOffset for adjacent loads, so that
       // the corresponding register contents can be re-used.
       //
-      // Load values with all low-bits set into SOffset, so that a larger
-      // range of values can be covered using s_movk_i32
-      uint32_t High = (Imm + 1) & ~4095;
-      uint32_t Low = (Imm + 1) & 4095;
+      // Load values with all low-bits (except for alignment bits) set into
+      // SOffset, so that a larger range of values can be covered using
+      // s_movk_i32.
+      //
+      // Atomic operations fail to work correctly when individual address
+      // components are unaligned, even if their sum is aligned.
+      uint32_t High = (Imm + Align) & ~4095;
+      uint32_t Low = (Imm + Align) & 4095;
       Imm = Low;
-      Overflow = High - 1;
+      Overflow = High - Align;
     }
   }
 
@@ -1316,6 +1360,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
   return true;
 }
 
+template <bool IsSigned>
 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
                                           SDValue &VAddr,
                                           SDValue &Offset,
@@ -1326,8 +1371,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
       CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
-    uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getZExtValue();
-    if (isUInt<12>(COffsetVal)) {
+    int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+    if ((IsSigned && isInt<13>(COffsetVal)) ||
+        (!IsSigned && isUInt<12>(COffsetVal))) {
       Addr = N0;
       OffsetVal = COffsetVal;
     }
@@ -1344,7 +1391,14 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr,
                                           SDValue &VAddr,
                                           SDValue &Offset,
                                           SDValue &SLC) const {
-  return SelectFlatOffset(Addr, VAddr, Offset, SLC);
+  return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC);
+}
+
+bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr,
+                                          SDValue &VAddr,
+                                          SDValue &Offset,
+                                          SDValue &SLC) const {
+  return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -1443,13 +1497,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
   return !Imm && isa<ConstantSDNode>(Offset);
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
-                                              SDValue &Offset) const {
-  bool Imm;
-  return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
-         !isa<ConstantSDNode>(Offset);
-}
-
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
                                             SDValue &Base,
                                             SDValue &Offset) const {
@@ -1622,18 +1669,55 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
     return;
   }
 
-  if (isCBranchSCC(N)) {
-    // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+  bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
+  unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
+  unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
+  SDLoc SL(N);
+
+  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
+  CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
+                       N->getOperand(2), // Basic Block
+                       VCC.getValue(0));
+}
+
+void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+  MVT VT = N->getSimpleValueType(0);
+  if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
     SelectCode(N);
     return;
   }
 
-  SDLoc SL(N);
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue Src2 = N->getOperand(2);
+  unsigned Src0Mods, Src1Mods, Src2Mods;
 
-  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond);
-  CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
-                       N->getOperand(2), // Basic Block
-                       VCC.getValue(0));
+  // Avoid using v_mad_mix_f32 unless there is actually an operand using the
+  // conversion from f16.
+  bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
+  bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
+  bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
+
+  assert(!Subtarget->hasFP32Denormals() &&
+         "fmad selected with denormals enabled");
+  // TODO: We can select this with f32 denormals enabled if all the sources are
+  // converted from f16 (in which case fmad isn't legal).
+
+  if (Sel0 || Sel1 || Sel2) {
+    // For dummy operands.
+    SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+    SDValue Ops[] = {
+      CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
+      CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
+      CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
+      CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
+      Zero, Zero
+    };
+
+    CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+  } else {
+    SelectCode(N);
+  }
 }
 
 // This is here because there isn't a way to use the generated sub0_sub1 as the
@@ -1652,11 +1736,11 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
 
   MachineSDNode *CmpSwap = nullptr;
   if (Subtarget->hasAddr64()) {
-    SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC;
+    SDValue SRsrc, VAddr, SOffset, Offset, SLC;
 
     if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
-      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 :
-        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64;
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
       SDValue CmpVal = Mem->getOperand(2);
 
       // XXX - Do we care about glue operands?
@@ -1672,8 +1756,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   if (!CmpSwap) {
     SDValue SRsrc, SOffset, Offset, SLC;
     if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
-      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET :
-        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET;
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
 
       SDValue CmpVal = Mem->getOperand(2);
       SDValue Ops[] = {
@@ -1702,9 +1786,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
-                                        SDValue &SrcMods) const {
-  unsigned Mods = 0;
+bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
+                                            unsigned &Mods) const {
+  Mods = 0;
   Src = In;
 
   if (Src.getOpcode() == ISD::FNEG) {
@@ -1717,10 +1801,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
     Src = Src.getOperand(0);
   }
 
-  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+                                        SDValue &SrcMods) const {
+  unsigned Mods;
+  if (SelectVOP3ModsImpl(In, Src, Mods)) {
+    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
                                              SDValue &SrcMods) const {
   SelectVOP3Mods(In, Src, SrcMods);
@@ -1864,24 +1958,234 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
   return SelectVOP3PMods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  Src = In;
+  // FIXME: Handle op_sel
+  SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src,
+                                          SDValue &SrcMods,
+                                          SDValue &Clamp) const {
+  SDLoc SL(In);
+
+  // FIXME: Handle clamp
+  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+  return SelectVOP3OpSel(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
+                                             SDValue &SrcMods) const {
+  // FIXME: Handle op_sel
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src,
+                                              SDValue &SrcMods,
+                                              SDValue &Clamp) const {
+  SDLoc SL(In);
+
+  // FIXME: Handle clamp
+  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+  return SelectVOP3OpSelMods(In, Src, SrcMods);
+}
+
+// The return value is not whether the match is possible (which it always is),
+// but whether or not it a conversion is really used.
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
+                                                   unsigned &Mods) const {
+  Mods = 0;
+  SelectVOP3ModsImpl(In, Src, Mods);
+
+  if (Src.getOpcode() == ISD::FP_EXTEND) {
+    Src = Src.getOperand(0);
+    assert(Src.getValueType() == MVT::f16);
+    Src = stripBitcast(Src);
+
+    // Be careful about folding modifiers if we already have an abs. fneg is
+    // applied last, so we don't want to apply an earlier fneg.
+    if ((Mods & SISrcMods::ABS) == 0) {
+      unsigned ModsTmp;
+      SelectVOP3ModsImpl(Src, Src, ModsTmp);
+
+      if ((ModsTmp & SISrcMods::NEG) != 0)
+        Mods ^= SISrcMods::NEG;
+
+      if ((ModsTmp & SISrcMods::ABS) != 0)
+        Mods |= SISrcMods::ABS;
+    }
+
+    // op_sel/op_sel_hi decide the source type and source.
+    // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
+    // If the sources's op_sel is set, it picks the high half of the source
+    // register.
+
+    Mods |= SISrcMods::OP_SEL_1;
+    if (isExtractHiElt(Src, Src)) {
+      Mods |= SISrcMods::OP_SEL_0;
+
+      // TODO: Should we try to look for neg/abs here?
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
+                                               SDValue &SrcMods) const {
+  unsigned Mods = 0;
+  SelectVOP3PMadMixModsImpl(In, Src, Mods);
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+// TODO: Can we identify things like v_mad_mixhi_f16?
+bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
+  if (In.isUndef()) {
+    Src = In;
+    return true;
+  }
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
+    SDLoc SL(In);
+    SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32);
+    MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                 SL, MVT::i32, K);
+    Src = SDValue(MovK, 0);
+    return true;
+  }
+
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
+    SDLoc SL(In);
+    SDValue K = CurDAG->getTargetConstant(
+      C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+    MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                 SL, MVT::i32, K);
+    Src = SDValue(MovK, 0);
+    return true;
+  }
+
+  return isExtractHiElt(In, Src);
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
   bool IsModified = false;
   do {
     IsModified = false;
+
     // Go over all selected nodes and try to fold them a bit more
-    for (SDNode &Node : CurDAG->allnodes()) {
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
+    while (Position != CurDAG->allnodes_end()) {
+      SDNode *Node = &*Position++;
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
       if (!MachineNode)
         continue;
 
       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-      if (ResNode != &Node) {
-        ReplaceUses(&Node, ResNode);
+      if (ResNode != Node) {
+        if (ResNode)
+          ReplaceUses(Node, ResNode);
         IsModified = true;
       }
     }
     CurDAG->RemoveDeadNodes();
   } while (IsModified);
 }
+
+void R600DAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  switch (Opc) {
+  default: break;
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+  case ISD::BUILD_VECTOR: {
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    unsigned RegClassID;
+    // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+    // that adds a 128 bits reg copy when going through TwoAddressInstructions
+    // pass. We want to avoid 128 bits copies as much as possible because they
+    // can't be bundled by our scheduler.
+    switch(NumVectorElts) {
+    case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+    case 4:
+      if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+        RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+      else
+        RegClassID = AMDGPU::R600_Reg128RegClassID;
+      break;
+    default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+    }
+    SelectBuildVector(N, RegClassID);
+    return;
+  }
+  }
+
+  SelectCode(N);
+}
+
+bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) {
+  ConstantSDNode *C;
+  SDLoc DL(Addr);
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  }
+
+  return true;
+}
+
+bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) {
+  ConstantSDNode *IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                         MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  SDLoc(CurDAG->getEntryNode()),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                       MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}