72 files changed, 1006 insertions, 372 deletions
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 90bc249bcb39..c2039e1dec2b 100644
--- a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -623,6 +623,7 @@ template <> struct GraphTraits<IrreducibleGraph> {
   typedef bfi_detail::IrreducibleGraph GraphT;
 
   typedef const GraphT::IrrNode NodeType;
+  typedef const GraphT::IrrNode *NodeRef;
   typedef GraphT::IrrNode::iterator ChildIteratorType;
 
   static const NodeType *getEntryNode(const GraphT &G) {
diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
index 6c471ab45048..c9adaa7b111c 100644
--- a/contrib/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
 /// integer type Ty is used to select how many bits are available for the
 /// result. Returns null if the conversion cannot be performed, otherwise
 /// returns the Constant value resulting from the conversion.
-Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
-                                   Type *Ty) {
+Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
+                                      Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
   unsigned ResultWidth = Ty->getIntegerBitWidth();
   assert(ResultWidth <= 64 &&
@@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
   APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
                                                   /*isSigned=*/true, mode,
                                                   &isExact);
-  if (status != APFloat::opOK && status != APFloat::opInexact)
+  if (status != APFloat::opOK &&
+      (!roundTowardZero || status != APFloat::opInexact))
     return nullptr;
   return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
 }
@@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
       case Intrinsic::x86_sse2_cvtsd2si:
       case Intrinsic::x86_sse2_cvtsd2si64:
         if (ConstantFP *FPOp =
-              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
-                                          /*roundTowardZero=*/false, Ty);
+                dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+                                             /*roundTowardZero=*/false, Ty);
       case Intrinsic::x86_sse_cvttss2si:
       case Intrinsic::x86_sse_cvttss2si64:
       case Intrinsic::x86_sse2_cvttsd2si:
       case Intrinsic::x86_sse2_cvttsd2si64:
         if (ConstantFP *FPOp =
-              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
-                                          /*roundTowardZero=*/true, Ty);
+                dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+                                             /*roundTowardZero=*/true, Ty);
       }
     }
 
diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
index 0cb2c78afb40..aeaf9388579c 100644
--- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3400,7 +3400,10 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
     return TrueVal;
 
   if (const auto *ICI = dyn_cast<ICmpInst>(CondVal)) {
-    unsigned BitWidth = Q.DL.getTypeSizeInBits(TrueVal->getType());
+    // FIXME: This code is nearly duplicated in InstCombine. Using/refactoring
+    // decomposeBitTestICmp() might help.
+    unsigned BitWidth =
+        Q.DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
     ICmpInst::Predicate Pred = ICI->getPredicate();
     Value *CmpLHS = ICI->getOperand(0);
     Value *CmpRHS = ICI->getOperand(1);
@@ -4274,7 +4277,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
-    if (I->getParent())
+    if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
+        !I->mayHaveSideEffects())
       I->eraseFromParent();
   } else {
     Worklist.insert(I);
@@ -4302,7 +4306,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
-    if (I->getParent())
+    if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
+        !I->mayHaveSideEffects())
       I->eraseFromParent();
   }
   return Simplified;
diff --git a/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
index f59257ab16b5..7bdf3408a581 100644
--- a/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -115,13 +115,19 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) {
   // We might have a vector load from an array. FIXME: for now we just bail
   // out in this case, but we should be able to resolve and simplify such
   // loads.
-  if(CDS->getElementType() != I.getType())
+  if (CDS->getElementType() != I.getType())
     return false;
 
-  int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
-  if (SimplifiedAddrOp->getValue().getActiveBits() >= 64)
+  unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+  if (SimplifiedAddrOp->getValue().getActiveBits() > 64)
     return false;
-  int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
+  int64_t SimplifiedAddrOpV = SimplifiedAddrOp->getSExtValue();
+  if (SimplifiedAddrOpV < 0) {
+    // FIXME: For now we conservatively ignore out of bound accesses, but
+    // we're allowed to perform the optimization in this case.
+    return false;
+  }
+  uint64_t Index = static_cast<uint64_t>(SimplifiedAddrOpV) / ElemSize;
   if (Index >= CDS->getNumElements()) {
     // FIXME: For now we conservatively ignore out of bound accesses, but
     // we're allowed to perform the optimization in this case.
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
index 77e4ec7ab40c..2e45bb840946 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1610,8 +1610,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
 
 Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
                                    Instruction *IP) {
-  assert(IP);
-  Builder.SetInsertPoint(IP);
+  setInsertPoint(IP);
   return expandCodeFor(SH, Ty);
 }
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index b0ba57122206..ebf80dea2c4b 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -214,10 +214,7 @@ TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
 }
 
 TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
-  // It's possible to ask for the FuncId of a function which doesn't have a
-  // subprogram: inlining a function with debug info into a function with none.
-  if (!SP)
-    return TypeIndex::None();
+  assert(SP);
 
   // Check if we've already translated this subprogram.
   auto I = TypeIndices.find({SP, nullptr});
@@ -621,11 +618,12 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   std::string FuncName;
   auto *SP = GV->getSubprogram();
+  assert(SP);
   setCurrentSubprogram(SP);
 
   // If we have a display name, build the fully qualified name by walking the
   // chain of scopes.
-  if (SP != nullptr && !SP->getDisplayName().empty())
+  if (!SP->getDisplayName().empty())
     FuncName =
         getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName());
 
@@ -864,7 +862,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
 void CodeViewDebug::beginFunction(const MachineFunction *MF) {
   assert(!CurFn && "Can't process two functions at once!");
 
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram())
     return;
 
   DebugHandlerBase::beginFunction(MF);
@@ -1939,7 +1937,8 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
 
   // Ignore DBG_VALUE locations and function prologue.
-  if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup))
+  if (!Asm || !CurFn || MI->isDebugValue() ||
+      MI->getFlag(MachineInstr::FrameSetup))
     return;
   DebugLoc DL = MI->getDebugLoc();
   if (DL == PrevInstLoc || !DL)
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
index fa705761645f..23e2aa70d0c7 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
@@ -996,6 +996,24 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
     MachineBasicBlock *IBB = &*I;
     MachineBasicBlock *PredBB = &*std::prev(I);
     MergePotentials.clear();
+    MachineLoop *ML;
+
+    // Bail if merging after placement and IBB is the loop header because
+    // -- If merging predecessors that belong to the same loop as IBB, the
+    // common tail of merged predecessors may become the loop top if block
+    // placement is called again and the predecessors may branch to this common
+    // tail and require more branches. This can be relaxed if
+    // MachineBlockPlacement::findBestLoopTop is more flexible.
+    // --If merging predecessors that do not belong to the same loop as IBB, the
+    // loop info of IBB's loop and the other loops may be affected. Calling the
+    // block placement again may make big change to the layout and eliminate the
+    // reason to do tail merging here.
+    if (AfterBlockPlacement && MLI) {
+      ML = MLI->getLoopFor(IBB);
+      if (ML && IBB == ML->getHeader())
+        continue;
+    }
+
     for (MachineBasicBlock *PBB : I->predecessors()) {
       if (MergePotentials.size() == TailMergeThreshold)
         break;
@@ -1015,16 +1033,12 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
       if (PBB->hasEHPadSuccessor())
         continue;
 
-      // Bail out if the loop header (IBB) is not the top of the loop chain
-      // after the block placement.  Otherwise, the common tail of IBB's
-      // predecessors may become the loop top if block placement is called again
-      // and the predecessors may branch to this common tail.
-      // FIXME: Relaxed this check if the algorithm of finding loop top is
-      // changed in MBP.
+      // After block placement, only consider predecessors that belong to the
+      // same loop as IBB.  The reason is the same as above when skipping loop
+      // header.
       if (AfterBlockPlacement && MLI)
-        if (MachineLoop *ML = MLI->getLoopFor(IBB))
-          if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB))
-            continue;
+        if (ML != MLI->getLoopFor(PBB))
+          continue;
 
       MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
       SmallVector<MachineOperand, 4> Cond;
diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp
index 19cd59b9dba7..4a1b9958a5b5 100644
--- a/contrib/llvm/lib/CodeGen/SafeStack.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp
@@ -530,7 +530,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     unsigned Align =
         std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
-                  Align, SSC.getLiveRange(StackGuardSlot));
+                  Align, SSC.getFullLiveRange());
   }
 
   for (Argument *Arg : ByValArguments) {
diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
index 709614f57e7d..795eb8d27191 100644
--- a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
@@ -25,7 +25,9 @@ static cl::opt<bool> ClColoring("safe-stack-coloring",
                                 cl::Hidden, cl::init(true));
 
 const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
-  return LiveRanges[AllocaNumbering[AI]];
+  const auto IT = AllocaNumbering.find(AI);
+  assert(IT != AllocaNumbering.end());
+  return LiveRanges[IT->second];
 }
 
 bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
diff --git a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
index b8190e0f2153..fb433c1856a6 100644
--- a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -100,7 +100,8 @@ void StackLayout::layoutObject(StackObject &Obj) {
   }
 
   // Split starting and ending regions if necessary.
-  for (StackRegion &R : Regions) {
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    StackRegion &R = Regions[i];
     if (Start > R.Start && Start < R.End) {
       StackRegion R0 = R;
       R.Start = R0.End = Start;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d888676583f3..5ecc6da32144 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6198,13 +6198,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       }
     }
 
-    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0)
-    unsigned ElementWidth = VT.getScalarType().getSizeInBits();
+    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
+    // Here, T can be 1 or -1, depending on the type of the setcc and
+    // getBooleanContents().
+    unsigned SetCCWidth = N0.getValueType().getScalarSizeInBits();
+
     SDLoc DL(N);
-    SDValue NegOne =
-      DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT);
+    // To determine the "true" side of the select, we need to know the high bit
+    // of the value returned by the setcc if it evaluates to true.
+    // If the type of the setcc is i1, then the true case of the select is just
+    // sext(i1 1), that is, -1.
+    // If the type of the setcc is larger (say, i8) then the value of the high
+    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value
+    // of the appropriate width.
+    SDValue ExtTrueVal =
+        (SetCCWidth == 1)
+            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()),
+                              DL, VT)
+            : TLI.getConstTrueVal(DAG, VT, DL);
+
     if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), NegOne,
+            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal,
             DAG.getConstant(0, DL, VT),
             cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
       return SCC;
@@ -6215,10 +6229,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
           TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
         SDLoc DL(N);
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
-                                     N0.getOperand(0), N0.getOperand(1), CC);
-        return DAG.getSelect(DL, VT, SetCC,
-                             NegOne, DAG.getConstant(0, DL, VT));
+        SDValue SetCC =
+            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC);
+        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal,
+                             DAG.getConstant(0, DL, VT));
       }
     }
   }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8235522b14bd..29d11c79ac25 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6639,19 +6639,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
   SDNode *FromNode = From.getNode();
   SDNode *ToNode = To.getNode();
   ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
        I != E; ++I) {
     SDDbgValue *Dbg = *I;
     // Only add Dbgvalues attached to same ResNo.
     if (Dbg->getKind() == SDDbgValue::SDNODE &&
-        Dbg->getResNo() == From.getResNo()) {
+        Dbg->getSDNode() == From.getNode() &&
+        Dbg->getResNo() == From.getResNo() && !Dbg->isInvalidated()) {
+      assert(FromNode != ToNode &&
+             "Should not transfer Debug Values intranode");
       SDDbgValue *Clone =
           getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
                       To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
                       Dbg->getDebugLoc(), Dbg->getOrder());
-      AddDbgValue(Clone, ToNode, false);
+      ClonedDVs.push_back(Clone);
+      Dbg->setIsInvalidated();
     }
   }
+  for (SDDbgValue *I : ClonedDVs)
+    AddDbgValue(I, ToNode, false);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f2bc88a98597..806646fbc676 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1234,6 +1234,16 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   llvm_unreachable("Invalid boolean contents");
 }
 
+SDValue TargetLowering::getConstTrueVal(SelectionDAG &DAG, EVT VT,
+                                        const SDLoc &DL) const {
+  unsigned ElementWidth = VT.getScalarSizeInBits();
+  APInt TrueInt =
+      getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent
+          ? APInt(ElementWidth, 1)
+          : APInt::getAllOnesValue(ElementWidth);
+  return DAG.getConstant(TrueInt, DL, VT);
+}
+
 bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   if (!N)
     return false;
diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 3d9a51864b6c..8feb18b4d030 100644
--- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -29,7 +29,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -539,6 +539,16 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
   return TRI->regsOverlap(RegA, RegB);
 }
 
+// Returns true if Reg is equal or aliased to at least one register in Set.
+static bool regOverlapsSet(const SmallVectorImpl<unsigned> &Set, unsigned Reg,
+                           const TargetRegisterInfo *TRI) {
+  for (unsigned R : Set)
+    if (TRI->regsOverlap(R, Reg))
+      return true;
+
+  return false;
+}
+
 /// Return true if it's potentially profitable to commute the two-address
 /// instruction that's being processed.
 bool
@@ -864,9 +874,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // FIXME: Needs more sophisticated heuristics.
     return false;
 
-  SmallSet<unsigned, 2> Uses;
-  SmallSet<unsigned, 2> Kills;
-  SmallSet<unsigned, 2> Defs;
+  SmallVector<unsigned, 2> Uses;
+  SmallVector<unsigned, 2> Kills;
+  SmallVector<unsigned, 2> Defs;
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
@@ -874,12 +884,12 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     if (!MOReg)
       continue;
     if (MO.isDef())
-      Defs.insert(MOReg);
+      Defs.push_back(MOReg);
     else {
-      Uses.insert(MOReg);
+      Uses.push_back(MOReg);
       if (MOReg != Reg && (MO.isKill() ||
                            (LIS && isPlainlyKilled(MI, MOReg, LIS))))
-        Kills.insert(MOReg);
+        Kills.push_back(MOReg);
     }
   }
 
@@ -888,8 +898,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   MachineBasicBlock::iterator AfterMI = std::next(Begin);
 
   MachineBasicBlock::iterator End = AfterMI;
-  while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) {
-    Defs.insert(End->getOperand(0).getReg());
+  while (End->isCopy() &&
+         regOverlapsSet(Defs, End->getOperand(1).getReg(), TRI)) {
+    Defs.push_back(End->getOperand(0).getReg());
     ++End;
   }
 
@@ -915,21 +926,21 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
       if (!MOReg)
         continue;
       if (MO.isDef()) {
-        if (Uses.count(MOReg))
+        if (regOverlapsSet(Uses, MOReg, TRI))
           // Physical register use would be clobbered.
           return false;
-        if (!MO.isDead() && Defs.count(MOReg))
+        if (!MO.isDead() && regOverlapsSet(Defs, MOReg, TRI))
           // May clobber a physical register def.
           // FIXME: This may be too conservative. It's ok if the instruction
           // is sunken completely below the use.
           return false;
       } else {
-        if (Defs.count(MOReg))
+        if (regOverlapsSet(Defs, MOReg, TRI))
           return false;
         bool isKill =
             MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS));
-        if (MOReg != Reg &&
-            ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
+        if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) ||
+                             regOverlapsSet(Kills, MOReg, TRI)))
           // Don't want to extend other live ranges and update kills.
           return false;
         if (MOReg == Reg && !isKill)
diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h
index 267a0dab2f25..d58bff56576d 100644
--- a/contrib/llvm/lib/IR/AttributeImpl.h
+++ b/contrib/llvm/lib/IR/AttributeImpl.h
@@ -19,8 +19,8 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/Attributes.h"
+#include "AttributeSetNode.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/TrailingObjects.h"
 #include <climits>
 #include <string>
 
@@ -142,73 +142,6 @@ public:
   StringRef getStringValue() const { return Val; }
 };
 
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a group of attributes that apply to one
-/// element: function, return type, or parameter.
-class AttributeSetNode final
-    : public FoldingSetNode,
-      private TrailingObjects<AttributeSetNode, Attribute> {
-  friend TrailingObjects;
-
-  unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint64_t AvailableAttrs;
-
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
-                  "Too many attributes for AvailableAttrs");
-    // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
-
-    for (Attribute I : *this) {
-      if (!I.isStringAttribute()) {
-        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-      }
-    }
-  }
-
-  // AttributesSetNode is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetNode &) = delete;
-  AttributeSetNode(const AttributeSetNode &) = delete;
-public:
-  void operator delete(void *p) { ::operator delete(p); }
-
-  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
-
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
-
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs & ((uint64_t)1) << Kind;
-  }
-  bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return NumAttrs != 0; }
-
-  Attribute getAttribute(Attribute::AttrKind Kind) const;
-  Attribute getAttribute(StringRef Kind) const;
-
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
-  uint64_t getDereferenceableBytes() const;
-  uint64_t getDereferenceableOrNullBytes() const;
-  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::string getAsString(bool InAttrGrp) const;
-
-  typedef const Attribute *iterator;
-  iterator begin() const { return getTrailingObjects<Attribute>(); }
-  iterator end() const { return begin() + NumAttrs; }
-
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(begin(), end()));
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
-      AttrList[I].Profile(ID);
-  }
-};
-
 typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/IR/AttributeSetNode.h b/contrib/llvm/lib/IR/AttributeSetNode.h
new file mode 100644
index 000000000000..fab1ed51e4d6
--- /dev/null
+++ b/contrib/llvm/lib/IR/AttributeSetNode.h
@@ -0,0 +1,98 @@
+//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the node class used internally by AttributeSet.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_ATTRIBUTESETNODE_H
+#define LLVM_IR_ATTRIBUTESETNODE_H
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/Support/TrailingObjects.h"
+#include <climits>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
+  unsigned NumAttrs; ///< Number of attributes in this node.
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
+    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
+                  "Too many attributes for AvailableAttrs");
+    // There's memory after the node where we can store the entries in.
+    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+    for (Attribute I : *this) {
+      if (!I.isStringAttribute()) {
+        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+      }
+    }
+  }
+
+  // AttributesSetNode is uniqued, these should not be publicly available.
+  void operator=(const AttributeSetNode &) = delete;
+  AttributeSetNode(const AttributeSetNode &) = delete;
+public:
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  static AttributeSetNode *get(AttributeSet AS, unsigned Index) {
+    return AS.getAttributes(Index);
+  }
+
+  /// \brief Return the number of attributes this AttributeSet contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return NumAttrs != 0; }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef const Attribute *iterator;
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
+  iterator end() const { return begin() + NumAttrs; }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, makeArrayRef(begin(), end()));
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
+      AttrList[I].Profile(ID);
+  }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp
index 431e51bb4562..2e4a2f89e2c7 100644
--- a/contrib/llvm/lib/IR/AutoUpgrade.cpp
+++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp
@@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
          Name == "sse2.cvtps2pd" ||
          Name == "avx.cvtdq2.pd.256" ||
          Name == "avx.cvt.ps2.pd.256" ||
-         Name == "sse2.cvttps2dq" ||
-         Name.startswith("avx.cvtt.") ||
          Name.startswith("avx.vinsertf128.") ||
          Name == "avx2.vinserti128" ||
          Name.startswith("avx.vextractf128.") ||
@@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
       else
         Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
-    } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
-                         Name.startswith("avx.cvtt."))) {
-      // Truncation (round to zero) float/double to i32 vector conversion.
-      Value *Src = CI->getArgOperand(0);
-      VectorType *DstTy = cast<VectorType>(CI->getType());
-      Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
     } else if (IsX86 && Name.startswith("sse4a.movnt.")) {
       Module *M = F->getParent();
       SmallVector<Metadata *, 1> Elts;
diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp
index a55361489ada..3c4b0cf2f8ff 100644
--- a/contrib/llvm/lib/IR/Core.cpp
+++ b/contrib/llvm/lib/IR/Core.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Attributes.h"
+#include "AttributeSetNode.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -1844,6 +1845,18 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
   unwrap<Function>(F)->addAttribute(Idx, unwrap(A));
 }
 
+unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
+  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
+  return ASN->getNumAttributes();
+}
+
+void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                              LLVMAttributeRef *Attrs) {
+  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
+  for (auto A: make_range(ASN->begin(), ASN->end()))
+    *Attrs++ = wrap(A);
+}
+
 LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
                                              LLVMAttributeIndex Idx,
                                              unsigned KindID) {
@@ -2216,6 +2229,21 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
   CallSite(unwrap<Instruction>(C)).addAttribute(Idx, unwrap(A));
 }
 
+unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
+                                       LLVMAttributeIndex Idx) {
+  auto CS = CallSite(unwrap<Instruction>(C));
+  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
+  return ASN->getNumAttributes();
+}
+
+void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
+                               LLVMAttributeRef *Attrs) {
+  auto CS = CallSite(unwrap<Instruction>(C));
+  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
+  for (auto A: make_range(ASN->begin(), ASN->end()))
+    *Attrs++ = wrap(A);
+}
+
 LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
                                               LLVMAttributeIndex Idx,
                                               unsigned KindID) {
diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp
index 5201c2ecce6a..f35c64b27b5b 100644
--- a/contrib/llvm/lib/IR/Metadata.cpp
+++ b/contrib/llvm/lib/IR/Metadata.cpp
@@ -675,8 +675,8 @@ void MDNode::handleChangedOperand(void *Ref, Metadata *New) {
   Metadata *Old = getOperand(Op);
   setOperand(Op, New);
 
-  // Drop uniquing for self-reference cycles.
-  if (New == this) {
+  // Drop uniquing for self-reference cycles and deleted constants.
+  if (New == this || (!New && Old && isa<ConstantAsMetadata>(Old))) {
     if (!isResolved())
       resolve();
     storeDistinctInContext();
diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp
index cfa12a9f0b27..2bac2a310670 100644
--- a/contrib/llvm/lib/Support/Triple.cpp
+++ b/contrib/llvm/lib/Support/Triple.cpp
@@ -201,6 +201,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   switch (Kind) {
   case UnknownEnvironment: return "unknown";
   case GNU: return "gnu";
+  case GNUABI64: return "gnuabi64";
   case GNUEABIHF: return "gnueabihf";
   case GNUEABI: return "gnueabi";
   case GNUX32: return "gnux32";
@@ -468,6 +469,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
   return StringSwitch<Triple::EnvironmentType>(EnvironmentName)
     .StartsWith("eabihf", Triple::EABIHF)
     .StartsWith("eabi", Triple::EABI)
+    .StartsWith("gnuabi64", Triple::GNUABI64)
     .StartsWith("gnueabihf", Triple::GNUEABIHF)
     .StartsWith("gnueabi", Triple::GNUEABI)
     .StartsWith("gnux32", Triple::GNUX32)
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
index b1e881685b0c..b97a0f155dc2 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -250,6 +250,7 @@ def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
                                    FeatureMacroOpFusion,
                                    FeatureNEON,
                                    FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
                                    HasV8_1aOps]>;
 
 def : ProcessorModel<"generic", NoSchedModel, [
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d6f2a190d4c8..ac7de1b422e0 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7685,6 +7685,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 /// Fold a floating-point multiply by power of two into floating-point to
 /// fixed-point conversion.
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   if (!Subtarget->hasNEON())
     return SDValue();
@@ -7728,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
     break;
   case 4:
-    ResTy = MVT::v4i32;
+    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
     break;
   }
 
+  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+         "Illegal vector type after legalization");
+
   SDLoc DL(N);
   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
@@ -9853,7 +9860,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
-    return performFpToIntCombine(N, DAG, Subtarget);
+    return performFpToIntCombine(N, DAG, DCI, Subtarget);
   case ISD::FDIV:
     return performFDivCombine(N, DAG, Subtarget);
   case ISD::OR:
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 7e59710a427a..d4784b5463d7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
+class GCNTargetMachine;
 struct MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
@@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
 
 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cfe6346fb6b1..c9c95c796a69 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -783,15 +783,19 @@ void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
   emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
                         RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
   if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
-    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
-                          RuntimeMD::OpenCL_C, 1);
-    auto Node = MD->getOperand(0);
-    unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
-                             ->getZExtValue();
-    unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
-                             ->getZExtValue();
-    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
-                          Major * 100 + Minor * 10, 2);
+    if (MD->getNumOperands()) {
+      auto Node = MD->getOperand(0);
+      if (Node->getNumOperands() > 1) {
+        emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
+                              RuntimeMD::OpenCL_C, 1);
+        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+                         ->getZExtValue();
+        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+                         ->getZExtValue();
+        emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
+                              Major * 100 + Minor * 10, 2);
+      }
+    }
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 3b415774df49..b955e231699a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
@@ -30,15 +32,28 @@ using namespace llvm;
 namespace {
 
 class AMDGPUCodeGenPrepare : public FunctionPass,
-                             public InstVisitor<AMDGPUCodeGenPrepare> {
+                             public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+  const GCNTargetMachine *TM;
+  const SISubtarget *ST;
   DivergenceAnalysis *DA;
-  const TargetMachine *TM;
+  Module *Mod;
+  bool HasUnsafeFPMath;
 
 public:
   static char ID;
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
     FunctionPass(ID),
-    TM(TM) { }
+    TM(static_cast<const GCNTargetMachine *>(TM)),
+    ST(nullptr),
+    DA(nullptr),
+    Mod(nullptr),
+    HasUnsafeFPMath(false) { }
+
+  bool visitFDiv(BinaryOperator &I);
+
+  bool visitInstruction(Instruction &I) {
+    return false;
+  }
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -55,7 +70,92 @@ public:
 
 } // End anonymous namespace
 
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+  if (!CNum)
+    return false;
+
+  // Reciprocal f32 is handled separately without denormals.
+  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+  Type *Ty = FDiv.getType();
+
+  // TODO: Handle half
+  if (!Ty->getScalarType()->isFloatTy())
+    return false;
+
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  if (!FPMath)
+    return false;
+
+  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+  float ULP = FPOp->getFPAccuracy();
+  if (ULP < 2.5f)
+    return false;
+
+  FastMathFlags FMF = FPOp->getFastMathFlags();
+  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+                                      FMF.allowReciprocal();
+  if (ST->hasFP32Denormals() && !UnsafeDiv)
+    return false;
+
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+  Function *Decl
+    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  Value *NewFDiv = nullptr;
+
+  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    NewFDiv = UndefValue::get(VT);
+
+    // FIXME: Doesn't do the right thing for cases where the vector is partially
+    // constant. This works when the scalarizer pass is run first.
+    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+      Value *NumEltI = Builder.CreateExtractElement(Num, I);
+      Value *DenEltI = Builder.CreateExtractElement(Den, I);
+      Value *NewElt;
+
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+      } else {
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      }
+
+      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+    }
+  } else {
+    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  }
+
+  if (NewFDiv) {
+    FDiv.replaceAllUsesWith(NewFDiv);
+    NewFDiv->takeName(&FDiv);
+    FDiv.eraseFromParent();
+  }
+
+  return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+  return Attr.getValueAsString() == "true";
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
   return false;
 }
 
@@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   if (!TM || skipFunction(F))
     return false;
 
+  ST = &TM->getSubtarget<SISubtarget>(F);
   DA = &getAnalysis<DivergenceAnalysis>();
-  visit(F);
+  HasUnsafeFPMath = hasUnsafeFPMath(F);
 
-  return true;
+  bool MadeChange = false;
+
+  for (BasicBlock &BB : F) {
+    BasicBlock::iterator Next;
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+      Next = std::next(I);
+      MadeChange |= visit(*I);
+    }
+  }
+
+  return MadeChange;
 }
 
 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
@@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 
 char AMDGPUCodeGenPrepare::ID = 0;
 
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
   return new AMDGPUCodeGenPrepare(TM);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 6761b4b5df95..3944fdbd31e3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -420,9 +420,10 @@ int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
-int FP32_NEG_ONE = 0xbf800000;
 int FP32_ONE = 0x3f800000;
+int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
+int FP64_NEG_ONE = 0xbff0000000000000;
 }
 def CONST : Constants;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 791872a9db40..8e3471bd2083 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = {
 #undef GET_INTRINSIC_NAME_TABLE
 };
 
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  if (IntrID < Intrinsic::num_intrinsics) {
-    return nullptr;
-  }
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+                                       ArrayRef<Type *> Tys) const {
+  if (IntrID < Intrinsic::num_intrinsics)
+    return StringRef();
+
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
-  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
-  return Result;
+  return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned NumTys) const {
+  return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+                                           ArrayRef<Type*> Tys) const {
+  // FIXME: Re-use Intrinsic::getType machinery
+  switch (ID) {
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    Type *F32Ty = Type::getFloatTy(Context);
+    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+  }
+  default:
+    llvm_unreachable("unhandled intrinsic");
+  }
 }
 
 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
 }
 
 Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              ArrayRef<Type *> Tys) const {
+  FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+  Function *F
+    = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+  AttributeSet AS = getAttributes(M->getContext(),
+                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  F->setAttributes(AS);
+  return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
                                               Type **Tys,
-                                              unsigned numTys) const {
-  llvm_unreachable("Not implemented");
+                                              unsigned NumTys) const {
+  return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index f4173929259c..6cb8b9644642 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -34,13 +34,23 @@ enum ID {
 class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo();
+
+  StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
   std::string getName(unsigned IntrId, Type **Tys = nullptr,
-                      unsigned numTys = 0) const override;
+                      unsigned NumTys = 0) const override;
+
   unsigned lookupName(const char *Name, unsigned Len) const override;
   bool isOverloaded(unsigned IID) const override;
   Function *getDeclaration(Module *M, unsigned ID,
                            Type **Tys = nullptr,
-                           unsigned numTys = 0) const override;
+                           unsigned NumTys = 0) const override;
+
+  Function *getDeclaration(Module *M, unsigned ID,
+                           ArrayRef<Type *> = None) const;
+
+  FunctionType *getType(LLVMContext &Context, unsigned ID,
+                        ArrayRef<Type*> Tys = None) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 775463809634..0bad63fa77ad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) {
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  if (isa<AllocaInst>(Ptr))
-    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
-
   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 
   auto I = GEPIdx.find(GEP);
@@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr,
 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
   // FIXME we only support simple cases
   if (GEP->getNumOperands() != 3)
-    return NULL;
+    return nullptr;
 
   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
   if (!I0 || !I0->isZero())
-    return NULL;
+    return nullptr;
 
   return GEP->getOperand(2);
 }
@@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   // are just being conservative for now.
   if (!AllocaTy ||
       AllocaTy->getElementType()->isVectorTy() ||
-      AllocaTy->getNumElements() > 4) {
+      AllocaTy->getNumElements() > 4 ||
+      AllocaTy->getNumElements() < 2) {
     DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
@@ -443,9 +441,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = Inst->getOperand(0);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
       Inst->replaceAllUsesWith(ExtractElement);
@@ -453,9 +453,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+
       Value *Ptr = Inst->getOperand(1);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(BitCast);
       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
                                                        Inst->getOperand(0),
@@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
 
     default:
-      Inst->dump();
       llvm_unreachable("Inconsistency in instructions promotable to vector");
     }
   }
@@ -477,11 +478,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 }
 
 static bool isCallPromotable(CallInst *CI) {
-  // TODO: We might be able to handle some cases where the callee is a
-  // constantexpr bitcast of a function.
-  if (!CI->getCalledFunction())
-    return false;
-
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (!II)
     return false;
@@ -773,28 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       continue;
     }
 
-    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
-    if (!Intr) {
-      // FIXME: What is this for? It doesn't make sense to promote arbitrary
-      // function calls. If the call is to a defined function that can also be
-      // promoted, we should be able to do this once that function is also
-      // rewritten.
-
-      std::vector<Type*> ArgTypes;
-      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
-                                ArgIdx != ArgEnd; ++ArgIdx) {
-        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
-      }
-      Function *F = Call->getCalledFunction();
-      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
-                                                F->isVarArg());
-      Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
-                                             NewType, F->getAttributes());
-      Function *NewF = cast<Function>(C);
-      Call->setCalledFunction(NewF);
-      continue;
-    }
-
+    IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
     Builder.SetInsertPoint(Intr);
     switch (Intr->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3e53f52c689f..b2d4e1144c75 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -309,6 +309,7 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
+  void addIRPasses() override;
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
   bool addInstSelector() override;
@@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&DeadMachineInstructionElimID);
 }
 
+void GCNPassConfig::addIRPasses() {
+  // TODO: May want to move later or split into an early and late one.
+  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+  AMDGPUPassConfig::addIRPasses();
+}
+
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8f78edd76a51..8ccd176930a6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -122,6 +122,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 
@@ -832,13 +833,18 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::FP_TO_UINT:
     if (N->getValueType(0) == MVT::i1) {
-      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+      Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
       return;
     }
     // Fall-through. Since we don't care about out of bounds values
     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
     // considers some extra cases which are not necessary here.
   case ISD::FP_TO_SINT: {
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
+      return;
+    }
+
     SDValue Result;
     if (expandFP_TO_SINT(N, Result, DAG))
       Results.push_back(Result);
@@ -1052,15 +1058,24 @@ SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
 }
 
-SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(
+      ISD::SETCC,
+      DL,
+      MVT::i1,
+      Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(
       ISD::SETCC,
       DL,
       MVT::i1,
-      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
-      DAG.getCondCode(ISD::SETNE)
-      );
+      Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 2fb6ee25caa9..9700ce14c6f3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -72,7 +72,8 @@ private:
 
   SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index 54efdc0a0466..f4b04e3631a5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -41,7 +41,8 @@ enum {
   WQM = 1 << 22,
   VGPRSpill = 1 << 23,
   VOPAsmPrefer32Bit = 1 << 24,
-  Gather4 = 1 << 25
+  Gather4 = 1 << 25,
+  DisableWQM = 1 << 26
 };
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 51241cf0a432..80d44351267d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1134,9 +1134,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MachineFunction *MF = BB->getParent();
     SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     DebugLoc DL = MI.getDebugLoc();
-    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
-        .addOperand(MI.getOperand(0))
-        .addImm(MFI->LDSSize);
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
+      .addOperand(MI.getOperand(0))
+      .addImm(MFI->LDSSize);
     MI.eraseFromParent();
     return BB;
   }
@@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    return lowerFDIV_FAST(Op, DAG);
+  }
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                        Op.getOperand(1),
@@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
 // Catch division cases where we can use shortcuts with rcp and rsq
 // instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+                                              SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
-    return FastLowered;
-
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
 
-  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
-  if (EnableAMDGPUFastFDIV) {
-    // This does not support denormals.
-    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
 
-    const APFloat K0Val(BitsToFloat(0x6f800000));
-    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
 
-    const APFloat K1Val(BitsToFloat(0x2f800000));
-    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
 
-    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
-    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
 
-    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
 
-    // TODO: Should this propagate fast-math-flags?
+  // TODO: Should this propagate fast-math-flags?
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
-    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+  // rcp does not support denormals.
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
 
-    // rcp does not support denormals.
-    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
 
-    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
 
-    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
-  }
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
 
-  // Generates more precise fpdiv32.
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
@@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getTarget().Options.UnsafeFPMath)
-    return LowerFastFDIV(Op, DAG);
+    return lowerFastUnsafeFDIV(Op, DAG);
 
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 8e055eea58c2..1d349faa592c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 2f63d4ed13b3..6163f0547bd5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bits<1> DS = 0;
   field bits<1> MIMG = 0;
   field bits<1> FLAT = 0;
+
+  // Whether WQM _must_ be enabled for this instruction.
   field bits<1> WQM = 0;
   field bits<1> VGPRSpill = 0;
 
@@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   field bits<1> Gather4 = 0;
 
+  // Whether WQM _must_ be disabled for this instruction.
+  field bits<1> DisableWQM = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{23} = VGPRSpill;
   let TSFlags{24} = VOPAsmPrefer32Bit;
   let TSFlags{25} = Gather4;
+  let TSFlags{26} = DisableWQM;
 
   let SchedRW = [Write32Bit];
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d171e21c8a4f..5cc6a4e0e83e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -738,7 +738,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
     MachineBasicBlock::iterator Insert = Entry.front();
     DebugLoc DL = Insert->getDebugLoc();
 
-    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
+    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
+                                   *MF);
     if (TIDReg == AMDGPU::NoRegister)
       return TIDReg;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 227b817227c2..fef8904a6c87 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -340,6 +340,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
+  static bool isDisableWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+  }
+
+  bool isDisableWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+  }
+
   static bool isVGPRSpill(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 253cc32b27e4..00f53e846db4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2949,6 +2949,10 @@ multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
   def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
            MUBUFAddr64Table <0>;
 
+  let DisableWQM = 1 in {
+    def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+  }
+
   let addr64 = 0, isCodeGenOnly = 0 in {
     def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
   }
@@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
 multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                          ValueType vt, SDPatternOperator atomic> {
 
-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+      DisableWQM = 1 in {
 
     // No return variants
     let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@@ -3423,6 +3428,7 @@ class MIMG_Store_Helper <bits<7> op, string asm,
   let mayStore = 1;
   let hasSideEffects = 1;
   let hasPostISelHook = 0;
+  let DisableWQM = 1;
 }
 
 multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
   let mayStore = 1;
   let hasSideEffects = 1;
   let hasPostISelHook = 0;
+  let DisableWQM = 1;
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6427db87cd6f..18b7d5d62efe 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2200,7 +2200,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, 0,
           (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+    (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
                                     (as_i1imm $glc), (as_i1imm $slc), 0)
   >;
 
@@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
           (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
                                    (as_i16imm $offset), (as_i1imm $glc),
                                    (as_i1imm $slc), 0)
   >;
@@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, 0,
           (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
                                    (as_i16imm $offset), (as_i1imm $glc),
                                    (as_i1imm $slc), 0)
   >;
@@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
           (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _BOTHEN)
+    (!cast<MUBUF>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
@@ -3391,6 +3391,16 @@ def : Pat <
     (V_CNDMASK_B32_e64 0, -1, $src), sub1)
 >;
 
+class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+  (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
+  (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
+
 // If we need to perform a logical operation on i1 values, we need to
 // use vector comparisons since there is only one SCC register. Vector
 // comparisions still write to a pair of SGPRs, so treat these as
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
index a9b7c39096e7..9d06ccfc6c7f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// SI Intrinsic Definitions
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
 //
 //===----------------------------------------------------------------------===//
 
@@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in {
 } // End TargetPrefix = "SI", isTarget = 1
 
 let TargetPrefix = "amdgcn", isTarget = 1 in {
+  // Emit 2.5 ulp, no denormal division. Should only be inserted by
+  // pass based on !fpmath metadata.
+  def int_amdgcn_fdiv_fast : Intrinsic<
+    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+  >;
+
   /* Control flow Intrinsics */
 
   def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4d12a1ef9a93..848be32cd515 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -203,7 +203,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
   Spill.Lane = Lane;
 
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
+                                                *MF);
 
     if (LaneVGPR == AMDGPU::NoRegister)
       // We have no VGPRs left for spilling SGPRs.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 0dd88ee45c58..347c33fb3760 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -957,10 +957,13 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
 /// \brief Returns a register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 //         AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
-                                           const TargetRegisterClass *RC) const {
+unsigned
+SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                   const TargetRegisterClass *RC,
+                                   const MachineFunction &MF) const {
+
   for (unsigned Reg : *RC)
-    if (!MRI.isPhysRegUsed(Reg))
+    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
       return Reg;
   return AMDGPU::NoRegister;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 6e97b1b910a9..d8b2d9f4e975 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -185,7 +185,8 @@ public:
   unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
 
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass *RC) const;
+                              const TargetRegisterClass *RC,
+                              const MachineFunction &MF) const;
 
   unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
   unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index c1a237ea5f51..b200c153df0b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -94,12 +94,15 @@ private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<const MachineInstr *, 2> ExecExports;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
+  void markInstruction(MachineInstr &MI, char Flag,
+                       std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -126,6 +129,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -135,8 +139,11 @@ public:
 
 char SIWholeQuadMode::ID = 0;
 
-INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
-                "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                    false)
 
 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 
@@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
   return new SIWholeQuadMode;
 }
 
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+                                      std::vector<WorkItem> &Worklist) {
+  InstrInfo &II = Instructions[&MI];
+
+  assert(Flag == StateWQM || Flag == StateExact);
+
+  // Ignore if the instruction is already marked. The typical case is that we
+  // mark an instruction WQM multiple times, but for atomics it can happen that
+  // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+  // letting the atomic run in StateExact is correct as per the relevant specs.
+  if (II.Needs)
+    return;
+
+  II.Needs = Flag;
+  Worklist.push_back(&MI);
+}
+
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -161,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 
       if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
         Flags = StateWQM;
-      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+      } else if (TII->isDisableWQM(MI)) {
         Flags = StateExact;
       } else {
         // Handle export instructions with the exec mask valid flag set
@@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           continue;
       }
 
-      Instructions[&MI].Needs = Flags;
-      Worklist.push_back(&MI);
+      markInstruction(MI, Flags, Worklist);
       GlobalFlags |= Flags;
     }
 
@@ -214,9 +237,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
   BlockInfo &BI = Blocks[MBB];
 
-  // Control flow-type instructions that are followed by WQM computations
-  // must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+  // Control flow-type instructions and stores to temporary memory that are
+  // followed by WQM computations must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !II.Needs &&
+      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
     Instructions[&MI].Needs = StateWQM;
     II.Needs = StateWQM;
   }
@@ -249,32 +273,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
     if (!Use.isReg() || !Use.isUse())
       continue;
 
-    // At this point, physical registers appear as inputs or outputs
-    // and following them makes no sense (and would in fact be incorrect
-    // when the same VGPR is used as both an output and an input that leads
-    // to a NeedsWQM instruction).
-    //
-    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
-    // have to trace this, in practice it happens for 64-bit computations like
-    // pointers where both dwords are followed already anyway.
-    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
-      continue;
-
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
-      InstrInfo &DefII = Instructions[&DefMI];
+    unsigned Reg = Use.getReg();
 
-      // Obviously skip if DefMI is already flagged as NeedWQM.
-      //
-      // The instruction might also be flagged as NeedExact. This happens when
-      // the result of an atomic is used in a WQM computation. In this case,
-      // the atomic must not run for helper pixels and the WQM result is
-      // undefined.
-      if (DefII.Needs != 0)
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
         continue;
 
-      DefII.Needs = StateWQM;
-      Worklist.push_back(&DefMI);
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
     }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
   }
 }
 
@@ -468,6 +495,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
 
   char GlobalFlags = analyzeFunction(MF);
   if (!(GlobalFlags & StateWQM)) {
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d6e7caf98a80..3cfcb1e09f0b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3857,7 +3857,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // Try to convert two saturating conditional selects into a single SSAT
   SDValue SatValue;
   uint64_t SatConstant;
-  if (isSaturatingConditional(Op, SatValue, SatConstant))
+  if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+      isSaturatingConditional(Op, SatValue, SatConstant))
     return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
                        DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index 060376b0a273..c9735f3ec277 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -3650,7 +3650,8 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 
 def SSAT : AI<(outs GPRnopc:$Rd),
               (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
-              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]>{
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -3666,7 +3667,8 @@ def SSAT : AI<(outs GPRnopc:$Rd),
 
 def SSAT16 : AI<(outs GPRnopc:$Rd),
                 (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
-                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
@@ -3679,7 +3681,8 @@ def SSAT16 : AI<(outs GPRnopc:$Rd),
 
 def USAT : AI<(outs GPRnopc:$Rd),
               (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
-              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]> {
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -3695,7 +3698,8 @@ def USAT : AI<(outs GPRnopc:$Rd),
 
 def USAT16 : AI<(outs GPRnopc:$Rd),
                 (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
-                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 55e5308be40e..fe699b284882 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2240,7 +2240,8 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 def t2SSAT: T2SatI<
               (outs rGPR:$Rd),
               (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsThumb2]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2251,7 +2252,7 @@ def t2SSAT: T2SatI<
 def t2SSAT16: T2SatI<
                 (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
                 "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasDSP]> {
+                Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2265,7 +2266,8 @@ def t2SSAT16: T2SatI<
 def t2USAT: T2SatI<
                (outs rGPR:$Rd),
                (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+                Requires<[IsThumb2]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
   let Inst{20} = 0;
@@ -2275,7 +2277,7 @@ def t2USAT: T2SatI<
 def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
                      NoItinerary,
                      "usat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasDSP]> {
+                     Requires<[IsThumb2, HasDSP]> {
   let Inst{31-22} = 0b1111001110;
   let Inst{20} = 0;
   let Inst{15} = 0;
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index cdad7ce1b73a..20c5f3691d23 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -518,6 +518,10 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
       return true;
     return false;
 
+  case ELF::R_MIPS_GOT_PAGE:
+  case ELF::R_MICROMIPS_GOT_PAGE:
+  case ELF::R_MIPS_GOT_OFST:
+  case ELF::R_MICROMIPS_GOT_OFST:
   case ELF::R_MIPS_16:
   case ELF::R_MIPS_32:
   case ELF::R_MIPS_GPREL32:
@@ -539,8 +543,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MIPS_SHIFT5:
   case ELF::R_MIPS_SHIFT6:
   case ELF::R_MIPS_GOT_DISP:
-  case ELF::R_MIPS_GOT_PAGE:
-  case ELF::R_MIPS_GOT_OFST:
   case ELF::R_MIPS_GOT_HI16:
   case ELF::R_MIPS_GOT_LO16:
   case ELF::R_MIPS_INSERT_A:
@@ -589,8 +591,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MICROMIPS_PC16_S1:
   case ELF::R_MICROMIPS_CALL16:
   case ELF::R_MICROMIPS_GOT_DISP:
-  case ELF::R_MICROMIPS_GOT_PAGE:
-  case ELF::R_MICROMIPS_GOT_OFST:
   case ELF::R_MICROMIPS_GOT_HI16:
   case ELF::R_MICROMIPS_GOT_LO16:
   case ELF::R_MICROMIPS_SUB:
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 1622b2212665..1ce8f07092b1 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -28,12 +28,19 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
 
+  // FIXME: This condition isn't quite right but it's the best we can do until
+  //        this object can identify the ABI. It will misbehave when using O32
+  //        on a mips64*-* triple.
+  if ((TheTriple.getArch() == Triple::mipsel) ||
+      (TheTriple.getArch() == Triple::mips)) {
+    PrivateGlobalPrefix = "$";
+    PrivateLabelPrefix = "$";
+  }
+
   AlignmentIsInBytes          = false;
   Data16bitsDirective         = "\t.2byte\t";
   Data32bitsDirective         = "\t.4byte\t";
   Data64bitsDirective         = "\t.8byte\t";
-  PrivateGlobalPrefix         = "$";
-  PrivateLabelPrefix          = "$";
   CommentString               = "#";
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index c248c3a50ac8..80641ed9bd31 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -57,7 +57,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   else
     Ret += "E";
 
-  Ret += "-m:m";
+  if (ABI.IsO32())
+    Ret += "-m:m";
+  else
+    Ret += "-m:e";
 
   // Pointers are 32 bit on some ABIs.
   if (!ABI.IsN64())
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index e54711195900..2c548384f1cb 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1187,6 +1187,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
 
@@ -13373,6 +13381,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
       return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
@@ -13380,6 +13389,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                          DAG.getUNDEF(SrcVT)));
     }
     if (SrcVT.getVectorElementType() == MVT::i1) {
+      if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
+        return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                           DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
@@ -13694,6 +13706,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   MVT SVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (SVT.getVectorElementType() == MVT::i1) {
+    if (SVT == MVT::v2i1)
+      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
+    MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
+  }
+
   switch (SVT.SimpleTy) {
   default:
     llvm_unreachable("Custom UINT_TO_FP is not supported!");
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1672b3855b79..5f0aab9ddc68 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2661,7 +2661,8 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP, unsigned &NewSrc,
                                   bool &isKill, bool &isUndef,
-                                  MachineOperand &ImplicitOp) const {
+                                  MachineOperand &ImplicitOp,
+                                  LiveVariables *LV) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetRegisterClass *RC;
   if (AllowSP) {
@@ -2715,13 +2716,17 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
-    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+    MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                 get(TargetOpcode::COPY))
         .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
         .addOperand(Src);
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
     isUndef = false;
+
+    if (LV)
+      LV->replaceKillInstruction(SrcReg, MI, *Copy);
   }
 
   // We've set all the parameters without issue.
@@ -2900,7 +2905,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -2943,7 +2948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -2977,7 +2982,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -3016,7 +3021,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     const MachineOperand &Src2 = MI.getOperand(2);
@@ -3024,7 +3029,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
-                        SrcReg2, isKill2, isUndef2, ImplicitOp2))
+                        SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -3087,7 +3092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index 858f35d1cbf0..a8a9f629fc1d 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -230,7 +230,7 @@ public:
   bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                       unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
                       bool &isKill, bool &isUndef,
-                      MachineOperand &ImplicitOp) const;
+                      MachineOperand &ImplicitOp, LiveVariables *LV) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index 9a98f5cac2ee..f91764a67d1c 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
                        Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -2009,24 +2009,35 @@ def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (loadv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (loadv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                           Sched<[WriteCvtF2ILd]>;
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // YMM only
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index fff544085414..787f4342831d 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -332,6 +332,7 @@ struct ArgumentUsesTracker : public CaptureTracker {
 namespace llvm {
 template <> struct GraphTraits<ArgumentGraphNode *> {
   typedef ArgumentGraphNode NodeType;
+  typedef ArgumentGraphNode *NodeRef;
   typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;
 
   static inline NodeType *getEntryNode(NodeType *A) { return A; }
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 310c29275faf..99b12d4db0d0 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -779,7 +780,8 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
         // Instructions could multiply use V.
         while (UI != E && *UI == I)
           ++UI;
-        I->eraseFromParent();
+        if (isInstructionTriviallyDead(I, TLI))
+          I->eraseFromParent();
       }
 }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index cf5b76dc365b..df6a48e05d42 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -134,6 +134,10 @@ static cl::opt<int> PreInlineThreshold(
     cl::desc("Control the amount of inlining in pre-instrumentation inliner "
              "(default = 75)"));
 
+static cl::opt<bool> EnableGVNHoist(
+    "enable-gvn-hoist", cl::init(false), cl::Hidden,
+    cl::desc("Enable the experimental GVN Hoisting pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -232,7 +236,8 @@ void PassManagerBuilder::populateFunctionPassManager(
   FPM.add(createCFGSimplificationPass());
   FPM.add(createSROAPass());
   FPM.add(createEarlyCSEPass());
-  FPM.add(createGVNHoistPass());
+  if(EnableGVNHoist)
+    FPM.add(createGVNHoistPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index d7eed790e2ab..8f1ff8ac0e66 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -553,8 +553,11 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
+  // decomposeBitTestICmp() might help.
   {
-    unsigned BitWidth = DL.getTypeSizeInBits(TrueVal->getType());
+    unsigned BitWidth =
+        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
     APInt MinSignedValue = APInt::getSignBit(BitWidth);
     Value *X;
     const APInt *Y, *C;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 51c3262b5d14..377ccb9c37f7 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2830,7 +2830,8 @@ bool InstCombiner::run() {
         // Add operands to the worklist.
         replaceInstUsesWith(*I, C);
         ++NumConstProp;
-        eraseInstFromFunction(*I);
+        if (isInstructionTriviallyDead(I, TLI))
+          eraseInstFromFunction(*I);
         MadeIRChange = true;
         continue;
       }
@@ -2851,7 +2852,8 @@ bool InstCombiner::run() {
         // Add operands to the worklist.
         replaceInstUsesWith(*I, C);
         ++NumConstProp;
-        eraseInstFromFunction(*I);
+        if (isInstructionTriviallyDead(I, TLI))
+          eraseInstFromFunction(*I);
         MadeIRChange = true;
         continue;
       }
@@ -3007,7 +3009,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
           ++NumConstProp;
-          Inst->eraseFromParent();
+          if (isInstructionTriviallyDead(Inst, TLI))
+            Inst->eraseFromParent();
           continue;
         }
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index dcb62d3ed1b5..41041c78db97 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -272,8 +272,9 @@ static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
         return false;
     }
 
-    // Check if the global is in a GCOV counter array.
-    if (GV->getName().startswith("__llvm_gcov_ctr"))
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
       return false;
   }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 88172d19fe5a..9e982194bac7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -19,6 +19,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constant.h"
@@ -90,11 +91,13 @@ bool ConstantPropagation::runOnFunction(Function &F) {
 
         // Remove the dead instruction.
         WorkList.erase(I);
-        I->eraseFromParent();
+        if (isInstructionTriviallyDead(I, TLI)) {
+          I->eraseFromParent();
+          ++NumInstKilled;
+        }
 
         // We made a change to the function...
         Changed = true;
-        ++NumInstKilled;
       }
   }
   return Changed;
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 9d0ef42e0396..0b16e2703dc4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -582,6 +582,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // its simpler value.
     if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
+      bool Killed = false;
       if (!Inst->use_empty()) {
         Inst->replaceAllUsesWith(V);
         Changed = true;
@@ -589,11 +590,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       if (isInstructionTriviallyDead(Inst, &TLI)) {
         Inst->eraseFromParent();
         Changed = true;
+        Killed = true;
       }
-      if (Changed) {
+      if (Changed)
         ++NumSimplify;
+      if (Killed)
         continue;
-      }
     }
 
     // If this is a simple instruction that we can value number, process it.
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 542cf38e43bb..e958563e2d10 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -815,6 +815,14 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
   if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
     return;
 
+  // Check that `Cast` actually extends the induction variable (we rely on this
+  // later).  This takes care of cases where `Cast` is extending a truncation of
+  // the narrow induction variable, and thus can end up being narrower than the
+  // "narrow" induction variable.
+  uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
+  if (NarrowIVWidth >= Width)
+    return;
+
   // Cast is either an sext or zext up to this point.
   // We should not widen an indvar if arithmetics on the wider indvar are more
   // expensive than those on the narrower indvar. We check only the cost of ADD
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index b9e717cf763e..d1769fc3ebb3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -758,7 +758,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
     if (SimpleVal) {
       I->replaceAllUsesWith(SimpleVal);
-      I->eraseFromParent();
+      if (isInstructionTriviallyDead(I, TLI))
+        I->eraseFromParent();
       Condition = SimpleVal;
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index 2c0a70e44f57..cdd17fc516a8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -377,9 +377,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
               &I, I.getModule()->getDataLayout(), TLI)) {
         DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
         CurAST->copyValue(&I, C);
-        CurAST->deleteValue(&I);
         I.replaceAllUsesWith(C);
-        I.eraseFromParent();
+        if (isInstructionTriviallyDead(&I, TLI)) {
+          CurAST->deleteValue(&I);
+          I.eraseFromParent();
+        }
         continue;
       }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 77c77eb7d798..70bd9d3cca95 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -4442,6 +4442,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
   IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+  Rewriter.setInsertPoint(&*IP);
 
   // Inform the Rewriter if we have a post-increment use, so that it can
   // perform an advantageous expansion.
@@ -4473,7 +4474,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                                  LF.UserInst, LF.OperandValToReplace,
                                  Loops, SE, DT);
 
-    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
   }
 
   // Expand the ScaledReg portion.
@@ -4491,14 +4492,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // Expand ScaleReg as if it was part of the base regs.
       if (F.Scale == 1)
         Ops.push_back(
-            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
       else {
         // An interesting way of "folding" with an icmp is to use a negated
         // scale, which we'll implement by inserting it into the other operand
         // of the icmp.
         assert(F.Scale == -1 &&
                "The only scale supported by ICmpZero uses is -1!");
-        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
       }
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
@@ -4508,11 +4509,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // Unless the addressing mode will not be folded.
       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
           isAMCompletelyFolded(TTI, LU, F)) {
-        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
         Ops.clear();
         Ops.push_back(SE.getUnknown(FullV));
       }
-      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
       if (F.Scale != 1)
         ScaledS =
             SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
@@ -4524,7 +4525,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   if (F.BaseGV) {
     // Flush the operand list to suppress SCEVExpander hoisting.
     if (!Ops.empty()) {
-      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
       Ops.clear();
       Ops.push_back(SE.getUnknown(FullV));
     }
@@ -4534,7 +4535,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
   // unfolded offsets. LSR assumes they both live next to their uses.
   if (!Ops.empty()) {
-    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
     Ops.clear();
     Ops.push_back(SE.getUnknown(FullV));
   }
@@ -4570,7 +4571,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   const SCEV *FullS = Ops.empty() ?
                       SE.getConstant(IntTy, 0) :
                       SE.getAddExpr(Ops);
-  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
 
   // We're done expanding now, so reset the rewriter.
   Rewriter.clearPostInc();
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
index c5ca56360fc8..4f1052d81433 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -552,9 +553,39 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // two PHINodes, the iteration over the old PHIs remains valid, and the
   // mapping will just map us to the new node (which may not even be a PHI
   // node).
+  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  SmallSetVector<const Value *, 8> Worklist;
   for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
-    if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]]))
-      recursivelySimplifyInstruction(PN);
+    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
+      Worklist.insert(PHIToResolve[Idx]);
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    const Value *OrigV = Worklist[Idx];
+    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
+    if (!I)
+      continue;
+
+    // See if this instruction simplifies.
+    Value *SimpleV = SimplifyInstruction(I, DL);
+    if (!SimpleV)
+      continue;
+
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (const User *U : OrigV->users())
+      Worklist.insert(cast<Instruction>(U));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // If the original instruction had no side effects, remove it.
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+    else
+      VMap[OrigV] = I;
+  }
 
   // Now that the inlined function body has been fully constructed, go through
   // and zap unconditional fall-through branches. This happens all the time when
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 1fbb19d2b8ad..e82c07fd7b59 100644
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1294,6 +1294,13 @@ updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
   return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
 }
 
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
+  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
 /// Update inlined instructions' line numbers to
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
@@ -1328,7 +1335,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 
         // Don't update static allocas, as they may get moved later.
         if (auto *AI = dyn_cast<AllocaInst>(BI))
-          if (isa<Constant>(AI->getArraySize()))
+          if (allocaWouldBeStaticInEntry(AI))
             continue;
 
         BI->setDebugLoc(TheCallDL);
@@ -1626,7 +1633,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         continue;
       }
 
-      if (!isa<Constant>(AI->getArraySize()))
+      if (!allocaWouldBeStaticInEntry(AI))
         continue;
       
       // Keep track of the static allocas that we inline into the caller.
@@ -1635,7 +1642,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Scan for the block of allocas that we can move over, and move them
       // all at once.
       while (isa<AllocaInst>(I) &&
-             isa<Constant>(cast<AllocaInst>(I)->getArraySize())) {
+             allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
         IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
         ++I;
       }
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
index 9658966779b9..0d5a25b8ebc5 100644
--- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -64,6 +64,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
                                     DominatorTree &DT, LoopInfo &LI) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallSetVector<PHINode *, 16> PHIsToRemove;
   PredIteratorCache PredCache;
   bool Changed = false;
 
@@ -115,7 +116,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     SmallVector<PHINode *, 16> AddedPHIs;
     SmallVector<PHINode *, 8> PostProcessPHIs;
 
-    SSAUpdater SSAUpdate;
+    SmallVector<PHINode *, 4> InsertedPHIs;
+    SSAUpdater SSAUpdate(&InsertedPHIs);
     SSAUpdate.Initialize(I->getType(), I->getName());
 
     // Insert the LCSSA phi's into all of the exit blocks dominated by the
@@ -184,6 +186,14 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 
       // Otherwise, do full PHI insertion.
       SSAUpdate.RewriteUse(*UseToRewrite);
+
+      // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
+      // to post-process them to keep LCSSA form.
+      for (PHINode *InsertedPN : InsertedPHIs) {
+        if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
+          if (!L->contains(OtherLoop))
+            PostProcessPHIs.push_back(InsertedPN);
+      }
     }
 
     // Post process PHI instructions that were inserted into another disjoint
@@ -196,13 +206,19 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       Worklist.push_back(PostProcessPN);
     }
 
-    // Remove PHI nodes that did not have any uses rewritten.
+    // Keep track of PHI nodes that we want to remove because they did not have
+    // any uses rewritten.
     for (PHINode *PN : AddedPHIs)
       if (PN->use_empty())
-        PN->eraseFromParent();
+        PHIsToRemove.insert(PN);
 
     Changed = true;
   }
+  // Remove PHI nodes that did not have any uses rewritten.
+  for (PHINode *PN : PHIsToRemove) {
+    assert (PN->use_empty() && "Trying to remove a phi with uses.");
+    PN->eraseFromParent();
+  }
   return Changed;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index b3a928bf7753..2846e8f235b7 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -327,6 +327,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
     else
       NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
 
+  SmallVector<BasicBlock *, 8> OuterLoopBlocks;
+  OuterLoopBlocks.push_back(NewBB);
   // Now that we know which blocks are in L and which need to be moved to
   // OuterLoop, move any blocks that need it.
   for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
@@ -334,12 +336,53 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
     if (!BlocksInL.count(BB)) {
       // Move this block to the parent, updating the exit blocks sets
       L->removeBlockFromLoop(BB);
-      if ((*LI)[BB] == L)
+      if ((*LI)[BB] == L) {
         LI->changeLoopFor(BB, NewOuter);
+        OuterLoopBlocks.push_back(BB);
+      }
       --i;
     }
   }
 
+  // Split edges to exit blocks from the inner loop, if they emerged in the
+  // process of separating the outer one.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
+                                               ExitBlocks.end());
+  for (BasicBlock *ExitBlock : ExitBlockSet) {
+    if (any_of(predecessors(ExitBlock),
+               [L](BasicBlock *BB) { return !L->contains(BB); })) {
+      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
+    }
+  }
+
+  if (PreserveLCSSA) {
+    // Fix LCSSA form for L. Some values, which previously were only used inside
+    // L, can now be used in NewOuter loop. We need to insert phi-nodes for them
+    // in corresponding exit blocks.
+
+    // Go through all instructions in OuterLoopBlocks and check if they are
+    // using operands from the inner loop. In this case we'll need to fix LCSSA
+    // for these instructions.
+    SmallSetVector<Instruction *, 8> WorklistSet;
+    for (BasicBlock *OuterBB: OuterLoopBlocks) {
+      for (Instruction &I : *OuterBB) {
+        for (Value *Op : I.operands()) {
+          Instruction *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI || !L->contains(OpI))
+            continue;
+          WorklistSet.insert(OpI);
+        }
+      }
+    }
+    SmallVector<Instruction *, 8> Worklist(WorklistSet.begin(),
+                                           WorklistSet.end());
+    formLCSSAForInstructions(Worklist, *DT, *LI);
+    assert(NewOuter->isRecursivelyLCSSAForm(*DT) &&
+           "LCSSA is broken after separating nested loops!");
+  }
+
   return NewOuter;
 }
 
@@ -541,17 +584,12 @@ ReprocessLoop:
   SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
                                                ExitBlocks.end());
   for (BasicBlock *ExitBlock : ExitBlockSet) {
-    for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
-         PI != PE; ++PI)
-      // Must be exactly this loop: no subloops, parent loops, or non-loop preds
-      // allowed.
-      if (!L->contains(*PI)) {
-        if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) {
-          ++NumInserted;
-          Changed = true;
-        }
-        break;
-      }
+    if (any_of(predecessors(ExitBlock),
+               [L](BasicBlock *BB) { return !L->contains(BB); })) {
+      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
+      ++NumInserted;
+      Changed = true;
+    }
   }
 
   // If the header has more than two predecessors at this point (from the
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8b85e320d3b2..ee5733d20f4f 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -50,6 +50,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -220,6 +221,81 @@ class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class LoopVectorizationRequirements;
 
+// A traits type that is intended to be used in graph algorithms. The graph it
+// models starts at the loop header, and traverses the BasicBlocks that are in
+// the loop body, but not the loop header. Since the loop header is skipped,
+// the back edges are excluded.
+struct LoopBodyTraits {
+  using NodeRef = std::pair<const Loop *, BasicBlock *>;
+
+  // This wraps a const Loop * into the iterator, so we know which edges to
+  // filter out.
+  class WrappedSuccIterator
+      : public iterator_adaptor_base<
+            WrappedSuccIterator, succ_iterator,
+            typename std::iterator_traits<succ_iterator>::iterator_category,
+            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+    using BaseT = iterator_adaptor_base<
+        WrappedSuccIterator, succ_iterator,
+        typename std::iterator_traits<succ_iterator>::iterator_category,
+        NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>;
+
+    const Loop *L;
+
+  public:
+    WrappedSuccIterator(succ_iterator Begin, const Loop *L)
+        : BaseT(Begin), L(L) {}
+
+    NodeRef operator*() const { return {L, *I}; }
+  };
+
+  struct LoopBodyFilter {
+    bool operator()(NodeRef N) const {
+      const Loop *L = N.first;
+      return N.second != L->getHeader() && L->contains(N.second);
+    }
+  };
+
+  using ChildIteratorType =
+      filter_iterator<WrappedSuccIterator, LoopBodyFilter>;
+
+  static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; }
+
+  static ChildIteratorType child_begin(NodeRef Node) {
+    return make_filter_range(make_range<WrappedSuccIterator>(
+                                 {succ_begin(Node.second), Node.first},
+                                 {succ_end(Node.second), Node.first}),
+                             LoopBodyFilter{})
+        .begin();
+  }
+
+  static ChildIteratorType child_end(NodeRef Node) {
+    return make_filter_range(make_range<WrappedSuccIterator>(
+                                 {succ_begin(Node.second), Node.first},
+                                 {succ_end(Node.second), Node.first}),
+                             LoopBodyFilter{})
+        .end();
+  }
+};
+
+/// Returns true if the given loop body has a cycle, excluding the loop
+/// itself.
+static bool hasCyclesInLoopBody(const Loop &L) {
+  if (!L.empty())
+    return true;
+
+  for (const auto SCC :
+       make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
+                  scc_iterator<Loop, LoopBodyTraits>::end(L))) {
+    if (SCC.size() > 1) {
+      DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
+      DEBUG(L.dump());
+      return true;
+    }
+  }
+  return false;
+}
+
 /// \brief This modifies LoopAccessReport to initialize message with
 /// loop-vectorizer-specific part.
 class VectorizationReport : public LoopAccessReport {
@@ -1782,12 +1858,14 @@ private:
   Instruction *UnsafeAlgebraInst;
 };
 
-static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
-  if (L.empty())
-    return V.push_back(&L);
-
+static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+  if (L.empty()) {
+    if (!hasCyclesInLoopBody(L))
+      V.push_back(&L);
+    return;
+  }
   for (Loop *InnerL : L)
-    addInnerLoop(*InnerL, V);
+    addAcyclicInnerLoop(*InnerL, V);
 }
 
 /// The LoopVectorize Pass.
@@ -4395,6 +4473,9 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
+  // FIXME: The code is currently dead, since the loop gets sent to
+  // LoopVectorizationLegality is already an innermost loop.
+  //
   // We can only vectorize innermost loops.
   if (!TheLoop->empty()) {
     emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
@@ -6639,7 +6720,7 @@ bool LoopVectorizePass::runImpl(
   SmallVector<Loop *, 8> Worklist;
 
   for (Loop *L : *LI)
-    addInnerLoop(*L, Worklist);
+    addAcyclicInnerLoop(*L, Worklist);
 
   LoopsAnalyzed += Worklist.size();