62 files changed, 1772 insertions, 469 deletions
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 82c30d39afd6..df7c951743c9 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -295,7 +295,7 @@ public:
   }
 
   /// Should we be emitting segmented stack stuff for the function
-  bool shouldSplitStack();
+  bool shouldSplitStack() const;
 
   /// getNumBlockIDs - Return the number of MBB ID's allocated.
   ///
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 23816bde07c0..536fc656e8e2 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -369,6 +369,18 @@ public:
     (UnsafeAlgebra << 3) | (NoNaNs << 4) | (NoInfs << 5) |
     (NoSignedZeros << 6) | (AllowReciprocal << 7);
   }
+
+  /// Clear any flags in this flag set that aren't also set in Flags.
+  void intersectWith(const SDNodeFlags *Flags) {
+    NoUnsignedWrap &= Flags->NoUnsignedWrap;
+    NoSignedWrap &= Flags->NoSignedWrap;
+    Exact &= Flags->Exact;
+    UnsafeAlgebra &= Flags->UnsafeAlgebra;
+    NoNaNs &= Flags->NoNaNs;
+    NoInfs &= Flags->NoInfs;
+    NoSignedZeros &= Flags->NoSignedZeros;
+    AllowReciprocal &= Flags->AllowReciprocal;
+  }
 };
 
 /// Represents one node in the SelectionDAG.
@@ -682,6 +694,9 @@ public:
   /// and directly, but it is not to avoid creating a vtable for this class.
   const SDNodeFlags *getFlags() const;
 
+  /// Clear any flags in this node that aren't also set in Flags.
+  void intersectFlagsWith(const SDNodeFlags *Flags);
+
   /// Return the number of values defined/returned by this operator.
   unsigned getNumValues() const { return NumValues; }
 
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 4fa4e7daeab0..fa6469aa0ade 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -346,6 +346,10 @@ public:
     return !(isDeclarationForLinker() || isWeakForLinker());
   }
 
+  // Returns true if the alignment of the value can be unilaterally
+  // increased.
+  bool canIncreaseAlignment() const;
+
   /// This method unlinks 'this' from the containing module, but does not delete
   /// it.
   virtual void removeFromParent() = 0;
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 911c6f14da0b..3ae01657a2ec 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -331,6 +331,25 @@ unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT,
 /// during lowering by the GC infrastructure.
 bool callsGCLeafFunction(ImmutableCallSite CS);
 
+//===----------------------------------------------------------------------===//
+//  Intrinsic pattern matching
+//
+
+/// Try and match a bitreverse or bswap idiom.
+///
+/// If an idiom is matched, an intrinsic call is inserted before \c I. Any added
+/// instructions are returned in \c InsertedInsts. They will all have been added
+/// to a basic block.
+///
+/// A bitreverse idiom normally requires around 2*BW nodes to be searched (where
+/// BW is the bitwidth of the integer type). A bswap idiom requires anywhere up
+/// to BW / 4 nodes to be searched, so is significantly faster.
+///
+/// This function returns true on a successful match or false otherwise.
+bool recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 410a075aeb98..fc34f49a1255 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -125,8 +125,6 @@ private:
   Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);
 
   // Math Library Optimizations
-  Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType);
-  Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B);
   Value *optimizeCos(CallInst *CI, IRBuilder<> &B);
   Value *optimizePow(CallInst *CI, IRBuilder<> &B);
   Value *optimizeExp2(CallInst *CI, IRBuilder<> &B);
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index bbe53249a084..b60ab9151ef2 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -93,18 +93,7 @@ public:
   /// variable, merge them by appending Next's values to the current
   /// list of values.
   /// Return true if the merge was successful.
-  bool MergeValues(const DebugLocEntry &Next) {
-    if (Begin == Next.Begin) {
-      auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
-      auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
-      if (Expr->isBitPiece() && NextExpr->isBitPiece()) {
-        addValues(Next.Values);
-        End = Next.End;
-        return true;
-      }
-    }
-    return false;
-  }
+  bool MergeValues(const DebugLocEntry &Next);
 
   /// \brief Attempt to merge this DebugLocEntry with Next and return
   /// true if the merge was successful. Entries can be merged if they
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index a4fb07eacb3b..ae62b6b19a42 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -805,6 +805,24 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
   return (l1 < r2) && (l2 < r1);
 }
 
+/// \brief If this and Next are describing different pieces of the same
+/// variable, merge them by appending Next's values to the current
+/// list of values.
+/// Return true if the merge was successful.
+bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
+  if (Begin == Next.Begin) {
+    auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
+    auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
+    if (Expr->isBitPiece() && NextExpr->isBitPiece() &&
+        !piecesOverlap(Expr, NextExpr)) {
+      addValues(Next.Values);
+      End = Next.End;
+      return true;
+    }
+  }
+  return false;
+}
+
 /// Build the location list for all DBG_VALUEs in the function that
 /// describe the same variable.  If the ranges of several independent
 /// pieces of the same variable overlap partially, split them up and
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 03e57787307a..c8007a524e70 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1742,8 +1742,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       // over-aligning global variables that have an explicit section is
       // forbidden.
       GlobalVariable *GV;
-      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->hasUniqueInitializer() &&
-          !GV->hasSection() && GV->getAlignment() < PrefAlign &&
+      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
+          GV->getAlignment() < PrefAlign &&
           DL->getTypeAllocSize(GV->getType()->getElementType()) >=
               MinSize + Offset2)
         GV->setAlignment(PrefAlign);
@@ -5211,6 +5211,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   return false;
 }
 
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+static bool makeBitReverse(Instruction &I, const DataLayout &DL,
+                           const TargetLowering &TLI) {
+  if (!I.getType()->isIntegerTy() ||
+      !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
+                                    TLI.getValueType(DL, I.getType(), true)))
+    return false;
+
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts))
+    return false;
+  Instruction *LastInst = Insts.back();
+  I.replaceAllUsesWith(LastInst);
+  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  return true;
+}
+
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
@@ -5224,8 +5242,19 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
     if (ModifiedDT)
       return true;
   }
-  MadeChange |= dupRetToEnableTailCallOpts(&BB);
 
+  bool MadeBitReverse = true;
+  while (TLI && MadeBitReverse) {
+    MadeBitReverse = false;
+    for (auto &I : reverse(BB)) {
+      if (makeBitReverse(I, *DL, *TLI)) {
+        MadeBitReverse = MadeChange = true;
+        break;
+      }
+    }
+  }
+  MadeChange |= dupRetToEnableTailCallOpts(&BB);
+  
   return MadeChange;
 }
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index ca4bb1c6ad49..f6604f38722a 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -163,7 +163,7 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
 }
 
 /// Should we be emitting segmented stack stuff for the function
-bool MachineFunction::shouldSplitStack() {
+bool MachineFunction::shouldSplitStack() const {
   return getFunction()->hasFnAttribute("split-stack");
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 96bf914701c6..893871f94485 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -377,22 +377,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
   }
 }
 
-/// Add logical or fast math flag values to FoldingSetNodeID value.
-static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode,
-                           const SDNodeFlags *Flags) {
-  if (!isBinOpWithFlags(Opcode))
-    return;
-
-  unsigned RawFlags = 0;
-  if (Flags)
-    RawFlags = Flags->getRawFlags();
-  ID.AddInteger(RawFlags);
-}
-
-static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) {
-  AddNodeIDFlags(ID, N->getOpcode(), N->getFlags());
-}
-
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
                           SDVTList VTList, ArrayRef<SDValue> OpList) {
   AddNodeIDOpcode(ID, OpC);
@@ -528,8 +512,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   }
   } // end switch (N->getOpcode())
 
-  AddNodeIDFlags(ID, N);
-
   // Target specific memory nodes could also have address spaces to check.
   if (N->isTargetMemoryOpcode())
     ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
@@ -851,6 +833,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
   return Node;
 }
 
@@ -869,6 +854,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
   return Node;
 }
 
@@ -886,6 +874,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
   return Node;
 }
 
@@ -3892,10 +3883,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     SDValue Ops[] = {N1, N2};
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
-    AddNodeIDFlags(ID, Opcode, Flags);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
+    }
 
     N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
 
@@ -6249,10 +6242,12 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
   if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
-    AddNodeIDFlags(ID, Opcode, Flags);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
       return E;
+    }
   }
   return nullptr;
 }
@@ -6948,6 +6943,11 @@ const SDNodeFlags *SDNode::getFlags() const {
   return nullptr;
 }
 
+void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) {
+  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
+    FlagsNode->Flags.intersectWith(Flags);
+}
+
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
   assert(N->getNumValues() == 1 &&
          "Can't unroll a vector with multiple results!");
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6159f93faf89..a61b62bd9687 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -12,11 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -134,6 +135,47 @@ bool GlobalValue::isDeclaration() const {
   return false;
 }
 
+bool GlobalValue::canIncreaseAlignment() const {
+  // Firstly, can only increase the alignment of a global if it
+  // is a strong definition.
+  if (!isStrongDefinitionForLinker())
+    return false;
+
+  // It also has to either not have a section defined, or, not have
+  // alignment specified. (If it is assigned a section, the global
+  // could be densely packed with other objects in the section, and
+  // increasing the alignment could cause padding issues.)
+  if (hasSection() && getAlignment() > 0)
+    return false;
+
+  // On ELF platforms, we're further restricted in that we can't
+  // increase the alignment of any variable which might be emitted
+  // into a shared library, and which is exported. If the main
+  // executable accesses a variable found in a shared-lib, the main
+  // exe actually allocates memory for and exports the symbol ITSELF,
+  // overriding the symbol found in the library. That is, at link
+  // time, the observed alignment of the variable is copied into the
+  // executable binary. (A COPY relocation is also generated, to copy
+  // the initial data from the shadowed variable in the shared-lib
+  // into the location in the main binary, before running code.)
+  //
+  // And thus, even though you might think you are defining the
+  // global, and allocating the memory for the global in your object
+  // file, and thus should be able to set the alignment arbitrarily,
+  // that's not actually true. Doing so can cause an ABI breakage; an
+  // executable might have already been built with the previous
+  // alignment of the variable, and then assuming an increased
+  // alignment will be incorrect.
+
+  // Conservatively assume ELF if there's no parent pointer.
+  bool isELF =
+      (!Parent || Triple(Parent->getTargetTriple()).isOSBinFormatELF());
+  if (isELF && hasDefaultVisibility() && !hasLocalLinkage())
+    return false;
+
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4ecfbe9e2280..9b73c5e9d952 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10133,6 +10133,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (AArch64::GPR64RegClass.contains(*I))
@@ -10152,13 +10153,13 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
+    // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index d26604f5765d..685907a2178e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -112,9 +112,21 @@ public:
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
 
+  /// Emit a 32-bit value as an instruction. This is only used for the .inst
+  /// directive, EmitInstruction should be used in other cases.
   void emitInst(uint32_t Inst) {
+    char Buffer[4];
+
+    // We can't just use EmitIntValue here, as that will emit a data mapping
+    // symbol, and swap the endianness on big-endian systems (instructions are
+    // always little-endian).
+    for (unsigned I = 0; I < 4; ++I) {
+      Buffer[I] = uint8_t(Inst);
+      Inst >>= 8;
+    }
+
     EmitA64MappingSymbol();
-    MCELFStreamer::EmitIntValue(Inst, 4);
+    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 37c0795af283..978e99cf511e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12423,6 +12423,7 @@ void ARMTargetLowering::insertCopiesSplitCSR(
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (ARM::GPRRegClass.contains(*I))
@@ -12442,13 +12443,13 @@ void ARMTargetLowering::insertCopiesSplitCSR(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
+    // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index e8b96e74a7af..ed2e88067168 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -832,10 +832,10 @@ def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
                                              R8, R9, R10, R11)>;
 
 // CSRs that are handled by prologue, epilogue.
-def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>;
 
 // CSRs that are handled explicitly via copies.
-def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>;
 
 // All GPRs - except r11
 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 8632bb8254f9..7f8ce4768c00 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -2031,6 +2031,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
 
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
   unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
   assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
          "Scratch register is live-in");
@@ -2271,6 +2275,11 @@ void X86FrameLowering::adjustForHiPEPrologue(
     MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   DebugLoc DL;
+
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
   // HiPE-specific values
   const unsigned HipeLeafWords = 24;
   const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
@@ -2584,7 +2593,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
 bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   // If we may need to emit frameless compact unwind information, give
   // up as this is currently broken: PR25614.
-  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+  return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
+         // The lowering of segmented stack and HiPE only support entry blocks
+         // as prologue blocks: PR26107.
+         // This limitation may be lifted if we fix:
+         // - adjustForSegmentedStacks
+         // - adjustForHiPEPrologue
+         MF.getFunction()->getCallingConv() != CallingConv::HiPE &&
+         !MF.shouldSplitStack();
 }
 
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b723059f091d..6904714ec781 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -28908,6 +28908,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (X86::GR64RegClass.contains(*I))
@@ -28925,13 +28926,13 @@ void X86TargetLowering::insertCopiesSplitCSR(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
+    // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 95c50d32c820..76cefd97cd8f 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -1565,190 +1566,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-
-/// Analyze the specified subexpression and see if it is capable of providing
-/// pieces of a bswap or bitreverse. The subexpression provides a potential
-/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
-/// the output of the expression came from a corresponding bit in some other
-/// value. This function is recursive, and the end result is a mapping of
-/// (value, bitnumber) to bitnumber. It is the caller's responsibility to
-/// validate that all `value`s are identical and that the bitnumber to bitnumber
-/// mapping is correct for a bswap or bitreverse.
-///
-/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
-/// that the expression deposits the low byte of %X into the high byte of the
-/// result and that all other bits are zero. This expression is accepted,
-/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7].
-///
-/// This function returns true if the match was unsuccessful and false if so.
-/// On entry to the function the "OverallLeftShift" is a signed integer value
-/// indicating the number of bits that the subexpression is later shifted.  For
-/// example, if the expression is later right shifted by 16 bits, the
-/// OverallLeftShift value would be -16 on entry.  This is used to specify which
-/// bits of BitValues are actually being set.
-///
-/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding
-/// bit is masked to zero by a user.  For example, in (X & 255), X will be
-/// processed with a bytemask of 255. BitMask is always in the local
-/// (OverallLeftShift) coordinate space.
-///
-static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask,
-                            SmallVectorImpl<Value *> &BitValues,
-                            SmallVectorImpl<int> &BitProvenance) {
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or)
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance) ||
-             CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-
-    // If this is a logical shift by a constant, recurse with OverallLeftShift
-    // and BitMask adjusted.
-    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned ShAmt =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
-      // Ensure the shift amount is defined.
-      if (ShAmt > BitValues.size())
-        return true;
-
-      unsigned BitShift = ShAmt;
-      if (I->getOpcode() == Instruction::Shl) {
-        // X << C -> collect(X, +C)
-        OverallLeftShift += BitShift;
-        BitMask = BitMask.lshr(BitShift);
-      } else {
-        // X >>u C -> collect(X, -C)
-        OverallLeftShift -= BitShift;
-        BitMask = BitMask.shl(BitShift);
-      }
-
-      if (OverallLeftShift >= (int)BitValues.size())
-        return true;
-      if (OverallLeftShift <= -(int)BitValues.size())
-        return true;
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-
-    // If this is a logical 'and' with a mask that clears bits, clear the
-    // corresponding bits in BitMask.
-    if (I->getOpcode() == Instruction::And &&
-        isa<ConstantInt>(I->getOperand(1))) {
-      unsigned NumBits = BitValues.size();
-      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
-      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
-
-      for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) {
-        // If this bit is masked out by a later operation, we don't care what
-        // the and mask is.
-        if (BitMask[i] == 0)
-          continue;
-
-        // If the AndMask is zero for this bit, clear the bit.
-        APInt MaskB = AndMask & Bit;
-        if (MaskB == 0) {
-          BitMask.clearBit(i);
-          continue;
-        }
-
-        // Otherwise, this bit is kept.
-      }
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-  }
-
-  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
-  // the input value to the bswap/bitreverse. To be part of a bswap or
-  // bitreverse we must be demanding a contiguous range of bits from it.
-  unsigned InputBitLen = BitMask.countPopulation();
-  unsigned InputBitNo = BitMask.countTrailingZeros();
-  if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo !=
-      InputBitLen)
-    // Not a contiguous set range of bits!
-    return true;
-
-  // We know we're moving a contiguous range of bits from the input to the
-  // output. Record which bits in the output came from which bits in the input.
-  unsigned DestBitNo = InputBitNo + OverallLeftShift;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitProvenance[DestBitNo + I] = InputBitNo + I;
-
-  // If the destination bit value is already defined, the values are or'd
-  // together, which isn't a bswap/bitreverse (unless it's an or of the same
-  // bits).
-  if (BitValues[DestBitNo] && BitValues[DestBitNo] != V)
-    return true;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitValues[DestBitNo + I] = V;
-
-  return false;
-}
-
-static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
-                                          unsigned BitWidth) {
-  if (From % 8 != To % 8)
-    return false;
-  // Convert from bit indices to byte indices and check for a byte reversal.
-  From >>= 3;
-  To >>= 3;
-  BitWidth >>= 3;
-  return From == BitWidth - To - 1;
-}
-
-static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
-                                               unsigned BitWidth) {
-  return From == BitWidth - To - 1;
-}
-
 /// Given an OR instruction, check to see if this is a bswap or bitreverse
 /// idiom. If so, insert the new intrinsic and return it.
 Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
-  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-  if (!ITy)
-    return nullptr;   // Can't do vectors.
-  unsigned BW = ITy->getBitWidth();
-  
-  /// We keep track of which bit (BitProvenance) inside which value (BitValues)
-  /// defines each bit in the result.
-  SmallVector<Value *, 8> BitValues(BW, nullptr);
-  SmallVector<int, 8> BitProvenance(BW, -1);
-  
-  // Try to find all the pieces corresponding to the bswap.
-  APInt BitMask = APInt::getAllOnesValue(BitValues.size());
-  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance))
-    return nullptr;
-
-  // Check to see if all of the bits come from the same value.
-  Value *V = BitValues[0];
-  if (!V) return nullptr;  // Didn't find a bit?  Must be zero.
-
-  if (!std::all_of(BitValues.begin(), BitValues.end(),
-                   [&](const Value *X) { return X == V; }))
-    return nullptr;
-
-  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
-  // only byteswap values with an even number of bytes.
-  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;;
-  for (unsigned i = 0, e = BitValues.size(); i != e; ++i) {
-    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
-    OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
-  }
-
-  Intrinsic::ID Intrin;
-  if (OKForBSwap)
-    Intrin = Intrinsic::bswap;
-  else if (OKForBitReverse)
-    Intrin = Intrinsic::bitreverse;
-  else
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts))
     return nullptr;
+  Instruction *LastInst = Insts.pop_back_val();
+  LastInst->removeFromParent();
 
-  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy);
-  return CallInst::Create(F, V);
+  for (auto *Inst : Insts)
+    Worklist.Add(Inst);
+  return LastInst;
 }
 
 /// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0)
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 14574119b9a8..79282a2a703b 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -179,13 +179,244 @@ void LandingPadInliningInfo::forwardResume(
   RI->eraseFromParent();
 }
 
+/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
+static Value *getParentPad(Value *EHPad) {
+  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+    return FPI->getParentPad();
+  return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+typedef DenseMap<Instruction *, Value *> UnwindDestMemoTy;
+
+/// Helper for getUnwindDestToken that does the descendant-ward part of
+/// the search.
+static Value *getUnwindDestTokenHelper(Instruction *EHPad,
+                                       UnwindDestMemoTy &MemoMap) {
+  SmallVector<Instruction *, 8> Worklist(1, EHPad);
+
+  while (!Worklist.empty()) {
+    Instruction *CurrentPad = Worklist.pop_back_val();
+    // We only put pads on the worklist that aren't in the MemoMap.  When
+    // we find an unwind dest for a pad we may update its ancestors, but
+    // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
+    // so they should never get updated while queued on the worklist.
+    assert(!MemoMap.count(CurrentPad));
+    Value *UnwindDestToken = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
+      if (CatchSwitch->hasUnwindDest()) {
+        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+      } else {
+        // Catchswitch doesn't have a 'nounwind' variant, and one might be
+        // annotated as "unwinds to caller" when really it's nounwind (see
+        // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
+        // parent's unwind dest from this.  We can check its catchpads'
+        // descendants, since they might include a cleanuppad with an
+        // "unwinds to caller" cleanupret, which can be trusted.
+        for (auto HI = CatchSwitch->handler_begin(),
+                  HE = CatchSwitch->handler_end();
+             HI != HE && !UnwindDestToken; ++HI) {
+          BasicBlock *HandlerBlock = *HI;
+          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+          for (User *Child : CatchPad->users()) {
+            // Intentionally ignore invokes here -- since the catchswitch is
+            // marked "unwind to caller", it would be a verifier error if it
+            // contained an invoke which unwinds out of it, so any invoke we'd
+            // encounter must unwind to some child of the catch.
+            if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
+              continue;
+
+            Instruction *ChildPad = cast<Instruction>(Child);
+            auto Memo = MemoMap.find(ChildPad);
+            if (Memo == MemoMap.end()) {
+              // Haven't figure out this child pad yet; queue it.
+              Worklist.push_back(ChildPad);
+              continue;
+            }
+            // We've already checked this child, but might have found that
+            // it offers no proof either way.
+            Value *ChildUnwindDestToken = Memo->second;
+            if (!ChildUnwindDestToken)
+              continue;
+            // We already know the child's unwind dest, which can either
+            // be ConstantTokenNone to indicate unwind to caller, or can
+            // be another child of the catchpad.  Only the former indicates
+            // the unwind dest of the catchswitch.
+            if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
+              UnwindDestToken = ChildUnwindDestToken;
+              break;
+            }
+            assert(getParentPad(ChildUnwindDestToken) == CatchPad);
+          }
+        }
+      }
+    } else {
+      auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
+      for (User *U : CleanupPad->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
+            UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+          else
+            UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
+          break;
+        }
+        Value *ChildUnwindDestToken;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+        } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
+          Instruction *ChildPad = cast<Instruction>(U);
+          auto Memo = MemoMap.find(ChildPad);
+          if (Memo == MemoMap.end()) {
+            // Haven't resolved this child yet; queue it and keep searching.
+            Worklist.push_back(ChildPad);
+            continue;
+          }
+          // We've checked this child, but still need to ignore it if it
+          // had no proof either way.
+          ChildUnwindDestToken = Memo->second;
+          if (!ChildUnwindDestToken)
+            continue;
+        } else {
+          // Not a relevant user of the cleanuppad
+          continue;
+        }
+        // In a well-formed program, the child/invoke must either unwind to
+        // an(other) child of the cleanup, or exit the cleanup.  In the
+        // first case, continue searching.
+        if (isa<Instruction>(ChildUnwindDestToken) &&
+            getParentPad(ChildUnwindDestToken) == CleanupPad)
+          continue;
+        UnwindDestToken = ChildUnwindDestToken;
+        break;
+      }
+    }
+    // If we haven't found an unwind dest for CurrentPad, we may have queued its
+    // children, so move on to the next in the worklist.
+    if (!UnwindDestToken)
+      continue;
+
+    // Now we know that CurrentPad unwinds to UnwindDestToken.  It also exits
+    // any ancestors of CurrentPad up to but not including UnwindDestToken's
+    // parent pad.  Record this in the memo map, and check to see if the
+    // original EHPad being queried is one of the ones exited.
+    Value *UnwindParent;
+    if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
+      UnwindParent = getParentPad(UnwindPad);
+    else
+      UnwindParent = nullptr;
+    bool ExitedOriginalPad = false;
+    for (Instruction *ExitedPad = CurrentPad;
+         ExitedPad && ExitedPad != UnwindParent;
+         ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
+      // Skip over catchpads since they just follow their catchswitches.
+      if (isa<CatchPadInst>(ExitedPad))
+        continue;
+      MemoMap[ExitedPad] = UnwindDestToken;
+      ExitedOriginalPad |= (ExitedPad == EHPad);
+    }
+
+    if (ExitedOriginalPad)
+      return UnwindDestToken;
+
+    // Continue the search.
+  }
+
+  // No definitive information is contained within this funclet.
+  return nullptr;
+}
+
+/// Given an EH pad, find where it unwinds.  If it unwinds to an EH pad,
+/// return that pad instruction.  If it unwinds to caller, return
+/// ConstantTokenNone.  If it does not have a definitive unwind destination,
+/// return nullptr.
+///
+/// This routine gets invoked for calls in funclets in inlinees when inlining
+/// an invoke.  Since many funclets don't have calls inside them, it's queried
+/// on-demand rather than building a map of pads to unwind dests up front.
+/// Determining a funclet's unwind dest may require recursively searching its
+/// descendants, and also ancestors and cousins if the descendants don't provide
+/// an answer.  Since most funclets will have their unwind dest immediately
+/// available as the unwind dest of a catchswitch or cleanupret, this routine
+/// searches top-down from the given pad and then up. To avoid worst-case
+/// quadratic run-time given that approach, it uses a memo map to avoid
+/// re-processing funclet trees.  The callers that rewrite the IR as they go
+/// take advantage of this, for correctness, by checking/forcing rewritten
+/// pads' entries to match the original callee view.
+static Value *getUnwindDestToken(Instruction *EHPad,
+                                 UnwindDestMemoTy &MemoMap) {
+  // Catchpads unwind to the same place as their catchswitch;
+  // redirct any queries on catchpads so the code below can
+  // deal with just catchswitches and cleanuppads.
+  if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
+    EHPad = CPI->getCatchSwitch();
+
+  // Check if we've already determined the unwind dest for this pad.
+  auto Memo = MemoMap.find(EHPad);
+  if (Memo != MemoMap.end())
+    return Memo->second;
+
+  // Search EHPad and, if necessary, its descendants.
+  Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
+  assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
+  if (UnwindDestToken)
+    return UnwindDestToken;
+
+  // No information is available for this EHPad from itself or any of its
+  // descendants.  An unwind all the way out to a pad in the caller would
+  // need also to agree with the unwind dest of the parent funclet, so
+  // search up the chain to try to find a funclet with information.  Put
+  // null entries in the memo map to avoid re-processing as we go up.
+  MemoMap[EHPad] = nullptr;
+  Instruction *LastUselessPad = EHPad;
+  Value *AncestorToken;
+  for (AncestorToken = getParentPad(EHPad);
+       auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
+       AncestorToken = getParentPad(AncestorToken)) {
+    // Skip over catchpads since they just follow their catchswitches.
+    if (isa<CatchPadInst>(AncestorPad))
+      continue;
+    assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
+    auto AncestorMemo = MemoMap.find(AncestorPad);
+    if (AncestorMemo == MemoMap.end()) {
+      UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
+    } else {
+      UnwindDestToken = AncestorMemo->second;
+    }
+    if (UnwindDestToken)
+      break;
+    LastUselessPad = AncestorPad;
+  }
+
+  // Since the whole tree under LastUselessPad has no information, it all must
+  // match UnwindDestToken; record that to avoid repeating the search.
+  SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
+  while (!Worklist.empty()) {
+    Instruction *UselessPad = Worklist.pop_back_val();
+    assert(!MemoMap.count(UselessPad) || MemoMap[UselessPad] == nullptr);
+    MemoMap[UselessPad] = UnwindDestToken;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
+      for (BasicBlock *HandlerBlock : CatchSwitch->handlers())
+        for (User *U : HandlerBlock->getFirstNonPHI()->users())
+          if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+            Worklist.push_back(cast<Instruction>(U));
+    } else {
+      assert(isa<CleanupPadInst>(UselessPad));
+      for (User *U : UselessPad->users())
+        if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+          Worklist.push_back(cast<Instruction>(U));
+    }
+  }
+
+  return UnwindDestToken;
+}
+
 /// When we inline a basic block into an invoke,
 /// we have to turn all of the calls that can throw into invokes.
 /// This function analyze BB to see if there are any calls, and if so,
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
-static BasicBlock *
-HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
+static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
+    BasicBlock *BB, BasicBlock *UnwindEdge,
+    UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = &*BBI++;
 
@@ -196,6 +427,31 @@ HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
     if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
+    if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+      // This call is nested inside a funclet.  If that funclet has an unwind
+      // destination within the inlinee, then unwinding out of this call would
+      // be UB.  Rewriting this call to an invoke which targets the inlined
+      // invoke's unwind dest would give the call's parent funclet multiple
+      // unwind destinations, which is something that subsequent EH table
+      // generation can't handle and that the veirifer rejects.  So when we
+      // see such a call, leave it as a call.
+      auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
+      Value *UnwindDestToken =
+          getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
+      if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+        continue;
+#ifndef NDEBUG
+      Instruction *MemoKey;
+      if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+        MemoKey = CatchPad->getCatchSwitch();
+      else
+        MemoKey = FuncletPad;
+      assert(FuncletUnwindMap->count(MemoKey) &&
+             (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
+             "must get memoized to avoid confusing later searches");
+#endif // NDEBUG
+    }
+
     // Convert this function call into an invoke instruction.  First, split the
     // basic block.
     BasicBlock *Split =
@@ -328,13 +584,23 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
 
   // This connects all the instructions which 'unwind to caller' to the invoke
   // destination.
+  UnwindDestMemoTy FuncletUnwindMap;
   for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
        BB != E; ++BB) {
     if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
       if (CRI->unwindsToCaller()) {
-        CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI);
+        auto *CleanupPad = CRI->getCleanupPad();
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
         CRI->eraseFromParent();
         UpdatePHINodes(&*BB);
+        // Finding a cleanupret with an unwind destination would confuse
+        // subsequent calls to getUnwindDestToken, so map the cleanuppad
+        // to short-circuit any such calls and recognize this as an "unwind
+        // to caller" cleanup.
+        assert(!FuncletUnwindMap.count(CleanupPad) ||
+               isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
+        FuncletUnwindMap[CleanupPad] =
+            ConstantTokenNone::get(Caller->getContext());
       }
     }
 
@@ -345,12 +611,41 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     Instruction *Replacement = nullptr;
     if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
       if (CatchSwitch->unwindsToCaller()) {
+        Value *UnwindDestToken;
+        if (auto *ParentPad =
+                dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
+          // This catchswitch is nested inside another funclet.  If that
+          // funclet has an unwind destination within the inlinee, then
+          // unwinding out of this catchswitch would be UB.  Rewriting this
+          // catchswitch to unwind to the inlined invoke's unwind dest would
+          // give the parent funclet multiple unwind destinations, which is
+          // something that subsequent EH table generation can't handle and
+          // that the veirifer rejects.  So when we see such a call, leave it
+          // as "unwind to caller".
+          UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
+          if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+            continue;
+        } else {
+          // This catchswitch has no parent to inherit constraints from, and
+          // none of its descendants can have an unwind edge that exits it and
+          // targets another funclet in the inlinee.  It may or may not have a
+          // descendant that definitively has an unwind to caller.  In either
+          // case, we'll have to assume that any unwinds out of it may need to
+          // be routed to the caller, so treat it as though it has a definitive
+          // unwind to caller.
+          UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
+        }
         auto *NewCatchSwitch = CatchSwitchInst::Create(
             CatchSwitch->getParentPad(), UnwindDest,
             CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
             CatchSwitch);
         for (BasicBlock *PadBB : CatchSwitch->handlers())
           NewCatchSwitch->addHandler(PadBB);
+        // Propagate info for the old catchswitch over to the new one in
+        // the unwind map.  This also serves to short-circuit any subsequent
+        // checks for the unwind dest of this catchswitch, which would get
+        // confused if they found the outer handler in the callee.
+        FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
         Replacement = NewCatchSwitch;
       }
     } else if (!isa<FuncletPadInst>(I)) {
@@ -369,8 +664,8 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     for (Function::iterator BB = FirstNewBlock->getIterator(),
                             E = Caller->end();
          BB != E; ++BB)
-      if (BasicBlock *NewBB =
-              HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest))
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, UnwindDest, &FuncletUnwindMap))
         // Update any PHI nodes in the exceptional block to indicate that there
         // is now a new entry in them.
         UpdatePHINodes(NewBB);
@@ -1415,6 +1710,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any call instructions into invoke instructions.  This is sensitive to which
+  // funclet pads were top-level in the inlinee, so must be done before
+  // rewriting the "parent pad" links.
+  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
+    BasicBlock *UnwindDest = II->getUnwindDest();
+    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    if (isa<LandingPadInst>(FirstNonPHI)) {
+      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    } else {
+      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    }
+  }
+
   // Update the lexical scopes of the new funclets and callsites.
   // Anything that had 'none' as its parent is now nested inside the callsite's
   // EHPad.
@@ -1472,18 +1781,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
-  // If we are inlining for an invoke instruction, we must make sure to rewrite
-  // any call instructions into invoke instructions.
-  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
-    BasicBlock *UnwindDest = II->getUnwindDest();
-    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
-    if (isa<LandingPadInst>(FirstNonPHI)) {
-      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
-    } else {
-      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
-    }
-  }
-
   // Handle any inlined musttail call sites.  In order for a new call site to be
   // musttail, the source of the clone and the inlined call site must have been
   // musttail.  Therefore it's safe to return without merging control into the
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index d2793e5ecb5b..abc9b65f7a39 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -944,37 +944,44 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
                                       unsigned PrefAlign,
                                       const DataLayout &DL) {
+  assert(PrefAlign > Align);
+
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    // TODO: ideally, computeKnownBits ought to have used
+    // AllocaInst::getAlignment() in its computation already, making
+    // the below max redundant. But, as it turns out,
+    // stripPointerCasts recurses through infinite layers of bitcasts,
+    // while computeKnownBits is not allowed to traverse more than 6
+    // levels.
+    Align = std::max(AI->getAlignment(), Align);
+    if (PrefAlign <= Align)
+      return Align;
+
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
     if (DL.exceedsNaturalStackAlignment(PrefAlign))
       return Align;
-    // If there is a requested alignment and if this is an alloca, round up.
-    if (AI->getAlignment() >= PrefAlign)
-      return AI->getAlignment();
     AI->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
+    // TODO: as above, this shouldn't be necessary.
+    Align = std::max(GO->getAlignment(), Align);
+    if (PrefAlign <= Align)
+      return Align;
+
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.  If the memory we set aside for the global may not be the
     // memory used by the final program then it is impossible for us to reliably
     // enforce the preferred alignment.
-    if (!GO->isStrongDefinitionForLinker())
+    if (!GO->canIncreaseAlignment())
       return Align;
 
-    if (GO->getAlignment() >= PrefAlign)
-      return GO->getAlignment();
-    // We can only increase the alignment of the global if it has no alignment
-    // specified or if it is not assigned a section.  If it is assigned a
-    // section, the global could be densely packed with other objects in the
-    // section, increasing the alignment could cause padding issues.
-    if (!GO->hasSection() || GO->getAlignment() == 0)
-      GO->setAlignment(PrefAlign);
-    return GO->getAlignment();
+    GO->setAlignment(PrefAlign);
+    return PrefAlign;
   }
 
   return Align;
@@ -1585,3 +1592,205 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
 
   return false;
 }
+
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+  BitPart(Value *P, unsigned BW) : Provider(P) {
+    Provenance.resize(BW);
+  }
+
+  /// The Value that this is a bitreverse/bswap of.
+  Value *Provider;
+  /// The "provenance" of each bit. Provenance[A] = B means that bit A
+  /// in Provider becomes bit B in the result of this expression.
+  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+  enum { Unset = -1 };
+};
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
+/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+///
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+                std::map<Value *, Optional<BitPart>> &BPS) {
+  auto I = BPS.find(V);
+  if (I != BPS.end())
+    return I->second;
+
+  auto &Result = BPS[V] = None;
+  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      if (!A || !B)
+        return Result;
+
+      // Try and merge the two together.
+      if (!A->Provider || A->Provider != B->Provider)
+        return Result;
+
+      Result = BitPart(A->Provider, BitWidth);
+      for (unsigned i = 0; i < A->Provenance.size(); ++i) {
+        if (A->Provenance[i] != BitPart::Unset &&
+            B->Provenance[i] != BitPart::Unset &&
+            A->Provenance[i] != B->Provenance[i])
+          return Result = None;
+
+        if (A->Provenance[i] == BitPart::Unset)
+          Result->Provenance[i] = B->Provenance[i];
+        else
+          Result->Provenance[i] = A->Provenance[i];
+      }
+
+      return Result;
+    }
+
+    // If this is a logical shift by a constant, recurse then shift the result.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned BitShift =
+          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined.
+      if (BitShift > BitWidth)
+        return Result;
+
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      // Perform the "shift" on BitProvenance.
+      auto &P = Result->Provenance;
+      if (I->getOpcode() == Instruction::Shl) {
+        P.erase(std::prev(P.end(), BitShift), P.end());
+        P.insert(P.begin(), BitShift, BitPart::Unset);
+      } else {
+        P.erase(P.begin(), std::next(P.begin(), BitShift));
+        P.insert(P.end(), BitShift, BitPart::Unset);
+      }
+
+      return Result;
+    }
+
+    // If this is a logical 'and' with a mask that clears bits, recurse then
+    // unset the appropriate bits.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+
+      // Check that the mask allows a multiple of 8 bits for a bswap, for an
+      // early exit.
+      unsigned NumMaskedBits = AndMask.countPopulation();
+      if (!MatchBitReversals && NumMaskedBits % 8 != 0)
+        return Result;
+      
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1)
+        // If the AndMask is zero for this bit, clear the bit.
+        if ((AndMask & Bit) == 0)
+          Result->Provenance[i] = BitPart::Unset;
+
+      return Result;
+    }
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap/bitreverse.
+  Result = BitPart(V, BitWidth);
+  for (unsigned i = 0; i < BitWidth; ++i)
+    Result->Provenance[i] = i;
+  return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+bool llvm::recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts) {
+  if (Operator::getOpcode(I) != Instruction::Or)
+    return false;
+  if (!MatchBSwaps && !MatchBitReversals)
+    return false;
+  IntegerType *ITy = dyn_cast<IntegerType>(I->getType());
+  if (!ITy || ITy->getBitWidth() > 128)
+    return false;   // Can't do vectors or integers > 128 bits.
+  unsigned BW = ITy->getBitWidth();
+
+  // Try to find all the pieces corresponding to the bswap.
+  std::map<Value *, Optional<BitPart>> BPS;
+  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
+  if (!Res)
+    return false;
+  auto &BitProvenance = Res->Provenance;
+
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
+  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < BW; ++i) {
+    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+    OKForBitReverse &=
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+  }
+
+  Intrinsic::ID Intrin;
+  if (OKForBSwap && MatchBSwaps)
+    Intrin = Intrinsic::bswap;
+  else if (OKForBitReverse && MatchBitReversals)
+    Intrin = Intrinsic::bitreverse;
+  else
+    return false;
+
+  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
+  InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
+  return true;
+}
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index dc0744060142..2f3c31128cf0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -970,15 +970,34 @@ static Value *valueHasFloatPrecision(Value *Val) {
   return nullptr;
 }
 
-//===----------------------------------------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+/// Any floating-point library function that we're trying to simplify will have
+/// a signature of the form: fptype foo(fptype param1, fptype param2, ...).
+/// CheckDoubleTy indicates that 'fptype' must be 'double'.
+static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams,
+                                          bool CheckDoubleTy) {
+  FunctionType *FT = F->getFunctionType();
+  if (FT->getNumParams() != NumParams)
+    return false;
+
+  // The return type must match what we're looking for.
+  Type *RetTy = FT->getReturnType();
+  if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy())
+    return false;
+
+  // Each parameter must match the return type, and therefore, match every other
+  // parameter too.
+  for (const Type *ParamTy : FT->params())
+    if (ParamTy != RetTy)
+      return false;
 
-Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
-                                                bool CheckRetType) {
+  return true;
+}
+
+/// Shrink double -> float for unary functions like 'floor'.
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+                                    bool CheckRetType) {
   Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-      !FT->getParamType(0)->isDoubleTy())
+  if (!matchesFPLibFunctionSignature(Callee, 1, true))
     return nullptr;
 
   if (CheckRetType) {
@@ -1013,15 +1032,10 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   return B.CreateFPExt(V, B.getDoubleTy());
 }
 
-// Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax'
-Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
+/// Shrink double -> float for binary functions like 'fmin/fmax'.
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 2 arguments of the same FP type, which match the
-  // result type.
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
+  if (!matchesFPLibFunctionSignature(Callee, 2, true))
     return nullptr;
 
   // If this is something like 'fmin((double)floatval1, (double)floatval2)',
@@ -1394,12 +1408,21 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  
+
   Value *Ret = nullptr;
   if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
                                    Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
+  // FIXME: Refactor - this check is repeated all over this file and even in the
+  // preceding call to shrink double -> float.
+
+  // Make sure this has 1 argument of FP type, which matches the result type.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
   if (!CI->hasUnsafeAlgebra())
     return Ret;
 
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
index a9ae00c8d270..9996c0d3aba8 100644
--- a/test/CodeGen/AArch64/cxx-tlscc.ll
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -8,6 +8,7 @@
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
 @__dso_handle = external global i8
 @__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
 
 declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
 declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
@@ -74,3 +75,29 @@ __tls_init.exit:
 ; CHECK-NOT: ldp d27, d26
 ; CHECK-NOT: ldp d29, d28
 ; CHECK-NOT: ldp d31, d30
+
+; CHECK-LABEL: _ZTW4sum1
+; CHECK-NOT: stp d31, d30
+; CHECK-NOT: stp d29, d28
+; CHECK-NOT: stp d27, d26
+; CHECK-NOT: stp d25, d24
+; CHECK-NOT: stp d23, d22
+; CHECK-NOT: stp d21, d20
+; CHECK-NOT: stp d19, d18
+; CHECK-NOT: stp d17, d16
+; CHECK-NOT: stp d7, d6
+; CHECK-NOT: stp d5, d4
+; CHECK-NOT: stp d3, d2
+; CHECK-NOT: stp d1, d0
+; CHECK-NOT: stp x20, x19
+; CHECK-NOT: stp x14, x13
+; CHECK-NOT: stp x12, x11
+; CHECK-NOT: stp x10, x9
+; CHECK-NOT: stp x8, x7
+; CHECK-NOT: stp x6, x5
+; CHECK-NOT: stp x4, x3
+; CHECK-NOT: stp x2, x1
+; CHECK: blr
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+  ret i32* @sum1
+}
diff --git a/test/CodeGen/ARM/cse-flags.ll b/test/CodeGen/ARM/cse-flags.ll
new file mode 100644
index 000000000000..c18e2fcb6039
--- /dev/null
+++ b/test/CodeGen/ARM/cse-flags.ll
@@ -0,0 +1,43 @@
+; RUN: llc -asm-verbose=false < %s | FileCheck %s
+; PR26063
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; CHECK: .LBB0_1:
+; CHECK-NEXT: bl      f{{$}}
+; CHECK-NEXT: ldrb    r[[T0:[0-9]+]], [r{{[0-9]+}}, #1]!{{$}}
+; CHECK-NEXT: cmp     r{{[0-9]+}}, #1{{$}}
+; CHECK-NEXT: cmpne   r[[T0]], #0{{$}}
+; CHECK-NEXT: bne     .LBB0_1{{$}}
+define i8* @h(i8* readonly %a, i32 %b, i32 %c) {
+entry:
+  %0 = load i8, i8* %a, align 1
+  %tobool4 = icmp ne i8 %0, 0
+  %cmp5 = icmp ne i32 %b, 1
+  %1 = and i1 %cmp5, %tobool4
+  br i1 %1, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
+  %call = tail call i32 bitcast (i32 (...)* @f to i32 ()*)()
+  %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1
+  %2 = load i8, i8* %incdec.ptr, align 1
+  %tobool = icmp ne i8 %2, 0
+  %cmp = icmp ne i32 %call, 1
+  %3 = and i1 %cmp, %tobool
+  br i1 %3, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %incdec.ptr.lcssa = phi i8* [ %incdec.ptr, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %a.addr.0.lcssa = phi i8* [ %a, %entry ], [ %incdec.ptr.lcssa, %while.end.loopexit ]
+  ret i8* %a.addr.0.lcssa
+}
+
+declare i32 @f(...)
diff --git a/test/CodeGen/ARM/cxx-tlscc.ll b/test/CodeGen/ARM/cxx-tlscc.ll
index 7b776d4b8e88..11173bbb1978 100644
--- a/test/CodeGen/ARM/cxx-tlscc.ll
+++ b/test/CodeGen/ARM/cxx-tlscc.ll
@@ -8,6 +8,7 @@
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
 @__dso_handle = external global i8
 @__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
 
 declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
 declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
@@ -44,3 +45,13 @@ __tls_init.exit:
 ; CHECK-NOT: pop {r9, r12}
 ; CHECK-NOT: pop {r1, r2, r3, r4, r7, pc}
 ; CHECK: pop {lr}
+
+; CHECK-LABEL: _ZTW4sum1
+; CHECK-NOT: push {r1, r2, r3, r4, r7, lr}
+; CHECK-NOT: push {r9, r12}
+; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7}
+; CHECK: blx
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+  ret i32* @sum1
+}
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 66743f3e9d5e..46fef7629cc4 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS
-; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN
-; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
-; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
-; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
-; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
-; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS --check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
 
 define void @f1(i8* %dest, i8* %src) {
 entry:
@@ -402,8 +402,8 @@ entry:
 ; CHECK: arr1:
 ; CHECK-IOS: .align 3
 ; CHECK-DARWIN: .align 2
-; CHECK-EABI: .align 2
-; CHECK-GNUEABI: .align 2
+; CHECK-EABI-NOT: .align
+; CHECK-GNUEABI-NOT: .align
 ; CHECK: arr2:
 ; CHECK: {{\.section.+foo,bar}}
 ; CHECK-NOT: .align
diff --git a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
deleted file mode 100644
index 4580795880ab..000000000000
--- a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-; CHECK: addl
-
-; The two additions are the same , but have different flags.
-; In theory this code should never be generated by the frontend, but this 
-; tries to test that two identical instructions with two different flags
-; actually generate two different nodes.
-;
-; Normally the combiner would see this condition without the flags 
-; and optimize the result of the sub into a register clear
-; (the final result would be 0). With the different flags though the combiner 
-; needs to keep the add + sub nodes, because the two nodes result as different
-; nodes and so cannot assume that the subtraction of the two nodes
-; generates 0 as result
-define i32 @foo(i32 %a, i32 %b) {
-  %1 = add i32 %a, %b
-  %2 = add nsw i32 %a, %b
-  %3 = sub i32 %1, %2
-  ret i32 %3
-}
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
index 70fe501040bf..6c8e45e42d15 100644
--- a/test/CodeGen/X86/cxx_tlscc64.ll
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -4,11 +4,13 @@
 ; tricks similar to AArch64 fast TLS calling convention (r255821).
 ; Applying tricks on x86-64 similar to r255821.
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -O0 | FileCheck %s --check-prefix=CHECK-O0
 %struct.S = type { i8 }
 
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
 @__dso_handle = external global i8
 @__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
 
 declare void @_ZN1SC1Ev(%struct.S*)
 declare void @_ZN1SD1Ev(%struct.S*)
@@ -50,3 +52,28 @@ init.i:
 __tls_init.exit:
   ret %struct.S* @sg
 }
+
+; CHECK-LABEL: _ZTW4sum1
+; CHECK-NOT: pushq %r11
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK-NOT: pushq %rsi
+; CHECK-NOT: pushq %rdx
+; CHECK-NOT: pushq %rcx
+; CHECK-NOT: pushq %rbx
+; CHECK: callq
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+  ret i32* @sum1
+}
+
+; Make sure at O0 we don't overwrite RBP.
+; CHECK-O0-LABEL: _ZTW4sum2
+; CHECK-O0: pushq %rbp
+; CHECK-O0: movq %rsp, %rbp
+; CHECK-O0-NOT: movq %r{{.*}}, (%rbp) 
+define cxx_fast_tlscc i32* @_ZTW4sum2() #0 {
+  ret i32* @sum1
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index 7c00f407b1e0..eb87f7101d7c 100644
--- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -1,11 +1,5 @@
 ; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
 ;
-; This test checks that we do not use shrink-wrapping when
-; the function does not have any frame pointer and may unwind.
-; This is a workaround for a limitation in the emission of
-; the CFI directives, that are not correct in such case.
-; PR25614
-;
 ; Note: This test cannot be merged with the shrink-wrapping tests
 ; because the booleans set on the command line take precedence on
 ; the target logic that disable shrink-wrapping.
@@ -13,6 +7,12 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "x86_64-apple-macosx"
 
 
+; This test checks that we do not use shrink-wrapping when
+; the function does not have any frame pointer and may unwind.
+; This is a workaround for a limitation in the emission of
+; the CFI directives, that are not correct in such case.
+; PR25614
+;
 ; No shrink-wrapping should occur here, until the CFI information are fixed.
 ; CHECK-LABEL: framelessUnwind:
 ;
@@ -151,3 +151,74 @@ false:
 }
 
 attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
+
+
+; Check that we generate correct code for segmented stack.
+; We used to emit the code at the entry point of the function
+; instead of just before the prologue.
+; For now, shrink-wrapping is disabled on segmented stack functions: PR26107.
+;
+; CHECK-LABEL: segmentedStack:
+; CHECK: cmpq
+; CHECK-NEXT: ja [[ENTRY_LABEL:LBB[0-9_]+]]
+;
+; CHECK: callq ___morestack
+; CHECK-NEXT: retq
+;
+; CHECK: [[ENTRY_LABEL]]:
+; Prologue
+; CHECK: push
+;
+; In PR26107, we use to drop these two basic blocks, because
+; the segmentedStack entry block was jumping directly to
+; the place where the prologue is actually needed, which is
+; the call to memcmp.
+; Then, those two basic blocks did not have any predecessors
+; anymore and were removed.
+;
+; Check if vk1 is null
+; CHECK: testq %rdi, %rdi
+; CHECK-NEXT: je [[STRINGS_EQUAL:LBB[0-9_]+]]
+;
+; Check if vk2 is null
+; CHECK: testq %rsi, %rsi
+; CHECK-NEXT:  je [[STRINGS_EQUAL]]
+;
+; CHECK: [[STRINGS_EQUAL]]
+; CHECK-NEXT: popq
+define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 {
+entry:
+  %cmp.i = icmp eq i8* %vk1, null
+  %cmp1.i = icmp eq i8* %vk2, null
+  %brmerge.i = or i1 %cmp.i, %cmp1.i
+  %cmp1.mux.i = and i1 %cmp.i, %cmp1.i
+  br i1 %brmerge.i, label %__go_ptr_strings_equal.exit, label %if.end4.i
+
+if.end4.i:                                        ; preds = %entry
+  %tmp = getelementptr inbounds i8, i8* %vk1, i64 8
+  %tmp1 = bitcast i8* %tmp to i64*
+  %tmp2 = load i64, i64* %tmp1, align 8
+  %tmp3 = getelementptr inbounds i8, i8* %vk2, i64 8
+  %tmp4 = bitcast i8* %tmp3 to i64*
+  %tmp5 = load i64, i64* %tmp4, align 8
+  %cmp.i.i = icmp eq i64 %tmp2, %tmp5
+  br i1 %cmp.i.i, label %land.rhs.i.i, label %__go_ptr_strings_equal.exit
+
+land.rhs.i.i:                                     ; preds = %if.end4.i
+  %tmp6 = bitcast i8* %vk2 to i8**
+  %tmp7 = load i8*, i8** %tmp6, align 8
+  %tmp8 = bitcast i8* %vk1 to i8**
+  %tmp9 = load i8*, i8** %tmp8, align 8
+  %call.i.i = tail call i32 @memcmp(i8* %tmp9, i8* %tmp7, i64 %tmp2) #5
+  %cmp4.i.i = icmp eq i32 %call.i.i, 0
+  br label %__go_ptr_strings_equal.exit
+
+__go_ptr_strings_equal.exit:                      ; preds = %land.rhs.i.i, %if.end4.i, %entry
+  %retval.0.i = phi i1 [ %cmp1.mux.i, %entry ], [ false, %if.end4.i ], [ %cmp4.i.i, %land.rhs.i.i ]
+  ret i1 %retval.0.i
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) #5
+
+attributes #5 = { nounwind readonly ssp uwtable "split-stack" }
diff --git a/test/DebugInfo/ARM/PR26163.ll b/test/DebugInfo/ARM/PR26163.ll
new file mode 100644
index 000000000000..9ab0e35805c1
--- /dev/null
+++ b/test/DebugInfo/ARM/PR26163.ll
@@ -0,0 +1,107 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s
+;
+; Checks that we're creating two ranges, one that terminates immediately
+; and one that spans the rest of the function. This isn't necessarily the
+; best thing to do here (and also not necessarily correct, since the first
+; one has a bit_piece), but it is what is currently being emitted, any
+; change here needs to be intentional, so the test is very specific.
+;
+; CHECK: .debug_loc contents:
+; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004
+; CHECK:                Ending address offset: 0x0000000000000004
+; CHECK:                 Location description: 10 00 9f
+; CHECK:             Beginning address offset: 0x0000000000000004
+; CHECK:                Ending address offset: 0x0000000000000014
+; CHECK:                 Location description: 10 00 9f
+
+; Created form the following test case (PR26163) with
+; clang -cc1 -triple armv4t--freebsd11.0-gnueabi -emit-obj -debug-info-kind=standalone -O2 -x c test.c
+;
+; typedef	unsigned int	size_t;
+; struct timeval {
+; 	long long tv_sec;
+; 	int tv_usec;
+; };
+; 
+; void *memset(void *, int, size_t);
+; void foo(void);
+; 
+; static void
+; bar(int value)
+; {
+; 	struct timeval lifetime;
+; 
+; 	memset(&lifetime, 0, sizeof(struct timeval));
+; 	lifetime.tv_sec = value;
+; 
+; 	foo();
+; }
+; 
+; int
+; parse_config_file(void)
+; {
+; 	int value;
+; 
+; 	bar(value);
+; 	return (0);
+; }
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t--freebsd11.0-gnueabi"
+
+%struct.timeval = type { i64, i32 }
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+declare void @foo()
+
+define i32 @parse_config_file() !dbg !4 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !26), !dbg !27
+  tail call void @llvm.dbg.declare(metadata %struct.timeval* undef, metadata !16, metadata !26), !dbg !29
+  tail call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !16, metadata !30), !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29
+  tail call void @foo() #3, !dbg !32
+  ret i32 0, !dbg !33
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23, !24}
+!llvm.ident = !{!25}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 89dda3855cda574f355e6defa1d77bdae5053994) (llvm/trunk 257891)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "<stdin>", directory: "/home/ubuntu/bugs")
+!2 = !{}
+!3 = !{!4, !11}
+!4 = distinct !DISubprogram(name: "parse_config_file", scope: !5, file: !5, line: 22, type: !6, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: true, variables: !9)
+!5 = !DIFile(filename: "test.c", directory: "/home/ubuntu/bugs")
+!6 = !DISubroutineType(types: !7)
+!7 = !{!8}
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DILocalVariable(name: "value", scope: !4, file: !5, line: 24, type: !8)
+!11 = distinct !DISubprogram(name: "bar", scope: !5, file: !5, line: 11, type: !12, isLocal: true, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, variables: !14)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !8}
+!14 = !{!15, !16}
+!15 = !DILocalVariable(name: "value", arg: 1, scope: !11, file: !5, line: 11, type: !8)
+!16 = !DILocalVariable(name: "lifetime", scope: !11, file: !5, line: 13, type: !17)
+!17 = !DICompositeType(tag: DW_TAG_structure_type, name: "timeval", file: !5, line: 2, size: 128, align: 64, elements: !18)
+!18 = !{!19, !21}
+!19 = !DIDerivedType(tag: DW_TAG_member, name: "tv_sec", scope: !17, file: !5, line: 3, baseType: !20, size: 64, align: 64)
+!20 = !DIBasicType(name: "long long int", size: 64, align: 64, encoding: DW_ATE_signed)
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "tv_usec", scope: !17, file: !5, line: 4, baseType: !8, size: 32, align: 32, offset: 64)
+!22 = !{i32 2, !"Debug Info Version", i32 3}
+!23 = !{i32 1, !"wchar_size", i32 4}
+!24 = !{i32 1, !"min_enum_size", i32 4}
+!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 89dda3855cda574f355e6defa1d77bdae5053994) (llvm/trunk 257891)"}
+!26 = !DIExpression()
+!27 = !DILocation(line: 11, scope: !11, inlinedAt: !28)
+!28 = distinct !DILocation(line: 26, scope: !4)
+!29 = !DILocation(line: 13, scope: !11, inlinedAt: !28)
+!30 = !DIExpression(DW_OP_bit_piece, 0, 64)
+!31 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!32 = !DILocation(line: 18, scope: !11, inlinedAt: !28)
+!33 = !DILocation(line: 27, scope: !4)
diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
index 7df88b1ec5e0..b91a0438a679 100644
--- a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
index d35418b19c7f..94938a86cba4 100644
--- a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
index 0d1a1ec6871a..72449f3af3ad 100644
--- a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define i32 @bar() nounwind {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
index 31ed7523db43..31271b594c02 100644
--- a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 ; This test should fail until remote symbol resolution is supported.
 
 define i32 @main() nounwind {
diff --git a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
index bbeab10cd788..9d1abbcf847c 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
index 0aa19b244c04..afa8a95f454d 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
@@ -1,5 +1,5 @@
 ; RUN:  %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
index 13bac29a3628..f9961593c7b9 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define double @test(double* %DP, double %Arg) nounwind {
 	%D = load double, double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
index 5d5480e9d459..329dc5c83950 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
index ef74fa02e6a9..44557ea399b5 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -relocation-model=pic -code-model=small %s > /dev/null
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
index c2260fc2f1ff..a249c2f097e1 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
index 2a45472b25a1..281705383339 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -O0 -relocation-model=pic -code-model=small %s
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll
index 249aad2d4b48..6fbb2bc3c4bd 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll
index 32c58ee6237b..ce094174134b 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll
index aaf3ebc9bc7f..bc477c285515 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define i32 @bar() nounwind {
 	ret i32 0
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll
index a0d941049c4a..001a617b97a3 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 ; This test should fail until remote symbol resolution is supported.
 
 define i32 @main() nounwind {
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll
index 9b4e2469665f..4c4256e45a35 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll
index 88a561b613ef..1621501a31a1 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll
@@ -1,5 +1,5 @@
 ; RUN:  %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll
index 484541ab4807..6ff8704da1f9 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define double @test(double* %DP, double %Arg) nounwind {
 	%D = load double, double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll
index adc3e944b639..a7c8bfef938f 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll
index 8ab3fd591388..a028df674648 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -relocation-model=pic -code-model=small %s > /dev/null
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll
index a47c801e799b..d369d2b38498 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll
index 210ac6f6ed1c..e918dabe1772 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -O0 -relocation-model=pic -code-model=small %s
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/MC/AArch64/inst-directive.s b/test/MC/AArch64/inst-directive.s
index 3bb620f689d1..7fd5200b9e57 100644
--- a/test/MC/AArch64/inst-directive.s
+++ b/test/MC/AArch64/inst-directive.s
@@ -1,7 +1,14 @@
 // RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=asm -o - \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ASM
-// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o - \
-// RUN:   | llvm-readobj -s -sd | FileCheck %s  --check-prefix=CHECK-OBJ
+// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o %t
+// RUN: llvm-readobj -s -sd %t | FileCheck %s  --check-prefix=CHECK-OBJ
+// RUN: llvm-objdump -t %t | FileCheck %s  --check-prefix=CHECK-SYMS
+
+// RUN: llvm-mc %s -triple=aarch64_be-none-linux-gnu -filetype=asm -o - \
+// RUN:   | FileCheck %s --check-prefix=CHECK-ASM
+// RUN: llvm-mc %s -triple=aarch64_be-none-linux-gnu -filetype=obj -o %t
+// RUN: llvm-readobj -s -sd %t | FileCheck %s  --check-prefix=CHECK-OBJ
+// RUN: llvm-objdump -t %t | FileCheck %s  --check-prefix=CHECK-SYMS
 
     .section    .inst.aarch64_inst
 
@@ -22,3 +29,7 @@ aarch64_inst:
 // CHECK-OBJ:   SectionData (
 // CHECK-OBJ-NEXT: 0000: 2040105E
 // CHECK-OBJ-NEXT: )
+
+// CHECK-SYMS-NOT: 0000000000000000         .inst.aarch64_inst              00000000 $d
+// CHECK-SYMS:     0000000000000000         .inst.aarch64_inst              00000000 $x
+// CHECK-SYMS-NOT: 0000000000000000         .inst.aarch64_inst              00000000 $d
diff --git a/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
new file mode 100644
index 000000000000..36440da21626
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -loop-unroll -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; CHECK-LABEL: @f
+define i32 @f(i32 %a) #0 {
+; CHECK: call i32 @llvm.bitreverse.i32
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %b.07 = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  %shr = lshr i32 %a, %i.08
+  %and = and i32 %shr, 1
+  %sub = sub nuw nsw i32 31, %i.08
+  %shl = shl i32 %and, %sub
+  %or = or i32 %shl, %b.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
+}
+
+attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git b7441a0f42c43a8eea9e3e706be187252db747fa)"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.unroll.full"}
diff --git a/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
new file mode 100644
index 000000000000..98c6700c209d
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/CodeGenPrepare/bitreverse-hang.ll b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll
new file mode 100644
index 000000000000..c81dcc15cae9
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -codegenprepare -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/Inline/inline-funclets.ll b/test/Transforms/Inline/inline-funclets.ll
new file mode 100644
index 000000000000..362e03d36a32
--- /dev/null
+++ b/test/Transforms/Inline/inline-funclets.ll
@@ -0,0 +1,455 @@
+; RUN: opt -inline -S %s | FileCheck %s
+
+declare void @g()
+
+
+;;; Test with a call in a funclet that needs to remain a call
+;;; when inlined because the funclet doesn't unwind to caller.
+;;; CHECK-LABEL: define void @test1(
+define void @test1() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test1_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test1_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup.inner
+; CHECK-NEXT:  invoke void @g()
+; CHECK-NEXT:    unwind label %[[cleanup_inner:.+]]
+
+cleanup.inner:
+  %pad.inner = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad.inner) ]
+  cleanupret from %pad.inner unwind label %cleanup.outer
+; CHECK: [[cleanup_inner]]:
+; The call here needs to remain a call becuase pad.inner has a cleanupret
+; that stays within the inlinee.
+; CHECK-NEXT:  %[[pad_inner:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT:  call void @g() [ "funclet"(token %[[pad_inner]]) ]
+; CHECK-NEXT:  cleanupret from %[[pad_inner]] unwind label %[[cleanup_outer:.+]]
+
+cleanup.outer:
+  %pad.outer = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad.outer) ]
+  cleanupret from %pad.outer unwind to caller
+; CHECK: [[cleanup_outer]]:
+; The call and cleanupret here need to be redirected to caller cleanup
+; CHECK-NEXT: %[[pad_outer:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[pad_outer]]) ]
+; CHECK-NEXT:   unwind label %cleanup
+; CHECK: cleanupret from %[[pad_outer]] unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
+
+
+
+;;; Test with an "unwind to caller" catchswitch in a parent funclet
+;;; that needs to remain "unwind to caller" because the parent
+;;; doesn't unwind to caller.
+;;; CHECK-LABEL: define void @test2(
+define void @test2() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test2_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test2_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup1
+; CHECK-NEXT:   invoke void @g()
+; CHECK-NEXT:     unwind label %[[cleanup1:.+]]
+
+cleanup1:
+  %outer = cleanuppad within none []
+  invoke void @g() [ "funclet"(token %outer) ]
+    to label %ret1 unwind label %catchswitch
+; CHECK: [[cleanup1]]:
+; CHECK-NEXT: %[[outer:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[outer]]) ]
+; CHECK-NEXT:   unwind label %[[catchswitch:.+]]
+
+catchswitch:
+  %cs = catchswitch within %outer [label %catch] unwind to caller
+; CHECK: [[catchswitch]]:
+; The catchswitch here needs to remain "unwind to caller" since %outer
+; has a cleanupret that remains within the inlinee.
+; CHECK-NEXT: %[[cs:[^ ]+]] = catchswitch within %[[outer]] [label %[[catch:.+]]] unwind to caller
+
+catch:
+  %inner = catchpad within %cs []
+  call void @g() [ "funclet"(token %inner) ]
+  catchret from %inner to label %ret1
+; CHECK: [[catch]]:
+; The call here needs to remain a call since it too is within %outer
+; CHECK:   %[[inner:[^ ]+]] = catchpad within %[[cs]]
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[inner]]) ]
+
+ret1:
+  cleanupret from %outer unwind label %cleanup2
+; CHECK: cleanupret from %[[outer]] unwind label %[[cleanup2:.+]]
+
+cleanup2:
+  %later = cleanuppad within none []
+  cleanupret from %later unwind to caller
+; CHECK: [[cleanup2]]:
+; The cleanupret here needs to get redirected to the caller cleanup
+; CHECK-NEXT: %[[later:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: cleanupret from %[[later]] unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
+
+
+;;; Test with a call in a cleanup that has no definitive unwind
+;;; destination, that must be rewritten to an invoke.
+;;; CHECK-LABEL: define void @test3(
+define void @test3() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test3_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test3_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup
+; CHECK-NEXT:  invoke void @g()
+; CHECK-NEXT:    unwind label %[[cleanup:.+]]
+
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  unreachable
+; CHECK: [[cleanup]]:
+; The call must be rewritten to an invoke targeting the caller cleanup
+; because it may well unwind to there.
+; CHECK-NEXT: %[[pad:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
+
+
+;;; Test with a catchswitch in a cleanup that has no definitive
+;;; unwind destination, that must be rewritten to unwind to the
+;;; inlined invoke's unwind dest
+;;; CHECK-LABEL: define void @test4(
+define void @test4() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test4_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test4_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup
+; CHECK-NEXT: invoke void @g()
+; CHECK-NEXT:   unwind label %[[cleanup:.+]]
+
+cleanup:
+  %clean = cleanuppad within none []
+  invoke void @g() [ "funclet"(token %clean) ]
+    to label %unreachable unwind label %dispatch
+; CHECK: [[cleanup]]:
+; CHECK-NEXT: %[[clean:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[clean]]) ]
+; CHECK-NEXT:   unwind label %[[dispatch:.+]]
+
+dispatch:
+  %cs = catchswitch within %clean [label %catch] unwind to caller
+; CHECK: [[dispatch]]:
+; The catchswitch must be rewritten to unwind to %cleanup in the caller
+; because it may well unwind to there.
+; CHECK-NEXT: %[[cs:[^ ]+]] = catchswitch within %[[clean]] [label %[[catch:.+]]] unwind label %cleanup{{$}}
+
+catch:
+  catchpad within %cs []
+  br label %unreachable
+unreachable:
+  unreachable
+exit:
+  ret void
+}
+
+
+;;; Test with multiple levels of nesting, and unwind dests
+;;; that need to be inferred from ancestors, descendants,
+;;; and cousins.
+;;; CHECK-LABEL: define void @test5(
+define void @test5() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test5_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test5_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %cont unwind label %noinfo.root
+; CHECK-NEXT: invoke void @g()
+; CHECK-NEXT:   to label %[[cont:[^ ]+]] unwind label %[[noinfo_root:.+]]
+
+noinfo.root:
+  %noinfo.root.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %noinfo.root.pad) ]
+  invoke void @g() [ "funclet"(token %noinfo.root.pad) ]
+    to label %noinfo.root.cont unwind label %noinfo.left
+; CHECK: [[noinfo_root]]:
+; Nothing under "noinfo.root" has a definitive unwind destination, so
+; we must assume all of it may actually unwind, and redirect unwinds
+; to the cleanup in the caller.
+; CHECK-NEXT: %[[noinfo_root_pad:[^ ]+]] = cleanuppad within none []
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ]
+; CHECK-NEXT:   to label %[[next:[^ ]+]] unwind label %cleanup{{$}}
+; CHECK: [[next]]:
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ]
+; CHECK-NEXT:   to label %[[noinfo_root_cont:[^ ]+]] unwind label %[[noinfo_left:.+]]
+
+noinfo.left:
+  %noinfo.left.pad = cleanuppad within %noinfo.root.pad []
+  invoke void @g() [ "funclet"(token %noinfo.left.pad) ]
+    to label %unreachable unwind label %noinfo.left.child
+; CHECK: [[noinfo_left]]:
+; CHECK-NEXT: %[[noinfo_left_pad:[^ ]+]] = cleanuppad within %[[noinfo_root_pad]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_left_pad]]) ]
+; CHECK-NEXT:   unwind label %[[noinfo_left_child:.+]]
+
+noinfo.left.child:
+  %noinfo.left.child.cs = catchswitch within %noinfo.left.pad [label %noinfo.left.child.catch] unwind to caller
+; CHECK: [[noinfo_left_child]]:
+; CHECK-NEXT: %[[noinfo_left_child_cs:[^ ]+]] = catchswitch within %[[noinfo_left_pad]] [label %[[noinfo_left_child_catch:[^ ]+]]] unwind label %cleanup{{$}}
+
+noinfo.left.child.catch:
+  %noinfo.left.child.pad = catchpad within %noinfo.left.child.cs []
+  call void @g() [ "funclet"(token %noinfo.left.child.pad) ]
+  br label %unreachable
+; CHECK: [[noinfo_left_child_catch]]:
+; CHECK-NEXT: %[[noinfo_left_child_pad:[^ ]+]] = catchpad within %[[noinfo_left_child_cs]] []
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_left_child_pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+noinfo.root.cont:
+  invoke void @g() [ "funclet"(token %noinfo.root.pad) ]
+    to label %unreachable unwind label %noinfo.right
+; CHECK: [[noinfo_root_cont]]:
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ]
+; CHECK-NEXT:   unwind label %[[noinfo_right:.+]]
+
+noinfo.right:
+  %noinfo.right.cs = catchswitch within %noinfo.root.pad [label %noinfo.right.catch] unwind to caller
+; CHECK: [[noinfo_right]]:
+; CHECK-NEXT: %[[noinfo_right_cs:[^ ]+]] = catchswitch within %[[noinfo_root_pad]] [label %[[noinfo_right_catch:[^ ]+]]] unwind label %cleanup{{$}}
+
+noinfo.right.catch:
+  %noinfo.right.pad = catchpad within %noinfo.right.cs []
+  invoke void @g() [ "funclet"(token %noinfo.right.pad) ]
+    to label %unreachable unwind label %noinfo.right.child
+; CHECK: [[noinfo_right_catch]]:
+; CHECK-NEXT: %[[noinfo_right_pad:[^ ]+]] = catchpad within %[[noinfo_right_cs]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_right_pad]]) ]
+; CHECK-NEXT:   unwind label %[[noinfo_right_child:.+]]
+
+noinfo.right.child:
+  %noinfo.right.child.pad = cleanuppad within %noinfo.right.pad []
+  call void @g() [ "funclet"(token %noinfo.right.child.pad) ]
+  br label %unreachable
+; CHECK: [[noinfo_right_child]]:
+; CHECK-NEXT: %[[noinfo_right_child_pad:[^ ]+]] = cleanuppad within %[[noinfo_right_pad]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_right_child_pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+cont:
+  invoke void @g()
+    to label %exit unwind label %implicit.root
+; CHECK: [[cont]]:
+; CHECK-NEXT: invoke void @g()
+; CHECK-NEXT:   unwind label %[[implicit_root:.+]]
+
+implicit.root:
+  %implicit.root.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %implicit.root.pad) ]
+  invoke void @g() [ "funclet"(token %implicit.root.pad) ]
+    to label %implicit.root.cont unwind label %implicit.left
+; CHECK: [[implicit_root]]:
+; There's an unwind edge to %internal in implicit.right, and we need to propagate that
+; fact down to implicit.right.grandchild, up to implicit.root, and down to
+; implicit.left.child.catch, leaving all calls and "unwind to caller" catchswitches
+; alone to so they don't conflict with the unwind edge in implicit.right
+; CHECK-NEXT: %[[implicit_root_pad:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_root_pad]]) ]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_root_pad]]) ]
+; CHECK-NEXT:   to label %[[implicit_root_cont:[^ ]+]] unwind label %[[implicit_left:.+]]
+
+implicit.left:
+  %implicit.left.pad = cleanuppad within %implicit.root.pad []
+  invoke void @g() [ "funclet"(token %implicit.left.pad) ]
+    to label %unreachable unwind label %implicit.left.child
+; CHECK: [[implicit_left]]:
+; CHECK-NEXT: %[[implicit_left_pad:[^ ]+]] = cleanuppad within %[[implicit_root_pad:[^ ]+]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_left_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_left_child:.+]]
+
+implicit.left.child:
+  %implicit.left.child.cs = catchswitch within %implicit.left.pad [label %implicit.left.child.catch] unwind to caller
+; CHECK: [[implicit_left_child]]:
+; CHECK-NEXT: %[[implicit_left_child_cs:[^ ]+]] = catchswitch within %[[implicit_left_pad]] [label %[[implicit_left_child_catch:[^ ]+]]] unwind to caller
+
+implicit.left.child.catch:
+  %implicit.left.child.pad = catchpad within %implicit.left.child.cs []
+  call void @g() [ "funclet"(token %implicit.left.child.pad) ]
+  br label %unreachable
+; CHECK: [[implicit_left_child_catch]]:
+; CHECK-NEXT: %[[implicit_left_child_pad:[^ ]+]] = catchpad within %[[implicit_left_child_cs]]
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_left_child_pad]]) ]
+
+implicit.root.cont:
+  invoke void @g() [ "funclet"(token %implicit.root.pad) ]
+    to label %unreachable unwind label %implicit.right
+; CHECK: [[implicit_root_cont]]:
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_root_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_right:.+]]
+
+implicit.right:
+  %implicit.right.cs = catchswitch within %implicit.root.pad [label %implicit.right.catch] unwind label %internal
+; CHECK: [[implicit_right]]:
+; This is the unwind edge (to %internal) whose existence needs to get propagated around the "implicit" tree
+; CHECK-NEXT: %[[implicit_right_cs:[^ ]+]] = catchswitch within %[[implicit_root_pad]] [label %[[implicit_right_catch:[^ ]+]]] unwind label %[[internal:.+]]
+
+implicit.right.catch:
+  %implicit.right.pad = catchpad within %implicit.right.cs []
+  invoke void @g() [ "funclet"(token %implicit.right.pad) ]
+    to label %unreachable unwind label %implicit.right.child
+; CHECK: [[implicit_right_catch]]:
+; CHECK-NEXT: %[[implicit_right_pad:[^ ]+]] = catchpad within %[[implicit_right_cs]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_right_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_right_child:.+]]
+
+implicit.right.child:
+  %implicit.right.child.pad = cleanuppad within %implicit.right.pad []
+  invoke void @g() [ "funclet"(token %implicit.right.child.pad) ]
+    to label %unreachable unwind label %implicit.right.grandchild
+; CHECK: [[implicit_right_child]]:
+; CHECK-NEXT: %[[implicit_right_child_pad:[^ ]+]] = cleanuppad within %[[implicit_right_pad]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_right_child_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_right_grandchild:.+]]
+
+implicit.right.grandchild:
+  %implicit.right.grandchild.cs = catchswitch within %implicit.right.child.pad [label %implicit.right.grandchild.catch] unwind to caller
+; CHECK: [[implicit_right_grandchild]]:
+; CHECK-NEXT: %[[implicit_right_grandchild_cs:[^ ]+]] = catchswitch within %[[implicit_right_child_pad]] [label %[[implicit_right_grandchild_catch:[^ ]+]]] unwind to caller
+
+implicit.right.grandchild.catch:
+  %implicit.right.grandhcild.pad = catchpad within %implicit.right.grandchild.cs []
+  call void @g() [ "funclet"(token %implicit.right.grandhcild.pad) ]
+  br label %unreachable
+; CHECK: [[implicit_right_grandchild_catch]]:
+; CHECK-NEXT: %[[implicit_right_grandhcild_pad:[^ ]+]] = catchpad within %[[implicit_right_grandchild_cs]]
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_right_grandhcild_pad]]) ]
+
+internal:
+  %internal.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %internal.pad) ]
+  cleanupret from %internal.pad unwind to caller
+; CHECK: [[internal]]:
+; internal is a cleanup with a "return to caller" cleanuppad; that needs to get redirected
+; to %cleanup in the caller, and the call needs to get similarly rewritten to an invoke.
+; CHECK-NEXT: %[[internal_pad:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %internal.pad.i) ]
+; CHECK-NEXT:   to label %[[next:[^ ]+]] unwind label %cleanup{{$}}
+; CHECK: [[next]]:
+; CHECK-NEXT: cleanupret from %[[internal_pad]] unwind label %cleanup{{$}}
+
+unreachable:
+  unreachable
+exit:
+  ret void
+}
+
+
+declare void @ProcessCLRException()
+
+; Make sure the logic doesn't get tripped up when the inlined invoke is
+; itself within a funclet in the caller.
+; CHECK-LABEL: define void @test6(
+define void @test6() personality void ()* @ProcessCLRException {
+entry:
+  invoke void @g()
+    to label %exit unwind label %callsite_parent
+callsite_parent:
+  %callsite_parent.pad = cleanuppad within none []
+; CHECK: %callsite_parent.pad = cleanuppad within none
+  invoke void @test6_inlinee() [ "funclet"(token %callsite_parent.pad) ]
+    to label %ret unwind label %cleanup
+ret:
+  cleanupret from %callsite_parent.pad unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test6_inlinee() alwaysinline personality void ()* @ProcessCLRException {
+entry:
+  invoke void @g()
+    to label %exit unwind label %inlinee_cleanup
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %callsite_parent.pad) ]
+; CHECK-NEXT:   unwind label %[[inlinee_cleanup:.+]]
+
+inlinee_cleanup:
+  %inlinee.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %inlinee.pad) ]
+  unreachable
+; CHECK: [[inlinee_cleanup]]:
+; CHECK-NEXT: %[[inlinee_pad:[^ ]+]] = cleanuppad within %callsite_parent.pad
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[inlinee_pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/InstCombine/bitreverse-hang.ll b/test/Transforms/InstCombine/bitreverse-hang.ll
new file mode 100644
index 000000000000..6823bd0ed653
--- /dev/null
+++ b/test/Transforms/InstCombine/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -instcombine -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/InstCombine/bitreverse-recognize.ll b/test/Transforms/InstCombine/bitreverse-recognize.ll
deleted file mode 100644
index fbd5cb6d139c..000000000000
--- a/test/Transforms/InstCombine/bitreverse-recognize.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define zeroext i8 @f_u8(i8 zeroext %a) {
-; CHECK-LABEL: @f_u8
-; CHECK-NEXT: %[[A:.*]] = call i8 @llvm.bitreverse.i8(i8 %a)
-; CHECK-NEXT: ret i8 %[[A]]
-  %1 = shl i8 %a, 7
-  %2 = shl i8 %a, 5
-  %3 = and i8 %2, 64
-  %4 = shl i8 %a, 3
-  %5 = and i8 %4, 32
-  %6 = shl i8 %a, 1
-  %7 = and i8 %6, 16
-  %8 = lshr i8 %a, 1
-  %9 = and i8 %8, 8
-  %10 = lshr i8 %a, 3
-  %11 = and i8 %10, 4
-  %12 = lshr i8 %a, 5
-  %13 = and i8 %12, 2
-  %14 = lshr i8 %a, 7
-  %15 = or i8 %14, %1
-  %16 = or i8 %15, %3
-  %17 = or i8 %16, %5
-  %18 = or i8 %17, %7
-  %19 = or i8 %18, %9
-  %20 = or i8 %19, %11
-  %21 = or i8 %20, %13
-  ret i8 %21
-}
-
-; The ANDs with 32 and 64 have been swapped here, so the sequence does not
-; completely match a bitreverse.
-define zeroext i8 @f_u8_fail(i8 zeroext %a) {
-; CHECK-LABEL: @f_u8_fail
-; CHECK-NOT: call
-; CHECK: ret i8
-  %1 = shl i8 %a, 7
-  %2 = shl i8 %a, 5
-  %3 = and i8 %2, 32
-  %4 = shl i8 %a, 3
-  %5 = and i8 %4, 64
-  %6 = shl i8 %a, 1
-  %7 = and i8 %6, 16
-  %8 = lshr i8 %a, 1
-  %9 = and i8 %8, 8
-  %10 = lshr i8 %a, 3
-  %11 = and i8 %10, 4
-  %12 = lshr i8 %a, 5
-  %13 = and i8 %12, 2
-  %14 = lshr i8 %a, 7
-  %15 = or i8 %14, %1
-  %16 = or i8 %15, %3
-  %17 = or i8 %16, %5
-  %18 = or i8 %17, %7
-  %19 = or i8 %18, %9
-  %20 = or i8 %19, %11
-  %21 = or i8 %20, %13
-  ret i8 %21
-}
-
-define zeroext i16 @f_u16(i16 zeroext %a) {
-; CHECK-LABEL: @f_u16
-; CHECK-NEXT: %[[A:.*]] = call i16 @llvm.bitreverse.i16(i16 %a)
-; CHECK-NEXT: ret i16 %[[A]]
-  %1 = shl i16 %a, 15
-  %2 = shl i16 %a, 13
-  %3 = and i16 %2, 16384
-  %4 = shl i16 %a, 11
-  %5 = and i16 %4, 8192
-  %6 = shl i16 %a, 9
-  %7 = and i16 %6, 4096
-  %8 = shl i16 %a, 7
-  %9 = and i16 %8, 2048
-  %10 = shl i16 %a, 5
-  %11 = and i16 %10, 1024
-  %12 = shl i16 %a, 3
-  %13 = and i16 %12, 512
-  %14 = shl i16 %a, 1
-  %15 = and i16 %14, 256
-  %16 = lshr i16 %a, 1
-  %17 = and i16 %16, 128
-  %18 = lshr i16 %a, 3
-  %19 = and i16 %18, 64
-  %20 = lshr i16 %a, 5
-  %21 = and i16 %20, 32
-  %22 = lshr i16 %a, 7
-  %23 = and i16 %22, 16
-  %24 = lshr i16 %a, 9
-  %25 = and i16 %24, 8
-  %26 = lshr i16 %a, 11
-  %27 = and i16 %26, 4
-  %28 = lshr i16 %a, 13
-  %29 = and i16 %28, 2
-  %30 = lshr i16 %a, 15
-  %31 = or i16 %30, %1
-  %32 = or i16 %31, %3
-  %33 = or i16 %32, %5
-  %34 = or i16 %33, %7
-  %35 = or i16 %34, %9
-  %36 = or i16 %35, %11
-  %37 = or i16 %36, %13
-  %38 = or i16 %37, %15
-  %39 = or i16 %38, %17
-  %40 = or i16 %39, %19
-  %41 = or i16 %40, %21
-  %42 = or i16 %41, %23
-  %43 = or i16 %42, %25
-  %44 = or i16 %43, %27
-  %45 = or i16 %44, %29
-  ret i16 %45
-}
-\ No newline at end of file
diff --git a/test/Transforms/InstCombine/cos-2.ll b/test/Transforms/InstCombine/cos-2.ll
index c9a9c7c07712..a85cc8fa6bde 100644
--- a/test/Transforms/InstCombine/cos-2.ll
+++ b/test/Transforms/InstCombine/cos-2.ll
@@ -1,12 +1,11 @@
-; Test that the cos library call simplifier works correctly.
-;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare float @cos(double)
+declare signext i8 @sqrt(...)
 
-; Check that cos functions with the wrong prototype aren't simplified.
+; Check that functions with the wrong prototype aren't simplified.
 
 define float @test_no_simplify1(double %d) {
 ; CHECK-LABEL: @test_no_simplify1(
@@ -15,3 +14,14 @@ define float @test_no_simplify1(double %d) {
 ; CHECK: call float @cos(double %neg)
   ret float %cos
 }
+
+
+define i8 @bogus_sqrt() {
+  %fake_sqrt = call signext i8 (...) @sqrt()
+  ret i8 %fake_sqrt
+
+; CHECK-LABEL: bogus_sqrt(
+; CHECK-NEXT:  %fake_sqrt = call signext i8 (...) @sqrt()
+; CHECK-NEXT:  ret i8 %fake_sqrt
+}
+
diff --git a/test/Transforms/InstCombine/double-float-shrink-1.ll b/test/Transforms/InstCombine/double-float-shrink-1.ll
index 319ea3259830..74f3ebbf5230 100644
--- a/test/Transforms/InstCombine/double-float-shrink-1.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-1.ll
@@ -364,6 +364,26 @@ define float @max1(float %a, float %b) {
 ; CHECK-NEXT:  ret
 }
 
+; A function can have a name that matches a common libcall,
+; but with the wrong type(s). Let it be.
+
+define float @fake_fmin(float %a, float %b) {
+  %c = fpext float %a to fp128
+  %d = fpext float %b to fp128
+  %e = call fp128 @fmin(fp128 %c, fp128 %d)
+  %f = fptrunc fp128 %e to float
+  ret float %f
+
+; CHECK-LABEL: fake_fmin(
+; CHECK-NEXT:  %c = fpext float %a to fp128
+; CHECK-NEXT:  %d = fpext float %b to fp128
+; CHECK-NEXT:  %e = call fp128 @fmin(fp128 %c, fp128 %d)
+; CHECK-NEXT:  %f = fptrunc fp128 %e to float
+; CHECK-NEXT:  ret float %f
+}
+
+declare fp128 @fmin(fp128, fp128) ; This is not the 'fmin' you're looking for.
+
 declare double @fmax(double, double)
 
 declare double @tanh(double) #1
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 67e7cbd7686a..a76ec11fb1da 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -16,6 +16,7 @@
 #include "OrcLazyJIT.h"
 #include "RemoteJITUtils.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
@@ -741,11 +742,11 @@ std::unique_ptr<FDRPCChannel> launchRemote() {
       ChildPath.reset(new char[ChildExecPath.size() + 1]);
       std::copy(ChildExecPath.begin(), ChildExecPath.end(), &ChildPath[0]);
       ChildPath[ChildExecPath.size()] = '\0';
-      std::string ChildInStr = std::to_string(PipeFD[0][0]);
+      std::string ChildInStr = utostr(PipeFD[0][0]);
       ChildIn.reset(new char[ChildInStr.size() + 1]);
       std::copy(ChildInStr.begin(), ChildInStr.end(), &ChildIn[0]);
       ChildIn[ChildInStr.size()] = '\0';
-      std::string ChildOutStr = std::to_string(PipeFD[1][1]);
+      std::string ChildOutStr = utostr(PipeFD[1][1]);
       ChildOut.reset(new char[ChildOutStr.size() + 1]);
       std::copy(ChildOutStr.begin(), ChildOutStr.end(), &ChildOut[0]);
       ChildOut[ChildOutStr.size()] = '\0';
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index fb50160f5fed..c3884ba31756 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -65,11 +65,6 @@ function usage() {
     echo " -no-openmp           Disable check-out & build libomp"
 }
 
-if [ `uname -s` = "Darwin" ]; then
-  # compiler-rt doesn't yet build with CMake on Darwin.
-  use_autoconf="yes"
-fi
-
 while [ $# -gt 0 ]; do
     case $1 in
         -release | --release )
@@ -288,10 +283,20 @@ function export_sources() {
     if [ ! -h clang ]; then
         ln -s ../../cfe.src clang
     fi
-    cd $BuildDir/llvm.src/tools/clang/tools
-    if [ ! -h extra ]; then
-        ln -s ../../../../clang-tools-extra.src extra
+
+    # The autoconf and CMake builds want different symlinks here:
+    if [ "$use_autoconf" = "yes" ]; then
+      cd $BuildDir/llvm.src/tools/clang/tools
+      if [ ! -h extra ]; then
+          ln -s ../../../../clang-tools-extra.src extra
+      fi
+    else
+      cd $BuildDir/cfe.src/tools
+      if [ ! -h extra ]; then
+          ln -s ../../clang-tools-extra.src extra
+      fi
     fi
+
     cd $BuildDir/llvm.src/projects
     if [ -d $BuildDir/test-suite.src ] && [ ! -h test-suite ]; then
         ln -s ../../test-suite.src test-suite