84 files changed, 2741 insertions, 2023 deletions
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 1d2ffc1abe1f..6387bb36166e 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -80,10 +80,15 @@ static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount,
   return CalleeInfo::HotnessType::None;
 }
 
-static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
-                                   const Function &F, BlockFrequencyInfo *BFI,
-                                   ProfileSummaryInfo *PSI,
-                                   bool HasLocalsInUsed) {
+static bool isNonRenamableLocal(const GlobalValue &GV) {
+  return GV.hasSection() && GV.hasLocalLinkage();
+}
+
+static void
+computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
+                       const Function &F, BlockFrequencyInfo *BFI,
+                       ProfileSummaryInfo *PSI, bool HasLocalsInUsed,
+                       DenseSet<GlobalValue::GUID> &CantBePromoted) {
   // Summary not currently supported for anonymous functions, they should
   // have been named.
   assert(F.hasName());
@@ -178,37 +183,64 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       }
     }
 
-  GlobalValueSummary::GVFlags Flags(F);
+  bool NonRenamableLocal = isNonRenamableLocal(F);
+  bool NotEligibleForImport =
+      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
+      // Inliner doesn't handle variadic functions.
+      // FIXME: refactor this to use the same code that inliner is using.
+      F.isVarArg();
+  GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
+                                    /* LiveRoot = */ false);
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(),
       TypeTests.takeVector());
-  if (HasInlineAsmMaybeReferencingInternal)
-    FuncSummary->setHasInlineAsmMaybeReferencingInternal();
+  if (NonRenamableLocal)
+    CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
 }
 
-static void computeVariableSummary(ModuleSummaryIndex &Index,
-                                   const GlobalVariable &V) {
+static void
+computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
+                       DenseSet<GlobalValue::GUID> &CantBePromoted) {
   SetVector<ValueInfo> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
   findRefEdges(&V, RefEdges, Visited);
-  GlobalValueSummary::GVFlags Flags(V);
+  bool NonRenamableLocal = isNonRenamableLocal(V);
+  GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
+                                    /* LiveRoot = */ false);
   auto GVarSummary =
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
+  if (NonRenamableLocal)
+    CantBePromoted.insert(V.getGUID());
   Index.addGlobalValueSummary(V.getName(), std::move(GVarSummary));
 }
 
-static void computeAliasSummary(ModuleSummaryIndex &Index,
-                                const GlobalAlias &A) {
-  GlobalValueSummary::GVFlags Flags(A);
+static void
+computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
+                    DenseSet<GlobalValue::GUID> &CantBePromoted) {
+  bool NonRenamableLocal = isNonRenamableLocal(A);
+  GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
+                                    /* LiveRoot = */ false);
   auto AS = llvm::make_unique<AliasSummary>(Flags, ArrayRef<ValueInfo>{});
   auto *Aliasee = A.getBaseObject();
   auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee);
   assert(AliaseeSummary && "Alias expects aliasee summary to be parsed");
   AS->setAliasee(AliaseeSummary);
+  if (NonRenamableLocal)
+    CantBePromoted.insert(A.getGUID());
   Index.addGlobalValueSummary(A.getName(), std::move(AS));
 }
 
+// Set LiveRoot flag on entries matching the given value name.
+static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) {
+  auto SummaryList =
+      Index.findGlobalValueSummaryList(GlobalValue::getGUID(Name));
+  if (SummaryList == Index.end())
+    return;
+  for (auto &Summary : SummaryList->second)
+    Summary->setLiveRoot();
+}
+
 ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     const Module &M,
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
@@ -226,9 +258,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
   collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
   // Next collect those in the llvm.compiler.used set.
   collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ true);
+  DenseSet<GlobalValue::GUID> CantBePromoted;
   for (auto *V : Used) {
-    if (V->hasLocalLinkage())
+    if (V->hasLocalLinkage()) {
       LocalsUsed.insert(V);
+      CantBePromoted.insert(V->getGUID());
+    }
   }
 
   // Compute summaries for all functions defined in module, and save in the
@@ -248,7 +283,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
       BFI = BFIPtr.get();
     }
 
-    computeFunctionSummary(Index, M, F, BFI, PSI, !LocalsUsed.empty());
+    computeFunctionSummary(Index, M, F, BFI, PSI, !LocalsUsed.empty(),
+                           CantBePromoted);
   }
 
   // Compute summaries for all variables defined in module, and save in the
@@ -256,20 +292,29 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
   for (const GlobalVariable &G : M.globals()) {
     if (G.isDeclaration())
       continue;
-    computeVariableSummary(Index, G);
+    computeVariableSummary(Index, G, CantBePromoted);
   }
 
   // Compute summaries for all aliases defined in module, and save in the
   // index.
   for (const GlobalAlias &A : M.aliases())
-    computeAliasSummary(Index, A);
+    computeAliasSummary(Index, A, CantBePromoted);
 
   for (auto *V : LocalsUsed) {
     auto *Summary = Index.getGlobalValueSummary(*V);
     assert(Summary && "Missing summary for global value");
-    Summary->setNoRename();
+    Summary->setNotEligibleToImport();
   }
 
+  // The linker doesn't know about these LLVM produced values, so we need
+  // to flag them as live in the index to ensure index-based dead value
+  // analysis treats them as live roots of the analysis.
+  setLiveRoot(Index, "llvm.used");
+  setLiveRoot(Index, "llvm.compiler.used");
+  setLiveRoot(Index, "llvm.global_ctors");
+  setLiveRoot(Index, "llvm.global_dtors");
+  setLiveRoot(Index, "llvm.global.annotations");
+
   if (!M.getModuleInlineAsm().empty()) {
     // Collect the local values defined by module level asm, and set up
     // summaries for these symbols so that they can be marked as NoRename,
@@ -282,7 +327,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     // referenced from there.
     ModuleSymbolTable::CollectAsmSymbols(
         Triple(M.getTargetTriple()), M.getModuleInlineAsm(),
-        [&M, &Index](StringRef Name, object::BasicSymbolRef::Flags Flags) {
+        [&M, &Index, &CantBePromoted](StringRef Name,
+                                      object::BasicSymbolRef::Flags Flags) {
           // Symbols not marked as Weak or Global are local definitions.
           if (Flags & (object::BasicSymbolRef::SF_Weak |
                        object::BasicSymbolRef::SF_Global))
@@ -291,11 +337,10 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           if (!GV)
             return;
           assert(GV->isDeclaration() && "Def in module asm already has definition");
-          GlobalValueSummary::GVFlags GVFlags(
-              GlobalValue::InternalLinkage,
-              /* NoRename */ true,
-              /* HasInlineAsmMaybeReferencingInternal */ false,
-              /* IsNotViableToInline */ true);
+          GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage,
+                                              /* NotEligibleToImport */ true,
+                                              /* LiveRoot */ true);
+          CantBePromoted.insert(GlobalValue::getGUID(Name));
           // Create the appropriate summary type.
           if (isa<Function>(GV)) {
             std::unique_ptr<FunctionSummary> Summary =
@@ -303,18 +348,41 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     GVFlags, 0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{});
-            Summary->setNoRename();
             Index.addGlobalValueSummary(Name, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
                 llvm::make_unique<GlobalVarSummary>(GVFlags,
                                                     ArrayRef<ValueInfo>{});
-            Summary->setNoRename();
             Index.addGlobalValueSummary(Name, std::move(Summary));
           }
         });
   }
 
+  for (auto &GlobalList : Index) {
+    assert(GlobalList.second.size() == 1 &&
+           "Expected module's index to have one summary per GUID");
+    auto &Summary = GlobalList.second[0];
+    bool AllRefsCanBeExternallyReferenced =
+        llvm::all_of(Summary->refs(), [&](const ValueInfo &VI) {
+          return !CantBePromoted.count(VI.getValue()->getGUID());
+        });
+    if (!AllRefsCanBeExternallyReferenced) {
+      Summary->setNotEligibleToImport();
+      continue;
+    }
+
+    if (auto *FuncSummary = dyn_cast<FunctionSummary>(Summary.get())) {
+      bool AllCallsCanBeExternallyReferenced = llvm::all_of(
+          FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
+            auto GUID = Edge.first.isGUID() ? Edge.first.getGUID()
+                                            : Edge.first.getValue()->getGUID();
+            return !CantBePromoted.count(GUID);
+          });
+      if (!AllCallsCanBeExternallyReferenced)
+        Summary->setNotEligibleToImport();
+    }
+  }
+
   return Index;
 }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 2a15b9b264e3..cd8c24630df1 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -389,8 +389,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
 }
 
 int TargetTransformInfo::getAddressComputationCost(Type *Tp,
-                                                   bool IsComplex) const {
-  int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex);
+                                                   ScalarEvolution *SE,
+                                                   const SCEV *Ptr) const {
+  int Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 03aefcf57118..d9e249aad21d 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -801,12 +801,12 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   // to getDecodedLinkage() will need to be taken into account here as above.
   auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits
   RawFlags = RawFlags >> 4;
-  bool NoRename = RawFlags & 0x1;
-  bool IsNotViableToInline = RawFlags & 0x2;
-  bool HasInlineAsmMaybeReferencingInternal = RawFlags & 0x4;
-  return GlobalValueSummary::GVFlags(Linkage, NoRename,
-                                     HasInlineAsmMaybeReferencingInternal,
-                                     IsNotViableToInline);
+  bool NotEligibleToImport = (RawFlags & 0x1) || Version < 3;
+  // The LiveRoot flag wasn't introduced until version 3. For dead stripping
+  // to work correctly on earlier versions, we must conservatively treat all
+  // values as live.
+  bool LiveRoot = (RawFlags & 0x2) || Version < 3;
+  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, LiveRoot);
 }
 
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
@@ -4838,9 +4838,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
   }
   const uint64_t Version = Record[0];
   const bool IsOldProfileFormat = Version == 1;
-  if (!IsOldProfileFormat && Version != 2)
+  if (Version < 1 || Version > 3)
     return error("Invalid summary version " + Twine(Version) +
-                 ", 1 or 2 expected");
+                 ", 1, 2 or 3 expected");
   Record.clear();
 
   // Keep around the last seen summary to be used when we see an optional
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index 43c9aebd79ef..771cf3d927bc 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -93,20 +93,29 @@ static void skipAbbreviatedField(BitstreamCursor &Cursor,
 }
 
 /// skipRecord - Read the current record and discard it.
-void BitstreamCursor::skipRecord(unsigned AbbrevID) {
+unsigned BitstreamCursor::skipRecord(unsigned AbbrevID) {
   // Skip unabbreviated records by reading past their entries.
   if (AbbrevID == bitc::UNABBREV_RECORD) {
     unsigned Code = ReadVBR(6);
-    (void)Code;
     unsigned NumElts = ReadVBR(6);
     for (unsigned i = 0; i != NumElts; ++i)
       (void)ReadVBR64(6);
-    return;
+    return Code;
   }
 
   const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+  const BitCodeAbbrevOp &CodeOp = Abbv->getOperandInfo(0);
+  unsigned Code;
+  if (CodeOp.isLiteral())
+    Code = CodeOp.getLiteralValue();
+  else {
+    if (CodeOp.getEncoding() == BitCodeAbbrevOp::Array ||
+        CodeOp.getEncoding() == BitCodeAbbrevOp::Blob)
+      report_fatal_error("Abbreviation starts with an Array or a Blob");
+    Code = readAbbreviatedField(*this, CodeOp);
+  }
 
-  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+  for (unsigned i = 1, e = Abbv->getNumOperandInfos(); i < e; ++i) {
     const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
     if (Op.isLiteral())
       continue;
@@ -164,6 +173,7 @@ void BitstreamCursor::skipRecord(unsigned AbbrevID) {
     // Skip over the blob.
     JumpToBit(NewEnd);
   }
+  return Code;
 }
 
 unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
@@ -273,7 +283,7 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
 }
 
 void BitstreamCursor::ReadAbbrevRecord() {
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   unsigned NumOpInfo = ReadVBR(5);
   for (unsigned i = 0; i != NumOpInfo; ++i) {
     bool IsLiteral = Read(1);
@@ -307,7 +317,7 @@ void BitstreamCursor::ReadAbbrevRecord() {
 
   if (Abbv->getNumOperandInfos() == 0)
     report_fatal_error("Abbrev record with no operands");
-  CurAbbrevs.push_back(Abbv);
+  CurAbbrevs.push_back(std::move(Abbv));
 }
 
 Optional<BitstreamBlockInfo>
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 5da421a79b7b..460d39cc28d8 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -14,10 +14,12 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
@@ -86,12 +88,23 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "bitcode-reader"
+
+STATISTIC(NumMDStringLoaded, "Number of MDStrings loaded");
+STATISTIC(NumMDNodeTemporary, "Number of MDNode::Temporary created");
+STATISTIC(NumMDRecordLoaded, "Number of Metadata records loaded");
+
 /// Flag whether we need to import full type definitions for ThinLTO.
 /// Currently needed for Darwin and LLDB.
 static cl::opt<bool> ImportFullTypeDefinitions(
     "import-full-type-definitions", cl::init(false), cl::Hidden,
     cl::desc("Import full type definitions for ThinLTO."));
 
+static cl::opt<bool> DisableLazyLoading(
+    "disable-ondemand-mds-loading", cl::init(false), cl::Hidden,
+    cl::desc("Force disable the lazy-loading on-demand of metadata when "
+             "loading bitcode for importing."));
+
 namespace {
 
 static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
@@ -165,6 +178,10 @@ public:
   void assignValue(Metadata *MD, unsigned Idx);
   void tryToResolveCycles();
   bool hasFwdRefs() const { return !ForwardReference.empty(); }
+  int getNextFwdRef() {
+    assert(hasFwdRefs());
+    return *ForwardReference.begin();
+  }
 
   /// Upgrade a type that had an MDString reference.
   void addTypeRef(MDString &UUID, DICompositeType &CT);
@@ -215,6 +232,7 @@ Metadata *BitcodeReaderMetadataList::getMetadataFwdRef(unsigned Idx) {
   ForwardReference.insert(Idx);
 
   // Create and return a placeholder, which will later be RAUW'd.
+  ++NumMDNodeTemporary;
   Metadata *MD = MDNode::getTemporary(Context, None).release();
   MetadataPtrs[Idx].reset(MD);
   return MD;
@@ -340,8 +358,26 @@ class PlaceholderQueue {
   std::deque<DistinctMDOperandPlaceholder> PHs;
 
 public:
+  bool empty() { return PHs.empty(); }
   DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
   void flush(BitcodeReaderMetadataList &MetadataList);
+
+  /// Return the list of temporaries nodes in the queue, these need to be
+  /// loaded before we can flush the queue.
+  void getTemporaries(BitcodeReaderMetadataList &MetadataList,
+                      DenseSet<unsigned> &Temporaries) {
+    for (auto &PH : PHs) {
+      auto ID = PH.getID();
+      auto *MD = MetadataList.lookup(ID);
+      if (!MD) {
+        Temporaries.insert(ID);
+        continue;
+      }
+      auto *N = dyn_cast_or_null<MDNode>(MD);
+      if (N && N->isTemporary())
+        Temporaries.insert(ID);
+    }
+  }
 };
 
 } // end anonymous namespace
@@ -375,6 +411,30 @@ class MetadataLoader::MetadataLoaderImpl {
   Module &TheModule;
   std::function<Type *(unsigned)> getTypeByID;
 
+  /// Cursor associated with the lazy-loading of Metadata. This is the easy way
+  /// to keep around the right "context" (Abbrev list) to be able to jump in
+  /// the middle of the metadata block and load any record.
+  BitstreamCursor IndexCursor;
+
+  /// Index that keeps track of MDString values.
+  std::vector<StringRef> MDStringRef;
+
+  /// On-demand loading of a single MDString. Requires the index above to be
+  /// populated.
+  MDString *lazyLoadOneMDString(unsigned Idx);
+
+  /// Index that keeps track of where to find a metadata record in the stream.
+  std::vector<uint64_t> GlobalMetadataBitPosIndex;
+
+  /// Populate the index above to enable lazily loading of metadata, and load
+  /// the named metadata as well as the transitively referenced global
+  /// Metadata.
+  Expected<bool> lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders);
+
+  /// On-demand loading of a single metadata. Requires the index above to be
+  /// populated.
+  void lazyLoadOneMetadata(unsigned Idx, PlaceholderQueue &Placeholders);
+
   // Keep mapping of seens pair of old-style CU <-> SP, and update pointers to
   // point from SP to CU after a block is completly parsed.
   std::vector<std::pair<DICompileUnit *, Metadata *>> CUSubprograms;
@@ -394,13 +454,25 @@ class MetadataLoader::MetadataLoaderImpl {
 
   Error parseOneMetadata(SmallVectorImpl<uint64_t> &Record, unsigned Code,
                          PlaceholderQueue &Placeholders, StringRef Blob,
-                         bool ModuleLevel, unsigned &NextMetadataNo);
+                         unsigned &NextMetadataNo);
   Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
-                             unsigned &NextMetadataNo);
+                             std::function<void(StringRef)> CallBack);
   Error parseGlobalObjectAttachment(GlobalObject &GO,
                                     ArrayRef<uint64_t> Record);
   Error parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
 
+  void resolveForwardRefsAndPlaceholders(PlaceholderQueue &Placeholders);
+
+  /// Upgrade old-style CU <-> SP pointers to point from SP to CU.
+  void upgradeCUSubprograms() {
+    for (auto CU_SP : CUSubprograms)
+      if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
+        for (auto &Op : SPs->operands())
+          if (auto *SP = dyn_cast_or_null<MDNode>(Op))
+            SP->replaceOperandWith(7, CU_SP.first);
+    CUSubprograms.clear();
+  }
+
 public:
   MetadataLoaderImpl(BitstreamCursor &Stream, Module &TheModule,
                      BitcodeReaderValueList &ValueList,
@@ -444,20 +516,217 @@ Error error(const Twine &Message) {
       Message, make_error_code(BitcodeError::CorruptedBitcode));
 }
 
+Expected<bool> MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock(
+    PlaceholderQueue &Placeholders) {
+  IndexCursor = Stream;
+  SmallVector<uint64_t, 64> Record;
+  // Get the abbrevs, and preload record positions to make them lazy-loadable.
+  while (true) {
+    BitstreamEntry Entry = IndexCursor.advanceSkippingSubblocks(
+        BitstreamCursor::AF_DontPopBlockAtEnd);
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock: {
+      return true;
+    }
+    case BitstreamEntry::Record: {
+      // The interesting case.
+      ++NumMDRecordLoaded;
+      uint64_t CurrentPos = IndexCursor.GetCurrentBitNo();
+      auto Code = IndexCursor.skipRecord(Entry.ID);
+      switch (Code) {
+      case bitc::METADATA_STRINGS: {
+        // Rewind and parse the strings.
+        IndexCursor.JumpToBit(CurrentPos);
+        StringRef Blob;
+        Record.clear();
+        IndexCursor.readRecord(Entry.ID, Record, &Blob);
+        unsigned NumStrings = Record[0];
+        MDStringRef.reserve(NumStrings);
+        auto IndexNextMDString = [&](StringRef Str) {
+          MDStringRef.push_back(Str);
+        };
+        if (auto Err = parseMetadataStrings(Record, Blob, IndexNextMDString))
+          return std::move(Err);
+        break;
+      }
+      case bitc::METADATA_INDEX_OFFSET: {
+        // This is the offset to the index, when we see this we skip all the
+        // records and load only an index to these.
+        IndexCursor.JumpToBit(CurrentPos);
+        Record.clear();
+        IndexCursor.readRecord(Entry.ID, Record);
+        if (Record.size() != 2)
+          return error("Invalid record");
+        auto Offset = Record[0] + (Record[1] << 32);
+        auto BeginPos = IndexCursor.GetCurrentBitNo();
+        IndexCursor.JumpToBit(BeginPos + Offset);
+        Entry = IndexCursor.advanceSkippingSubblocks(
+            BitstreamCursor::AF_DontPopBlockAtEnd);
+        assert(Entry.Kind == BitstreamEntry::Record &&
+               "Corrupted bitcode: Expected `Record` when trying to find the "
+               "Metadata index");
+        Record.clear();
+        auto Code = IndexCursor.readRecord(Entry.ID, Record);
+        (void)Code;
+        assert(Code == bitc::METADATA_INDEX && "Corrupted bitcode: Expected "
+                                               "`METADATA_INDEX` when trying "
+                                               "to find the Metadata index");
+
+        // Delta unpack
+        auto CurrentValue = BeginPos;
+        GlobalMetadataBitPosIndex.reserve(Record.size());
+        for (auto &Elt : Record) {
+          CurrentValue += Elt;
+          GlobalMetadataBitPosIndex.push_back(CurrentValue);
+        }
+        break;
+      }
+      case bitc::METADATA_INDEX:
+        // We don't expect to get there, the Index is loaded when we encounter
+        // the offset.
+        return error("Corrupted Metadata block");
+      case bitc::METADATA_NAME: {
+        // Named metadata need to be materialized now and aren't deferred.
+        IndexCursor.JumpToBit(CurrentPos);
+        Record.clear();
+        unsigned Code = IndexCursor.readRecord(Entry.ID, Record);
+        assert(Code == bitc::METADATA_NAME);
+
+        // Read name of the named metadata.
+        SmallString<8> Name(Record.begin(), Record.end());
+        Code = IndexCursor.ReadCode();
+
+        // Named Metadata comes in two parts, we expect the name to be followed
+        // by the node
+        Record.clear();
+        unsigned NextBitCode = IndexCursor.readRecord(Code, Record);
+        assert(NextBitCode == bitc::METADATA_NAMED_NODE);
+        (void)NextBitCode;
+
+        // Read named metadata elements.
+        unsigned Size = Record.size();
+        NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name);
+        for (unsigned i = 0; i != Size; ++i) {
+          // FIXME: We could use a placeholder here, however NamedMDNode are
+          // taking MDNode as operand and not using the Metadata infrastructure.
+          // It is acknowledged by 'TODO: Inherit from Metadata' in the
+          // NamedMDNode class definition.
+          MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
+          assert(MD && "Invalid record");
+          NMD->addOperand(MD);
+        }
+        break;
+      }
+      case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
+        // FIXME: we need to do this early because we don't materialize global
+        // value explicitly.
+        IndexCursor.JumpToBit(CurrentPos);
+        Record.clear();
+        IndexCursor.readRecord(Entry.ID, Record);
+        if (Record.size() % 2 == 0)
+          return error("Invalid record");
+        unsigned ValueID = Record[0];
+        if (ValueID >= ValueList.size())
+          return error("Invalid record");
+        if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
+          if (Error Err = parseGlobalObjectAttachment(
+                  *GO, ArrayRef<uint64_t>(Record).slice(1)))
+            return std::move(Err);
+        break;
+      }
+      case bitc::METADATA_KIND:
+      case bitc::METADATA_STRING_OLD:
+      case bitc::METADATA_OLD_FN_NODE:
+      case bitc::METADATA_OLD_NODE:
+      case bitc::METADATA_VALUE:
+      case bitc::METADATA_DISTINCT_NODE:
+      case bitc::METADATA_NODE:
+      case bitc::METADATA_LOCATION:
+      case bitc::METADATA_GENERIC_DEBUG:
+      case bitc::METADATA_SUBRANGE:
+      case bitc::METADATA_ENUMERATOR:
+      case bitc::METADATA_BASIC_TYPE:
+      case bitc::METADATA_DERIVED_TYPE:
+      case bitc::METADATA_COMPOSITE_TYPE:
+      case bitc::METADATA_SUBROUTINE_TYPE:
+      case bitc::METADATA_MODULE:
+      case bitc::METADATA_FILE:
+      case bitc::METADATA_COMPILE_UNIT:
+      case bitc::METADATA_SUBPROGRAM:
+      case bitc::METADATA_LEXICAL_BLOCK:
+      case bitc::METADATA_LEXICAL_BLOCK_FILE:
+      case bitc::METADATA_NAMESPACE:
+      case bitc::METADATA_MACRO:
+      case bitc::METADATA_MACRO_FILE:
+      case bitc::METADATA_TEMPLATE_TYPE:
+      case bitc::METADATA_TEMPLATE_VALUE:
+      case bitc::METADATA_GLOBAL_VAR:
+      case bitc::METADATA_LOCAL_VAR:
+      case bitc::METADATA_EXPRESSION:
+      case bitc::METADATA_OBJC_PROPERTY:
+      case bitc::METADATA_IMPORTED_ENTITY:
+      case bitc::METADATA_GLOBAL_VAR_EXPR:
+        // We don't expect to see any of these, if we see one, give up on
+        // lazy-loading and fallback.
+        MDStringRef.clear();
+        GlobalMetadataBitPosIndex.clear();
+        return false;
+      }
+      break;
+    }
+    }
+  }
+}
+
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
 Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
   if (!ModuleLevel && MetadataList.hasFwdRefs())
     return error("Invalid metadata: fwd refs into function blocks");
 
+  // Record the entry position so that we can jump back here and efficiently
+  // skip the whole block in case we lazy-load.
+  auto EntryPos = Stream.GetCurrentBitNo();
+
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
     return error("Invalid record");
 
-  unsigned NextMetadataNo = MetadataList.size();
   SmallVector<uint64_t, 64> Record;
-
   PlaceholderQueue Placeholders;
 
+  // We lazy-load module-level metadata: we build an index for each record, and
+  // then load individual record as needed, starting with the named metadata.
+  if (ModuleLevel && IsImporting && MetadataList.empty() &&
+      !DisableLazyLoading) {
+    auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders);
+    if (!SuccessOrErr)
+      return SuccessOrErr.takeError();
+    if (SuccessOrErr.get()) {
+      // An index was successfully created and we will be able to load metadata
+      // on-demand.
+      MetadataList.resize(MDStringRef.size() +
+                          GlobalMetadataBitPosIndex.size());
+
+      // Reading the named metadata created forward references and/or
+      // placeholders, that we flush here.
+      resolveForwardRefsAndPlaceholders(Placeholders);
+      upgradeCUSubprograms();
+      // Return at the beginning of the block, since it is easy to skip it
+      // entirely from there.
+      Stream.ReadBlockEnd(); // Pop the abbrev block context.
+      Stream.JumpToBit(EntryPos);
+      if (Stream.SkipBlock())
+        return error("Invalid record");
+      return Error::success();
+    }
+    // Couldn't load an index, fallback to loading all the block "old-style".
+  }
+
+  unsigned NextMetadataNo = MetadataList.size();
+
   // Read all the records.
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -467,16 +736,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      // Upgrade old-style CU <-> SP pointers to point from SP to CU.
-      for (auto CU_SP : CUSubprograms)
-        if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
-          for (auto &Op : SPs->operands())
-            if (auto *SP = dyn_cast_or_null<MDNode>(Op))
-              SP->replaceOperandWith(7, CU_SP.first);
-      CUSubprograms.clear();
-
-      MetadataList.tryToResolveCycles();
-      Placeholders.flush(MetadataList);
+      resolveForwardRefsAndPlaceholders(Placeholders);
+      upgradeCUSubprograms();
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -486,20 +747,86 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
     // Read a record.
     Record.clear();
     StringRef Blob;
+    ++NumMDRecordLoaded;
     unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
-    if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob,
-                                     ModuleLevel, NextMetadataNo))
+    if (Error Err =
+            parseOneMetadata(Record, Code, Placeholders, Blob, NextMetadataNo))
       return Err;
   }
 }
 
+MDString *MetadataLoader::MetadataLoaderImpl::lazyLoadOneMDString(unsigned ID) {
+  ++NumMDStringLoaded;
+  if (Metadata *MD = MetadataList.lookup(ID))
+    return cast<MDString>(MD);
+  auto MDS = MDString::get(Context, MDStringRef[ID]);
+  MetadataList.assignValue(MDS, ID);
+  return MDS;
+}
+
+void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
+    unsigned ID, PlaceholderQueue &Placeholders) {
+  assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size());
+  assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
+#ifndef NDEBUG
+  // Lookup first if the metadata hasn't already been loaded.
+  if (auto *MD = MetadataList.lookup(ID)) {
+    auto *N = dyn_cast_or_null<MDNode>(MD);
+    assert(N && N->isTemporary() && "Lazy loading an already loaded metadata");
+  }
+#endif
+  SmallVector<uint64_t, 64> Record;
+  StringRef Blob;
+  IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]);
+  auto Entry = IndexCursor.advanceSkippingSubblocks();
+  ++NumMDRecordLoaded;
+  unsigned Code = IndexCursor.readRecord(Entry.ID, Record, &Blob);
+  if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob, ID))
+    report_fatal_error("Can't lazyload MD");
+}
+
+/// Ensure that all forward-references and placeholders are resolved.
+/// Iteratively lazy-loading metadata on-demand if needed.
+void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders(
+    PlaceholderQueue &Placeholders) {
+  DenseSet<unsigned> Temporaries;
+  while (1) {
+    // Populate Temporaries with the placeholders that haven't been loaded yet.
+    Placeholders.getTemporaries(MetadataList, Temporaries);
+
+    // If we don't have any temporary, or FwdReference, we're done!
+    if (Temporaries.empty() && !MetadataList.hasFwdRefs())
+      break;
+
+    // First, load all the temporaries. This can add new placeholders or
+    // forward references.
+    for (auto ID : Temporaries)
+      lazyLoadOneMetadata(ID, Placeholders);
+    Temporaries.clear();
+
+    // Second, load the forward-references. This can also add new placeholders
+    // or forward references.
+    while (MetadataList.hasFwdRefs())
+      lazyLoadOneMetadata(MetadataList.getNextFwdRef(), Placeholders);
+  }
+  // At this point we don't have any forward reference remaining, or temporary
+  // that haven't been loaded. We can safely drop RAUW support and mark cycles
+  // as resolved.
+  MetadataList.tryToResolveCycles();
+
+  // Finally, everything is in place, we can replace the placeholders operands
+  // with the final node they refer to.
+  Placeholders.flush(MetadataList);
+}
+
 Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     SmallVectorImpl<uint64_t> &Record, unsigned Code,
-    PlaceholderQueue &Placeholders, StringRef Blob, bool ModuleLevel,
-    unsigned &NextMetadataNo) {
+    PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo) {
 
   bool IsDistinct = false;
   auto getMD = [&](unsigned ID) -> Metadata * {
+    if (ID < MDStringRef.size())
+      return lazyLoadOneMDString(ID);
     if (!IsDistinct)
       return MetadataList.getMetadataFwdRef(ID);
     if (auto *MD = MetadataList.getMetadataIfResolved(ID))
@@ -519,7 +846,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
   auto getMDString = [&](unsigned ID) -> MDString * {
     // This requires that the ID is not really a forward reference.  In
     // particular, the MDString must already have been resolved.
-    return cast_or_null<MDString>(getMDOrNull(ID));
+    auto MDS = getMDOrNull(ID);
+    return cast_or_null<MDString>(MDS);
   };
 
   // Support for old type refs.
@@ -539,6 +867,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Record.clear();
     Code = Stream.ReadCode();
 
+    ++NumMDRecordLoaded;
     unsigned NextBitCode = Stream.readRecord(Code, Record);
     if (NextBitCode != bitc::METADATA_NAMED_NODE)
       return error("METADATA_NAME not followed by METADATA_NAMED_NODE");
@@ -1137,15 +1466,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     // Test for upgrading !llvm.loop.
     HasSeenOldLoopTags |= mayBeOldLoopAttachmentTag(String);
-
+    ++NumMDStringLoaded;
     Metadata *MD = MDString::get(Context, String);
     MetadataList.assignValue(MD, NextMetadataNo++);
     break;
   }
-  case bitc::METADATA_STRINGS:
-    if (Error Err = parseMetadataStrings(Record, Blob, NextMetadataNo))
+  case bitc::METADATA_STRINGS: {
+    auto CreateNextMDString = [&](StringRef Str) {
+      ++NumMDStringLoaded;
+      MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo++);
+    };
+    if (Error Err = parseMetadataStrings(Record, Blob, CreateNextMDString))
       return Err;
     break;
+  }
   case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
     if (Record.size() % 2 == 0)
       return error("Invalid record");
@@ -1166,12 +1500,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   }
-#undef GET_OR_DISTINCT
   return Error::success();
+#undef GET_OR_DISTINCT
 }
 
 Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
-    ArrayRef<uint64_t> Record, StringRef Blob, unsigned &NextMetadataNo) {
+    ArrayRef<uint64_t> Record, StringRef Blob,
+    std::function<void(StringRef)> CallBack) {
   // All the MDStrings in the block are emitted together in a single
   // record.  The strings are concatenated and stored in a blob along with
   // their sizes.
@@ -1197,8 +1532,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
     if (Strings.size() < Size)
       return error("Invalid record: metadata strings truncated chars");
 
-    MetadataList.assignValue(MDString::get(Context, Strings.slice(0, Size)),
-                             NextMetadataNo++);
+    CallBack(Strings.slice(0, Size));
     Strings = Strings.drop_front(Size);
   } while (--NumStrings);
 
@@ -1228,6 +1562,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
 
   SmallVector<uint64_t, 64> Record;
 
+  PlaceholderQueue Placeholders;
+
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
@@ -1236,6 +1572,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      resolveForwardRefsAndPlaceholders(Placeholders);
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -1244,6 +1581,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
 
     // Read a metadata attachment record.
     Record.clear();
+    ++NumMDRecordLoaded;
     switch (Stream.readRecord(Entry.ID, Record)) {
     default: // Default behavior: ignore.
       break;
@@ -1268,7 +1606,14 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
         if (I->second == LLVMContext::MD_tbaa && StripTBAA)
           continue;
 
-        Metadata *Node = MetadataList.getMetadataFwdRef(Record[i + 1]);
+        auto Idx = Record[i + 1];
+        if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
+            !MetadataList.lookup(Idx))
+          // Load the attachment if it is in the lazy-loadable range and hasn't
+          // been loaded yet.
+          lazyLoadOneMetadata(Idx, Placeholders);
+
+        Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
         if (isa<LocalAsMetadata>(Node))
           // Drop the attachment.  This used to be legal, but there's no
           // upgrade path.
@@ -1331,6 +1676,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataKinds() {
 
     // Read a record.
     Record.clear();
+    ++NumMDRecordLoaded;
     unsigned Code = Stream.readRecord(Entry.ID, Record);
     switch (Code) {
     default: // Default behavior: ignore.
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index c10ba2399e71..ebb2022551f7 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -784,53 +784,53 @@ void ModuleBitcodeWriter::writeTypeTable() {
   uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
 
   // Abbrev for TYPE_CODE_POINTER.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
   Abbv->Add(BitCodeAbbrevOp(0));  // Addrspace = 0
-  unsigned PtrAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_FUNCTION.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // isvararg
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned FunctionAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_STRUCT_ANON.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned StructAnonAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_STRUCT_NAME.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAME));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  unsigned StructNameAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned StructNameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_STRUCT_NAMED.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned StructNamedAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_ARRAY.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // size
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned ArrayAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Emit an entry count so the reader can reserve space.
   TypeVals.push_back(TypeList.size());
@@ -971,9 +971,8 @@ static unsigned getEncodedLinkage(const GlobalValue &GV) {
 static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
   uint64_t RawFlags = 0;
 
-  RawFlags |= Flags.NoRename; // bool
-  RawFlags |= (Flags.IsNotViableToInline << 1);
-  RawFlags |= (Flags.HasInlineAsmMaybeReferencingInternal << 2);
+  RawFlags |= Flags.NotEligibleToImport; // bool
+  RawFlags |= (Flags.LiveRoot << 1);
   // Linkage don't need to be remapped at that time for the summary. Any future
   // change to the getEncodedLinkage() function will need to be taken into
   // account here as well.
@@ -1059,13 +1058,13 @@ void BitcodeWriterBase::writeValueSymbolTableForwardDecl() {
   // which is written after the function blocks so that it can include
   // the offset of each function. The placeholder offset will be
   // updated when the real VST is written.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_VSTOFFSET));
   // Blocks are 32-bit aligned, so we can use a 32-bit word offset to
   // hold the real VST offset. Must use fixed instead of VBR as we don't
   // know how many VBR chunks to reserve ahead of time.
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Emit the placeholder
   uint64_t Vals[] = {bitc::MODULE_CODE_VSTOFFSET, 0};
@@ -1155,7 +1154,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   unsigned SimpleGVarAbbrev = 0;
   if (!M.global_empty()) {
     // Add an abbrev for common globals with no visibility or thread localness.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                               Log2_32_Ceil(MaxGlobalType+1)));
@@ -1177,7 +1176,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
       Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                                Log2_32_Ceil(SectionMap.size()+1)));
     // Don't bother emitting vis + thread local.
-    SimpleGVarAbbrev = Stream.EmitAbbrev(Abbv);
+    SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
   // Emit the global variable information.
@@ -1285,11 +1284,11 @@ void ModuleBitcodeWriter::writeModuleInfo() {
       AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7);
 
     // MODULE_CODE_SOURCE_FILENAME: [namechar x N]
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_SOURCE_FILENAME));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(AbbrevOpToUse);
-    unsigned FilenameAbbrev = Stream.EmitAbbrev(Abbv);
+    unsigned FilenameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     for (const auto P : M.getSourceFileName())
       Vals.push_back((unsigned char)P);
@@ -1360,14 +1359,14 @@ void ModuleBitcodeWriter::writeMDTuple(const MDTuple *N,
 unsigned ModuleBitcodeWriter::createDILocationAbbrev() {
   // Assume the column is usually under 128, and always output the inlined-at
   // location (it's never more expensive than building an array size 1).
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 void ModuleBitcodeWriter::writeDILocation(const DILocation *N,
@@ -1389,7 +1388,7 @@ void ModuleBitcodeWriter::writeDILocation(const DILocation *N,
 unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() {
   // Assume the column is usually under 128, and always output the inlined-at
   // location (it's never more expensive than building an array size 1).
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
@@ -1397,7 +1396,7 @@ unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 void ModuleBitcodeWriter::writeGenericDINode(const GenericDINode *N,
@@ -1790,11 +1789,11 @@ void ModuleBitcodeWriter::writeDIImportedEntity(
 }
 
 unsigned ModuleBitcodeWriter::createNamedMetadataAbbrev() {
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 void ModuleBitcodeWriter::writeNamedMetadata(
@@ -1819,12 +1818,12 @@ void ModuleBitcodeWriter::writeNamedMetadata(
 }
 
 unsigned ModuleBitcodeWriter::createMetadataStringsAbbrev() {
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRINGS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of strings
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // offset to chars
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 /// Write out a record for MDString.
@@ -1918,17 +1917,17 @@ void ModuleBitcodeWriter::writeModuleMetadata() {
   MDAbbrevs[MetadataAbbrev::GenericDINodeAbbrevID] =
       createGenericDINodeAbbrev();
 
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX_OFFSET));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned OffsetAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned OffsetAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  unsigned IndexAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned IndexAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Emit MDStrings together upfront.
   writeMetadataStrings(VE.getMDStrings(), Record);
@@ -2125,30 +2124,30 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
   // If this is a constant pool for the module, emit module-specific abbrevs.
   if (isGlobal) {
     // Abbrev for CST_CODE_AGGREGATE.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1)));
-    AggregateAbbrev = Stream.EmitAbbrev(Abbv);
+    AggregateAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // Abbrev for CST_CODE_STRING.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_STRING));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    String8Abbrev = Stream.EmitAbbrev(Abbv);
+    String8Abbrev = Stream.EmitAbbrev(std::move(Abbv));
     // Abbrev for CST_CODE_CSTRING.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-    CString7Abbrev = Stream.EmitAbbrev(Abbv);
+    CString7Abbrev = Stream.EmitAbbrev(std::move(Abbv));
     // Abbrev for CST_CODE_CSTRING.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    CString6Abbrev = Stream.EmitAbbrev(Abbv);
+    CString6Abbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
   SmallVector<uint64_t, 64> Record;
@@ -2858,39 +2857,39 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
   unsigned GUIDEntryAbbrev;
   if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
     // 8-bit fixed-width VST_CODE_FNENTRY function strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv);
+    FnEntry8BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // 7-bit fixed width VST_CODE_FNENTRY function strings.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-    FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv);
+    FnEntry7BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // 6-bit char6 VST_CODE_FNENTRY function strings.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
+    FnEntry6BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // FIXME: Change the name of this record as it is now used by
     // the per-module index as well.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
-    GUIDEntryAbbrev = Stream.EmitAbbrev(Abbv);
+    GUIDEntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
   // FIXME: Set up the abbrev, we know how many values there are!
@@ -2984,11 +2983,11 @@ void IndexBitcodeWriter::writeCombinedValueSymbolTable() {
 
   Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
 
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
-  unsigned EntryAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned EntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<uint64_t, 64> NameVals;
   for (const auto &GVI : valueIds()) {
@@ -3121,7 +3120,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   Stream.EnterBlockInfoBlock();
 
   { // 8-bit fixed-width VST_CODE_ENTRY/VST_CODE_BBENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3132,7 +3131,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // 7-bit fixed width VST_CODE_ENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3142,7 +3141,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // 6-bit char6 VST_CODE_ENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3152,7 +3151,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // 6-bit char6 VST_CODE_BBENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3165,7 +3164,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
 
 
   { // SETTYPE abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                               VE.computeBitsRequiredForTypeIndicies()));
@@ -3175,7 +3174,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // INTEGER abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
@@ -3184,7 +3183,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // CE_CAST abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
@@ -3196,7 +3195,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // NULL abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
         CONSTANTS_NULL_Abbrev)
@@ -3206,7 +3205,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   // FIXME: This should only use space for first class types!
 
   { // INST_LOAD abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
@@ -3218,7 +3217,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_BINOP abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
@@ -3228,7 +3227,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
@@ -3239,7 +3238,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_CAST abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
@@ -3251,14 +3250,14 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // INST_RET abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_RET_VOID_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_RET abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
@@ -3266,14 +3265,14 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE));
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_UNREACHABLE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   {
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_GEP));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty
@@ -3296,38 +3295,38 @@ void IndexBitcodeWriter::writeModStrings() {
   // TODO: See which abbrev sizes we actually need to emit
 
   // 8-bit fixed-width MST_ENTRY strings.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-  unsigned Abbrev8Bit = Stream.EmitAbbrev(Abbv);
+  unsigned Abbrev8Bit = Stream.EmitAbbrev(std::move(Abbv));
 
   // 7-bit fixed width MST_ENTRY strings.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-  unsigned Abbrev7Bit = Stream.EmitAbbrev(Abbv);
+  unsigned Abbrev7Bit = Stream.EmitAbbrev(std::move(Abbv));
 
   // 6-bit char6 MST_ENTRY strings.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  unsigned Abbrev6Bit = Stream.EmitAbbrev(Abbv);
+  unsigned Abbrev6Bit = Stream.EmitAbbrev(std::move(Abbv));
 
   // Module Hash, 160 bits SHA1. Optionally, emitted after each MST_CODE_ENTRY.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_HASH));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned AbbrevHash = Stream.EmitAbbrev(Abbv);
+  unsigned AbbrevHash = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<unsigned, 64> Vals;
   for (const auto &MPSE : Index.modulePaths()) {
@@ -3435,7 +3434,7 @@ void ModuleBitcodeWriter::writeModuleLevelReferences(
 // Current version for the summary.
 // This is bumped whenever we introduce changes in the way some record are
 // interpreted, like flags for instance.
-static const uint64_t INDEX_VERSION = 2;
+static const uint64_t INDEX_VERSION = 3;
 
 /// Emit the per-module summary section alongside the rest of
 /// the module's bitcode.
@@ -3450,7 +3449,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
   }
 
   // Abbrev for FS_PERMODULE.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
@@ -3459,10 +3458,10 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
   // numrefs x valueid, n x (valueid)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_PERMODULE_PROFILE.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
@@ -3471,24 +3470,24 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_PERMODULE_GLOBALVAR_INIT_REFS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));  // valueids
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_ALIAS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_ALIAS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
@@ -3542,7 +3541,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
 
   // Abbrev for FS_COMBINED.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
@@ -3552,10 +3551,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // numrefs x valueid, n x (valueid)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_PROFILE.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
@@ -3565,26 +3564,26 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_GLOBALVAR_INIT_REFS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));    // valueids
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_ALIAS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALIAS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
@@ -3702,19 +3701,19 @@ void writeIdentificationBlock(BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
 
   // Write the "user readable" string identifying the bitcode producer
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_STRING));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  auto StringAbbrev = Stream.EmitAbbrev(Abbv);
+  auto StringAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   writeStringRecord(Stream, bitc::IDENTIFICATION_CODE_STRING,
                     "LLVM" LLVM_VERSION_STRING, StringAbbrev);
 
   // Write the epoch version
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_EPOCH));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  auto EpochAbbrev = Stream.EmitAbbrev(Abbv);
+  auto EpochAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   SmallVector<unsigned, 1> Vals = {bitc::BITCODE_CURRENT_EPOCH};
   Stream.EmitRecord(bitc::IDENTIFICATION_CODE_EPOCH, Vals, EpochAbbrev);
   Stream.ExitBlock();
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 0c79def87933..61149d9229b7 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -53,7 +53,8 @@ void ARMException::beginFunction(const MachineFunction *MF) {
 
   if (MoveType == AsmPrinter::CFI_M_Debug) {
     if (!hasEmittedCFISections) {
-      Asm->OutStreamer->EmitCFISections(false, true);
+      if (Asm->needsOnlyDebugCFIMoves())
+        Asm->OutStreamer->EmitCFISections(false, true);
       hasEmittedCFISections = true;
     }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 5f15ac1d503b..9f6caa95a9ed 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -108,7 +108,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
       OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
-      LastMI(nullptr), LastFn(0), Counter(~0U) {
+      isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) {
   DD = nullptr;
   MMI = nullptr;
   LI = nullptr;
@@ -264,6 +264,28 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
   }
 
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+    isCFIMoveForDebugging = true;
+    if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
+      break;
+    for (auto &F: M.getFunctionList()) {
+      // If the module contains any function with unwind data,
+      // .eh_frame has to be emitted.
+      // Ignore functions that won't get emitted.
+      if (!F.isDeclarationForLinker() && F.needsUnwindTableEntry()) {
+        isCFIMoveForDebugging = false;
+        break;
+      }
+    }
+    break;
+  default:
+    isCFIMoveForDebugging = false;
+    break;
+  }
+
   EHStreamer *ES = nullptr;
   switch (MAI->getExceptionHandlingType()) {
   case ExceptionHandling::None:
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 20075e41977f..57864e4e4d4f 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -100,6 +100,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   }
 
   SourceMgr SrcMgr;
+  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+
   SrcMgrDiagInfo DiagInfo;
 
   // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index ef30e279aed2..e08306b001fb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -137,7 +137,7 @@ void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
     return;
 
   if (!hasEmittedCFISections) {
-    if (Asm->needsCFIMoves() == AsmPrinter::CFI_M_Debug)
+    if (Asm->needsOnlyDebugCFIMoves())
       Asm->OutStreamer->EmitCFISections(false, true);
     hasEmittedCFISections = true;
   }
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index cf35afbc6e5f..89a042ffc477 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -125,8 +125,11 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
 MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
   if (!MBB) {
-    MBB = MF->CreateMachineBasicBlock();
+    MBB = MF->CreateMachineBasicBlock(&BB);
     MF->push_back(MBB);
+
+    if (BB.hasAddressTaken())
+      MBB->setHasAddressTaken();
   }
   return *MBB;
 }
@@ -195,6 +198,45 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   return true;
 }
 
+bool IRTranslator::translateSwitch(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  // For now, just translate as a chain of conditional branches.
+  // FIXME: could we share most of the logic/code in
+  // SelectionDAGBuilder::visitSwitch between SelectionDAG and GlobalISel?
+  // At first sight, it seems most of the logic in there is independent of
+  // SelectionDAG-specifics and a lot of work went in to optimize switch
+  // lowering in there.
+
+  const SwitchInst &SwInst = cast<SwitchInst>(U);
+  const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition());
+
+  LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL);
+  for (auto &CaseIt : SwInst.cases()) {
+    const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());
+    const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);
+    MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue);
+    MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+    MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor());
+
+    MIRBuilder.buildBrCond(Tst, TrueBB);
+    CurBB.addSuccessor(&TrueBB);
+
+    MachineBasicBlock *FalseBB =
+        MF->CreateMachineBasicBlock(SwInst.getParent());
+    MF->push_back(FalseBB);
+    MIRBuilder.buildBr(*FalseBB);
+    CurBB.addSuccessor(FalseBB);
+
+    MIRBuilder.setMBB(*FalseBB);
+  }
+  // handle default case
+  MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest());
+  MIRBuilder.buildBr(DefaultBB);
+  MIRBuilder.getMBB().addSuccessor(&DefaultBB);
+
+  return true;
+}
+
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index a6c93bc0f3d7..7d405dd92ac3 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -55,11 +55,10 @@ const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
 RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
                                    unsigned NumRegBanks)
     : RegBanks(RegBanks), NumRegBanks(NumRegBanks) {
-  DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx)
     assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
-    assert(!RegBanks[Idx]->isValid() &&
-           "RegisterBank should be invalid before initialization");
-  });
+#endif // NDEBUG
 }
 
 RegisterBankInfo::~RegisterBankInfo() {
@@ -70,13 +69,15 @@ RegisterBankInfo::~RegisterBankInfo() {
 }
 
 bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
-  DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
     const RegisterBank &RegBank = getRegBank(Idx);
     assert(Idx == RegBank.getID() &&
            "ID does not match the index in the array");
     dbgs() << "Verify " << RegBank << '\n';
     assert(RegBank.verify(TRI) && "RegBank is invalid");
-  });
+  }
+#endif // NDEBUG
   return true;
 }
 
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 0cac7b71e241..b9f3d86eabd8 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1495,16 +1495,18 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     if (TII->reverseBranchCondition(Cond))
       llvm_unreachable("Unable to reverse branch condition!");
 
-  // Initialize liveins to the first BB. These are potentiall redefined by
-  // predicated instructions.
   Redefs.init(*TRI);
-  Redefs.addLiveIns(CvtMBB);
-  Redefs.addLiveIns(NextMBB);
-
-  // Compute a set of registers which must not be killed by instructions in
-  // BB1: This is everything live-in to BB2.
   DontKill.init(*TRI);
-  DontKill.addLiveIns(NextMBB);
+
+  if (MRI->tracksLiveness()) {
+    // Initialize liveins to the first BB. These are potentiall redefined by
+    // predicated instructions.
+    Redefs.addLiveIns(CvtMBB);
+    Redefs.addLiveIns(NextMBB);
+    // Compute a set of registers which must not be killed by instructions in
+    // BB1: This is everything live-in to BB2.
+    DontKill.addLiveIns(NextMBB);
+  }
 
   if (CvtMBB.pred_size() > 1) {
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
@@ -1602,8 +1604,10 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
   Redefs.init(*TRI);
-  Redefs.addLiveIns(CvtMBB);
-  Redefs.addLiveIns(NextMBB);
+  if (MRI->tracksLiveness()) {
+    Redefs.addLiveIns(CvtMBB);
+    Redefs.addLiveIns(NextMBB);
+  }
 
   DontKill.clear();
 
@@ -1766,8 +1770,10 @@ bool IfConverter::IfConvertDiamondCommon(
   //   instructions. We start with BB1 live-ins so we have the live-out regs
   //   after tracking the BB1 instructions.
   Redefs.init(*TRI);
-  Redefs.addLiveIns(MBB1);
-  Redefs.addLiveIns(MBB2);
+  if (MRI->tracksLiveness()) {
+    Redefs.addLiveIns(MBB1);
+    Redefs.addLiveIns(MBB2);
+  }
 
   // Remove the duplicated instructions at the beginnings of both paths.
   // Skip dbg_value instructions
@@ -1792,12 +1798,14 @@ bool IfConverter::IfConvertDiamondCommon(
   // This is everything used+live in BB2 after the duplicated instructions. We
   // can compute this set by simulating liveness backwards from the end of BB2.
   DontKill.init(*TRI);
-  for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse()))
-    DontKill.stepBackward(MI);
+  if (MRI->tracksLiveness()) {
+    for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse()))
+      DontKill.stepBackward(MI);
 
-  for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
-    SmallVector<std::pair<unsigned, const MachineOperand*>, 4> IgnoredClobbers;
-    Redefs.stepForward(MI, IgnoredClobbers);
+    for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
+      SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Dummy;
+      Redefs.stepForward(MI, Dummy);
+    }
   }
   BBI.BB->splice(BBI.BB->end(), &MBB1, MBB1.begin(), DI1);
   MBB2.erase(MBB2.begin(), DI2);
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index eb13d2d3ec0c..db87092177ca 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -488,16 +488,16 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
   }
 
   // Print the live in registers.
-  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
-  assert(TRI && "Expected target register info");
-  if (!MBB.livein_empty()) {
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  if (MRI.tracksLiveness() && !MBB.livein_empty()) {
+    const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
     OS.indent(2) << "liveins: ";
     bool First = true;
     for (const auto &LI : MBB.liveins()) {
       if (!First)
         OS << ", ";
       First = false;
-      printReg(LI.PhysReg, OS, TRI);
+      printReg(LI.PhysReg, OS, &TRI);
       if (!LI.LaneMask.all())
         OS << ":0x" << PrintLaneMask(LI.LaneMask);
     }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 549424d257fe..3869f976854d 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -286,7 +286,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   if (!livein_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Live Ins:";
-    for (const auto &LI : make_range(livein_begin(), livein_end())) {
+    for (const auto &LI : LiveIns) {
       OS << ' ' << PrintReg(LI.PhysReg, TRI);
       if (!LI.LaneMask.all())
         OS << ':' << PrintLaneMask(LI.LaneMask);
@@ -1292,3 +1292,10 @@ MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const {
 void MachineBasicBlock::clearLiveIns() {
   LiveIns.clear();
 }
+
+MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const {
+  assert(getParent()->getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksLiveness) &&
+      "Liveness information is accurate");
+  return LiveIns.begin();
+}
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 426a4666c649..a98139f9e5af 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -566,7 +566,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   FirstTerminator = nullptr;
 
   if (!MF->getProperties().hasProperty(
-      MachineFunctionProperties::Property::NoPHIs)) {
+      MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) {
     // If this block has allocatable physical registers live-in, check that
     // it is an entry block or landing pad.
     for (const auto &LI : MBB->liveins()) {
@@ -741,14 +741,16 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   }
 
   regsLive.clear();
-  for (const auto &LI : MBB->liveins()) {
-    if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
-      report("MBB live-in list contains non-physical register", MBB);
-      continue;
+  if (MRI->tracksLiveness()) {
+    for (const auto &LI : MBB->liveins()) {
+      if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
+        report("MBB live-in list contains non-physical register", MBB);
+        continue;
+      }
+      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        regsLive.insert(*SubRegs);
     }
-    for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
-         SubRegs.isValid(); ++SubRegs)
-      regsLive.insert(*SubRegs);
   }
   regsLiveInButUnused = regsLive;
 
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index de1c35caa1a0..fdf741fd58f7 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -48,11 +48,6 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
          "Target changed?");
 
-  // It is not possible to use the register scavenger after late optimization
-  // passes that don't preserve accurate liveness information.
-  assert(MRI->tracksLiveness() &&
-         "Cannot use register scavenger with inaccurate liveness");
-
   // Self-initialize.
   if (!this->MBB) {
     NumRegUnits = TRI->getNumRegUnits();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cb803585282f..a07bd8f83546 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -7339,19 +7340,23 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
   if (!Range)
     return Op;
 
-  Constant *Lo = cast<ConstantAsMetadata>(Range->getOperand(0))->getValue();
-  if (!Lo->isNullValue())
+  ConstantRange CR = getConstantRangeFromMetadata(*Range);
+  if (CR.isFullSet() || CR.isEmptySet() || CR.isWrappedSet())
     return Op;
 
-  Constant *Hi = cast<ConstantAsMetadata>(Range->getOperand(1))->getValue();
-  unsigned Bits = cast<ConstantInt>(Hi)->getValue().logBase2();
+  APInt Lo = CR.getUnsignedMin();
+  if (!Lo.isMinValue())
+    return Op;
+
+  APInt Hi = CR.getUnsignedMax();
+  unsigned Bits = Hi.getActiveBits();
 
   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
 
   SDLoc SL = getCurSDLoc();
 
-  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(),
-                             Op, DAG.getValueType(SmallVT));
+  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
+                             DAG.getValueType(SmallVT));
   unsigned NumVals = Op.getNode()->getNumValues();
   if (NumVals == 1)
     return ZExt;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index deec16330224..2aac3474654f 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -299,11 +299,8 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
     Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end());
   }
 
-  DWARFDie Child = getFirstChild();
-  while (Child) {
+  for (auto Child: children())
     Child.collectChildrenAddressRanges(Ranges);
-    Child = Child.getSibling();
-  }
 }
 
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index e6c9764f1133..2bbcb25275e4 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -468,6 +468,7 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   Options.HandleInt = Flags.handle_int;
   Options.HandleSegv = Flags.handle_segv;
   Options.HandleTerm = Flags.handle_term;
+  Options.HandleXfsz = Flags.handle_xfsz;
   SetSignalHandler(Options);
 
   if (Flags.minimize_crash_internal_step)
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 08eaad9856be..22aad353acec 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -91,6 +91,7 @@ FUZZER_FLAG_INT(handle_ill, 1, "If 1, try to intercept SIGILL.")
 FUZZER_FLAG_INT(handle_fpe, 1, "If 1, try to intercept SIGFPE.")
 FUZZER_FLAG_INT(handle_int, 1, "If 1, try to intercept SIGINT.")
 FUZZER_FLAG_INT(handle_term, 1, "If 1, try to intercept SIGTERM.")
+FUZZER_FLAG_INT(handle_xfsz, 1, "If 1, try to intercept SIGXFSZ.")
 FUZZER_FLAG_INT(close_fd_mask, 0, "If 1, close stdout at startup; "
     "if 2, close stderr; if 3, close both. "
     "Be careful, this will also close e.g. asan's stderr/stdout.")
diff --git a/lib/Fuzzer/FuzzerIO.h b/lib/Fuzzer/FuzzerIO.h
index 741fecf415b0..15bfd3d34727 100644
--- a/lib/Fuzzer/FuzzerIO.h
+++ b/lib/Fuzzer/FuzzerIO.h
@@ -37,6 +37,9 @@ std::string DirPlusFile(const std::string &DirPath,
 // Returns the name of the dir, similar to the 'dirname' utility.
 std::string DirName(const std::string &FileName);
 
+// Returns path to a TmpDir.
+std::string TmpDir();
+
 void DupAndCloseStderr();
 
 void CloseStdout();
diff --git a/lib/Fuzzer/FuzzerIOPosix.cpp b/lib/Fuzzer/FuzzerIOPosix.cpp
index 720bc1304594..6d8edf6ff538 100644
--- a/lib/Fuzzer/FuzzerIOPosix.cpp
+++ b/lib/Fuzzer/FuzzerIOPosix.cpp
@@ -83,6 +83,12 @@ std::string DirName(const std::string &FileName) {
   return Res;
 }
 
+std::string TmpDir() {
+  if (auto Env = getenv("TMPDIR"))
+    return Env;
+  return "/tmp";
+}
+
 }  // namespace fuzzer
 
 #endif // LIBFUZZER_POSIX
diff --git a/lib/Fuzzer/FuzzerIOWindows.cpp b/lib/Fuzzer/FuzzerIOWindows.cpp
index a4738eb9dfe5..056f0721a336 100644
--- a/lib/Fuzzer/FuzzerIOWindows.cpp
+++ b/lib/Fuzzer/FuzzerIOWindows.cpp
@@ -277,6 +277,8 @@ std::string DirName(const std::string &FileName) {
   return FileName.substr(0, LocationLen + DirLen);
 }
 
+std::string TmpDir() { return "TODO: implement TmpDir"; }
+
 }  // namespace fuzzer
 
 #endif // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index c041706092db..0d2c7a78aca8 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -82,6 +82,7 @@ public:
   static void StaticAlarmCallback();
   static void StaticCrashSignalCallback();
   static void StaticInterruptCallback();
+  static void StaticFileSizeExceedCallback();
 
   void ExecuteCallback(const uint8_t *Data, size_t Size);
   size_t RunOne(const uint8_t *Data, size_t Size);
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 1336f5e4aeeb..9f49d1557990 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -266,6 +266,11 @@ void Fuzzer::StaticInterruptCallback() {
   F->InterruptCallback();
 }
 
+void Fuzzer::StaticFileSizeExceedCallback() {
+  Printf("==%lu== ERROR: libFuzzer: file size exceeded\n", GetPid());
+  exit(1);
+}
+
 void Fuzzer::CrashCallback() {
   Printf("==%lu== ERROR: libFuzzer: deadly signal\n", GetPid());
   if (EF->__sanitizer_print_stack_trace)
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
index 84660e0fe53f..9e559115680c 100644
--- a/lib/Fuzzer/FuzzerMerge.cpp
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -220,8 +220,8 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
     ListFilesInDirRecursive(Corpora[i], nullptr, &AllFiles, /*TopDir*/true);
   Printf("MERGE-OUTER: %zd files, %zd in the initial corpus\n",
          AllFiles.size(), NumFilesInFirstCorpus);
-  std::string CFPath =
-      "libFuzzerTemp." + std::to_string(GetPid()) + ".txt";
+  auto CFPath = DirPlusFile(TmpDir(),
+                       "libFuzzerTemp." + std::to_string(GetPid()) + ".txt");
   // Write the control file.
   RemoveFile(CFPath);
   std::ofstream ControlFile(CFPath);
@@ -229,6 +229,11 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   ControlFile << NumFilesInFirstCorpus << "\n";
   for (auto &Path: AllFiles)
     ControlFile << Path << "\n";
+  if (!ControlFile) {
+    Printf("MERGE-OUTER: failed to write to the control file: %s\n",
+           CFPath.c_str());
+    exit(1);
+  }
   ControlFile.close();
 
   // Execute the inner process untill it passes.
@@ -246,6 +251,9 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   // Read the control file and do the merge.
   Merger M;
   std::ifstream IF(CFPath);
+  IF.seekg(0, IF.end);
+  Printf("MERGE-OUTER: the control file has %zd bytes\n", (size_t)IF.tellg());
+  IF.seekg(0, IF.beg);
   M.ParseOrExit(IF, true);
   IF.close();
   std::vector<std::string> NewFiles;
diff --git a/lib/Fuzzer/FuzzerOptions.h b/lib/Fuzzer/FuzzerOptions.h
index cb702d285200..6f72205600b9 100644
--- a/lib/Fuzzer/FuzzerOptions.h
+++ b/lib/Fuzzer/FuzzerOptions.h
@@ -62,6 +62,7 @@ struct FuzzingOptions {
   bool HandleInt = false;
   bool HandleSegv = false;
   bool HandleTerm = false;
+  bool HandleXfsz = false;
 };
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
index be62a6624b27..2ad9702fab0e 100644
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@@ -46,10 +46,6 @@ public:
   void TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
                            const uint8_t *Data2);
 
-  void TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits, uint64_t Val,
-                           size_t NumCases, uint64_t *Cases);
-  int TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
-                          size_t DataSize);
   int TryToAddDesiredData(const uint8_t *PresentData,
                           const uint8_t *DesiredData, size_t DataSize);
 
@@ -147,29 +143,6 @@ public:
   size_t AutoDictAdds = 0;
 };
 
-int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
-                                    size_t DataSize) {
-  if (NumMutations >= kMaxMutations || !WantToHandleOneMoreMutation()) return 0;
-  ScopedDoingMyOwnMemmem scoped_doing_my_own_memmem;
-  const uint8_t *UnitData;
-  auto UnitSize = F->GetCurrentUnitInFuzzingThead(&UnitData);
-  int Res = 0;
-  const uint8_t *Beg = UnitData;
-  const uint8_t *End = Beg + UnitSize;
-  for (const uint8_t *Cur = Beg; Cur < End; Cur++) {
-    Cur = (uint8_t *)SearchMemory(Cur, End - Cur, &PresentData, DataSize);
-    if (!Cur)
-      break;
-    size_t Pos = Cur - Beg;
-    assert(Pos < UnitSize);
-    AddMutation(Pos, DataSize, DesiredData);
-    AddMutation(Pos, DataSize, DesiredData + 1);
-    AddMutation(Pos, DataSize, DesiredData - 1);
-    Res++;
-  }
-  return Res;
-}
-
 int TraceState::TryToAddDesiredData(const uint8_t *PresentData,
                                     const uint8_t *DesiredData,
                                     size_t DataSize) {
@@ -206,26 +179,6 @@ void TraceState::TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
   }
 }
 
-void TraceState::TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits,
-                                     uint64_t Val, size_t NumCases,
-                                     uint64_t *Cases) {
-  if (F->InFuzzingThread()) return;
-  size_t ValSize = ValSizeInBits / 8;
-  bool TryShort = IsTwoByteData(Val);
-  for (size_t i = 0; i < NumCases; i++)
-    TryShort &= IsTwoByteData(Cases[i]);
-
-  if (Options.Verbosity >= 3)
-    Printf("TraceSwitch: %p %zd # %zd; TryShort %d\n", PC, Val, NumCases,
-           TryShort);
-
-  for (size_t i = 0; i < NumCases; i++) {
-    TryToAddDesiredData(Val, Cases[i], ValSize);
-    if (TryShort)
-      TryToAddDesiredData(Val, Cases[i], 2);
-  }
-}
-
 static TraceState *TS;
 
 void Fuzzer::StartTraceRecording() {
diff --git a/lib/Fuzzer/FuzzerUtilPosix.cpp b/lib/Fuzzer/FuzzerUtilPosix.cpp
index 8b484b8effa4..e8d48dc81a3b 100644
--- a/lib/Fuzzer/FuzzerUtilPosix.cpp
+++ b/lib/Fuzzer/FuzzerUtilPosix.cpp
@@ -41,6 +41,10 @@ static void InterruptHandler(int, siginfo_t *, void *) {
   Fuzzer::StaticInterruptCallback();
 }
 
+static void FileSizeExceedHandler(int, siginfo_t *, void *) {
+  Fuzzer::StaticFileSizeExceedCallback();
+}
+
 static void SetSigaction(int signum,
                          void (*callback)(int, siginfo_t *, void *)) {
   struct sigaction sigact;
@@ -80,6 +84,8 @@ void SetSignalHandler(const FuzzingOptions& Options) {
     SetSigaction(SIGILL, CrashHandler);
   if (Options.HandleFpe)
     SetSigaction(SIGFPE, CrashHandler);
+  if (Options.HandleXfsz)
+    SetSigaction(SIGXFSZ, FileSizeExceedHandler);
 }
 
 void SleepSeconds(int Seconds) {
diff --git a/lib/Fuzzer/FuzzerUtilWindows.cpp b/lib/Fuzzer/FuzzerUtilWindows.cpp
index 64adb7cd1380..3ca1f2c8f562 100644
--- a/lib/Fuzzer/FuzzerUtilWindows.cpp
+++ b/lib/Fuzzer/FuzzerUtilWindows.cpp
@@ -58,6 +58,7 @@ LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) {
       if (HandlerOpt->HandleFpe)
         Fuzzer::StaticCrashSignalCallback();
       break;
+    // TODO: handle (Options.HandleXfsz)
   }
   return EXCEPTION_CONTINUE_SEARCH;
 }
diff --git a/lib/Fuzzer/test/merge.test b/lib/Fuzzer/test/merge.test
index 1f1810eb0195..5c7d30e41caa 100644
--- a/lib/Fuzzer/test/merge.test
+++ b/lib/Fuzzer/test/merge.test
@@ -44,3 +44,11 @@ MERGE_WITH_CRASH: MERGE-OUTER: 3 new files
 # Check that we actually limit the size with max_len
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2  -max_len=5 2>&1 | FileCheck %s --check-prefix=MERGE_LEN5
 MERGE_LEN5: MERGE-OUTER: succesfull in 1 attempt(s)
+
+# Check that we honor TMPDIR
+RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR
+TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp
+
+# Check that we can report an error if file size exceeded
+RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
+SIGXFSZ: ERROR: libFuzzer: file size exceeded
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 42b3a344352b..e3e2f9f806c8 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -337,12 +337,21 @@ void LTO::addSymbolToGlobalRes(SmallPtrSet<GlobalValue *, 8> &Used,
     if (Res.Prevailing)
       GlobalRes.IRName = GV->getName();
   }
+  // Set the partition to external if we know it is used elsewhere, e.g.
+  // it is visible to a regular object, is referenced from llvm.compiler_used,
+  // or was already recorded as being referenced from a different partition.
   if (Res.VisibleToRegularObj || (GV && Used.count(GV)) ||
       (GlobalRes.Partition != GlobalResolution::Unknown &&
-       GlobalRes.Partition != Partition))
+       GlobalRes.Partition != Partition)) {
     GlobalRes.Partition = GlobalResolution::External;
-  else
+  } else
+    // First recorded reference, save the current partition.
     GlobalRes.Partition = Partition;
+
+  // Flag as visible outside of ThinLTO if visible from a regular object or
+  // if this is a reference in the regular LTO partition.
+  GlobalRes.VisibleOutsideThinLTO |=
+      (Res.VisibleToRegularObj || (Partition == GlobalResolution::RegularLTO));
 }
 
 static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
@@ -848,6 +857,19 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
     if (!ModuleToDefinedGVSummaries.count(Mod.first))
       ModuleToDefinedGVSummaries.try_emplace(Mod.first);
 
+  // Compute "dead" symbols, we don't want to import/export these!
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+  for (auto &Res : GlobalResolutions) {
+    if (Res.second.VisibleOutsideThinLTO &&
+        // IRName will be defined if we have seen the prevailing copy of
+        // this value. If not, no need to preserve any ThinLTO copies.
+        !Res.second.IRName.empty())
+      GUIDPreservedSymbols.insert(GlobalValue::getGUID(Res.second.IRName));
+  }
+
+  auto DeadSymbols =
+      computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
+
   StringMap<FunctionImporter::ImportMapTy> ImportLists(
       ThinLTO.ModuleMap.size());
   StringMap<FunctionImporter::ExportSetTy> ExportLists(
@@ -856,12 +878,21 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   if (Conf.OptLevel > 0) {
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                             ImportLists, ExportLists);
+                             ImportLists, ExportLists, &DeadSymbols);
 
     std::set<GlobalValue::GUID> ExportedGUIDs;
     for (auto &Res : GlobalResolutions) {
-      if (!Res.second.IRName.empty() &&
-          Res.second.Partition == GlobalResolution::External)
+      // First check if the symbol was flagged as having external references.
+      if (Res.second.Partition != GlobalResolution::External)
+        continue;
+      // IRName will be defined if we have seen the prevailing copy of
+      // this value. If not, no need to mark as exported from a ThinLTO
+      // partition (and we can't get the GUID).
+      if (Res.second.IRName.empty())
+        continue;
+      auto GUID = GlobalValue::getGUID(Res.second.IRName);
+      // Mark exported unless index-based analysis determined it to be dead.
+      if (!DeadSymbols.count(GUID))
         ExportedGUIDs.insert(GlobalValue::getGUID(Res.second.IRName));
     }
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 880dc3dfae98..66ffe6db29d6 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -581,11 +581,18 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries;
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
+      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
+                           ExportLists, &DeadSymbols);
 
   // Resolve LinkOnce/Weak symbols.
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
@@ -594,10 +601,6 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
   thinLTOResolveWeakForLinkerModule(
       TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
 
-  // Convert the preserved symbols set from string to GUID
-  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
-      PreservedSymbols, Triple(TheModule.getTargetTriple()));
-
   // Promote the exported values in the index, so that they are promoted
   // in the module.
   auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
@@ -623,11 +626,18 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
+      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
+                           ExportLists, &DeadSymbols);
   auto &ImportList = ImportLists[TheModule.getModuleIdentifier()];
 
   crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
@@ -697,11 +707,14 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
+                           ExportLists, &DeadSymbols);
   auto &ExportList = ExportLists[ModuleIdentifier];
 
   // Be friendly and don't nuke totally the module when the client didn't
@@ -836,17 +849,20 @@ void ThinLTOCodeGenerator::run() {
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID, this is needed for
+  // computing the caching hash and the internalization.
+  auto GUIDPreservedSymbols =
+      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(*Index, GUIDPreservedSymbols);
+
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
-
-  // Convert the preserved symbols set from string to GUID, this is needed for
-  // computing the caching hash and the internalization.
-  auto GUIDPreservedSymbols =
-      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+                           ExportLists, &DeadSymbols);
 
   // We use a std::map here to be able to have a defined ordering when
   // producing a hash for the cache entry.
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 0c0b498f1375..fb8b45166a41 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -205,7 +205,7 @@ APInt& APInt::operator++() {
 
 /// This function subtracts a single "digit" (64-bit word), y, from
 /// the multi-digit integer array, x[], propagating the borrowed 1 value until
-/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// no further borrowing is needed or it runs out of "digits" in x.  The result
 /// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
 /// In other words, if y > x then this function returns 1, otherwise 0.
 /// @returns the borrow out of the subtraction
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index ca344b1dc058..15418ad2fd06 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -90,6 +90,7 @@ add_llvm_library(LLVMSupport
   StringSaver.cpp
   StringRef.cpp
   SystemUtils.cpp
+  TarWriter.cpp
   TargetParser.cpp
   ThreadPool.cpp
   Timer.cpp
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 49d0ed55a716..8a09589aa884 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -474,15 +474,25 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
       break;
 
     // Skylake:
-    case 0x4e:
-      *Type = INTEL_COREI7; // "skylake-avx512"
-      *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
-      break;
-    case 0x5e:
+    case 0x4e: // Skylake mobile
+    case 0x5e: // Skylake desktop
+    case 0x8e: // Kaby Lake mobile
+    case 0x9e: // Kaby Lake desktop
       *Type = INTEL_COREI7; // "skylake"
       *Subtype = INTEL_COREI7_SKYLAKE;
       break;
 
+    // Skylake Xeon:
+    case 0x55:
+      *Type = INTEL_COREI7;
+      // Check that we really have AVX512
+      if (Features & (1 << FEATURE_AVX512)) {
+        *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512"
+      } else {
+        *Subtype = INTEL_COREI7_SKYLAKE; // "skylake"
+      }
+      break;
+
     case 0x1c: // Most 45 nm Intel Atom processors
     case 0x26: // 45 nm Atom Lincroft
     case 0x27: // 32 nm Atom Medfield
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
new file mode 100644
index 000000000000..5fc17d276377
--- /dev/null
+++ b/lib/Support/TarWriter.cpp
@@ -0,0 +1,166 @@
+//===-- TarWriter.cpp - Tar archive file creator --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TarWriter class provides a feature to create a tar archive file.
+//
+// I put emphasis on simplicity over comprehensiveness when implementing this
+// class because we don't need a full-fledged archive file generator in LLVM
+// at the moment.
+//
+// The filename field in the Unix V7 tar header is 100 bytes. Longer filenames
+// are stored using the PAX extension. The PAX header is standardized in
+// POSIX.1-2001.
+//
+// The struct definition of UstarHeader is copied from
+// https://www.freebsd.org/cgi/man.cgi?query=tar&sektion=5
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TarWriter.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+// Each file in an archive must be aligned to this block size.
+static const int BlockSize = 512;
+
+struct UstarHeader {
+  char Name[100];
+  char Mode[8];
+  char Uid[8];
+  char Gid[8];
+  char Size[12];
+  char Mtime[12];
+  char Checksum[8];
+  char TypeFlag;
+  char Linkname[100];
+  char Magic[6];
+  char Version[2];
+  char Uname[32];
+  char Gname[32];
+  char DevMajor[8];
+  char DevMinor[8];
+  char Prefix[155];
+  char Pad[12];
+};
+static_assert(sizeof(UstarHeader) == BlockSize, "invalid Ustar header");
+
+// A PAX attribute is in the form of "<length> <key>=<value>\n"
+// where <length> is the length of the entire string including
+// the length field itself. An example string is this.
+//
+//   25 ctime=1084839148.1212\n
+//
+// This function create such string.
+static std::string formatPax(StringRef Key, StringRef Val) {
+  int Len = Key.size() + Val.size() + 3; // +3 for " ", "=" and "\n"
+
+  // We need to compute total size twice because appending
+  // a length field could change total size by one.
+  int Total = Len + Twine(Len).str().size();
+  Total = Len + Twine(Total).str().size();
+  return (Twine(Total) + " " + Key + "=" + Val + "\n").str();
+}
+
+// Headers in tar files must be aligned to 512 byte boundaries.
+// This function forwards the current file position to the next boundary.
+static void pad(raw_fd_ostream &OS) {
+  uint64_t Pos = OS.tell();
+  OS.seek(alignTo(Pos, BlockSize));
+}
+
+// Computes a checksum for a tar header.
+static void computeChecksum(UstarHeader &Hdr) {
+  // Before computing a checksum, checksum field must be
+  // filled with space characters.
+  memset(Hdr.Checksum, ' ', sizeof(Hdr.Checksum));
+
+  // Compute a checksum and set it to the checksum field.
+  unsigned Chksum = 0;
+  for (size_t I = 0; I < sizeof(Hdr); ++I)
+    Chksum += reinterpret_cast<uint8_t *>(&Hdr)[I];
+  snprintf(Hdr.Checksum, sizeof(Hdr.Checksum), "%06o", Chksum);
+}
+
+// Create a tar header and write it to a given output stream.
+static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
+  // A PAX header consists of a 512-byte header followed
+  // by key-value strings. First, create key-value strings.
+  std::string PaxAttr = formatPax("path", Path);
+
+  // Create a 512-byte header.
+  UstarHeader Hdr = {};
+  snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", PaxAttr.size());
+  Hdr.TypeFlag = 'x';            // PAX magic
+  memcpy(Hdr.Magic, "ustar", 6); // Ustar magic
+  computeChecksum(Hdr);
+
+  // Write them down.
+  OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
+  OS << PaxAttr;
+  pad(OS);
+}
+
+// The PAX header is an extended format, so a PAX header needs
+// to be followed by a "real" header.
+static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
+  UstarHeader Hdr = {};
+  memcpy(Hdr.Name, Path.data(), Path.size());
+  memcpy(Hdr.Mode, "0000664", 8);
+  snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
+  memcpy(Hdr.Magic, "ustar", 6);
+  computeChecksum(Hdr);
+  OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
+}
+
+// We want to use '/' as a path separator even on Windows.
+// This function canonicalizes a given path.
+static std::string canonicalize(std::string S) {
+#ifdef LLVM_ON_WIN32
+  std::replace(S.begin(), S.end(), '\\', '/');
+#endif
+  return S;
+}
+
+// Creates a TarWriter instance and returns it.
+Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
+                                                       StringRef BaseDir) {
+  int FD;
+  if (std::error_code EC = openFileForWrite(OutputPath, FD, sys::fs::F_None))
+    return make_error<StringError>("cannot open " + OutputPath, EC);
+  return std::unique_ptr<TarWriter>(new TarWriter(FD, BaseDir));
+}
+
+TarWriter::TarWriter(int FD, StringRef BaseDir)
+    : OS(FD, /*shouldClose=*/true, /*unbuffered=*/false), BaseDir(BaseDir) {}
+
+// Append a given file to an archive.
+void TarWriter::append(StringRef Path, StringRef Data) {
+  // Write Path and Data.
+  std::string S = BaseDir + "/" + canonicalize(Path) + "\0";
+  if (S.size() <= sizeof(UstarHeader::Name)) {
+    writeUstarHeader(OS, S, Data.size());
+  } else {
+    writePaxHeader(OS, S);
+    writeUstarHeader(OS, "", Data.size());
+  }
+
+  OS << Data;
+  pad(OS);
+
+  // POSIX requires tar archives end with two null blocks.
+  // Here, we write the terminator and then seek back, so that
+  // the file being output is terminated correctly at any moment.
+  uint64_t Pos = OS.tell();
+  OS << std::string(BlockSize * 2, '\0');
+  OS.seek(Pos);
+  OS.flush();
+}
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 3750d7f4c09d..9752b70644c6 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -48,7 +48,7 @@
 // _Unwind_Backtrace function, but on FreeBSD the configure test passes
 // despite the function not existing, and on Android, <unwind.h> conflicts
 // with <link.h>.
-#if defined(__GLIBC__) || defined(__APPLE__)
+#ifdef __GLIBC__
 #include <unwind.h>
 #else
 #undef HAVE__UNWIND_BACKTRACE
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 7666011f75b6..17aafa0c3d6e 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -110,72 +110,34 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-collect-loh"
 
-static cl::opt<bool>
-PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
-                   cl::desc("Restrict analysis to registers invovled"
-                            " in LOHs"),
-                   cl::init(true));
-
-static cl::opt<bool>
-BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
-                    cl::desc("Restrict analysis at basic block scope"),
-                    cl::init(true));
-
 STATISTIC(NumADRPSimpleCandidate,
           "Number of simplifiable ADRP dominate by another");
-#ifndef NDEBUG
-STATISTIC(NumADRPComplexCandidate2,
-          "Number of simplifiable ADRP reachable by 2 defs");
-STATISTIC(NumADRPComplexCandidate3,
-          "Number of simplifiable ADRP reachable by 3 defs");
-STATISTIC(NumADRPComplexCandidateOther,
-          "Number of simplifiable ADRP reachable by 4 or more defs");
-STATISTIC(NumADDToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by ADD");
-STATISTIC(NumLDRToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by LDR");
 STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
 STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
-STATISTIC(NumADDToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by ADD");
-STATISTIC(NumLDRToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by LDR");
 STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
 STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
-#endif // NDEBUG
 STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
-#ifndef NDEBUG
-STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
-STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
-STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
-STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
-#endif // NDEBUG
 STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
-STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
 
 #define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
 
 namespace {
+
 struct AArch64CollectLOH : public MachineFunctionPass {
   static char ID;
-  AArch64CollectLOH() : MachineFunctionPass(ID) {
-    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
-  }
+  AArch64CollectLOH() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -187,351 +149,57 @@ struct AArch64CollectLOH : public MachineFunctionPass {
   StringRef getPassName() const override { return AARCH64_COLLECT_LOH_NAME; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesAll();
   }
-
-private:
 };
 
-/// A set of MachineInstruction.
-typedef SetVector<const MachineInstr *> SetOfMachineInstr;
-/// Map a basic block to a set of instructions per register.
-/// This is used to represent the exposed uses of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *,
-                  std::unique_ptr<SetOfMachineInstr[]>>
-BlockToSetOfInstrsPerColor;
-/// Map a basic block to an instruction per register.
-/// This is used to represent the live-out definitions of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *,
-                  std::unique_ptr<const MachineInstr *[]>>
-BlockToInstrPerColor;
-/// Map an instruction to a set of instructions. Used to represent the
-/// mapping def to reachable uses or use to definitions.
-typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
-/// Map a basic block to a BitVector.
-/// This is used to record the kill registers per basic block.
-typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
-
-/// Map a register to a dense id.
-typedef DenseMap<unsigned, unsigned> MapRegToId;
-/// Map a dense id to a register. Used for debug purposes.
-typedef SmallVector<unsigned, 32> MapIdToReg;
-} // end anonymous namespace.
-
 char AArch64CollectLOH::ID = 0;
 
-INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
-                      AARCH64_COLLECT_LOH_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
-                    AARCH64_COLLECT_LOH_NAME, false, false)
-
-/// Given a couple (MBB, reg) get the corresponding set of instruction from
-/// the given "sets".
-/// If this couple does not reference any set, an empty set is added to "sets"
-/// for this couple and returned.
-/// \param nbRegs is used internally allocate some memory. It must be consistent
-/// with the way sets is used.
-static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
-                                 const MachineBasicBlock &MBB, unsigned reg,
-                                 unsigned nbRegs) {
-  SetOfMachineInstr *result;
-  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
-  if (it != sets.end())
-    result = it->second.get();
-  else
-    result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
-
-  return result[reg];
-}
-
-/// Given a couple (reg, MI) get the corresponding set of instructions from the
-/// the given "sets".
-/// This is used to get the uses record in sets of a definition identified by
-/// MI and reg, i.e., MI defines reg.
-/// If the couple does not reference anything, an empty set is added to
-/// "sets[reg]".
-/// \pre set[reg] is valid.
-static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
-                                  const MachineInstr &MI) {
-  return sets[reg][&MI];
-}
-
-/// Same as getUses but does not modify the input map: sets.
-/// \return NULL if the couple (reg, MI) is not in sets.
-static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
-                                        const MachineInstr &MI) {
-  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
-  if (Res != sets[reg].end())
-    return &(Res->second);
-  return nullptr;
-}
-
-/// Initialize the reaching definition algorithm:
-/// For each basic block BB in MF, record:
-/// - its kill set.
-/// - its reachable uses (uses that are exposed to BB's predecessors).
-/// - its the generated definitions.
-/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
-/// the list of uses of exposed defintions.
-/// \param ADRPMode specifies to only consider ADRP instructions for generated
-/// definition. It also consider definitions of ADRP instructions as uses and
-/// ignore other uses. The ADRPMode is used to collect the information for LHO
-/// that involve ADRP operation only.
-static void initReachingDef(const MachineFunction &MF,
-                            InstrToInstrs *ColorOpToReachedUses,
-                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                            BlockToSetOfInstrsPerColor &ReachableUses,
-                            const MapRegToId &RegToId,
-                            const MachineInstr *DummyOp, bool ADRPMode) {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  unsigned NbReg = RegToId.size();
-
-  for (const MachineBasicBlock &MBB : MF) {
-    auto &BBGen = Gen[&MBB];
-    BBGen = make_unique<const MachineInstr *[]>(NbReg);
-    std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
-
-    BitVector &BBKillSet = Kill[&MBB];
-    BBKillSet.resize(NbReg);
-    for (const MachineInstr &MI : MBB) {
-      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
-
-      // Process uses first.
-      if (IsADRP || !ADRPMode)
-        for (const MachineOperand &MO : MI.operands()) {
-          // Treat ADRP def as use, as the goal of the analysis is to find
-          // ADRP defs reached by other ADRP defs.
-          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
-              (ADRPMode && (!IsADRP || !MO.isDef())))
-            continue;
-          unsigned CurReg = MO.getReg();
-          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-          if (ItCurRegId == RegToId.end())
-            continue;
-          CurReg = ItCurRegId->second;
-
-          // if CurReg has not been defined, this use is reachable.
-          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
-            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
-          // current basic block definition for this color, if any, is in Gen.
-          if (BBGen[CurReg])
-            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
-        }
-
-      // Process clobbers.
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isRegMask())
-          continue;
-        // Clobbers kill the related colors.
-        const uint32_t *PreservedRegs = MO.getRegMask();
-
-        // Set generated regs.
-        for (const auto &Entry : RegToId) {
-          unsigned Reg = Entry.second;
-          // Use the global register ID when querying APIs external to this
-          // pass.
-          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
-            // Do not register clobbered definition for no ADRP.
-            // This definition is not used anyway (otherwise register
-            // allocation is wrong).
-            BBGen[Reg] = ADRPMode ? &MI : nullptr;
-            BBKillSet.set(Reg);
-          }
-        }
-      }
-
-      // Process register defs.
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg() || !MO.isDef())
-          continue;
-        unsigned CurReg = MO.getReg();
-        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-        if (ItCurRegId == RegToId.end())
-          continue;
-
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
-          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
-          // If this alias has not been recorded, then it is not interesting
-          // for the current analysis.
-          // We can end up in this situation because of tuple registers.
-          // E.g., Let say we are interested in S1. When we register
-          // S1, we will also register its aliases and in particular
-          // the tuple Q1_Q2.
-          // Now, when we encounter Q1_Q2, we will look through its aliases
-          // and will find that S2 is not registered.
-          if (ItRegId == RegToId.end())
-            continue;
-
-          BBKillSet.set(ItRegId->second);
-          BBGen[ItRegId->second] = &MI;
-        }
-        BBGen[ItCurRegId->second] = &MI;
-      }
-    }
-
-    // If we restrict our analysis to basic block scope, conservatively add a
-    // dummy
-    // use for each generated value.
-    if (!ADRPMode && DummyOp && !MBB.succ_empty())
-      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
-        if (BBGen[CurReg])
-          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
-  }
-}
-
-/// Reaching def core algorithm:
-/// while an Out has changed
-///    for each bb
-///       for each color
-///           In[bb][color] = U Out[bb.predecessors][color]
-///           insert reachableUses[bb][color] in each in[bb][color]
-///                 op.reachedUses
-///
-///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-static void reachingDefAlgorithm(const MachineFunction &MF,
-                                 InstrToInstrs *ColorOpToReachedUses,
-                                 BlockToSetOfInstrsPerColor &In,
-                                 BlockToSetOfInstrsPerColor &Out,
-                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                                 BlockToSetOfInstrsPerColor &ReachableUses,
-                                 unsigned NbReg) {
-  bool HasChanged;
-  do {
-    HasChanged = false;
-    for (const MachineBasicBlock &MBB : MF) {
-      unsigned CurReg;
-      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBReachableUses =
-            getSet(ReachableUses, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
-        unsigned Size = BBOutSet.size();
-        //   In[bb][color] = U Out[bb.predecessors][color]
-        for (const MachineBasicBlock *PredMBB : MBB.predecessors()) {
-          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
-          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
-        }
-        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
-        for (const MachineInstr *MI : BBInSet) {
-          SetOfMachineInstr &OpReachedUses =
-              getUses(ColorOpToReachedUses, CurReg, *MI);
-          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
-        }
-        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-        if (!Kill[&MBB].test(CurReg))
-          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
-        if (Gen[&MBB][CurReg])
-          BBOutSet.insert(Gen[&MBB][CurReg]);
-        HasChanged |= BBOutSet.size() != Size;
-      }
-    }
-  } while (HasChanged);
-}
-
-/// Reaching definition algorithm.
-/// \param MF function on which the algorithm will operate.
-/// \param[out] ColorOpToReachedUses will contain the result of the reaching
-/// def algorithm.
-/// \param ADRPMode specify whether the reaching def algorithm should be tuned
-/// for ADRP optimization. \see initReachingDef for more details.
-/// \param DummyOp if not NULL, the algorithm will work at
-/// basic block scope and will set for every exposed definition a use to
-/// @p DummyOp.
-/// \pre ColorOpToReachedUses is an array of at least number of registers of
-/// InstrToInstrs.
-static void reachingDef(const MachineFunction &MF,
-                        InstrToInstrs *ColorOpToReachedUses,
-                        const MapRegToId &RegToId, bool ADRPMode = false,
-                        const MachineInstr *DummyOp = nullptr) {
-  // structures:
-  // For each basic block.
-  // Out: a set per color of definitions that reach the
-  //      out boundary of this block.
-  // In: Same as Out but for in boundary.
-  // Gen: generated color in this block (one operation per color).
-  // Kill: register set of killed color in this block.
-  // ReachableUses: a set per color of uses (operation) reachable
-  //                for "In" definitions.
-  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
-  BlockToInstrPerColor Gen;
-  BlockToRegSet Kill;
-
-  // Initialize Gen, kill and reachableUses.
-  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
-                  DummyOp, ADRPMode);
-
-  // Algo.
-  if (!DummyOp)
-    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
-                         ReachableUses, RegToId.size());
-}
+} // end anonymous namespace.
 
-#ifndef NDEBUG
-/// print the result of the reaching definition algorithm.
-static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
-                             unsigned NbReg, const TargetRegisterInfo *TRI,
-                             const MapIdToReg &IdToReg) {
-  unsigned CurReg;
-  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+INITIALIZE_PASS(AArch64CollectLOH, "aarch64-collect-loh",
+                AARCH64_COLLECT_LOH_NAME, false, false)
 
-    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
-      DEBUG(dbgs() << "Def:\n");
-      DEBUG(DefsIt.first->print(dbgs()));
-      DEBUG(dbgs() << "Reachable uses:\n");
-      for (const MachineInstr *MI : DefsIt.second) {
-        DEBUG(MI->print(dbgs()));
-      }
-    }
+static bool canAddBePartOfLOH(const MachineInstr &MI) {
+  // Check immediate to see if the immediate is an address.
+  switch (MI.getOperand(2).getType()) {
+  default:
+    return false;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return true;
   }
 }
-#endif // NDEBUG
 
 /// Answer the following question: Can Def be one of the definition
 /// involved in a part of a LOH?
-static bool canDefBePartOfLOH(const MachineInstr *Def) {
-  unsigned Opc = Def->getOpcode();
+static bool canDefBePartOfLOH(const MachineInstr &MI) {
   // Accept ADRP, ADDLow and LOADGot.
-  switch (Opc) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::ADRP:
     return true;
   case AArch64::ADDXri:
-    // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
-    default:
-      return false;
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_JumpTableIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_BlockAddress:
-      return true;
-    }
+    return canAddBePartOfLOH(MI);
   case AArch64::LDRXui:
     // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
+    switch (MI.getOperand(2).getType()) {
     default:
       return false;
     case MachineOperand::MO_GlobalAddress:
-      return true;
+      return MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT;
     }
   }
-  // Unreachable.
-  return false;
 }
 
 /// Check whether the given instruction can the end of a LOH chain involving a
 /// store.
-static bool isCandidateStore(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
+static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::STRBBui:
@@ -543,109 +211,19 @@ static bool isCandidateStore(const MachineInstr *Instr) {
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
+    // We can only optimize the index operand.
     // In case we have str xA, [xA, #imm], this is two different uses
     // of xA and we cannot fold, otherwise the xA stored may be wrong,
     // even if #imm == 0.
-    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
-      return true;
-  }
-  return false;
-}
-
-/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
-/// Build the Use to Defs information and filter out obvious non-LOH candidates.
-/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
-/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
-/// i.e., no simple chain.
-/// \param ADRPMode -- \see initReachingDef.
-static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
-                              const InstrToInstrs *ColorOpToReachedUses,
-                              const MapRegToId &RegToId,
-                              bool ADRPMode = false) {
-
-  SetOfMachineInstr NotCandidate;
-  unsigned NbReg = RegToId.size();
-  MapRegToId::const_iterator EndIt = RegToId.end();
-  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
-    // If this color is never defined, continue.
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-
-    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
-      for (const MachineInstr *MI : DefsIt.second) {
-        const MachineInstr *Def = DefsIt.first;
-        MapRegToId::const_iterator It;
-        // if all the reaching defs are not adrp, this use will not be
-        // simplifiable.
-        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
-            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
-            (!ADRPMode && isCandidateStore(MI) &&
-             // store are LOH candidate iff the end of the chain is used as
-             // base.
-             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
-              It->second != CurReg))) {
-          NotCandidate.insert(MI);
-          continue;
-        }
-        // Do not consider self reaching as a simplifiable case for ADRP.
-        if (!ADRPMode || MI != DefsIt.first) {
-          UseToReachingDefs[MI].insert(DefsIt.first);
-          // If UsesIt has several reaching definitions, it is not
-          // candidate for simplificaton in non-ADRPMode.
-          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
-            NotCandidate.insert(MI);
-        }
-      }
-    }
-  }
-  for (const MachineInstr *Elem : NotCandidate) {
-    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
-    // It would have been better if we could just remove the entry
-    // from the map.  Because of that, we have to filter the garbage
-    // (second.empty) in the subsequence analysis.
-    UseToReachingDefs[Elem].clear();
-  }
-}
-
-/// Based on the use to defs information (in ADRPMode), compute the
-/// opportunities of LOH ADRP-related.
-static void computeADRP(const InstrToInstrs &UseToDefs,
-                        AArch64FunctionInfo &AArch64FI,
-                        const MachineDominatorTree *MDT) {
-  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
-  for (const auto &Entry : UseToDefs) {
-    unsigned Size = Entry.second.size();
-    if (Size == 0)
-      continue;
-    if (Size == 1) {
-      const MachineInstr *L2 = *Entry.second.begin();
-      const MachineInstr *L1 = Entry.first;
-      if (!MDT->dominates(L2, L1)) {
-        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
-                     << '\n');
-        continue;
-      }
-      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
-      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
-      ++NumADRPSimpleCandidate;
-    }
-#ifndef NDEBUG
-    else if (Size == 2)
-      ++NumADRPComplexCandidate2;
-    else if (Size == 3)
-      ++NumADRPComplexCandidate3;
-    else
-      ++NumADRPComplexCandidateOther;
-#endif
-    // if Size < 1, the use should have been removed from the candidates
-    assert(Size >= 1 && "No reaching defs for that use!");
+    return MI.getOperandNo(&MO) == 1 &&
+           MI.getOperand(0).getReg() != MI.getOperand(1).getReg();
   }
 }
 
 /// Check whether the given instruction can be the end of a LOH chain
 /// involving a load.
-static bool isCandidateLoad(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
+static bool isCandidateLoad(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::LDRSBWui:
@@ -660,17 +238,13 @@ static bool isCandidateLoad(const MachineInstr *Instr) {
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
-    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
-      return false;
-    return true;
+    return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT);
   }
-  // Unreachable.
-  return false;
 }
 
 /// Check whether the given instruction can load a litteral.
-static bool supportLoadFromLiteral(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
+static bool supportLoadFromLiteral(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::LDRSWui:
@@ -681,353 +255,233 @@ static bool supportLoadFromLiteral(const MachineInstr *Instr) {
   case AArch64::LDRQui:
     return true;
   }
-  // Unreachable.
-  return false;
 }
 
-/// Check whether the given instruction is a LOH candidate.
-/// \param UseToDefs is used to check that Instr is at the end of LOH supported
-/// chain.
-/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
-/// already been filtered out.
-static bool isCandidate(const MachineInstr *Instr,
-                        const InstrToInstrs &UseToDefs,
-                        const MachineDominatorTree *MDT) {
-  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
-    return false;
+/// Number of GPR registers traked by mapRegToGPRIndex()
+static const unsigned N_GPR_REGS = 31;
+/// Map register number to index from 0-30.
+static int mapRegToGPRIndex(MCPhysReg Reg) {
+  static_assert(AArch64::X28 - AArch64::X0 + 3 == N_GPR_REGS, "Number of GPRs");
+  static_assert(AArch64::W30 - AArch64::W0 + 1 == N_GPR_REGS, "Number of GPRs");
+  if (AArch64::X0 <= Reg && Reg <= AArch64::X28)
+    return Reg - AArch64::X0;
+  if (AArch64::W0 <= Reg && Reg <= AArch64::W30)
+    return Reg - AArch64::W0;
+  // TableGen gives "FP" and "LR" an index not adjacent to X28 so we have to
+  // handle them as special cases.
+  if (Reg == AArch64::FP)
+    return 29;
+  if (Reg == AArch64::LR)
+    return 30;
+  return -1;
+}
 
-  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
-  if (Def->getOpcode() != AArch64::ADRP) {
-    // At this point, Def is ADDXri or LDRXui of the right type of
-    // symbol, because we filtered out the uses that were not defined
-    // by these kind of instructions (+ ADRP).
+/// State tracked per register.
+/// The main algorithm walks backwards over a basic block maintaining this
+/// datastructure for each tracked general purpose register.
+struct LOHInfo {
+  MCLOHType Type : 8;           ///< "Best" type of LOH possible.
+  bool IsCandidate : 1;         ///< Possible LOH candidate.
+  bool OneUser : 1;             ///< Found exactly one user (yet).
+  bool MultiUsers : 1;          ///< Found multiple users.
+  const MachineInstr *MI0;      ///< First instruction involved in the LOH.
+  const MachineInstr *MI1;      ///< Second instruction involved in the LOH
+                                ///  (if any).
+  const MachineInstr *LastADRP; ///< Last ADRP in same register.
+};
 
-    // Check if this forms a simple chain: each intermediate node must
-    // dominates the next one.
-    if (!MDT->dominates(Def, Instr))
-      return false;
-    // Move one node up in the simple chain.
-    if (UseToDefs.find(Def) ==
-            UseToDefs.end()
-            // The map may contain garbage we have to ignore.
-        ||
-        UseToDefs.find(Def)->second.empty())
-      return false;
-    Instr = Def;
-    Def = *UseToDefs.find(Def)->second.begin();
+/// Update state \p Info given \p MI uses the tracked register.
+static void handleUse(const MachineInstr &MI, const MachineOperand &MO,
+                      LOHInfo &Info) {
+  // We have multiple uses if we already found one before.
+  if (Info.MultiUsers || Info.OneUser) {
+    Info.IsCandidate = false;
+    Info.MultiUsers = true;
+    return;
   }
-  // Check if we reached the top of the simple chain:
-  // - top is ADRP.
-  // - check the simple chain property: each intermediate node must
-  // dominates the next one.
-  if (Def->getOpcode() == AArch64::ADRP)
-    return MDT->dominates(Def, Instr);
-  return false;
-}
-
-static bool registerADRCandidate(const MachineInstr &Use,
-                                 const InstrToInstrs &UseToDefs,
-                                 const InstrToInstrs *DefsPerColorToUses,
-                                 AArch64FunctionInfo &AArch64FI,
-                                 SetOfMachineInstr *InvolvedInLOHs,
-                                 const MapRegToId &RegToId) {
-  // Look for opportunities to turn ADRP -> ADD or
-  // ADRP -> LDR GOTPAGEOFF into ADR.
-  // If ADRP has more than one use. Give up.
-  if (Use.getOpcode() != AArch64::ADDXri &&
-      (Use.getOpcode() != AArch64::LDRXui ||
-       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
-    return false;
-  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
-  // The map may contain garbage that we need to ignore.
-  if (It == UseToDefs.end() || It->second.empty())
-    return false;
-  const MachineInstr &Def = **It->second.begin();
-  if (Def.getOpcode() != AArch64::ADRP)
-    return false;
-  // Check the number of users of ADRP.
-  const SetOfMachineInstr *Users =
-      getUses(DefsPerColorToUses,
-              RegToId.find(Def.getOperand(0).getReg())->second, Def);
-  if (Users->size() > 1) {
-    ++NumADRComplexCandidate;
-    return false;
+  Info.OneUser = true;
+
+  // Start new LOHInfo if applicable.
+  if (isCandidateLoad(MI)) {
+    Info.Type = MCLOH_AdrpLdr;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
+    // Note that even this is AdrpLdr now, we can switch to a Ldr variant
+    // later.
+  } else if (isCandidateStore(MI, MO)) {
+    Info.Type = MCLOH_AdrpAddStr;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
+    Info.MI1 = nullptr;
+  } else if (MI.getOpcode() == AArch64::ADDXri) {
+    Info.Type = MCLOH_AdrpAdd;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
+  } else if (MI.getOpcode() == AArch64::LDRXui &&
+             MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) {
+    Info.Type = MCLOH_AdrpLdrGot;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
   }
-  ++NumADRSimpleCandidate;
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
-         "ADRP already involved in LOH.");
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
-         "ADD already involved in LOH.");
-  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
-
-  AArch64FI.addLOHDirective(
-      Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
-      {&Def, &Use});
-  return true;
 }
 
-/// Based on the use to defs information (in non-ADRPMode), compute the
-/// opportunities of LOH non-ADRP-related
-static void computeOthers(const InstrToInstrs &UseToDefs,
-                          const InstrToInstrs *DefsPerColorToUses,
-                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
-                          const MachineDominatorTree *MDT) {
-  SetOfMachineInstr *InvolvedInLOHs = nullptr;
-#ifndef NDEBUG
-  SetOfMachineInstr InvolvedInLOHsStorage;
-  InvolvedInLOHs = &InvolvedInLOHsStorage;
-#endif // NDEBUG
-  DEBUG(dbgs() << "*** Compute LOH for Others\n");
-  // ADRP -> ADD/LDR -> LDR/STR pattern.
-  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+/// Update state \p Info given the tracked register is clobbered.
+static void handleClobber(LOHInfo &Info) {
+  Info.IsCandidate = false;
+  Info.OneUser = false;
+  Info.MultiUsers = false;
+  Info.LastADRP = nullptr;
+}
 
-  // FIXME: When the statistics are not important,
-  // This initial filtering loop can be merged into the next loop.
-  // Currently, we didn't do it to have the same code for both DEBUG and
-  // NDEBUG builds. Indeed, the iterator of the second loop would need
-  // to be changed.
-  SetOfMachineInstr PotentialCandidates;
-  SetOfMachineInstr PotentialADROpportunities;
-  for (auto &Use : UseToDefs) {
-    // If no definition is available, this is a non candidate.
-    if (Use.second.empty())
-      continue;
-    // Keep only instructions that are load or store and at the end of
-    // a ADRP -> ADD/LDR/Nothing chain.
-    // We already filtered out the no-chain cases.
-    if (!isCandidate(Use.first, UseToDefs, MDT)) {
-      PotentialADROpportunities.insert(Use.first);
-      continue;
+/// Update state \p Info given that \p MI is possibly the middle instruction
+/// of an LOH involving 3 instructions.
+static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
+                             LOHInfo &OpInfo) {
+  if (!DefInfo.IsCandidate || (&DefInfo != &OpInfo && OpInfo.OneUser))
+    return false;
+  // Copy LOHInfo for dest register to LOHInfo for source register.
+  if (&DefInfo != &OpInfo) {
+    OpInfo = DefInfo;
+    // Invalidate \p DefInfo because we track it in \p OpInfo now.
+    handleClobber(DefInfo);
+  } else
+    DefInfo.LastADRP = nullptr;
+
+  // Advance state machine.
+  assert(OpInfo.IsCandidate && "Expect valid state");
+  if (MI.getOpcode() == AArch64::ADDXri && canAddBePartOfLOH(MI)) {
+    if (OpInfo.Type == MCLOH_AdrpLdr) {
+      OpInfo.Type = MCLOH_AdrpAddLdr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
+    } else if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
+      OpInfo.Type = MCLOH_AdrpAddStr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
     }
-    PotentialCandidates.insert(Use.first);
-  }
-
-  // Make the following distinctions for statistics as the linker does
-  // know how to decode instructions:
-  // - ADD/LDR/Nothing make there different patterns.
-  // - LDR/STR make two different patterns.
-  // Hence, 6 - 1 base patterns.
-  // (because ADRP-> Nothing -> STR is not simplifiable)
-
-  // The linker is only able to have a simple semantic, i.e., if pattern A
-  // do B.
-  // However, we want to see the opportunity we may miss if we were able to
-  // catch more complex cases.
-
-  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
-  // A potential candidate becomes a candidate, if its current immediate
-  // operand is zero and all nodes of the chain have respectively only one user
-#ifndef NDEBUG
-  SetOfMachineInstr DefsOfPotentialCandidates;
-#endif
-  for (const MachineInstr *Candidate : PotentialCandidates) {
-    // Get the definition of the candidate i.e., ADD or LDR.
-    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
-    // Record the elements of the chain.
-    const MachineInstr *L1 = Def;
-    const MachineInstr *L2 = nullptr;
-    unsigned ImmediateDefOpc = Def->getOpcode();
-    if (Def->getOpcode() != AArch64::ADRP) {
-      // Check the number of users of this node.
-      const SetOfMachineInstr *Users =
-          getUses(DefsPerColorToUses,
-                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
-      if (Users->size() > 1) {
-#ifndef NDEBUG
-        // if all the uses of this def are in potential candidate, this is
-        // a complex candidate of level 2.
-        bool IsLevel2 = true;
-        for (const MachineInstr *MI : *Users) {
-          if (!PotentialCandidates.count(MI)) {
-            ++NumTooCplxLvl2;
-            IsLevel2 = false;
-            break;
-          }
-        }
-        if (IsLevel2)
-          ++NumCplxLvl2;
-#endif // NDEBUG
-        PotentialADROpportunities.insert(Def);
-        continue;
-      }
-      L2 = Def;
-      Def = *UseToDefs.find(Def)->second.begin();
-      L1 = Def;
-    } // else the element in the middle of the chain is nothing, thus
-      // Def already contains the first element of the chain.
-
-    // Check the number of users of the first node in the chain, i.e., ADRP
-    const SetOfMachineInstr *Users =
-        getUses(DefsPerColorToUses,
-                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
-    if (Users->size() > 1) {
-#ifndef NDEBUG
-      // if all the uses of this def are in the defs of the potential candidate,
-      // this is a complex candidate of level 1
-      if (DefsOfPotentialCandidates.empty()) {
-        // lazy init
-        DefsOfPotentialCandidates = PotentialCandidates;
-        for (const MachineInstr *Candidate : PotentialCandidates) {
-          if (!UseToDefs.find(Candidate)->second.empty())
-            DefsOfPotentialCandidates.insert(
-                *UseToDefs.find(Candidate)->second.begin());
-        }
-      }
-      bool Found = false;
-      for (auto &Use : *Users) {
-        if (!DefsOfPotentialCandidates.count(Use)) {
-          ++NumTooCplxLvl1;
-          Found = true;
-          break;
-        }
-      }
-      if (!Found)
-        ++NumCplxLvl1;
-#endif // NDEBUG
-      continue;
+  } else {
+    assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui");
+    assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) &&
+           "Expected GOT relocation");
+    if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
+      OpInfo.Type = MCLOH_AdrpLdrGotStr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
+    } else if (OpInfo.Type == MCLOH_AdrpLdr) {
+      OpInfo.Type = MCLOH_AdrpLdrGotLdr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
     }
+  }
+  return false;
+}
 
-    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
-    // If the chain is three instructions long and ldr is the second element,
-    // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add &&
-        !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
-      continue;
-    SmallVector<const MachineInstr *, 3> Args;
-    MCLOHType Kind;
-    if (isCandidateLoad(Candidate)) {
-      if (!L2) {
-        // At this point, the candidate LOH indicates that the ldr instruction
-        // may use a direct access to the symbol. There is not such encoding
-        // for loads of byte and half.
-        if (!supportLoadFromLiteral(Candidate))
-          continue;
+/// Update state when seeing and ADRP instruction.
+static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
+                       LOHInfo &Info) {
+  if (Info.LastADRP != nullptr) {
+    DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t'
+                 << *Info.LastADRP);
+    AFI.addLOHDirective(MCLOH_AdrpAdrp, {&MI, Info.LastADRP});
+    ++NumADRPSimpleCandidate;
+  }
 
-        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
-                     << '\n');
-        Kind = MCLOH_AdrpLdr;
-        Args.push_back(L1);
-        Args.push_back(Candidate);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
+  // Produce LOH directive if possible.
+  if (Info.IsCandidate) {
+    switch (Info.Type) {
+    case MCLOH_AdrpAdd:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t'
+                   << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0});
+      ++NumADRSimpleCandidate;
+      break;
+    case MCLOH_AdrpLdr:
+      if (supportLoadFromLiteral(*Info.MI0)) {
+        DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" << '\t' << MI << '\t'
+                     << *Info.MI0);
+        AFI.addLOHDirective(MCLOH_AdrpLdr, {&MI, Info.MI0});
         ++NumADRPToLDR;
-      } else {
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifndef NDEBUG
-        // get the immediate of the load
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == AArch64::ADDXri)
-            ++NumADDToLDR;
-          else
-            ++NumLDRToLDR;
-        else if (ImmediateDefOpc == AArch64::ADDXri)
-          ++NumADDToLDRWithImm;
-        else
-          ++NumLDRToLDRWithImm;
-#endif // NDEBUG
       }
-    } else {
-      if (ImmediateDefOpc == AArch64::ADRP)
-        continue;
-      else {
-
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifndef NDEBUG
-        // get the immediate of the store
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == AArch64::ADDXri)
-            ++NumADDToSTR;
-          else
-            ++NumLDRToSTR;
-        else if (ImmediateDefOpc == AArch64::ADDXri)
-          ++NumADDToSTRWithImm;
-        else
-          ++NumLDRToSTRWithImm;
-#endif // DEBUG
+      break;
+    case MCLOH_AdrpAddLdr:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t'
+                   << *Info.MI1 << '\t' << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
+      ++NumADDToLDR;
+      break;
+    case MCLOH_AdrpAddStr:
+      if (Info.MI1 != nullptr) {
+        DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" << '\t' << MI << '\t'
+                     << *Info.MI1 << '\t' << *Info.MI0);
+        AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
+        ++NumADDToSTR;
       }
+      break;
+    case MCLOH_AdrpLdrGotLdr:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" << '\t' << MI << '\t'
+                   << *Info.MI1 << '\t' << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpLdrGotLdr, {&MI, Info.MI1, Info.MI0});
+      ++NumLDRToLDR;
+      break;
+    case MCLOH_AdrpLdrGotStr:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n" << '\t' << MI << '\t'
+                   << *Info.MI1 << '\t' << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpLdrGotStr, {&MI, Info.MI1, Info.MI0});
+      ++NumLDRToSTR;
+      break;
+    case MCLOH_AdrpLdrGot:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n" << '\t' << MI << '\t'
+                   << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpLdrGot, {&MI, Info.MI0});
+      break;
+    case MCLOH_AdrpAdrp:
+      llvm_unreachable("MCLOH_AdrpAdrp not used in state machine");
     }
-    AArch64FI.addLOHDirective(Kind, Args);
   }
 
-  // Now, we grabbed all the big patterns, check ADR opportunities.
-  for (const MachineInstr *Candidate : PotentialADROpportunities)
-    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
-                         InvolvedInLOHs, RegToId);
+  handleClobber(Info);
+  Info.LastADRP = &MI;
 }
 
-/// Look for every register defined by potential LOHs candidates.
-/// Map these registers with dense id in @p RegToId and vice-versa in
-/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
-static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
-                               MapIdToReg &IdToReg,
-                               const TargetRegisterInfo *TRI) {
-  unsigned CurRegId = 0;
-  if (!PreCollectRegister) {
-    unsigned NbReg = TRI->getNumRegs();
-    for (; CurRegId < NbReg; ++CurRegId) {
-      RegToId[CurRegId] = CurRegId;
-      DEBUG(IdToReg.push_back(CurRegId));
-      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
-    }
+static void handleRegMaskClobber(const uint32_t *RegMask, MCPhysReg Reg,
+                                 LOHInfo *LOHInfos) {
+  if (!MachineOperand::clobbersPhysReg(RegMask, Reg))
     return;
-  }
-
-  DEBUG(dbgs() << "** Collect Involved Register\n");
-  for (const auto &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (!canDefBePartOfLOH(&MI) &&
-          !isCandidateLoad(&MI) && !isCandidateStore(&MI))
-        continue;
+  int Idx = mapRegToGPRIndex(Reg);
+  if (Idx >= 0)
+    handleClobber(LOHInfos[Idx]);
+}
 
-      // Process defs
-      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
-                                            IOEnd = MI.operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isReg() || !IO->isDef())
-          continue;
-        unsigned CurReg = IO->getReg();
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
-          if (RegToId.find(*AI) == RegToId.end()) {
-            DEBUG(IdToReg.push_back(*AI);
-                  assert(IdToReg[CurRegId] == *AI &&
-                         "Reg index mismatches insertion index."));
-            RegToId[*AI] = CurRegId++;
-            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
-          }
-      }
+static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
+  // Handle defs and regmasks.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      const uint32_t *RegMask = MO.getRegMask();
+      for (MCPhysReg Reg : AArch64::GPR32RegClass)
+        handleRegMaskClobber(RegMask, Reg, LOHInfos);
+      for (MCPhysReg Reg : AArch64::GPR64RegClass)
+        handleRegMaskClobber(RegMask, Reg, LOHInfos);
+      continue;
     }
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    int Idx = mapRegToGPRIndex(MO.getReg());
+    if (Idx < 0)
+      continue;
+    handleClobber(LOHInfos[Idx]);
+  }
+  // Handle uses.
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+    int Idx = mapRegToGPRIndex(MO.getReg());
+    if (Idx < 0)
+      continue;
+    handleUse(MI, MO, LOHInfos[Idx]);
   }
 }
 
@@ -1035,74 +489,59 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
-
-  MapRegToId RegToId;
-  MapIdToReg IdToReg;
-  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
-  assert(AArch64FI && "No MachineFunctionInfo for this function!");
-
-  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+  DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n"
+               << "Looking in function " << MF.getName() << '\n');
 
-  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
-  if (RegToId.empty())
-    return false;
+  LOHInfo LOHInfos[N_GPR_REGS];
+  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+  for (const MachineBasicBlock &MBB : MF) {
+    // Reset register tracking state.
+    memset(LOHInfos, 0, sizeof(LOHInfos));
+    // Live-out registers are used.
+    for (const MachineBasicBlock *Succ : MBB.successors()) {
+      for (const auto &LI : Succ->liveins()) {
+        int RegIdx = mapRegToGPRIndex(LI.PhysReg);
+        if (RegIdx >= 0)
+          LOHInfos[RegIdx].OneUser = true;
+      }
+    }
 
-  MachineInstr *DummyOp = nullptr;
-  if (BasicBlockScopeOnly) {
-    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-    // For local analysis, create a dummy operation to record uses that are not
-    // local.
-    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+    // Walk the basic block backwards and update the per register state machine
+    // in the process.
+    for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+      unsigned Opcode = MI.getOpcode();
+      switch (Opcode) {
+      case AArch64::ADDXri:
+      case AArch64::LDRXui:
+        if (canDefBePartOfLOH(MI)) {
+          const MachineOperand &Def = MI.getOperand(0);
+          const MachineOperand &Op = MI.getOperand(1);
+          assert(Def.isReg() && Def.isDef() && "Expected reg def");
+          assert(Op.isReg() && Op.isUse() && "Expected reg use");
+          int DefIdx = mapRegToGPRIndex(Def.getReg());
+          int OpIdx = mapRegToGPRIndex(Op.getReg());
+          if (DefIdx >= 0 && OpIdx >= 0 &&
+              handleMiddleInst(MI, LOHInfos[DefIdx], LOHInfos[OpIdx]))
+            continue;
+        }
+        break;
+      case AArch64::ADRP:
+        const MachineOperand &Op0 = MI.getOperand(0);
+        int Idx = mapRegToGPRIndex(Op0.getReg());
+        if (Idx >= 0) {
+          handleADRP(MI, AFI, LOHInfos[Idx]);
+          continue;
+        }
+        break;
+      }
+      handleNormalInst(MI, LOHInfos);
+    }
   }
 
-  unsigned NbReg = RegToId.size();
-  bool Modified = false;
-
-  // Start with ADRP.
-  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // Compute the reaching def in ADRP mode, meaning ADRP definitions
-  // are first considered as uses.
-  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
-  DEBUG(dbgs() << "ADRP reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Translate the definition to uses map into a use to definitions map to ease
-  // statistic computation.
-  InstrToInstrs ADRPToReachingDefs;
-  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
-
-  // Compute LOH for ADRP.
-  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
-  delete[] ColorOpToReachedUses;
-
-  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
-  ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // first perform a regular reaching def analysis.
-  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
-  DEBUG(dbgs() << "All reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Turn that into a use to defs to ease statistic computation.
-  InstrToInstrs UsesToReachingDefs;
-  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
-
-  // Compute other than AdrpAdrp LOH.
-  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
-                MDT);
-  delete[] ColorOpToReachedUses;
-
-  if (BasicBlockScopeOnly)
-    MF.DeleteMachineInstr(DummyOp);
-
-  return Modified;
+  // Return "no change": The pass only collects information.
+  return false;
 }
 
-/// createAArch64CollectLOHPass - returns an instance of the Statistic for
-/// linker optimization pass.
 FunctionPass *llvm::createAArch64CollectLOHPass() {
   return new AArch64CollectLOH();
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4c98253878e4..74a01835171b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11,28 +11,79 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64ISelLowering.h"
 #include "AArch64CallingConvention.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64ISelLowering.h"
 #include "AArch64PerfectShuffle.h"
+#include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
-#include "AArch64TargetObjectFile.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <bitset>
+#include <cassert>
+#include <cctype>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-lower"
@@ -59,7 +110,6 @@ static const MVT MVT_CC = MVT::i32;
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
   // we have to make something up. Arbitrarily, choose ZeroOrOne.
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -218,7 +268,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
-
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
@@ -3632,6 +3681,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
+
 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -4549,7 +4599,6 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
   return DAG.getMergeValues(Ops, dl);
 }
 
-
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
@@ -5074,10 +5123,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     int WindowBase;
     int WindowScale;
 
-    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
     ShuffleSourceInfo(SDValue Vec)
-        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
-          WindowScale(1) {}
+      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
+          ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -7028,7 +7078,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return true;
   }
   case Intrinsic::aarch64_ldaxp:
-  case Intrinsic::aarch64_ldxp: {
+  case Intrinsic::aarch64_ldxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(0);
@@ -7038,9 +7088,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = true;
     Info.writeMem = false;
     return true;
-  }
   case Intrinsic::aarch64_stlxp:
-  case Intrinsic::aarch64_stxp: {
+  case Intrinsic::aarch64_stxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(2);
@@ -7050,7 +7099,6 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = false;
     Info.writeMem = true;
     return true;
-  }
   default:
     break;
   }
@@ -8044,13 +8092,13 @@ static SDValue tryCombineToEXTR(SDNode *N,
 
   SDValue LHS;
   uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
+  bool LHSFromHi = false;
   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
     return SDValue();
 
   SDValue RHS;
   uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
+  bool RHSFromHi = false;
   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
     return SDValue();
 
@@ -9732,52 +9780,51 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
 
   switch(CC) {
   case AArch64CC::LE:
-  case AArch64CC::GT: {
+  case AArch64CC::GT:
     if ((AddConstant == 0) ||
         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
         (AddConstant >= 0 && CompConstant < 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
       return true;
-  } break;
+    break;
   case AArch64CC::LT:
-  case AArch64CC::GE: {
+  case AArch64CC::GE:
     if ((AddConstant == 0) ||
         (AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
       return true;
-  } break;
+    break;
   case AArch64CC::HI:
-  case AArch64CC::LS: {
+  case AArch64CC::LS:
     if ((AddConstant >= 0 && CompConstant < 0) ||
        (AddConstant <= 0 && CompConstant >= -1 &&
         CompConstant < AddConstant + MaxUInt))
       return true;
-  } break;
+   break;
   case AArch64CC::PL:
-  case AArch64CC::MI: {
+  case AArch64CC::MI:
     if ((AddConstant == 0) ||
         (AddConstant > 0 && CompConstant <= 0) ||
         (AddConstant < 0 && CompConstant <= AddConstant))
       return true;
-  } break;
+    break;
   case AArch64CC::LO:
-  case AArch64CC::HS: {
+  case AArch64CC::HS:
     if ((AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant >= 0 &&
          CompConstant <= AddConstant + MaxUInt))
       return true;
-  } break;
+    break;
   case AArch64CC::EQ:
-  case AArch64CC::NE: {
+  case AArch64CC::NE:
     if ((AddConstant > 0 && CompConstant < 0) ||
         (AddConstant < 0 && CompConstant >= 0 &&
          CompConstant < AddConstant + MaxUInt) ||
         (AddConstant >= 0 && CompConstant >= 0 &&
          CompConstant >= AddConstant) ||
         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
-
       return true;
-  } break;
+    break;
   case AArch64CC::VS:
   case AArch64CC::VC:
   case AArch64CC::AL:
@@ -10501,7 +10548,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   if (ValTy->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
-    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+    Function *Ldxr = Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
@@ -10517,7 +10564,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
-  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldxr, Addr),
@@ -10527,8 +10574,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilder<> &Builder) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(
-      llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 626c934f236e..5c8acba26aab 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -14,16 +14,37 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -529,19 +550,19 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
     default:
       llvm_unreachable("Unknown branch opcode in Cond");
     case AArch64::CBZW:
-      Is64Bit = 0;
+      Is64Bit = false;
       CC = AArch64CC::EQ;
       break;
     case AArch64::CBZX:
-      Is64Bit = 1;
+      Is64Bit = true;
       CC = AArch64CC::EQ;
       break;
     case AArch64::CBNZW:
-      Is64Bit = 0;
+      Is64Bit = false;
       CC = AArch64CC::NE;
       break;
     case AArch64::CBNZX:
-      Is64Bit = 1;
+      Is64Bit = true;
       CC = AArch64CC::NE;
       break;
     }
@@ -1044,7 +1065,7 @@ static unsigned sForm(MachineInstr &Instr) {
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
-    return Instr.getOpcode();;
+    return Instr.getOpcode();
 
   case AArch64::ADDWrr:    return AArch64::ADDSWrr;
   case AArch64::ADDWri:    return AArch64::ADDSWri;
@@ -1072,12 +1093,15 @@ static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
 }
 
 namespace {
+
 struct UsedNZCV {
-  bool N;
-  bool Z;
-  bool C;
-  bool V;
-  UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+  bool N = false;
+  bool Z = false;
+  bool C = false;
+  bool V = false;
+
+  UsedNZCV() = default;
+
   UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
     this->N |= UsedFlags.N;
     this->Z |= UsedFlags.Z;
@@ -1086,6 +1110,7 @@ struct UsedNZCV {
     return *this;
   }
 };
+
 } // end anonymous namespace
 
 /// Find a condition code used by the instruction.
@@ -1561,7 +1586,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
 
 /// Check all MachineMemOperands for a hint to suppress pairing.
 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
-  return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
     return MMO->getFlags() & MOSuppressPair;
   });
 }
@@ -1994,7 +2019,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
 void AArch64InstrInfo::copyPhysRegTuple(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
-    llvm::ArrayRef<unsigned> Indices) const {
+    ArrayRef<unsigned> Indices) const {
   assert(Subtarget.hasNEON() &&
          "Unexpected register copy without NEON");
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -2583,7 +2608,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   // <rdar://problem/11522048>
   //
-  if (MI.isCopy()) {
+  if (MI.isFullCopy()) {
     unsigned DstReg = MI.getOperand(0).getReg();
     unsigned SrcReg = MI.getOperand(1).getReg();
     if (SrcReg == AArch64::SP &&
@@ -2598,7 +2623,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     }
   }
 
-  // Handle the case where a copy is being spilled or refilled but the source
+  // Handle the case where a copy is being spilled or filled but the source
   // and destination register class don't match.  For example:
   //
   //   %vreg0<def> = COPY %XZR; GPR64common:%vreg0
@@ -2613,7 +2638,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   //   %vreg0<def> = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1
   //
-  // will be refilled as
+  // will be filled as
   //
   //   LDRDui %vreg0, fi<#0>
   //
@@ -2622,9 +2647,11 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //   LDRXui %vregTemp, fi<#0>
   //   %vreg0 = FMOV %vregTemp
   //
-  if (MI.isFullCopy() && Ops.size() == 1 &&
+  if (MI.isCopy() && Ops.size() == 1 &&
       // Make sure we're only folding the explicit COPY defs/uses.
       (Ops[0] == 0 || Ops[0] == 1)) {
+    bool IsSpill = Ops[0] == 0;
+    bool IsFill = !IsSpill;
     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     MachineBasicBlock &MBB = *MI.getParent();
@@ -2632,21 +2659,112 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     const MachineOperand &SrcMO = MI.getOperand(1);
     unsigned DstReg = DstMO.getReg();
     unsigned SrcReg = SrcMO.getReg();
+    // This is slightly expensive to compute for physical regs since
+    // getMinimalPhysRegClass is slow.
     auto getRegClass = [&](unsigned Reg) {
       return TargetRegisterInfo::isVirtualRegister(Reg)
                  ? MRI.getRegClass(Reg)
                  : TRI.getMinimalPhysRegClass(Reg);
     };
-    const TargetRegisterClass &DstRC = *getRegClass(DstReg);
-    const TargetRegisterClass &SrcRC = *getRegClass(SrcReg);
-    if (DstRC.getSize() == SrcRC.getSize()) {
-      if (Ops[0] == 0)
+
+    if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
+      assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() &&
+             "Mismatched register size in non subreg COPY");
+      if (IsSpill)
         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
-                            &SrcRC, &TRI);
+                            getRegClass(SrcReg), &TRI);
       else
-        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, &DstRC, &TRI);
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
+                             getRegClass(DstReg), &TRI);
       return &*--InsertPt;
     }
+
+    // Handle cases like spilling def of:
+    //
+    //   %vreg0:sub_32<def,read-undef> = COPY %WZR; GPR64common:%vreg0
+    //
+    // where the physical register source can be widened and stored to the full
+    // virtual reg destination stack slot, in this case producing:
+    //
+    //   STRXui %XZR, <fi#0>
+    //
+    if (IsSpill && DstMO.isUndef() &&
+        TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+      assert(SrcMO.getSubReg() == 0 &&
+             "Unexpected subreg on physical register");
+      const TargetRegisterClass *SpillRC;
+      unsigned SpillSubreg;
+      switch (DstMO.getSubReg()) {
+      default:
+        SpillRC = nullptr;
+        break;
+      case AArch64::sub_32:
+      case AArch64::ssub:
+        if (AArch64::GPR32RegClass.contains(SrcReg)) {
+          SpillRC = &AArch64::GPR64RegClass;
+          SpillSubreg = AArch64::sub_32;
+        } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
+          SpillRC = &AArch64::FPR64RegClass;
+          SpillSubreg = AArch64::ssub;
+        } else
+          SpillRC = nullptr;
+        break;
+      case AArch64::dsub:
+        if (AArch64::FPR64RegClass.contains(SrcReg)) {
+          SpillRC = &AArch64::FPR128RegClass;
+          SpillSubreg = AArch64::dsub;
+        } else
+          SpillRC = nullptr;
+        break;
+      }
+
+      if (SpillRC)
+        if (unsigned WidenedSrcReg =
+                TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
+          storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
+                              FrameIndex, SpillRC, &TRI);
+          return &*--InsertPt;
+        }
+    }
+
+    // Handle cases like filling use of:
+    //
+    //   %vreg0:sub_32<def,read-undef> = COPY %vreg1; GPR64:%vreg0, GPR32:%vreg1
+    //
+    // where we can load the full virtual reg source stack slot, into the subreg
+    // destination, in this case producing:
+    //
+    //   LDRWui %vreg0:sub_32<def,read-undef>, <fi#0>
+    //
+    if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
+      const TargetRegisterClass *FillRC;
+      switch (DstMO.getSubReg()) {
+      default:
+        FillRC = nullptr;
+        break;
+      case AArch64::sub_32:
+        FillRC = &AArch64::GPR32RegClass;
+        break;
+      case AArch64::ssub:
+        FillRC = &AArch64::FPR32RegClass;
+        break;
+      case AArch64::dsub:
+        FillRC = &AArch64::FPR64RegClass;
+        break;
+      }
+
+      if (FillRC) {
+        assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() &&
+               "Mismatched regclass size on folded subreg COPY");
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
+        MachineInstr &LoadMI = *--InsertPt;
+        MachineOperand &LoadDst = LoadMI.getOperand(0);
+        assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
+        LoadDst.setSubReg(DstMO.getSubReg());
+        LoadDst.setIsUndef();
+        return &LoadMI;
+      }
+    }
   }
 
   // Cannot fold.
@@ -2936,7 +3054,7 @@ bool AArch64InstrInfo::useMachineCombiner() const {
 
   return true;
 }
-//
+
 // True when Opc sets flag
 static bool isCombineInstrSettingFlag(unsigned Opc) {
   switch (Opc) {
@@ -2955,7 +3073,7 @@ static bool isCombineInstrSettingFlag(unsigned Opc) {
   }
   return false;
 }
-//
+
 // 32b Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate32(unsigned Opc) {
   switch (Opc) {
@@ -2974,7 +3092,7 @@ static bool isCombineInstrCandidate32(unsigned Opc) {
   }
   return false;
 }
-//
+
 // 64b Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate64(unsigned Opc) {
   switch (Opc) {
@@ -2993,7 +3111,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   }
   return false;
 }
-//
+
 // FP Opcodes that can be combined with a FMUL
 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   switch (Inst.getOpcode()) {
@@ -3009,13 +3127,13 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   case AArch64::FSUBv2f32:
   case AArch64::FSUBv2f64:
   case AArch64::FSUBv4f32:
-		TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 
-    return (Options.UnsafeFPMath || 
-				    Options.AllowFPOpFusion == FPOpFusion::Fast);
+    TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
+    return (Options.UnsafeFPMath ||
+            Options.AllowFPOpFusion == FPOpFusion::Fast);
   }
   return false;
 }
-//
+
 // Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate(unsigned Opc) {
   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
@@ -3205,7 +3323,7 @@ static bool getFMAPatterns(MachineInstr &Root,
                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
 
   if (!isCombineInstrCandidateFP(Root))
-    return 0;
+    return false;
 
   MachineBasicBlock &MBB = *Root.getParent();
   bool Found = false;
@@ -3971,8 +4089,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
   DelInstrs.push_back(&Root);
-
-  return;
 }
 
 /// \brief Replace csincr-branch sequence by simple conditional branch
@@ -4148,6 +4264,7 @@ AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
 ArrayRef<std::pair<unsigned, const char *>>
 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   using namespace AArch64II;
+
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_PAGE, "aarch64-page"},
       {MO_PAGEOFF, "aarch64-pageoff"},
@@ -4162,6 +4279,7 @@ AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
 ArrayRef<std::pair<unsigned, const char *>>
 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   using namespace AArch64II;
+
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_GOT, "aarch64-got"},
       {MO_NC, "aarch64-nc"},
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 90b2c0896872..5037866925d3 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -162,6 +162,10 @@ public:
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  // This tells target independent code that it is okay to pass instructions
+  // with subreg operands to foldMemoryOperandImpl.
+  bool isSubregFoldable() const override { return true; }
+
   using TargetInstrInfo::foldMemoryOperandImpl;
   MachineInstr *
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 20de07424c53..b51473524c72 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -1071,8 +1071,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       return false;
     }
 
-    const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
-        (CmpInst::Predicate)I.getOperand(1).getPredicate());
+    // CSINC increments the result by one when the condition code is false.
+    // Therefore, we have to invert the predicate to get an increment by 1 when
+    // the predicate is true.
+    const AArch64CC::CondCode invCC =
+        changeICMPPredToAArch64CC(CmpInst::getInversePredicate(
+            (CmpInst::Predicate)I.getOperand(1).getPredicate()));
 
     MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
                                .addDef(ZReg)
@@ -1084,7 +1088,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
              .addDef(I.getOperand(0).getReg())
              .addUse(AArch64::WZR)
              .addUse(AArch64::WZR)
-             .addImm(CC);
+             .addImm(invCC);
 
     constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
     constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.h b/lib/Target/AArch64/AArch64InstructionSelector.h
index 0d44e696ac20..2c6e5a912fb7 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.h
+++ b/lib/Target/AArch64/AArch64InstructionSelector.h
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 
 namespace llvm {
+
 class AArch64InstrInfo;
 class AArch64RegisterBankInfo;
 class AArch64RegisterInfo;
@@ -29,7 +30,7 @@ public:
                              const AArch64Subtarget &STI,
                              const AArch64RegisterBankInfo &RBI);
 
-  virtual bool select(MachineInstr &I) const override;
+  bool select(MachineInstr &I) const override;
 
 private:
   /// tblgen-erated 'select' implementation, used as the initial selector for
@@ -43,5 +44,6 @@ private:
   const AArch64RegisterBankInfo &RBI;
 };
 
-} // End llvm namespace.
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index ca2860afe13d..f0bffe544158 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -14,17 +14,18 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include <cassert>
 
 namespace llvm {
 
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
 /// contains private AArch64-specific information for each MachineFunction.
 class AArch64FunctionInfo final : public MachineFunctionInfo {
-
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
   /// all usable during a tail call.
@@ -34,16 +35,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// space to a function with 16-bytes then misalignment of this value would
   /// make a stack adjustment necessary, which could not be undone by the
   /// callee.
-  unsigned BytesInStackArgArea;
+  unsigned BytesInStackArgArea = 0;
 
   /// The number of bytes to restore to deallocate space for incoming
   /// arguments. Canonically 0 in the C calling convention, but non-zero when
   /// callee is expected to pop the args.
-  unsigned ArgumentStackToRestore;
+  unsigned ArgumentStackToRestore = 0;
 
   /// HasStackFrame - True if this function has a stack frame. Set by
   /// determineCalleeSaves().
-  bool HasStackFrame;
+  bool HasStackFrame = false;
 
   /// \brief Amount of stack frame size, not including callee-saved registers.
   unsigned LocalStackSize;
@@ -53,54 +54,44 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
 
   /// \brief Number of TLS accesses using the special (combinable)
   /// _TLS_MODULE_BASE_ symbol.
-  unsigned NumLocalDynamicTLSAccesses;
+  unsigned NumLocalDynamicTLSAccesses = 0;
 
   /// \brief FrameIndex for start of varargs area for arguments passed on the
   /// stack.
-  int VarArgsStackIndex;
+  int VarArgsStackIndex = 0;
 
   /// \brief FrameIndex for start of varargs area for arguments passed in
   /// general purpose registers.
-  int VarArgsGPRIndex;
+  int VarArgsGPRIndex = 0;
 
   /// \brief Size of the varargs area for arguments passed in general purpose
   /// registers.
-  unsigned VarArgsGPRSize;
+  unsigned VarArgsGPRSize = 0;
 
   /// \brief FrameIndex for start of varargs area for arguments passed in
   /// floating-point registers.
-  int VarArgsFPRIndex;
+  int VarArgsFPRIndex = 0;
 
   /// \brief Size of the varargs area for arguments passed in floating-point
   /// registers.
-  unsigned VarArgsFPRSize;
+  unsigned VarArgsFPRSize = 0;
 
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies.
-  bool IsSplitCSR;
+  bool IsSplitCSR = false;
 
   /// True when the stack gets realigned dynamically because the size of stack
   /// frame is unknown at compile time. e.g., in case of VLAs.
-  bool StackRealigned;
+  bool StackRealigned = false;
 
   /// True when the callee-save stack area has unused gaps that may be used for
   /// other stack allocations.
-  bool CalleeSaveStackHasFreeSpace;
+  bool CalleeSaveStackHasFreeSpace = false;
 
 public:
-  AArch64FunctionInfo()
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false), StackRealigned(false),
-        CalleeSaveStackHasFreeSpace(false) {}
-
-  explicit AArch64FunctionInfo(MachineFunction &MF)
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false), StackRealigned(false),
-        CalleeSaveStackHasFreeSpace(false) {
+  AArch64FunctionInfo() = default;
+
+  explicit AArch64FunctionInfo(MachineFunction &MF) {
     (void)MF;
   }
 
@@ -193,6 +184,7 @@ private:
   MILOHContainer LOHContainerSet;
   SetOfInstructions LOHRelated;
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index f58bbbd26132..03e01329e036 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -71,6 +71,7 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case Falkor:
     MaxInterleaveFactor = 4;
+    VectorInsertExtractBaseCost = 2;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index e4ef0d4bb8db..d2883941e2c4 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -15,24 +15,35 @@
 #include "AArch64InstructionSelector.h"
 #include "AArch64LegalizerInfo.h"
 #include "AArch64RegisterBankInfo.h"
+#include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
@@ -154,9 +165,9 @@ extern "C" void LLVMInitializeAArch64Target() {
 //===----------------------------------------------------------------------===//
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
-    return make_unique<AArch64_MachoTargetObjectFile>();
+    return llvm::make_unique<AArch64_MachoTargetObjectFile>();
 
-  return make_unique<AArch64_ELFTargetObjectFile>();
+  return llvm::make_unique<AArch64_ELFTargetObjectFile>();
 }
 
 // Helper function to build a DataLayout string
@@ -202,29 +213,35 @@ AArch64TargetMachine::AArch64TargetMachine(
   initAsmInfo();
 }
 
-AArch64TargetMachine::~AArch64TargetMachine() {}
+AArch64TargetMachine::~AArch64TargetMachine() = default;
 
 #ifdef LLVM_BUILD_GLOBAL_ISEL
 namespace {
+
 struct AArch64GISelActualAccessor : public GISelAccessor {
   std::unique_ptr<CallLowering> CallLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
   const CallLowering *getCallLowering() const override {
     return CallLoweringInfo.get();
   }
+
   const InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
+
   const LegalizerInfo *getLegalizerInfo() const override {
     return Legalizer.get();
   }
+
   const RegisterBankInfo *getRegBankInfo() const override {
     return RegBankInfo.get();
   }
 };
-} // End anonymous namespace.
+
+} // end anonymous namespace
 #endif
 
 const AArch64Subtarget *
@@ -287,6 +304,7 @@ AArch64beTargetMachine::AArch64beTargetMachine(
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
+
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
@@ -324,7 +342,8 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
@@ -414,14 +433,17 @@ bool AArch64PassConfig::addIRTranslator() {
   addPass(new IRTranslator());
   return false;
 }
+
 bool AArch64PassConfig::addLegalizeMachineIR() {
   addPass(new Legalizer());
   return false;
 }
+
 bool AArch64PassConfig::addRegBankSelect() {
   addPass(new RegBankSelect());
   return false;
 }
+
 bool AArch64PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
   return false;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 88c98865bbc6..1a17691fc584 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -417,14 +417,17 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   }
 }
 
-int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                              const SCEV *Ptr) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
   // extra micro-ops can significantly decrease throughput.
   unsigned NumVectorInstToHideOverhead = 10;
+  int MaxMergeDistance = 64;
 
-  if (Ty->isVectorTy() && IsComplex)
+  if (Ty->isVectorTy() && SE && 
+      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
     return NumVectorInstToHideOverhead;
 
   // In many cases the address computation is not merged into the instruction
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 24642cb1698e..849fd3d9b44a 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -104,7 +104,7 @@ public:
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
 
-  int getAddressComputationCost(Type *Ty, bool IsComplex);
+  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index db84afacf30e..b86a283b40d4 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -9,45 +9,62 @@
 
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cctype>
+#include <cstdint>
 #include <cstdio>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 namespace {
 
-class AArch64Operand;
-
 class AArch64AsmParser : public MCTargetAsmParser {
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
 
   // Map of register aliases registers via the .req directive.
-  StringMap<std::pair<bool, unsigned> > RegisterReqs;
+  StringMap<std::pair<bool, unsigned>> RegisterReqs;
 
   AArch64TargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -118,6 +135,7 @@ public:
 #include "AArch64GenAsmMatcher.inc"
   };
   bool IsILP32;
+
   AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                    const MCInstrInfo &MII, const MCTargetOptions &Options)
     : MCTargetAsmParser(Options, STI) {
@@ -143,9 +161,6 @@ public:
                                 MCSymbolRefExpr::VariantKind &DarwinRefKind,
                                 int64_t &Addend);
 };
-} // end anonymous namespace
-
-namespace {
 
 /// AArch64Operand - Instances of this class represent a parsed AArch64 machine
 /// instruction.
@@ -531,6 +546,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 2);
   }
+
   bool isImm0_7() const {
     if (!isImm())
       return false;
@@ -540,6 +556,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 8);
   }
+
   bool isImm1_8() const {
     if (!isImm())
       return false;
@@ -549,6 +566,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val > 0 && Val < 9);
   }
+
   bool isImm0_15() const {
     if (!isImm())
       return false;
@@ -558,6 +576,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 16);
   }
+
   bool isImm1_16() const {
     if (!isImm())
       return false;
@@ -567,6 +586,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val > 0 && Val < 17);
   }
+
   bool isImm0_31() const {
     if (!isImm())
       return false;
@@ -576,6 +596,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 32);
   }
+
   bool isImm1_31() const {
     if (!isImm())
       return false;
@@ -585,6 +606,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 32);
   }
+
   bool isImm1_32() const {
     if (!isImm())
       return false;
@@ -594,6 +616,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 33);
   }
+
   bool isImm0_63() const {
     if (!isImm())
       return false;
@@ -603,6 +626,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 64);
   }
+
   bool isImm1_63() const {
     if (!isImm())
       return false;
@@ -612,6 +636,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 64);
   }
+
   bool isImm1_64() const {
     if (!isImm())
       return false;
@@ -621,6 +646,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 65);
   }
+
   bool isImm0_127() const {
     if (!isImm())
       return false;
@@ -630,6 +656,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 128);
   }
+
   bool isImm0_255() const {
     if (!isImm())
       return false;
@@ -639,6 +666,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 256);
   }
+
   bool isImm0_65535() const {
     if (!isImm())
       return false;
@@ -648,6 +676,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 65536);
   }
+
   bool isImm32_63() const {
     if (!isImm())
       return false;
@@ -657,6 +686,7 @@ public:
     int64_t Val = MCE->getValue();
     return (Val >= 32 && Val < 64);
   }
+
   bool isLogicalImm32() const {
     if (!isImm())
       return false;
@@ -669,6 +699,7 @@ public:
     Val &= 0xFFFFFFFF;
     return AArch64_AM::isLogicalImmediate(Val, 32);
   }
+
   bool isLogicalImm64() const {
     if (!isImm())
       return false;
@@ -677,6 +708,7 @@ public:
       return false;
     return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
   }
+
   bool isLogicalImm32Not() const {
     if (!isImm())
       return false;
@@ -686,6 +718,7 @@ public:
     int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
     return AArch64_AM::isLogicalImmediate(Val, 32);
   }
+
   bool isLogicalImm64Not() const {
     if (!isImm())
       return false;
@@ -694,7 +727,9 @@ public:
       return false;
     return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
   }
+
   bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+
   bool isAddSubImm() const {
     if (!isShiftedImm() && !isImm())
       return false;
@@ -737,6 +772,7 @@ public:
     // code deal with it.
     return true;
   }
+
   bool isAddSubImmNeg() const {
     if (!isShiftedImm() && !isImm())
       return false;
@@ -756,7 +792,9 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
     return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff;
   }
+
   bool isCondCode() const { return Kind == k_CondCode; }
+
   bool isSIMDImmType10() const {
     if (!isImm())
       return false;
@@ -765,6 +803,7 @@ public:
       return false;
     return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
+
   bool isBranchTarget26() const {
     if (!isImm())
       return false;
@@ -776,6 +815,7 @@ public:
       return false;
     return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
   }
+
   bool isPCRelLabel19() const {
     if (!isImm())
       return false;
@@ -787,6 +827,7 @@ public:
       return false;
     return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
   }
+
   bool isBranchTarget14() const {
     if (!isImm())
       return false;
@@ -891,40 +932,49 @@ public:
   bool isFPImm() const { return Kind == k_FPImm; }
   bool isBarrier() const { return Kind == k_Barrier; }
   bool isSysReg() const { return Kind == k_SysReg; }
+
   bool isMRSSystemRegister() const {
     if (!isSysReg()) return false;
 
     return SysReg.MRSReg != -1U;
   }
+
   bool isMSRSystemRegister() const {
     if (!isSysReg()) return false;
     return SysReg.MSRReg != -1U;
   }
+
   bool isSystemPStateFieldWithImm0_1() const {
     if (!isSysReg()) return false;
     return (SysReg.PStateField == AArch64PState::PAN ||
             SysReg.PStateField == AArch64PState::UAO);
   }
+
   bool isSystemPStateFieldWithImm0_15() const {
     if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
     return SysReg.PStateField != -1U;
   }
+
   bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
   bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+
   bool isVectorRegLo() const {
     return Kind == k_Register && Reg.isVector &&
            AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
                Reg.RegNum);
   }
+
   bool isGPR32as64() const {
     return Kind == k_Register && !Reg.isVector &&
       AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
+
   bool isWSeqPair() const {
     return Kind == k_Register && !Reg.isVector &&
            AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
                Reg.RegNum);
   }
+
   bool isXSeqPair() const {
     return Kind == k_Register && !Reg.isVector &&
            AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
@@ -957,19 +1007,25 @@ public:
   bool isVectorIndex1() const {
     return Kind == k_VectorIndex && VectorIndex.Val == 1;
   }
+
   bool isVectorIndexB() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 16;
   }
+
   bool isVectorIndexH() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 8;
   }
+
   bool isVectorIndexS() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 4;
   }
+
   bool isVectorIndexD() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 2;
   }
+
   bool isToken() const override { return Kind == k_Token; }
+
   bool isTokenEqual(StringRef Str) const {
     return Kind == k_Token && getToken() == Str;
   }
@@ -1006,6 +1062,7 @@ public:
     AArch64_AM::ShiftExtendType ET = getShiftExtendType();
     return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
   }
+
   bool isExtendLSL64() const {
     if (!isExtend())
       return false;
@@ -1836,11 +1893,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
       OS << "<prfop invalid #" << getPrefetch() << ">";
     break;
   }
-  case k_PSBHint: {
+  case k_PSBHint:
     OS << getPSBHintName();
     break;
-  }
-  case k_ShiftExtend: {
+  case k_ShiftExtend:
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
     if (!hasShiftExtendAmount())
@@ -1848,7 +1904,6 @@ void AArch64Operand::print(raw_ostream &OS) const {
     OS << '>';
     break;
   }
-  }
 }
 
 /// @name Auto-generated Match Functions
@@ -2469,7 +2524,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     Expr = MCConstantExpr::create(op2, getContext());                          \
     Operands.push_back(                                                        \
         AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-  } while (0)
+  } while (false)
 
   if (Mnemonic == "ic") {
     if (!Op.compare_lower("ialluis")) {
@@ -3979,7 +4034,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     }
   }
 
-
   switch (MatchResult) {
   case Match_Success: {
     // Perform range checking and other semantic validations
@@ -4550,7 +4604,6 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   return Match_InvalidOperand;
 }
 
-
 OperandMatchResultTy
 AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
 
@@ -4601,7 +4654,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
- if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
+  if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
       (isXReg && !XRegClass.contains(SecondReg)) ||
       (isWReg && !WRegClass.contains(SecondReg))) {
     Error(E,"expected second odd register of a "
@@ -4610,7 +4663,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
   }
 
   unsigned Pair = 0;
-  if(isXReg) {
+  if (isXReg) {
     Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
            &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
   } else {
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 24e353cf4b96..bc2f7f181699 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -17,15 +17,12 @@
 
 namespace llvm {
 
-class MCInst;
-class raw_ostream;
-
 class AArch64Disassembler : public MCDisassembler {
 public:
   AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
 
-  ~AArch64Disassembler() {}
+  ~AArch64Disassembler() override = default;
 
   MCDisassembler::DecodeStatus
   getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
@@ -33,6 +30,6 @@ public:
                  raw_ostream &CStream) const override;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a1edb3cef46a..c954c0eb2c6b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -17,25 +17,30 @@
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32);
 
-  ~AArch64ELFObjectWriter() override;
+  ~AArch64ELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
   bool IsILP32;
-private:
 };
-}
+
+} // end anonymous namespace
 
 AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                                                bool IsLittleEndian,
@@ -44,8 +49,6 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
 
-AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
-
 #define R_CLS(rtype) \
         IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
 #define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index f7058cdf2373..62dfa59483eb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -15,15 +15,23 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -37,13 +45,12 @@ class AArch64MCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   const MCInstrInfo &MCII;
 
-  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
 public:
   AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : Ctx(ctx), MCII(mcii) {}
-
-  ~AArch64MCCodeEmitter() override {}
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) = delete;
+  void operator=(const AArch64MCCodeEmitter &) = delete;
+  ~AArch64MCCodeEmitter() override = default;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
@@ -181,12 +188,6 @@ private:
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                MCContext &Ctx) {
-  return new AArch64MCCodeEmitter(MCII, Ctx);
-}
-
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned
@@ -601,3 +602,9 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "AArch64GenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 3e86a42d5be6..1b949b54590c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64TargetStreamer.h"
 #include "llvm/MC/ConstantPools.h"
+
 using namespace llvm;
 
 //
@@ -21,7 +22,7 @@ using namespace llvm;
 AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
 
-AArch64TargetStreamer::~AArch64TargetStreamer() {}
+AArch64TargetStreamer::~AArch64TargetStreamer() = default;
 
 // The constant pool handling is shared by all AArch64TargetStreamer
 // implementations.
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index a8e6902c252b..4acd55eb6120 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -176,12 +176,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   SetupMachineFunction(MF);
 
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   MCContext &Context = getObjFileLowering().getContext();
-  MCSectionELF *ConfigSection =
-      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
-  OutStreamer->SwitchSection(ConfigSection);
+  if (!STM.isAmdHsaOS()) {
+    MCSectionELF *ConfigSection =
+        Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+    OutStreamer->SwitchSection(ConfigSection);
+  }
 
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     getSIProgramInfo(KernelInfo, MF);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 85cbadf0a570..5f651d4da5d2 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -269,7 +269,7 @@ unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
 
 unsigned encodeWaitcnt(IsaVersion Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
-  unsigned Waitcnt = getWaitcntBitMask(Version);;
+  unsigned Waitcnt = getWaitcntBitMask(Version);
   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
   Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
   Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 10e6297ef1ed..cc001b596785 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -338,14 +338,17 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                          const SCEV *Ptr) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
   // extra micro-ops can significantly decrease throughput.
   unsigned NumVectorInstToHideOverhead = 10;
+  int MaxMergeDistance = 64;
 
-  if (Ty->isVectorTy() && IsComplex)
+  if (Ty->isVectorTy() && SE && 
+      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
     return NumVectorInstToHideOverhead;
 
   // In many cases the address computation is not merged into the instruction
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index d83228afb0ab..731a5adf3d73 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -104,7 +104,8 @@ public:
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
-  int getAddressComputationCost(Type *Val, bool IsComplex);
+  int getAddressComputationCost(Type *Val, ScalarEvolution *SE, 
+                                const SCEV *Ptr);
 
   int getFPOpCost(Type *Ty);
 
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 903f92a04431..57ead973b56e 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -8,23 +8,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "Lanai.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
 #include "MCTargetDesc/LanaiMCExpr.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 
 namespace llvm {
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(StringRef Name);
+
 namespace {
+
 struct LanaiOperand;
 
 class LanaiAsmParser : public MCTargetAsmParser {
@@ -80,9 +98,6 @@ private:
   const MCSubtargetInfo &SubtargetInfo;
 };
 
-// Auto-generated by TableGen
-static unsigned MatchRegisterName(llvm::StringRef Name);
-
 // LanaiOperand - Instances of this class represented a parsed machine
 // instruction
 struct LanaiOperand : public MCParsedAsmOperand {
@@ -627,6 +642,8 @@ public:
   }
 };
 
+} // end anonymous namespace
+
 bool LanaiAsmParser::ParseDirective(AsmToken /*DirectiveId*/) { return true; }
 
 bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
@@ -680,11 +697,11 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseRegister() {
   if (Lexer.getKind() == AsmToken::Identifier) {
     RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
     if (RegNum == 0)
-      return 0;
+      return nullptr;
     Parser.Lex(); // Eat identifier token
     return LanaiOperand::createReg(RegNum, Start, End);
   }
-  return 0;
+  return nullptr;
 }
 
 bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
@@ -701,15 +718,15 @@ bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
 std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   SMLoc Start = Parser.getTok().getLoc();
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-  const MCExpr *Res, *RHS = 0;
+  const MCExpr *Res, *RHS = nullptr;
   LanaiMCExpr::VariantKind Kind = LanaiMCExpr::VK_Lanai_None;
 
   if (Lexer.getKind() != AsmToken::Identifier)
-    return 0;
+    return nullptr;
 
   StringRef Identifier;
   if (Parser.parseIdentifier(Identifier))
-    return 0;
+    return nullptr;
 
   // Check if identifier has a modifier
   if (Identifier.equals_lower("hi"))
@@ -722,24 +739,24 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   if (Kind != LanaiMCExpr::VK_Lanai_None) {
     if (Lexer.getKind() != AsmToken::LParen) {
       Error(Lexer.getLoc(), "Expected '('");
-      return 0;
+      return nullptr;
     }
     Lexer.Lex(); // lex '('
 
     // Parse identifier
     if (Parser.parseIdentifier(Identifier))
-      return 0;
+      return nullptr;
   }
 
   // If addition parse the RHS.
   if (Lexer.getKind() == AsmToken::Plus && Parser.parseExpression(RHS))
-    return 0;
+    return nullptr;
 
   // For variants parse the final ')'
   if (Kind != LanaiMCExpr::VK_Lanai_None) {
     if (Lexer.getKind() != AsmToken::RParen) {
       Error(Lexer.getLoc(), "Expected ')'");
-      return 0;
+      return nullptr;
     }
     Lexer.Lex(); // lex ')'
   }
@@ -771,7 +788,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() {
     if (!Parser.parseExpression(ExprVal))
       return LanaiOperand::createImm(ExprVal, Start, End);
   default:
-    return 0;
+    return nullptr;
   }
 }
 
@@ -1204,10 +1221,9 @@ bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/,
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #include "LanaiGenAsmMatcher.inc"
-} // namespace
 
 extern "C" void LLVMInitializeLanaiAsmParser() {
   RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
 }
 
-} // namespace llvm
+} // end namespace llvm
diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
index a317cd88ad63..e0c19e8ea644 100644
--- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
+++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -20,14 +20,11 @@
 
 namespace llvm {
 
-class MCInst;
-class raw_ostream;
-
 class LanaiDisassembler : public MCDisassembler {
 public:
   LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx);
 
-  ~LanaiDisassembler() override {}
+  ~LanaiDisassembler() override = default;
 
   // getInstruction - See MCDisassembler.
   MCDisassembler::DecodeStatus
@@ -36,6 +33,6 @@ public:
                  raw_ostream &CStream) const override;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
index 1c9d186ad819..59904fbaa318 100644
--- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
+++ b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
@@ -14,10 +14,10 @@
 #ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
 #define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-class MCOperand;
 
 class LanaiInstPrinter : public MCInstPrinter {
 public:
@@ -28,14 +28,14 @@ public:
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O,
-                         const char *Modifier = 0);
+                         const char *Modifier = nullptr);
   void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O,
-                         const char *Modifier = 0);
+                         const char *Modifier = nullptr);
   void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O,
-                           const char *Modifier = 0);
+                           const char *Modifier = nullptr);
   void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O);
   void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O);
   void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -60,6 +60,7 @@ private:
   bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream,
                                  StringRef Opcode, int AddOffset);
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index ae7870e07d42..d156294a0b0c 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,31 +11,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LanaiISelLowering.h"
-
 #include "Lanai.h"
+#include "LanaiCondCode.h"
+#include "LanaiISelLowering.h"
 #include "LanaiMachineFunctionInfo.h"
 #include "LanaiSubtarget.h"
-#include "LanaiTargetMachine.h"
 #include "LanaiTargetObjectFile.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
 
 #define DEBUG_TYPE "lanai-lower"
 
@@ -195,6 +210,7 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
     llvm_unreachable("unimplemented operand");
   }
 }
+
 //===----------------------------------------------------------------------===//
 //                       Lanai Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -244,7 +260,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = Info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (CallOperandVal == nullptr)
     return CW_Default;
   // Look at the constraint type.
   switch (*Constraint) {
@@ -270,7 +286,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight(
 void LanaiTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result(nullptr, 0);
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1)
@@ -676,7 +692,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
     } else {
       assert(VA.isMemLoc());
 
-      if (StackPtr.getNode() == 0)
+      if (StackPtr.getNode() == nullptr)
         StackPtr = DAG.getCopyFromReg(Chain, DL, Lanai::SP,
                                       getPointerTy(DAG.getDataLayout()));
 
@@ -1120,7 +1136,7 @@ const char *LanaiTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case LanaiISD::SMALL:
     return "LanaiISD::SMALL";
   default:
-    return NULL;
+    return nullptr;
   }
 }
 
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.h b/lib/Target/Lanai/LanaiRegisterInfo.h
index 8b84bbc460e8..c6e459076ebc 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.h
+++ b/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -21,9 +21,6 @@
 
 namespace llvm {
 
-class TargetInstrInfo;
-class Type;
-
 struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
   LanaiRegisterInfo();
 
@@ -32,7 +29,7 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
 
   // Code Generation virtual methods.
   const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
@@ -42,7 +39,7 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 
   bool canRealignStack(const MachineFunction &MF) const override;
 
@@ -58,6 +55,6 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
   int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index e30d5e9a18eb..e02bba529bd5 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -9,20 +9,19 @@
 
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 namespace {
+
 class LanaiELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   explicit LanaiELFObjectWriter(uint8_t OSABI);
 
-  ~LanaiELFObjectWriter() override;
+  ~LanaiELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -30,14 +29,13 @@ protected:
   bool needsRelocateWithSymbol(const MCSymbol &SD,
                                unsigned Type) const override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI)
     : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI,
                               /*HasRelocationAddend=*/true) {}
 
-LanaiELFObjectWriter::~LanaiELFObjectWriter() {}
-
 unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/,
                                             const MCValue & /*Target*/,
                                             const MCFixup &Fixup,
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index ce68b7e24dba..f5b5335bb989 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -12,37 +12,38 @@
 //===----------------------------------------------------------------------===//
 
 #include "Lanai.h"
+#include "LanaiAluCode.h"
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
 #include "MCTargetDesc/LanaiMCExpr.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 #define DEBUG_TYPE "mccodeemitter"
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace llvm {
+
 namespace {
-class LanaiMCCodeEmitter : public MCCodeEmitter {
-  LanaiMCCodeEmitter(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const LanaiMCCodeEmitter &);     // DO NOT IMPLEMENT
-  const MCInstrInfo &InstrInfo;
-  MCContext &Context;
 
+class LanaiMCCodeEmitter : public MCCodeEmitter {
 public:
-  LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C)
-      : InstrInfo(MCII), Context(C) {}
-
-  ~LanaiMCCodeEmitter() override {}
+  LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C) {}
+  LanaiMCCodeEmitter(const LanaiMCCodeEmitter &) = delete;
+  void operator=(const LanaiMCCodeEmitter &) = delete;
+  ~LanaiMCCodeEmitter() override = default;
 
   // The functions below are called by TableGen generated functions for getting
   // the binary encoding of instructions/opereands.
@@ -86,6 +87,8 @@ public:
                             const MCSubtargetInfo &STI) const;
 };
 
+} // end anonymous namespace
+
 Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
@@ -298,8 +301,8 @@ unsigned LanaiMCCodeEmitter::getBranchTargetOpValue(
 }
 
 #include "LanaiGenMCCodeEmitter.inc"
-} // namespace
-} // namespace llvm
+
+} // end namespace llvm
 
 llvm::MCCodeEmitter *
 llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo,
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index c2f8c0f7ad50..a47ff9ff3d61 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -11,16 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LanaiMCAsmInfo.h"
 #include "LanaiMCTargetDesc.h"
-
 #include "InstPrinter/LanaiInstPrinter.h"
-#include "LanaiMCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cstdint>
+#include <string>
 
 #define GET_INSTRINFO_MC_DESC
 #include "LanaiGenInstrInfo.inc"
@@ -70,7 +75,7 @@ static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
                                                const MCRegisterInfo &MRI) {
   if (SyntaxVariant == 0)
     return new LanaiInstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
@@ -79,6 +84,7 @@ static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
 }
 
 namespace {
+
 class LanaiMCInstrAnalysis : public MCInstrAnalysis {
 public:
   explicit LanaiMCInstrAnalysis(const MCInstrInfo *Info)
@@ -107,6 +113,7 @@ public:
     }
   }
 };
+
 } // end anonymous namespace
 
 static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
@@ -131,7 +138,7 @@ extern "C" void LLVMInitializeLanaiTargetMC() {
 
   // Register the MC code emitter
   TargetRegistry::RegisterMCCodeEmitter(getTheLanaiTarget(),
-                                        llvm::createLanaiMCCodeEmitter);
+                                        createLanaiMCCodeEmitter);
 
   // Register the ASM Backend
   TargetRegistry::RegisterMCAsmBackend(getTheLanaiTarget(),
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index d3c88482f092..05acd25ae5fc 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -47,7 +47,7 @@ namespace llvm {
       FCTIDZ, FCTIWZ,
 
       /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
-      /// unsigned integers.
+      /// unsigned integers with round toward zero.
       FCTIDUZ, FCTIWUZ,
 
       /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 03b2257a88a8..fbec8787ef8d 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1154,6 +1154,9 @@ defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
 defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctid", "$frD, $frB", IIC_FPGeneral,
                         []>, isPPC64;
+defm FCTIDU : XForm_26r<63, 942, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctidu", "$frD, $frB", IIC_FPGeneral,
+                        []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctidz", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 99689f656c2d..ef7d2012a233 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -603,6 +603,12 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XForm_17a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+  : XForm_17<opcode, xo, OOL, IOL, asmstr, itin > {
+  let FRA = 0;
+}
+
 // Used for QPX
 class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index a7231bd2e2c0..90111bbea07d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2172,11 +2172,19 @@ let isCompare = 1, hasSideEffects = 0 in {
                         "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
 }
 
+def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+                      "ftdiv $crD, $fA, $fB", IIC_FPCompare>;
+def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
+                      "ftsqrt $crD, $fB", IIC_FPCompare>;
+
 let Uses = [RM] in {
   let hasSideEffects = 0 in {
   defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiw", "$frD, $frB", IIC_FPGeneral,
                           []>;
+  defm FCTIWU  : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiwu", "$frD, $frB", IIC_FPGeneral,
+                          []>;
   defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiwz", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fd2189397279..7f72ab17f619 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16985,10 +16985,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
-  if (Cond.getOpcode() == ISD::SETCC) {
+  if (Cond.getOpcode() == ISD::SETCC)
     if (SDValue NewCond = LowerSETCC(Cond, DAG))
       Cond = NewCond;
-  }
 
   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
@@ -18289,6 +18288,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
 /// constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
+                                   const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT SVT = ShAmt.getSimpleValueType();
   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
@@ -18306,27 +18306,32 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   }
 
-  const X86Subtarget &Subtarget =
-      static_cast<const X86Subtarget &>(DAG.getSubtarget());
-  if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
-      ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
-    // Let the shuffle legalizer expand this shift amount node.
+  // Need to build a vector containing shift amount.
+  // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+  // +=================+============+=======================================+
+  // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
+  // +=================+============+=======================================+
+  // | i64             | Yes, No    | Use ShAmt as lowest elt               |
+  // | i32             | Yes        | zero-extend in-reg                    |
+  // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
+  // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
+  // +=================+============+=======================================+
+
+  if (SVT == MVT::i64)
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+  else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+           ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     SDValue Op0 = ShAmt.getOperand(0);
     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
-    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
+    ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
+  } else if (Subtarget.hasSSE41() &&
+             ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
-    // Need to build a vector containing shift amount.
-    // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
-    SmallVector<SDValue, 4> ShOps;
-    ShOps.push_back(ShAmt);
-    if (SVT == MVT::i32) {
-      ShOps.push_back(DAG.getConstant(0, dl, SVT));
-      ShOps.push_back(DAG.getUNDEF(SVT));
-    }
-    ShOps.push_back(DAG.getUNDEF(SVT));
-
-    MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
-    ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
+    SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
+                                     DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
+    ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   }
 
   // The return type has to be a 128-bit type with the same element
@@ -19014,7 +19019,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
-                                 Op.getOperand(1), Op.getOperand(2), DAG);
+                                 Op.getOperand(1), Op.getOperand(2), Subtarget,
+                                 DAG);
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
@@ -21276,7 +21282,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       else if (EltVT.bitsLT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
-      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
     }
   }
 
@@ -25951,12 +25957,11 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                    bool FloatDomain,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
-  bool FloatDomain = MaskVT.isFloatingPoint() ||
-                     (!Subtarget.hasAVX2() && MaskVT.is256BitVector());
 
   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
@@ -26067,11 +26072,11 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                           bool FloatDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
-  bool FloatDomain = MaskVT.isFloatingPoint();
 
   bool ContainsZeros = false;
   SmallBitVector Zeroable(NumMaskElts, false);
@@ -26211,11 +26216,10 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                     SDValue &V1, SDValue &V2,
+                                     bool FloatDomain, SDValue &V1, SDValue &V2,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &ShuffleVT,
                                      bool IsUnary) {
-  bool FloatDomain = MaskVT.isFloatingPoint();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
@@ -26310,13 +26314,13 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 }
 
 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                            bool FloatDomain,
                                             SDValue &V1, SDValue &V2,
                                             SDLoc &DL, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget,
                                             unsigned &Shuffle, MVT &ShuffleVT,
                                             unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
-  bool FloatDomain = MaskVT.isFloatingPoint();
 
   // Attempt to match against PALIGNR byte rotate.
   if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
@@ -26594,8 +26598,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       }
     }
 
-    if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
-                                ShuffleVT)) {
+    if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
+                                ShuffleSrcVT, ShuffleVT)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26609,8 +26613,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       return true;
     }
 
-    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle,
-                                       ShuffleVT, PermuteImm)) {
+    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
+                                       Shuffle, ShuffleVT, PermuteImm)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26626,8 +26630,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle,
-                               ShuffleVT, UnaryShuffle)) {
+  if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
+                               Shuffle, ShuffleVT, UnaryShuffle)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26643,8 +26647,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return true;
   }
 
-  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,
-                                      Shuffle, ShuffleVT, PermuteImm)) {
+  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
+                                      DAG, Subtarget, Shuffle, ShuffleVT,
+                                      PermuteImm)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -28742,6 +28747,27 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
                               DAG.getConstant(Imm, DL, MVT::i8)));
     return true;
   }
+  case ISD::EXTRACT_SUBVECTOR: {
+    unsigned EltSize = EltVT.getSizeInBits();
+    if (EltSize != 32 && EltSize != 64)
+      return false;
+    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+    // Only change element size, not type.
+    if (VT.isInteger() != OpEltVT.isInteger())
+      return false;
+    uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
+    // Op0 needs to be bitcasted to a larger vector with the same element type.
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = MVT::getVectorVT(EltVT,
+                            Op0.getSimpleValueType().getSizeInBits() / EltSize);
+    Op0 = DAG.getBitcast(Op0VT, Op0);
+    DCI.AddToWorklist(Op0.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(Opcode, DL, VT, Op0,
+                              DAG.getConstant(Imm, DL, MVT::i8)));
+    return true;
+  }
   }
 
   return false;
@@ -30921,6 +30947,59 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
+/// Check if truncation with saturation form type \p SrcVT to \p DstVT
+/// is valid for the given \p Subtarget.
+static bool
+isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX512())
+    return false;
+  EVT SrcElVT = SrcVT.getScalarType();
+  EVT DstElVT = DstVT.getScalarType();
+  if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
+    return false;
+  if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+    return false;
+  if (SrcVT.is512BitVector() || Subtarget.hasVLX())
+    return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
+  return false;
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched or the unsupported on the current target.
+static SDValue
+detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
+  if (In.getOpcode() != ISD::UMIN)
+    return SDValue();
+
+  EVT InVT = In.getValueType();
+  // FIXME: Scalar type may be supported if we move it to vector register.
+  if (!InVT.isVector() || !InVT.isSimple())
+    return SDValue();
+
+  if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
+    return SDValue();
+
+  //Saturation with truncation. We truncate from InVT to VT.
+  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+    "Unexpected types for truncate operation");
+
+  SDValue SrcVal;
+  APInt C;
+  if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
+    SrcVal = In.getOperand(1);
+  else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
+    SrcVal = In.getOperand(0);
+  else
+    return SDValue();
+
+  // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+  // the element size of the destination type.
+  return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
+    SrcVal : SDValue();
+}
+
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
@@ -31487,6 +31566,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
+    if (SDValue Val =
+        detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+                             dl, Val, St->getBasePtr(),
+                             St->getMemoryVT(), St->getMemOperand(), DAG);
+
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
@@ -31967,7 +32052,8 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
 
 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
 static SDValue
-combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
+                                  SelectionDAG &DAG,
                                   SmallVector<SDValue, 8> &Regs) {
   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
   EVT OutVT = N->getValueType(0);
@@ -31976,8 +32062,10 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
   for (auto &Reg : Regs) {
-    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
-    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
+                              Subtarget, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
+                              Subtarget, DAG);
   }
 
   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
@@ -32046,7 +32134,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
   else if (InSVT == MVT::i32)
-    return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+    return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
   else
     return SDValue();
 }
@@ -32104,6 +32192,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
+  // Try the truncation with unsigned saturation.
+  if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
+    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index d7792e296a58..de4839432b9a 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -80,9 +80,12 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
 
 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   if (Vector) {
-    if (ST->hasAVX512()) return 512;
-    if (ST->hasAVX()) return 256;
-    if (ST->hasSSE1()) return 128;
+    if (ST->hasAVX512())
+      return 512;
+    if (ST->hasAVX())
+      return 256;
+    if (ST->hasSSE1())
+      return 128;
     return 0;
   }
 
@@ -211,11 +214,9 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for AVX512DQ lowering tricks for custom cases.
-  if (ST->hasDQI()) {
-    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
-                                            LT.second))
+  if (ST->hasDQI())
+    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX512BWCostTable[] = {
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
@@ -225,37 +226,38 @@ int X86TTIImpl::getArithmeticInstrCost(
     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
     { ISD::SDIV,  MVT::v64i8,  64*20 },
     { ISD::SDIV,  MVT::v32i16, 32*20 },
-    { ISD::SDIV,  MVT::v16i32, 16*20 },
-    { ISD::SDIV,  MVT::v8i64,   8*20 },
     { ISD::UDIV,  MVT::v64i8,  64*20 },
-    { ISD::UDIV,  MVT::v32i16, 32*20 },
-    { ISD::UDIV,  MVT::v16i32, 16*20 },
-    { ISD::UDIV,  MVT::v8i64,   8*20 },
+    { ISD::UDIV,  MVT::v32i16, 32*20 }
   };
 
   // Look for AVX512BW lowering tricks for custom cases.
-  if (ST->hasBWI()) {
-    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD,
-                                            LT.second))
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX512CostTable[] = {
-    { ISD::SHL,     MVT::v16i32,    1 },
-    { ISD::SRL,     MVT::v16i32,    1 },
-    { ISD::SRA,     MVT::v16i32,    1 },
-    { ISD::SHL,     MVT::v8i64,     1 },
-    { ISD::SRL,     MVT::v8i64,     1 },
-    { ISD::SRA,     MVT::v8i64,     1 },
-
-    { ISD::MUL,     MVT::v32i8,    13 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,     MVT::v16i8,     5 }, // extend/pmullw/trunc sequence.
+    { ISD::SHL,     MVT::v16i32,     1 },
+    { ISD::SRL,     MVT::v16i32,     1 },
+    { ISD::SRA,     MVT::v16i32,     1 },
+    { ISD::SHL,     MVT::v8i64,      1 },
+    { ISD::SRL,     MVT::v8i64,      1 },
+    { ISD::SRA,     MVT::v8i64,      1 },
+
+    { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld
+    { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,    MVT::v16i32, 16*20 },
+    { ISD::SDIV,    MVT::v8i64,   8*20 },
+    { ISD::UDIV,    MVT::v16i32, 16*20 },
+    { ISD::UDIV,    MVT::v8i64,   8*20 }
   };
 
-  if (ST->hasAVX512()) {
+  if (ST->hasAVX512())
     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
@@ -315,10 +317,9 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for XOP lowering tricks.
-  if (ST->hasXOP()) {
+  if (ST->hasXOP())
     if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX2CustomCostTable[] = {
     { ISD::SHL,  MVT::v32i8,      11 }, // vpblendvb sequence.
@@ -334,6 +335,8 @@ int X86TTIImpl::getArithmeticInstrCost(
 
     { ISD::MUL,   MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,   MVT::v8i32,      1 }, // pmulld
+    { ISD::MUL,   MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
 
     { ISD::FDIV,  MVT::f32,        7 }, // Haswell from http://www.agner.org/
     { ISD::FDIV,  MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
@@ -344,11 +347,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX2()) {
+  if (ST->hasAVX2())
     if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
                                             LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVXCustomCostTable[] = {
     { ISD::MUL,   MVT::v32i8,  26 }, // extend/pmullw/trunc sequence.
@@ -372,24 +374,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX()) {
+  if (ST->hasAVX())
     if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
                                             LT.second))
       return LT.first * Entry->Cost;
-  }
-
-  static const CostTblEntry SSE42FloatCostTable[] = {
-    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
-    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
-    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
-    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
-  };
-
-  if (ST->hasSSE42()) {
-    if (const auto *Entry = CostTableLookup(SSE42FloatCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry
   SSE2UniformCostTable[] = {
@@ -452,6 +440,17 @@ int X86TTIImpl::getArithmeticInstrCost(
       ISD = ISD::MUL;
   }
 
+  static const CostTblEntry SSE42CostTable[] = {
+    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+  };
+
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry SSE41CostTable[] = {
     { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
     { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
@@ -471,44 +470,39 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
     { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
+
+    { ISD::MUL,  MVT::v4i32,     1 }  // pmulld
   };
 
-  if (ST->hasSSE41()) {
+  if (ST->hasSSE41())
     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
     { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
     { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
     { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
     { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
     { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
-    { ISD::SRA,  MVT::v32i8,  2*54 }, // unpacked cmpgtb sequence.
     { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
     { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
 
     { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
+    { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
 
     { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
     { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
@@ -531,10 +525,9 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::UDIV,  MVT::v2i64,  2*20 },
   };
 
-  if (ST->hasSSE2()) {
+  if (ST->hasSSE2())
     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
@@ -553,307 +546,278 @@ int X86TTIImpl::getArithmeticInstrCost(
     // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
     // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
     // Because we believe v4i64 to be a legal type, we must also include the
-    // split factor of two in the cost table. Therefore, the cost here is 16
+    // extract+insert in the cost table. Therefore, the cost here is 18
     // instead of 8.
-    { ISD::MUL,     MVT::v4i64,    16 },
+    { ISD::MUL,     MVT::v4i64,    18 },
   };
 
   // Look for AVX1 lowering tricks.
-  if (ST->hasAVX() && !ST->hasAVX2()) {
-    MVT VT = LT.second;
-
-    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+  if (ST->hasAVX() && !ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
-  // Custom lowering of vectors.
-  static const CostTblEntry CustomLowered[] = {
-    // A v2i64/v4i64 and multiply is custom lowered as a series of long
-    // multiplies(3), shifts(3) and adds(2).
-    { ISD::MUL,     MVT::v2i64,    8 },
-    { ISD::MUL,     MVT::v4i64,    8 },
-    { ISD::MUL,     MVT::v8i64,    8 }
-  };
-  if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
-    return LT.first * Entry->Cost;
-
-  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
-  // 2x pmuludq, 2x shuffle.
-  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
-      !ST->hasSSE41())
-    return LT.first * 6;
-
-  static const CostTblEntry SSE1FloatCostTable[] = {
+  static const CostTblEntry SSE1CostTable[] = {
     { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
     { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
   };
 
   if (ST->hasSSE1())
-    if (const auto *Entry = CostTableLookup(SSE1FloatCostTable, ISD,
-                                            LT.second))
+    if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
+
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
 }
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
-    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
-    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
-    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v64i8,  1 }, // vpermb
-      { TTI::SK_Reverse, MVT::v32i8,  1 }  // vpermb
-    };
-
-    if (ST->hasVBMI())
-      if (const auto *Entry =
-              CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+  // For Broadcasts we are splatting the first element from the first input
+  // register, so only need to reference that input and all the output
+  // registers are the same.
+  if (Kind == TTI::SK_Broadcast)
+    LT.first = 1;
+
+  // We are going to permute multiple sources and the result will be in multiple
+  // destinations. Providing an accurate cost only for splits where the element
+  // type remains the same.
+  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+    MVT LegalVT = LT.second;
+    if (LegalVT.getVectorElementType().getSizeInBits() ==
+            Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+        LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+      unsigned VecTySize = DL.getTypeStoreSize(Tp);
+      unsigned LegalVTSize = LegalVT.getStoreSize();
+      // Number of source vectors after legalization:
+      unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+      // Number of destination vectors after legalization:
+      unsigned NumOfDests = LT.first;
+
+      Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+                                         LegalVT.getVectorNumElements());
+
+      unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+      return NumOfShuffles *
+             getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+    }
 
-    static const CostTblEntry AVX512BWShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
-                                           // + 2*pshufb + vinserti64x4
-    };
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
 
-    if (ST->hasBWI())
-      if (const auto *Entry =
-              CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
+  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
+    // We assume that source and destination have the same vector type.
+    int NumOfDests = LT.first;
+    int NumOfShufflesPerDest = LT.first * 2 - 1;
+    LT.first = NumOfDests * NumOfShufflesPerDest;
+  }
 
-    static const CostTblEntry AVX512ShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v8f64,  1 }, // vpermpd
-      { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
-      { TTI::SK_Reverse, MVT::v8i64,  1 }, // vpermq
-      { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
-    };
+  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+    { TTI::SK_Reverse,          MVT::v64i8,  1 }, // vpermb
+    { TTI::SK_Reverse,          MVT::v32i8,  1 }, // vpermb
 
-    if (ST->hasAVX512())
-      if (const auto *Entry =
-              CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  1 }, // vpermb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  1 }, // vpermb
 
-    static const CostTblEntry AVX2ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
-      { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
-      { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
-      { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
-      { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
-      { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
+    { TTI::SK_PermuteTwoSrc,    MVT::v64i8,  1 }, // vpermt2b
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  1 }, // vpermt2b
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  1 }  // vpermt2b
+  };
 
-      { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-      { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
-    };
+  if (ST->hasVBMI())
+    if (const auto *Entry =
+            CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasAVX2())
-      if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX512BWShuffleTbl[] = {
+    { TTI::SK_Broadcast,        MVT::v32i16, 1 }, // vpbroadcastw
+    { TTI::SK_Broadcast,        MVT::v64i8,  1 }, // vpbroadcastb
+
+    { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
+    { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
+    { TTI::SK_Reverse,          MVT::v64i8,  6 }, // vextracti64x4 + 2*vperm2i128
+                                                  // + 2*pshufb + vinserti64x4
+
+    { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  1 }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  8 }, // extend to v32i16
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  3 }, // vpermw + zext/trunc
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i16, 1 }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 1 }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  1 }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  3 }, // zext + vpermt2w + trunc
+    { TTI::SK_PermuteTwoSrc,    MVT::v64i8, 19 }, // 6 * v32i8 + 1
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  3 }  // zext + vpermt2w + trunc
+  };
 
-    static const CostTblEntry AVX1ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
-      { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
-      { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
-      { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
-      { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
-                                             // + vinsertf128
-      { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
-                                             // + vinsertf128
-
-      { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
-      { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
-      { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
-      { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
-      { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
-      { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
-    };
+  if (ST->hasBWI())
+    if (const auto *Entry =
+            CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX512ShuffleTbl[] = {
+    { TTI::SK_Broadcast,        MVT::v8f64,  1 }, // vbroadcastpd
+    { TTI::SK_Broadcast,        MVT::v16f32, 1 }, // vbroadcastps
+    { TTI::SK_Broadcast,        MVT::v8i64,  1 }, // vpbroadcastq
+    { TTI::SK_Broadcast,        MVT::v16i32, 1 }, // vpbroadcastd
+
+    { TTI::SK_Reverse,          MVT::v8f64,  1 }, // vpermpd
+    { TTI::SK_Reverse,          MVT::v16f32, 1 }, // vpermps
+    { TTI::SK_Reverse,          MVT::v8i64,  1 }, // vpermq
+    { TTI::SK_Reverse,          MVT::v16i32, 1 }, // vpermd
+
+    { TTI::SK_PermuteSingleSrc, MVT::v8f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v4f32,  1 }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v8i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8,  1 }, // pshufb
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v8f64,  1 }, // vpermt2pd
+    { TTI::SK_PermuteTwoSrc,    MVT::v16f32, 1 }, // vpermt2ps
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i64,  1 }, // vpermt2q
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i32, 1 }, // vpermt2d
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  1 }, // vpermt2pd
+    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  1 }, // vpermt2ps
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  1 }, // vpermt2q
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  1 }, // vpermt2d
+    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // vpermt2pd
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f32,  1 }, // vpermt2ps
+    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // vpermt2q
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  1 }  // vpermt2d
+  };
 
-    static const CostTblEntry SSE41ShuffleTbl[] = {
-      { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
-      { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
-      { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
-      { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
-      { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
-      { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
-    };
-
-    if (ST->hasSSE41())
-      if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    static const CostTblEntry SSSE3ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
-      { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
+  static const CostTblEntry AVX2ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
+    { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
+    { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
+    { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
+    { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
+
+    { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
+    { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
+    { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
+    { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
+
+    { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
+    { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+  };
 
-      { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-      { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
-    };
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasSSSE3())
-      if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX1ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
+    { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
+
+    { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+                                           // + vinsertf128
+    { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
+                                           // + vinsertf128
+
+    { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
+    { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
+    { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
+    { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
+    { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+    { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
+  };
 
-    static const CostTblEntry SSE2ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
-      { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
-      { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
-      { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
-      { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
-                                             // + 2*pshufd + 2*unpck + packus
-
-      { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
-      { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
-      { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
-      { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-      { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
-    };
-
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    static const CostTblEntry SSE1ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
-      { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
-    };
+  static const CostTblEntry SSE41ShuffleTbl[] = {
+    { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
+    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
+    { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
+    { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
+    { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
+    { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
+  };
 
-    if (ST->hasSSE1())
-      if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  if (ST->hasSSE41())
+    if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-  } else if (Kind == TTI::SK_PermuteTwoSrc) {
-    // We assume that source and destination have the same vector type.
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    int NumOfDests = LT.first;
-    int NumOfShufflesPerDest = LT.first * 2 - 1;
-    int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
-
-    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
-        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}  // vpermt2b
-    };
-
-    if (ST->hasVBMI())
-      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
-        return NumOfShuffles * Entry->Cost;
-
-    static const CostTblEntry AVX512BWShuffleTbl[] = {
-        {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
-        {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
-        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermt2w
-        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3},  // zext + vpermt2w + trunc
-        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}   // zext + vpermt2w + trunc
-    };
-
-    if (ST->hasBWI())
-      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
-        return NumOfShuffles * Entry->Cost;
-
-    static const CostTblEntry AVX512ShuffleTbl[] = {
-        {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermt2pd
-        {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
-        {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermt2q
-        {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
-        {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermt2pd
-        {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermt2ps
-        {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermt2q
-        {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermt2d
-        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermt2pd
-        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermt2ps
-        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermt2q
-        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}   // vpermt2d
-    };
+  static const CostTblEntry SSSE3ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
+    { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
 
-    if (ST->hasAVX512())
-      if (const auto *Entry =
-              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return NumOfShuffles * Entry->Cost;
-
-  } else if (Kind == TTI::SK_PermuteSingleSrc) {
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    if (LT.first == 1) {
-
-      static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
-          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}  // vpermb
-      };
-
-      if (ST->hasVBMI())
-        if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
-                                                ISD::VECTOR_SHUFFLE, LT.second))
-          return Entry->Cost;
-
-      static const CostTblEntry AVX512BWShuffleTbl[] = {
-          {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
-          {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
-          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermw
-          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8},  // extend to v32i16
-          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}   // vpermw + zext/trunc
-      };
-
-      if (ST->hasBWI())
-        if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
-                                                ISD::VECTOR_SHUFFLE, LT.second))
-          return Entry->Cost;
-
-      static const CostTblEntry AVX512ShuffleTbl[] = {
-          {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermpd
-          {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermpd
-          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermpd
-          {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
-          {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermps
-          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermps
-          {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermq
-          {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermq
-          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermq
-          {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
-          {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermd
-          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},  // vpermd
-          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}   // pshufb
-      };
-
-      if (ST->hasAVX512())
-        if (const auto *Entry =
-            CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-          return Entry->Cost;
-
-    } else {
-      // We are going to permute multiple sources and the result will be in
-      // multiple destinations. Providing an accurate cost only for splits where
-      // the element type remains the same.
-
-      MVT LegalVT = LT.second;
-      if (LegalVT.getVectorElementType().getSizeInBits() ==
-              Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
-          LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
-
-        unsigned VecTySize = DL.getTypeStoreSize(Tp);
-        unsigned LegalVTSize = LegalVT.getStoreSize();
-        // Number of source vectors after legalization:
-        unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
-        // Number of destination vectors after legalization:
-        unsigned NumOfDests = LT.first;
-
-        Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
-                                           LegalVT.getVectorNumElements());
-
-        unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
-        return NumOfShuffles *
-               getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
-      }
-    }
-  }
+    { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
+    { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
+
+    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
+  };
+
+  if (ST->hasSSSE3())
+    if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+
+  static const CostTblEntry SSE2ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
+    { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
+    { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
+    { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
+    { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
+
+    { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
+    { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
+    { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
+    { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
+    { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
+                                           // + 2*pshufd + 2*unpck + packus
+
+    { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
+    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
+    { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
+    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
+  };
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+
+  static const CostTblEntry SSE1ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
+    { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
+    { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
+  };
+
+  if (ST->hasSSE1())
+    if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
@@ -1623,17 +1587,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   return Cost+LT.first;
 }
 
-int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                          const SCEV *Ptr) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
   // extra micro-ops can significantly decrease throughput.
   unsigned NumVectorInstToHideOverhead = 10;
 
-  if (Ty->isVectorTy() && IsComplex)
-    return NumVectorInstToHideOverhead;
+  // Cost modeling of Strided Access Computation is hidden by the indexing
+  // modes of X86 regardless of the stride value. We dont believe that there
+  // is a difference between constant strided access in gerenal and constant
+  // strided value which is less than or equal to 64.
+  // Even in the case of (loop invariant) stride whose value is not known at
+  // compile time, the address computation will not incur more than one extra
+  // ADD instruction.
+  if (Ty->isVectorTy() && SE) {
+    if (!BaseT::isStridedAccess(Ptr))
+      return NumVectorInstToHideOverhead;
+    if (!BaseT::getConstantStrideStep(SE, Ptr))
+      return 1;
+  }
 
-  return BaseT::getAddressComputationCost(Ty, IsComplex);
+  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
 int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index f6bcb9f569e4..c013805f4321 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -71,7 +71,8 @@ public:
                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
                              bool VariableMask, unsigned Alignment);
-  int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+  int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                const SCEV *Ptr);
 
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Type *> Tys, FastMathFlags FMF);
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 6dd95f8dcd55..6b32f6c31f72 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -36,7 +36,10 @@
 
 using namespace llvm;
 
-STATISTIC(NumImported, "Number of functions imported");
+STATISTIC(NumImportedFunctions, "Number of functions imported");
+STATISTIC(NumImportedModules, "Number of modules imported from");
+STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
+STATISTIC(NumLiveSymbols, "Number of live symbols in index");
 
 /// Limit on instruction count of imported functions.
 static cl::opt<unsigned> ImportInstrLimit(
@@ -69,6 +72,9 @@ static cl::opt<float> ImportColdMultiplier(
 static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
                                   cl::desc("Print imported functions"));
 
+static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
+                                 cl::desc("Compute dead symbols"));
+
 // Temporary allows the function import pass to disable always linking
 // referenced discardable symbols.
 static cl::opt<bool>
@@ -105,78 +111,6 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
 
 namespace {
 
-// Return true if the Summary describes a GlobalValue that can be externally
-// referenced, i.e. it does not need renaming (linkage is not local) or renaming
-// is possible (does not have a section for instance).
-static bool canBeExternallyReferenced(const GlobalValueSummary &Summary) {
-  if (!Summary.needsRenaming())
-    return true;
-
-  if (Summary.noRename())
-    // Can't externally reference a global that needs renaming if has a section
-    // or is referenced from inline assembly, for example.
-    return false;
-
-  return true;
-}
-
-// Return true if \p GUID describes a GlobalValue that can be externally
-// referenced, i.e. it does not need renaming (linkage is not local) or
-// renaming is possible (does not have a section for instance).
-static bool canBeExternallyReferenced(const ModuleSummaryIndex &Index,
-                                      GlobalValue::GUID GUID) {
-  auto Summaries = Index.findGlobalValueSummaryList(GUID);
-  if (Summaries == Index.end())
-    return true;
-  if (Summaries->second.size() != 1)
-    // If there are multiple globals with this GUID, then we know it is
-    // not a local symbol, and it is necessarily externally referenced.
-    return true;
-
-  // We don't need to check for the module path, because if it can't be
-  // externally referenced and we call it, it is necessarilly in the same
-  // module
-  return canBeExternallyReferenced(**Summaries->second.begin());
-}
-
-// Return true if the global described by \p Summary can be imported in another
-// module.
-static bool eligibleForImport(const ModuleSummaryIndex &Index,
-                              const GlobalValueSummary &Summary) {
-  if (!canBeExternallyReferenced(Summary))
-    // Can't import a global that needs renaming if has a section for instance.
-    // FIXME: we may be able to import it by copying it without promotion.
-    return false;
-
-  // Don't import functions that are not viable to inline.
-  if (Summary.isNotViableToInline())
-    return false;
-
-  // Check references (and potential calls) in the same module. If the current
-  // value references a global that can't be externally referenced it is not
-  // eligible for import. First check the flag set when we have possible
-  // opaque references (e.g. inline asm calls), then check the call and
-  // reference sets.
-  if (Summary.hasInlineAsmMaybeReferencingInternal())
-    return false;
-  bool AllRefsCanBeExternallyReferenced =
-      llvm::all_of(Summary.refs(), [&](const ValueInfo &VI) {
-        return canBeExternallyReferenced(Index, VI.getGUID());
-      });
-  if (!AllRefsCanBeExternallyReferenced)
-    return false;
-
-  if (auto *FuncSummary = dyn_cast<FunctionSummary>(&Summary)) {
-    bool AllCallsCanBeExternallyReferenced = llvm::all_of(
-        FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
-          return canBeExternallyReferenced(Index, Edge.first.getGUID());
-        });
-    if (!AllCallsCanBeExternallyReferenced)
-      return false;
-  }
-  return true;
-}
-
 /// Given a list of possible callee implementation for a call site, select one
 /// that fits the \p Threshold.
 ///
@@ -214,7 +148,7 @@ selectCallee(const ModuleSummaryIndex &Index,
         if (Summary->instCount() > Threshold)
           return false;
 
-        if (!eligibleForImport(Index, *Summary))
+        if (Summary->notEligibleToImport())
           return false;
 
         return true;
@@ -346,7 +280,8 @@ static void computeImportForFunction(
 static void ComputeImportForModule(
     const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList,
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr,
+    const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) {
   // Worklist contains the list of function imported in this module, for which
   // we will analyse the callees and may import further down the callgraph.
   SmallVector<EdgeInfo, 128> Worklist;
@@ -354,6 +289,10 @@ static void ComputeImportForModule(
   // Populate the worklist with the import for the functions in the current
   // module
   for (auto &GVSummary : DefinedGVSummaries) {
+    if (DeadSymbols && DeadSymbols->count(GVSummary.first)) {
+      DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
+      continue;
+    }
     auto *Summary = GVSummary.second;
     if (auto *AS = dyn_cast<AliasSummary>(Summary))
       Summary = &AS->getAliasee();
@@ -393,14 +332,15 @@ void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
     const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     StringMap<FunctionImporter::ImportMapTy> &ImportLists,
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists,
+    const DenseSet<GlobalValue::GUID> *DeadSymbols) {
   // For each module that has function defined, compute the import/export lists.
   for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
     auto &ImportList = ImportLists[DefinedGVSummaries.first()];
     DEBUG(dbgs() << "Computing import for Module '"
                  << DefinedGVSummaries.first() << "'\n");
     ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
-                           &ExportLists);
+                           &ExportLists, DeadSymbols);
   }
 
   // When computing imports we added all GUIDs referenced by anything
@@ -462,6 +402,86 @@ void llvm::ComputeCrossModuleImportForModule(
 #endif
 }
 
+DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols(
+    const ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  if (!ComputeDead)
+    return DenseSet<GlobalValue::GUID>();
+  if (GUIDPreservedSymbols.empty())
+    // Don't do anything when nothing is live, this is friendly with tests.
+    return DenseSet<GlobalValue::GUID>();
+  DenseSet<GlobalValue::GUID> LiveSymbols = GUIDPreservedSymbols;
+  SmallVector<GlobalValue::GUID, 128> Worklist;
+  Worklist.reserve(LiveSymbols.size() * 2);
+  for (auto GUID : LiveSymbols) {
+    DEBUG(dbgs() << "Live root: " << GUID << "\n");
+    Worklist.push_back(GUID);
+  }
+  // Add values flagged in the index as live roots to the worklist.
+  for (const auto &Entry : Index) {
+    bool IsLiveRoot = llvm::any_of(
+        Entry.second,
+        [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) {
+          return Summary->liveRoot();
+        });
+    if (!IsLiveRoot)
+      continue;
+    DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n");
+    Worklist.push_back(Entry.first);
+  }
+
+  while (!Worklist.empty()) {
+    auto GUID = Worklist.pop_back_val();
+    auto It = Index.findGlobalValueSummaryList(GUID);
+    if (It == Index.end()) {
+      DEBUG(dbgs() << "Not in index: " << GUID << "\n");
+      continue;
+    }
+
+    // FIXME: we should only make the prevailing copy live here
+    for (auto &Summary : It->second) {
+      for (auto Ref : Summary->refs()) {
+        auto RefGUID = Ref.getGUID();
+        if (LiveSymbols.insert(RefGUID).second) {
+          DEBUG(dbgs() << "Marking live (ref): " << RefGUID << "\n");
+          Worklist.push_back(RefGUID);
+        }
+      }
+      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
+        for (auto Call : FS->calls()) {
+          auto CallGUID = Call.first.getGUID();
+          if (LiveSymbols.insert(CallGUID).second) {
+            DEBUG(dbgs() << "Marking live (call): " << CallGUID << "\n");
+            Worklist.push_back(CallGUID);
+          }
+        }
+      }
+      if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
+        auto AliaseeGUID = AS->getAliasee().getOriginalName();
+        if (LiveSymbols.insert(AliaseeGUID).second) {
+          DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n");
+          Worklist.push_back(AliaseeGUID);
+        }
+      }
+    }
+  }
+  DenseSet<GlobalValue::GUID> DeadSymbols;
+  DeadSymbols.reserve(
+      std::min(Index.size(), Index.size() - LiveSymbols.size()));
+  for (auto &Entry : Index) {
+    auto GUID = Entry.first;
+    if (!LiveSymbols.count(GUID)) {
+      DEBUG(dbgs() << "Marking dead: " << GUID << "\n");
+      DeadSymbols.insert(GUID);
+    }
+  }
+  DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and "
+               << DeadSymbols.size() << " symbols Dead \n");
+  NumDeadSymbols += DeadSymbols.size();
+  NumLiveSymbols += LiveSymbols.size();
+  return DeadSymbols;
+}
+
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
 /// \p ModulePath.
 void llvm::gatherImportedSummariesForModule(
@@ -625,7 +645,6 @@ Expected<bool> FunctionImporter::importFunctions(
     // now, before linking it (otherwise this will be a noop).
     if (Error Err = SrcModule->materializeMetadata())
       return std::move(Err);
-    UpgradeDebugInfo(*SrcModule);
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
     // Find the globals to import
@@ -698,6 +717,10 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
 
+    // Upgrade debug info after we're done materializing all the globals and we
+    // have loaded all the required metadata!
+    UpgradeDebugInfo(*SrcModule);
+
     // Link in the specified functions.
     if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport))
       return true;
@@ -717,9 +740,10 @@ Expected<bool> FunctionImporter::importFunctions(
       report_fatal_error("Function Import: link error");
 
     ImportedCount += GlobalsToImport.size();
+    NumImportedModules++;
   }
 
-  NumImported += ImportedCount;
+  NumImportedFunctions += ImportedCount;
 
   DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 2948878cffc4..f4742aaf748f 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -27,9 +27,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/TrailingObjects.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
@@ -52,6 +55,20 @@ static cl::opt<bool> AvoidReuse(
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
+static cl::opt<std::string> ClSummaryAction(
+    "lowertypetests-summary-action",
+    cl::desc("What to do with the summary when running this pass"), cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "lowertypetests-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "lowertypetests-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
 bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
   if (Offset < ByteOffset)
     return false;
@@ -66,38 +83,6 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
   return Bits.count(BitOffset);
 }
 
-bool BitSetInfo::containsValue(
-    const DataLayout &DL,
-    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V,
-    uint64_t COffset) const {
-  if (auto GV = dyn_cast<GlobalObject>(V)) {
-    auto I = GlobalLayout.find(GV);
-    if (I == GlobalLayout.end())
-      return false;
-    return containsGlobalOffset(I->second + COffset);
-  }
-
-  if (auto GEP = dyn_cast<GEPOperator>(V)) {
-    APInt APOffset(DL.getPointerSizeInBits(0), 0);
-    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
-    if (!Result)
-      return false;
-    COffset += APOffset.getZExtValue();
-    return containsValue(DL, GlobalLayout, GEP->getPointerOperand(), COffset);
-  }
-
-  if (auto Op = dyn_cast<Operator>(V)) {
-    if (Op->getOpcode() == Instruction::BitCast)
-      return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset);
-
-    if (Op->getOpcode() == Instruction::Select)
-      return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) &&
-             containsValue(DL, GlobalLayout, Op->getOperand(2), COffset);
-  }
-
-  return false;
-}
-
 void BitSetInfo::print(raw_ostream &OS) const {
   OS << "offset " << ByteOffset << " size " << BitSize << " align "
      << (1 << AlignLog2);
@@ -204,7 +189,7 @@ struct ByteArrayInfo {
   std::set<uint64_t> Bits;
   uint64_t BitSize;
   GlobalVariable *ByteArray;
-  Constant *Mask;
+  GlobalVariable *MaskGlobal;
 };
 
 /// A POD-like structure that we use to store a global reference together with
@@ -241,6 +226,9 @@ public:
 class LowerTypeTestsModule {
   Module &M;
 
+  // This is for testing purposes only.
+  std::unique_ptr<ModuleSummaryIndex> OwnedSummary;
+
   bool LinkerSubsectionsViaSymbols;
   Triple::ArchType Arch;
   Triple::OSType OS;
@@ -248,6 +236,7 @@ class LowerTypeTestsModule {
 
   IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
   IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
   IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
   PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
   IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
@@ -259,6 +248,37 @@ class LowerTypeTestsModule {
   // Mapping from type identifiers to the call sites that test them.
   DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites;
 
+  /// This structure describes how to lower type tests for a particular type
+  /// identifier. It is either built directly from the global analysis (during
+  /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
+  /// identifier summaries and external symbol references (in ThinLTO backends).
+  struct TypeIdLowering {
+    TypeTestResolution::Kind TheKind;
+
+    /// All except Unsat: the start address within the combined global.
+    Constant *OffsetedGlobal;
+
+    /// ByteArray, Inline, AllOnes: log2 of the required global alignment
+    /// relative to the start address.
+    Constant *AlignLog2;
+
+    /// ByteArray, Inline, AllOnes: size of the memory region covering members
+    /// of this type identifier as a multiple of 2^AlignLog2.
+    Constant *Size;
+
+    /// ByteArray, Inline, AllOnes: range of the size expressed as a bit width.
+    unsigned SizeBitWidth;
+
+    /// ByteArray: the byte array to test the address against.
+    Constant *TheByteArray;
+
+    /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
+    Constant *BitMask;
+
+    /// Inline: the bit mask to test the address against.
+    Constant *InlineBits;
+  };
+
   std::vector<ByteArrayInfo> ByteArrayInfos;
 
   Function *WeakInitializerFn = nullptr;
@@ -268,15 +288,13 @@ class LowerTypeTestsModule {
               const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
   ByteArrayInfo *createByteArray(BitSetInfo &BSI);
   void allocateByteArrays();
-  Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
+  Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
                           Value *BitOffset);
   void lowerTypeTestCalls(
       ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
       const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
-  Value *
-  lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
-                  Constant *CombinedGlobal,
-                  const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+  Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                           const TypeIdLowering &TIL);
   void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
                                        ArrayRef<GlobalTypeMember *> Globals);
   unsigned getJumpTableEntrySize();
@@ -302,6 +320,7 @@ class LowerTypeTestsModule {
 
 public:
   LowerTypeTestsModule(Module &M);
+  ~LowerTypeTestsModule();
   bool lower();
 };
 
@@ -380,7 +399,7 @@ ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
   BAI->Bits = BSI.Bits;
   BAI->BitSize = BSI.BitSize;
   BAI->ByteArray = ByteArrayGlobal;
-  BAI->Mask = ConstantExpr::getPtrToInt(MaskGlobal, Int8Ty);
+  BAI->MaskGlobal = MaskGlobal;
   return BAI;
 }
 
@@ -399,8 +418,9 @@ void LowerTypeTestsModule::allocateByteArrays() {
     uint8_t Mask;
     BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
 
-    BAI->Mask->replaceAllUsesWith(ConstantInt::get(Int8Ty, Mask));
-    cast<GlobalVariable>(BAI->Mask->getOperand(0))->eraseFromParent();
+    BAI->MaskGlobal->replaceAllUsesWith(
+        ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
+    BAI->MaskGlobal->eraseFromParent();
   }
 
   Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
@@ -435,101 +455,121 @@ void LowerTypeTestsModule::allocateByteArrays() {
   ByteArraySizeBytes = BAB.Bytes.size();
 }
 
-/// Build a test that bit BitOffset is set in BSI, where
-/// BitSetGlobal is a global containing the bits in BSI.
-Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
-                                              ByteArrayInfo *&BAI,
+/// Build a test that bit BitOffset is set in the type identifier that was
+/// lowered to TIL, which must be either an Inline or a ByteArray.
+Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
+                                              const TypeIdLowering &TIL,
                                               Value *BitOffset) {
-  if (BSI.BitSize <= 64) {
+  if (TIL.TheKind == TypeTestResolution::Inline) {
     // If the bit set is sufficiently small, we can avoid a load by bit testing
     // a constant.
-    IntegerType *BitsTy;
-    if (BSI.BitSize <= 32)
-      BitsTy = Int32Ty;
-    else
-      BitsTy = Int64Ty;
-
-    uint64_t Bits = 0;
-    for (auto Bit : BSI.Bits)
-      Bits |= uint64_t(1) << Bit;
-    Constant *BitsConst = ConstantInt::get(BitsTy, Bits);
-    return createMaskedBitTest(B, BitsConst, BitOffset);
+    return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
   } else {
-    if (!BAI) {
-      ++NumByteArraysCreated;
-      BAI = createByteArray(BSI);
-    }
-
-    Constant *ByteArray = BAI->ByteArray;
-    Type *Ty = BAI->ByteArray->getValueType();
+    Constant *ByteArray = TIL.TheByteArray;
     if (!LinkerSubsectionsViaSymbols && AvoidReuse) {
       // Each use of the byte array uses a different alias. This makes the
       // backend less likely to reuse previously computed byte array addresses,
       // improving the security of the CFI mechanism based on this pass.
-      ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0,
-                                      GlobalValue::PrivateLinkage, "bits_use",
-                                      ByteArray, &M);
+      ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
+                                      "bits_use", ByteArray, &M);
     }
 
-    Value *ByteAddr = B.CreateGEP(Ty, ByteArray, BitOffset);
+    Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
     Value *Byte = B.CreateLoad(ByteAddr);
 
-    Value *ByteAndMask = B.CreateAnd(Byte, BAI->Mask);
+    Value *ByteAndMask =
+        B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
     return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
   }
 }
 
+static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
+                                Value *V, uint64_t COffset) {
+  if (auto GV = dyn_cast<GlobalObject>(V)) {
+    SmallVector<MDNode *, 2> Types;
+    GV->getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      if (Type->getOperand(1) != TypeId)
+        continue;
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      if (COffset == Offset)
+        return true;
+    }
+    return false;
+  }
+
+  if (auto GEP = dyn_cast<GEPOperator>(V)) {
+    APInt APOffset(DL.getPointerSizeInBits(0), 0);
+    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
+    if (!Result)
+      return false;
+    COffset += APOffset.getZExtValue();
+    return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
+  }
+
+  if (auto Op = dyn_cast<Operator>(V)) {
+    if (Op->getOpcode() == Instruction::BitCast)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
+
+    if (Op->getOpcode() == Instruction::Select)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
+             isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
+  }
+
+  return false;
+}
+
 /// Lower a llvm.type.test call to its implementation. Returns the value to
 /// replace the call with.
-Value *LowerTypeTestsModule::lowerBitSetCall(
-    CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
-    Constant *CombinedGlobalIntAddr,
-    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
+Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                                               const TypeIdLowering &TIL) {
+  if (TIL.TheKind == TypeTestResolution::Unsat)
+    return ConstantInt::getFalse(M.getContext());
+
   Value *Ptr = CI->getArgOperand(0);
   const DataLayout &DL = M.getDataLayout();
-
-  if (BSI.containsValue(DL, GlobalLayout, Ptr))
+  if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
     return ConstantInt::getTrue(M.getContext());
 
-  Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
-      CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset));
-
   BasicBlock *InitialBB = CI->getParent();
 
   IRBuilder<> B(CI);
 
   Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
 
-  if (BSI.isSingleOffset())
+  Constant *OffsetedGlobalAsInt =
+      ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
+  if (TIL.TheKind == TypeTestResolution::Single)
     return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
 
   Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
 
-  Value *BitOffset;
-  if (BSI.AlignLog2 == 0) {
-    BitOffset = PtrOffset;
-  } else {
-    // We need to check that the offset both falls within our range and is
-    // suitably aligned. We can check both properties at the same time by
-    // performing a right rotate by log2(alignment) followed by an integer
-    // comparison against the bitset size. The rotate will move the lower
-    // order bits that need to be zero into the higher order bits of the
-    // result, causing the comparison to fail if they are nonzero. The rotate
-    // also conveniently gives us a bit offset to use during the load from
-    // the bitset.
-    Value *OffsetSHR =
-        B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2));
-    Value *OffsetSHL = B.CreateShl(
-        PtrOffset,
-        ConstantInt::get(IntPtrTy, DL.getPointerSizeInBits(0) - BSI.AlignLog2));
-    BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
-  }
-
-  Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize);
+  // We need to check that the offset both falls within our range and is
+  // suitably aligned. We can check both properties at the same time by
+  // performing a right rotate by log2(alignment) followed by an integer
+  // comparison against the bitset size. The rotate will move the lower
+  // order bits that need to be zero into the higher order bits of the
+  // result, causing the comparison to fail if they are nonzero. The rotate
+  // also conveniently gives us a bit offset to use during the load from
+  // the bitset.
+  Value *OffsetSHR =
+      B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
+  Value *OffsetSHL = B.CreateShl(
+      PtrOffset, ConstantExpr::getZExt(
+                     ConstantExpr::getSub(
+                         ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
+                         TIL.AlignLog2),
+                     IntPtrTy));
+  Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+
+  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.Size, IntPtrTy);
   Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
 
   // If the bit set is all ones, testing against it is unnecessary.
-  if (BSI.isAllOnes())
+  if (TIL.TheKind == TypeTestResolution::AllOnes)
     return OffsetInRange;
 
   TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false);
@@ -537,7 +577,7 @@ Value *LowerTypeTestsModule::lowerBitSetCall(
 
   // Now that we know that the offset is in range and aligned, load the
   // appropriate bit from the bitset.
-  Value *Bit = createBitSetTest(ThenB, BSI, BAI, BitOffset);
+  Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
 
   // The value we want is 0 if we came directly from the initial block
   // (having failed the range or alignment checks), or the loaded bit if
@@ -622,11 +662,7 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
 void LowerTypeTestsModule::lowerTypeTestCalls(
     ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
     const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
-  Constant *CombinedGlobalIntAddr =
-      ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy);
-  DenseMap<GlobalObject *, uint64_t> GlobalObjLayout;
-  for (auto &P : GlobalLayout)
-    GlobalObjLayout[P.first->getGlobal()] = P.second;
+  CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
 
   // For each type identifier in this disjoint set...
   for (Metadata *TypeId : TypeIds) {
@@ -640,13 +676,43 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
       BSI.print(dbgs());
     });
 
-    ByteArrayInfo *BAI = nullptr;
+    TypeIdLowering TIL;
+    TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
+        Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
+    TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+    if (BSI.isAllOnes()) {
+      TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
+                                       : TypeTestResolution::AllOnes;
+      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
+      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
+                                  BSI.BitSize);
+    } else if (BSI.BitSize <= 64) {
+      TIL.TheKind = TypeTestResolution::Inline;
+      TIL.SizeBitWidth = (BSI.BitSize <= 32) ? 5 : 6;
+      TIL.Size = ConstantInt::get(Int8Ty, BSI.BitSize);
+      uint64_t InlineBits = 0;
+      for (auto Bit : BSI.Bits)
+        InlineBits |= uint64_t(1) << Bit;
+      if (InlineBits == 0)
+        TIL.TheKind = TypeTestResolution::Unsat;
+      else
+        TIL.InlineBits = ConstantInt::get(
+            (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
+    } else {
+      TIL.TheKind = TypeTestResolution::ByteArray;
+      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
+      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
+                                  BSI.BitSize);
+      ++NumByteArraysCreated;
+      ByteArrayInfo *BAI = createByteArray(BSI);
+      TIL.TheByteArray = BAI->ByteArray;
+      TIL.BitMask = BAI->MaskGlobal;
+    }
 
     // Lower each call to llvm.type.test for this type identifier.
     for (CallInst *CI : TypeTestCallSites[TypeId]) {
       ++NumTypeTestCallsLowered;
-      Value *Lowered =
-          lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalObjLayout);
+      Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
       CI->replaceAllUsesWith(Lowered);
       CI->eraseFromParent();
     }
@@ -1080,6 +1146,22 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 
 /// Lower all type tests in this module.
 LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClSummaryAction.empty()) {
+    OwnedSummary = make_unique<ModuleSummaryIndex>();
+    if (!ClReadSummary.empty()) {
+      ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+                            ": ");
+      auto ReadSummaryFile =
+          ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+      yaml::Input In(ReadSummaryFile->getBuffer());
+      In >> *OwnedSummary;
+      ExitOnErr(errorCodeToError(In.error()));
+    }
+  }
+
   Triple TargetTriple(M.getTargetTriple());
   LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
   Arch = TargetTriple.getArch();
@@ -1087,6 +1169,20 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
   ObjectFormat = TargetTriple.getObjectFormat();
 }
 
+LowerTypeTestsModule::~LowerTypeTestsModule() {
+  if (ClSummaryAction.empty() || ClWriteSummary.empty())
+    return;
+
+  ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+                        ": ");
+  std::error_code EC;
+  raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+  ExitOnErr(errorCodeToError(EC));
+
+  yaml::Output Out(OS);
+  Out << *OwnedSummary;
+}
+
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f863d192fc2f..b29ed3c87451 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1637,6 +1637,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::cos:
+  case Intrinsic::amdgcn_cos: {
+    Value *SrcSrc;
+    Value *Src = II->getArgOperand(0);
+    if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
+        match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+      // cos(-x) -> cos(x)
+      // cos(fabs(x)) -> cos(x)
+      II->setArgOperand(0, SrcSrc);
+      return II;
+    }
+
+    break;
+  }
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
     // Turn PPC lvx -> load if the pointer is known aligned.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6a7cb0e45c63..1d5528398776 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -514,7 +514,8 @@ struct AddressSanitizer : public FunctionPass {
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
                          Value *Addr, uint32_t TypeSize, bool IsWrite,
                          Value *SizeArgument, bool UseCalls, uint32_t Exp);
-  void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr,
+  void instrumentUnusualSizeOrAlignment(Instruction *I,
+                                        Instruction *InsertBefore, Value *Addr,
                                         uint32_t TypeSize, bool IsWrite,
                                         Value *SizeArgument, bool UseCalls,
                                         uint32_t Exp);
@@ -1056,20 +1057,18 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
           return nullptr;
         *IsWrite = false;
       }
-      // Only instrument if the mask is constant for now.
-      if (isa<ConstantVector>(CI->getOperand(2 + OpOffset))) {
-        auto BasePtr = CI->getOperand(0 + OpOffset);
-        auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
-        *TypeSize = DL.getTypeStoreSizeInBits(Ty);
-        if (auto AlignmentConstant =
-                dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-          *Alignment = (unsigned)AlignmentConstant->getZExtValue();
-        else
-          *Alignment = 1; // No alignment guarantees. We probably got Undef
-        if (MaybeMask)
-          *MaybeMask = CI->getOperand(2 + OpOffset);
-        PtrOperand = BasePtr;
-      }
+
+      auto BasePtr = CI->getOperand(0 + OpOffset);
+      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      *TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        *Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        *Alignment = 1; // No alignment guarantees. We probably got Undef
+      if (MaybeMask)
+        *MaybeMask = CI->getOperand(2 + OpOffset);
+      PtrOperand = BasePtr;
     }
   }
 
@@ -1130,24 +1129,25 @@ void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
 }
 
 static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
-                                Value *Addr, unsigned Alignment,
-                                unsigned Granularity, uint32_t TypeSize,
-                                bool IsWrite, Value *SizeArgument,
-                                bool UseCalls, uint32_t Exp) {
+                                Instruction *InsertBefore, Value *Addr,
+                                unsigned Alignment, unsigned Granularity,
+                                uint32_t TypeSize, bool IsWrite,
+                                Value *SizeArgument, bool UseCalls,
+                                uint32_t Exp) {
   // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
   // if the data is properly aligned.
   if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
        TypeSize == 128) &&
       (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
-    return Pass->instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr,
-                                   UseCalls, Exp);
-  Pass->instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr,
-                                         UseCalls, Exp);
+    return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+                                   nullptr, UseCalls, Exp);
+  Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+                                         IsWrite, nullptr, UseCalls, Exp);
 }
 
 static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
                                         const DataLayout &DL, Type *IntptrTy,
-                                        ConstantVector *Mask, Instruction *I,
+                                        Value *Mask, Instruction *I,
                                         Value *Addr, unsigned Alignment,
                                         unsigned Granularity, uint32_t TypeSize,
                                         bool IsWrite, Value *SizeArgument,
@@ -1157,15 +1157,30 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
   unsigned Num = VTy->getVectorNumElements();
   auto Zero = ConstantInt::get(IntptrTy, 0);
   for (unsigned Idx = 0; Idx < Num; ++Idx) {
-    // dyn_cast as we might get UndefValue
-    auto Masked = dyn_cast<ConstantInt>(Mask->getOperand(Idx));
-    if (Masked && Masked->isAllOnesValue()) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isNullValue())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to doInstrumentAddress
+        // with InsertBefore == I
+      }
+    } else {
       IRBuilder<> IRB(I);
-      auto InstrumentedAddress =
-          IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
-      doInstrumentAddress(Pass, I, InstrumentedAddress, Alignment, Granularity,
-                          ElemTypeSize, IsWrite, SizeArgument, UseCalls, Exp);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
     }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+                        Granularity, ElemTypeSize, IsWrite, SizeArgument,
+                        UseCalls, Exp);
   }
 }
 
@@ -1220,12 +1235,11 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
 
   unsigned Granularity = 1 << Mapping.Scale;
   if (MaybeMask) {
-    auto Mask = cast<ConstantVector>(MaybeMask);
-    instrumentMaskedLoadOrStore(this, DL, IntptrTy, Mask, I, Addr, Alignment,
-                                Granularity, TypeSize, IsWrite, nullptr,
-                                UseCalls, Exp);
+    instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
+                                Alignment, Granularity, TypeSize, IsWrite,
+                                nullptr, UseCalls, Exp);
   } else {
-    doInstrumentAddress(this, I, Addr, Alignment, Granularity, TypeSize,
+    doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
                         IsWrite, nullptr, UseCalls, Exp);
   }
 }
@@ -1342,9 +1356,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
 // to report the actual access size.
 void AddressSanitizer::instrumentUnusualSizeOrAlignment(
-    Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite,
-    Value *SizeArgument, bool UseCalls, uint32_t Exp) {
-  IRBuilder<> IRB(I);
+    Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+    bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
   Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   if (UseCalls) {
@@ -1358,8 +1372,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment(
     Value *LastByte = IRB.CreateIntToPtr(
         IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
         Addr->getType());
-    instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp);
-    instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
   }
 }
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 9485bfd7c296..0137378b828b 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1572,6 +1572,13 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
   // Assign value numbers to the new instructions.
   for (Instruction *I : NewInsts) {
+    // Instructions that have been inserted in predecessor(s) to materialize
+    // the load address do not retain their original debug locations. Doing
+    // so could lead to confusing (but correct) source attributions.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
+    I->setDebugLoc(DebugLoc());
+
     // FIXME: We really _ought_ to insert these value numbers into their
     // parent's availability map.  However, in doing so, we risk getting into
     // ordering issues.  If a block hasn't been processed yet, we would be
@@ -1601,8 +1608,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
       NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
 
-    // Transfer DebugLoc.
-    NewLoad->setDebugLoc(LI->getDebugLoc());
+    // We do not propagate the old load's debug location, because the new
+    // load now lives in a different BB, and we want to avoid a jumpy line
+    // table.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
 
     // Add the newly created load.
     ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 1cc5c8f0da84..6ef9d0561322 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -408,6 +408,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
           CurAST->deleteValue(&I);
           I.eraseFromParent();
         }
+        Changed = true;
         continue;
       }
 
@@ -766,6 +767,14 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
+  // Do not retain debug locations when we are moving instructions to different
+  // basic blocks, because we want to avoid jumpy line tables. Calls, however,
+  // need to retain their debug locs because they may be inlined.
+  // FIXME: How do we retain source locations without causing poor debugging
+  // behavior?
+  if (!isa<CallInst>(I))
+    I.setDebugLoc(DebugLoc());
+
   if (isa<LoadInst>(I))
     ++NumMovedLoads;
   else if (isa<CallInst>(I))
@@ -911,14 +920,23 @@ bool llvm::promoteLoopAccessesToScalars(
   //
   // If at least one store is guaranteed to execute, both properties are
   // satisfied, and promotion is legal.
+  //
   // This, however, is not a necessary condition. Even if no store/load is
-  // guaranteed to execute, we can still establish these properties:
-  // (p1) by proving that hoisting the load into the preheader is
-  // safe (i.e. proving dereferenceability on all paths through the loop). We
+  // guaranteed to execute, we can still establish these properties.
+  // We can establish (p1) by proving that hoisting the load into the preheader
+  // is safe (i.e. proving dereferenceability on all paths through the loop). We
   // can use any access within the alias set to prove dereferenceability,
   // since they're all must alias.
-  // (p2) by proving the memory is thread-local, so the memory model
+  // 
+  // There are two ways establish (p2): 
+  // a) Prove the location is thread-local. In this case the memory model
   // requirement does not apply, and stores are safe to insert.
+  // b) Prove a store dominates every exit block. In this case, if an exit
+  // blocks is reached, the original dynamic path would have taken us through
+  // the store, so inserting a store into the exit block is safe. Note that this
+  // is different from the store being guaranteed to execute. For instance,
+  // if an exception is thrown on the first iteration of the loop, the original
+  // store is never executed, but the exit blocks are not executed either.
 
   bool DereferenceableInPH = false;
   bool SafeToInsertStore = false;
@@ -1000,6 +1018,17 @@ bool llvm::promoteLoopAccessesToScalars(
           }
         }
 
+        // If a store dominates all exit blocks, it is safe to sink.
+        // As explained above, if an exit block was executed, a dominating
+        // store must have been been executed at least once, so we are not
+        // introducing stores on paths that did not have them.
+        // Note that this only looks at explicit exit blocks. If we ever
+        // start sinking stores into unwind edges (see above), this will break.
+        if (!SafeToInsertStore)
+          SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+            return DT->dominates(Store->getParent(), Exit);
+          });
+
         // If the store is not guaranteed to execute, we may still get
         // deref info through it.
         if (!DereferenceableInPH) {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index fd167db11789..2743574ecca6 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -997,7 +997,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
 /// Check if the given conditional branch is based on the comparison between
 /// a variable and zero, and if the variable is non-zero, the control yields to
 /// the loop entry. If the branch matches the behavior, the variable involved
-/// in the comparion is returned. This function will be called to see if the
+/// in the comparison is returned. This function will be called to see if the
 /// precondition and postcondition of the loop are in desirable form.
 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
   if (!BI || !BI->isConditional())
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index 90309d7ebba6..f64354497771 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -283,8 +283,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // sinked.
   for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
     Instruction *I = &*II++;
-    if (!L.hasLoopInvariantOperands(I) ||
-        !canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
+    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
       continue;
     if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
       Changed = true;
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 440e36767edf..678d02e05d42 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -56,12 +56,9 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
   if (!isPerformingImport() && !isModuleExporting())
     return false;
 
-  // If we are exporting, we need to see whether this value is marked
-  // as NoRename in the summary. If we are importing, we may not have
-  // a summary in the distributed backend case (only summaries for values
-  // importes as defs, not references, are included in the index passed
-  // to the distributed backends).
   if (isPerformingImport()) {
+    assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) &&
+           "Attempting to promote non-renamable local");
     // We don't know for sure yet if we are importing this value (as either
     // a reference or a def), since we are simply walking all values in the
     // module. But by necessity if we end up importing it and it is local,
@@ -77,13 +74,28 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
   assert(Summaries->second.size() == 1 && "Local has more than one summary");
   auto Linkage = Summaries->second.front()->linkage();
   if (!GlobalValue::isLocalLinkage(Linkage)) {
-    assert(!Summaries->second.front()->noRename());
+    assert(!isNonRenamableLocal(*SGV) &&
+           "Attempting to promote non-renamable local");
     return true;
   }
 
   return false;
 }
 
+#ifndef NDEBUG
+bool FunctionImportGlobalProcessing::isNonRenamableLocal(
+    const GlobalValue &GV) const {
+  if (!GV.hasLocalLinkage())
+    return false;
+  // This needs to stay in sync with the logic in buildModuleSummaryIndex.
+  if (GV.hasSection())
+    return true;
+  if (Used.count(const_cast<GlobalValue *>(&GV)))
+    return true;
+  return false;
+}
+#endif
+
 std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV,
                                                     bool DoPromote) {
   // For locals that must be promoted to global scope, ensure that
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8cde0c4cd607..31daba2248aa 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6785,22 +6785,19 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
   return Cost;
 }
 
-/// \brief Check whether the address computation for a non-consecutive memory
-/// access looks like an unlikely candidate for being merged into the indexing
-/// mode.
+/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
 ///
-/// We look for a GEP which has one index that is an induction variable and all
-/// other indices are loop invariant. If the stride of this access is also
-/// within a small bound we decide that this address computation can likely be
-/// merged into the addressing mode.
-/// In all other cases, we identify the address computation as complex.
-static bool isLikelyComplexAddressComputation(Value *Ptr,
-                                              LoopVectorizationLegality *Legal,
-                                              ScalarEvolution *SE,
-                                              const Loop *TheLoop) {
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+              Value *Ptr,
+              LoopVectorizationLegality *Legal,
+              ScalarEvolution *SE,
+              const Loop *TheLoop) {
   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   if (!Gep)
-    return true;
+    return nullptr;
 
   // We are looking for a gep with all loop invariant indices except for one
   // which should be an induction variable.
@@ -6809,33 +6806,11 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
     Value *Opd = Gep->getOperand(i);
     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
         !Legal->isInductionVariable(Opd))
-      return true;
+      return nullptr;
   }
 
-  // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
-  // can likely be merged into the address computation.
-  unsigned MaxMergeDistance = 64;
-
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
-  if (!AddRec)
-    return true;
-
-  // Check the step is constant.
-  const SCEV *Step = AddRec->getStepRecurrence(*SE);
-  // Calculate the pointer stride and check if it is consecutive.
-  const auto *C = dyn_cast<SCEVConstant>(Step);
-  if (!C)
-    return true;
-
-  const APInt &APStepVal = C->getAPInt();
-
-  // Huge step value - give up.
-  if (APStepVal.getBitWidth() > 64)
-    return true;
-
-  int64_t StepVal = APStepVal.getSExtValue();
-
-  return StepVal > MaxMergeDistance;
+  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+  return SE->getSCEV(Ptr);
 }
 
 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
@@ -7063,12 +7038,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       unsigned Cost = 0;
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
 
-      // True if the memory instruction's address computation is complex.
-      bool IsComplexComputation =
-          isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
+      // Figure out whether the access is strided and get the stride value
+      // if it's known in compile time
+      const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); 
 
       // Get the cost of the scalar memory instruction and address computation.
-      Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
+      Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
       Cost += VF *
               TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                   Alignment, AS);