72 files changed, 1941 insertions, 395 deletions
diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp
index daee93267f56d..94306d0f54adc 100644
--- a/lib/Analysis/GlobalsModRef.cpp
+++ b/lib/Analysis/GlobalsModRef.cpp
@@ -502,6 +502,8 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     }
 
     FunctionInfo &FI = FunctionInfos[F];
+    Handles.emplace_front(*this, F);
+    Handles.front().I = Handles.begin();
     bool KnowNothing = false;
 
     // Collect the mod/ref properties due to called functions.  We only compute
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 6e9368c49d659..09605f61fa935 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -153,9 +153,14 @@ public:
     if (IsCall != Other.IsCall)
       return false;
 
-    if (IsCall)
-      return CS.getCalledValue() == Other.CS.getCalledValue();
-    return Loc == Other.Loc;
+    if (!IsCall)
+      return Loc == Other.Loc;
+
+    if (CS.getCalledValue() != Other.CS.getCalledValue())
+      return false;
+
+    return CS.arg_size() == Other.CS.arg_size() &&
+           std::equal(CS.arg_begin(), CS.arg_end(), Other.CS.arg_begin());
   }
 
 private:
@@ -179,12 +184,18 @@ template <> struct DenseMapInfo<MemoryLocOrCall> {
   }
 
   static unsigned getHashValue(const MemoryLocOrCall &MLOC) {
-    if (MLOC.IsCall)
-      return hash_combine(MLOC.IsCall,
-                          DenseMapInfo<const Value *>::getHashValue(
-                              MLOC.getCS().getCalledValue()));
-    return hash_combine(
-        MLOC.IsCall, DenseMapInfo<MemoryLocation>::getHashValue(MLOC.getLoc()));
+    if (!MLOC.IsCall)
+      return hash_combine(
+          MLOC.IsCall,
+          DenseMapInfo<MemoryLocation>::getHashValue(MLOC.getLoc()));
+
+    hash_code hash =
+        hash_combine(MLOC.IsCall, DenseMapInfo<const Value *>::getHashValue(
+                                      MLOC.getCS().getCalledValue()));
+
+    for (const Value *Arg : MLOC.getCS().args())
+      hash = hash_combine(hash, DenseMapInfo<const Value *>::getHashValue(Arg));
+    return hash;
   }
 
   static bool isEqual(const MemoryLocOrCall &LHS, const MemoryLocOrCall &RHS) {
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index a22ce0dab9c24..d8ce90e63a9d0 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1714,20 +1714,25 @@ bool IfConverter::IfConvertDiamondCommon(
   }
 
   // Remove the duplicated instructions at the beginnings of both paths.
-  // Skip dbg_value instructions
+  // Skip dbg_value instructions.
   MachineBasicBlock::iterator DI1 = MBB1.getFirstNonDebugInstr();
   MachineBasicBlock::iterator DI2 = MBB2.getFirstNonDebugInstr();
   BBI1->NonPredSize -= NumDups1;
   BBI2->NonPredSize -= NumDups1;
 
   // Skip past the dups on each side separately since there may be
-  // differing dbg_value entries.
+  // differing dbg_value entries. NumDups1 can include a "return"
+  // instruction, if it's not marked as "branch".
   for (unsigned i = 0; i < NumDups1; ++DI1) {
+    if (DI1 == MBB1.end())
+      break;
     if (!DI1->isDebugValue())
       ++i;
   }
   while (NumDups1 != 0) {
     ++DI2;
+    if (DI2 == MBB2.end())
+      break;
     if (!DI2->isDebugValue())
       --NumDups1;
   }
@@ -1738,11 +1743,16 @@ bool IfConverter::IfConvertDiamondCommon(
       Redefs.stepForward(MI, Dummy);
     }
   }
+
   BBI.BB->splice(BBI.BB->end(), &MBB1, MBB1.begin(), DI1);
   MBB2.erase(MBB2.begin(), DI2);
 
-  // The branches have been checked to match, so it is safe to remove the branch
-  // in BB1 and rely on the copy in BB2
+  // The branches have been checked to match, so it is safe to remove the
+  // branch in BB1 and rely on the copy in BB2. The complication is that
+  // the blocks may end with a return instruction, which may or may not
+  // be marked as "branch". If it's not, then it could be included in
+  // "dups1", leaving the blocks potentially empty after moving the common
+  // duplicates.
 #ifndef NDEBUG
   // Unanalyzable branches must match exactly. Check that now.
   if (!BBI1->IsBrAnalyzable)
@@ -1768,11 +1778,14 @@ bool IfConverter::IfConvertDiamondCommon(
   if (RemoveBranch)
     BBI2->NonPredSize -= TII->removeBranch(*BBI2->BB);
   else {
-    do {
-      assert(DI2 != MBB2.begin());
-      DI2--;
-    } while (DI2->isBranch() || DI2->isDebugValue());
-    DI2++;
+    // Make DI2 point to the end of the range where the common "tail"
+    // instructions could be found.
+    while (DI2 != MBB2.begin()) {
+      MachineBasicBlock::iterator Prev = std::prev(DI2);
+      if (!Prev->isBranch() && !Prev->isDebugValue())
+        break;
+      DI2 = Prev;
+    }
   }
   while (NumDups2 != 0) {
     // NumDups2 only counted non-dbg_value instructions, so this won't
@@ -1833,11 +1846,15 @@ bool IfConverter::IfConvertDiamondCommon(
   // a non-predicated in BBI2, then we don't want to predicate the one from
   // BBI2. The reason is that if we merged these blocks, we would end up with
   // two predicated terminators in the same block.
+  // Also, if the branches in MBB1 and MBB2 were non-analyzable, then don't
+  // predicate them either. They were checked to be identical, and so the
+  // same branch would happen regardless of which path was taken.
   if (!MBB2.empty() && (DI2 == MBB2.end())) {
     MachineBasicBlock::iterator BBI1T = MBB1.getFirstTerminator();
     MachineBasicBlock::iterator BBI2T = MBB2.getFirstTerminator();
-    if (BBI1T != MBB1.end() && TII->isPredicated(*BBI1T) &&
-        BBI2T != MBB2.end() && !TII->isPredicated(*BBI2T))
+    bool BB1Predicated = BBI1T != MBB1.end() && TII->isPredicated(*BBI1T);
+    bool BB2NonPredicated = BBI2T != MBB2.end() && !TII->isPredicated(*BBI2T);
+    if (BB2NonPredicated && (BB1Predicated || !BBI2->IsBrAnalyzable))
       --DI2;
   }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 75e3d35169cfb..4ffcffcea693c 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -514,6 +514,39 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
     return false;
   }
 
+  // Detect invalid DBG_VALUE instructions, with a debug-use of a virtual
+  // register that hasn't been defined yet. If we do not remove those here, then
+  // the re-insertion of the DBG_VALUE instruction after register allocation
+  // will be incorrect.
+  // TODO: If earlier passes are corrected to generate sane debug information
+  // (and if the machine verifier is improved to catch this), then these checks
+  // could be removed or replaced by asserts.
+  bool Discard = false;
+  if (MI.getOperand(0).isReg() &&
+      TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
+    const unsigned Reg = MI.getOperand(0).getReg();
+    if (!LIS->hasInterval(Reg)) {
+      // The DBG_VALUE is described by a virtual register that does not have a
+      // live interval. Discard the DBG_VALUE.
+      Discard = true;
+      DEBUG(dbgs() << "Discarding debug info (no LIS interval): "
+            << Idx << " " << MI);
+    } else {
+      // The DBG_VALUE is only valid if either Reg is live out from Idx, or Reg
+      // is defined dead at Idx (where Idx is the slot index for the instruction
+      // preceeding the DBG_VALUE).
+      const LiveInterval &LI = LIS->getInterval(Reg);
+      LiveQueryResult LRQ = LI.Query(Idx);
+      if (!LRQ.valueOutOrDead()) {
+        // We have found a DBG_VALUE with the value in a virtual register that
+        // is not live. Discard the DBG_VALUE.
+        Discard = true;
+        DEBUG(dbgs() << "Discarding debug info (reg not live): "
+              << Idx << " " << MI);
+      }
+    }
+  }
+
   // Get or create the UserValue for (variable,offset) here.
   bool IsIndirect = MI.getOperand(1).isImm();
   if (IsIndirect)
@@ -522,7 +555,13 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   const DIExpression *Expr = MI.getDebugExpression();
   UserValue *UV =
       getUserValue(Var, Expr, MI.getDebugLoc());
-  UV->addDef(Idx, MI.getOperand(0), IsIndirect);
+  if (!Discard)
+    UV->addDef(Idx, MI.getOperand(0), IsIndirect);
+  else {
+    MachineOperand MO = MachineOperand::CreateReg(0U, false);
+    MO.setIsDebug();
+    UV->addDef(Idx, MO, false);
+  }
   return true;
 }
 
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 209abf34d885b..cd67449e3acfa 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -646,6 +646,14 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   removeSuccessor(OldI);
 }
 
+void MachineBasicBlock::copySuccessor(MachineBasicBlock *Orig,
+                                      succ_iterator I) {
+  if (Orig->Probs.empty())
+    addSuccessor(*I, Orig->getSuccProbability(I));
+  else
+    addSuccessorWithoutProb(*I);
+}
+
 void MachineBasicBlock::addPredecessor(MachineBasicBlock *Pred) {
   Predecessors.push_back(Pred);
 }
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 84c808ee79384..167135b56ec08 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -513,6 +513,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  bool allowTailDupPlacement() const {
+    assert(F);
+    return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
+  }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
     AU.addRequired<MachineBlockFrequencyInfo>();
@@ -1018,7 +1023,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
     MachineBasicBlock *Succ1 = BestA.Dest;
     MachineBasicBlock *Succ2 = BestB.Dest;
     // Check to see if tail-duplication would be profitable.
-    if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+    if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
         canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
         isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
                               Chain, BlockFilter)) {
@@ -1044,7 +1049,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
   return Result;
 }
 
-/// When the option TailDupPlacement is on, this method checks if the
+/// When the option allowTailDupPlacement() is on, this method checks if the
 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
 /// into all of its unplaced, unfiltered predecessors, that are not BB.
 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
@@ -1493,7 +1498,7 @@ MachineBlockPlacement::selectBestSuccessor(
     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
                                    Chain, BlockFilter)) {
       // If tail duplication would make Succ profitable, place it.
-      if (TailDupPlacement && shouldTailDuplicate(Succ))
+      if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
         DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
       continue;
     }
@@ -1702,7 +1707,7 @@ void MachineBlockPlacement::buildChain(
     auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
     MachineBasicBlock* BestSucc = Result.BB;
     bool ShouldTailDup = Result.ShouldTailDup;
-    if (TailDupPlacement)
+    if (allowTailDupPlacement())
       ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
 
     // If an immediate successor isn't available, look for the best viable
@@ -1724,7 +1729,7 @@ void MachineBlockPlacement::buildChain(
 
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
-    if (TailDupPlacement && BestSucc && ShouldTailDup) {
+    if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
       // If the chosen successor was duplicated into all its predecessors,
       // don't bother laying it out, just go round the loop again with BB as
       // the chain end.
@@ -2758,7 +2763,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
       TailDupSize = TailDupPlacementAggressiveThreshold;
   }
 
-  if (TailDupPlacement) {
+  if (allowTailDupPlacement()) {
     MPDT = &getAnalysis<MachinePostDominatorTree>();
     if (MF.getFunction().optForSize())
       TailDupSize = 1;
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 11acbe687a312..1090550243f87 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1882,6 +1882,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
     return ValueTrackerResult();
   // Otherwise, we want the whole source.
   const MachineOperand &Src = Def->getOperand(1);
+  if (Src.isUndef())
+    return ValueTrackerResult();
   return ValueTrackerResult(Src.getReg(), Src.getSubReg());
 }
 
@@ -1925,6 +1927,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
   }
 
   const MachineOperand &Src = Def->getOperand(SrcIdx);
+  if (Src.isUndef())
+    return ValueTrackerResult();
   return ValueTrackerResult(Src.getReg(), Src.getSubReg());
 }
 
@@ -2093,6 +2097,10 @@ ValueTrackerResult ValueTracker::getNextSourceFromPHI() {
   for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) {
     auto &MO = Def->getOperand(i);
     assert(MO.isReg() && "Invalid PHI instruction");
+    // We have no code to deal with undef operands. They shouldn't happen in
+    // normal programs anyway.
+    if (MO.isUndef())
+      return ValueTrackerResult();
     Res.addSource(MO.getReg(), MO.getSubReg());
   }
 
@@ -2149,9 +2157,14 @@ ValueTrackerResult ValueTracker::getNextSource() {
     // If we can still move up in the use-def chain, move to the next
     // definition.
     if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) {
-      Def = MRI.getVRegDef(Reg);
-      DefIdx = MRI.def_begin(Reg).getOperandNo();
-      DefSubReg = Res.getSrcSubReg(0);
+      MachineRegisterInfo::def_iterator DI = MRI.def_begin(Reg);
+      if (DI != MRI.def_end()) {
+        Def = DI->getParent();
+        DefIdx = DI.getOperandNo();
+        DefSubReg = Res.getSrcSubReg(0);
+      } else {
+        Def = nullptr;
+      }
       return Res;
     }
   }
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index db925f803db61..bd90ed5b55b89 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -1151,6 +1151,8 @@ bool TargetInstrInfo::getRegSequenceInputs(
   for (unsigned OpIdx = 1, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
        OpIdx += 2) {
     const MachineOperand &MOReg = MI.getOperand(OpIdx);
+    if (MOReg.isUndef())
+      continue;
     const MachineOperand &MOSubIdx = MI.getOperand(OpIdx + 1);
     assert(MOSubIdx.isImm() &&
            "One of the subindex of the reg_sequence is not an immediate");
@@ -1174,6 +1176,8 @@ bool TargetInstrInfo::getExtractSubregInputs(
   // Def = EXTRACT_SUBREG v0.sub1, sub0.
   assert(DefIdx == 0 && "EXTRACT_SUBREG only has one def");
   const MachineOperand &MOReg = MI.getOperand(1);
+  if (MOReg.isUndef())
+    return false;
   const MachineOperand &MOSubIdx = MI.getOperand(2);
   assert(MOSubIdx.isImm() &&
          "The subindex of the extract_subreg is not an immediate");
@@ -1198,6 +1202,8 @@ bool TargetInstrInfo::getInsertSubregInputs(
   assert(DefIdx == 0 && "INSERT_SUBREG only has one def");
   const MachineOperand &MOBaseReg = MI.getOperand(1);
   const MachineOperand &MOInsertedReg = MI.getOperand(2);
+  if (MOInsertedReg.isUndef())
+    return false;
   const MachineOperand &MOSubIdx = MI.getOperand(3);
   assert(MOSubIdx.isImm() &&
          "One of the subindex of the reg_sequence is not an immediate");
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index c0047d0cde6a9..2c57eee191db6 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -1422,7 +1422,8 @@ RuntimeDyldELF::processRelocationRef(
       SectionEntry &Section = Sections[SectionID];
       uint8_t *Target = Section.getAddressWithOffset(Offset);
       bool RangeOverflow = false;
-      if (!Value.SymbolName && SymType != SymbolRef::ST_Unknown) {
+      bool IsExtern = Value.SymbolName || SymType == SymbolRef::ST_Unknown;
+      if (!IsExtern) {
         if (AbiVariant != 2) {
           // In the ELFv1 ABI, a function call may point to the .opd entry,
           // so the final symbol value is calculated based on the relocation
@@ -1432,21 +1433,24 @@ RuntimeDyldELF::processRelocationRef(
         } else {
           // In the ELFv2 ABI, a function symbol may provide a local entry
           // point, which must be used for direct calls.
-          uint8_t SymOther = Symbol->getOther();
-          Value.Addend += ELF::decodePPC64LocalEntryOffset(SymOther);
+          if (Value.SectionID == SectionID){
+            uint8_t SymOther = Symbol->getOther();
+            Value.Addend += ELF::decodePPC64LocalEntryOffset(SymOther);
+          }
         }
         uint8_t *RelocTarget =
             Sections[Value.SectionID].getAddressWithOffset(Value.Addend);
         int64_t delta = static_cast<int64_t>(Target - RelocTarget);
         // If it is within 26-bits branch range, just set the branch target
-        if (SignExtend64<26>(delta) == delta) {
+        if (SignExtend64<26>(delta) != delta) {
+          RangeOverflow = true;
+        } else if ((AbiVariant != 2) ||
+                   (AbiVariant == 2  && Value.SectionID == SectionID)) {
           RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
           addRelocationForSection(RE, Value.SectionID);
-        } else {
-          RangeOverflow = true;
         }
       }
-      if (Value.SymbolName || SymType == SymbolRef::ST_Unknown ||
+      if (IsExtern || (AbiVariant == 2 && Value.SectionID != SectionID) ||
           RangeOverflow) {
         // It is an external symbol (either Value.SymbolName is set, or
         // SymType is SymbolRef::ST_Unknown) or out of range.
@@ -1503,10 +1507,10 @@ RuntimeDyldELF::processRelocationRef(
                             RelType, 0);
           Section.advanceStubOffset(getMaxStubSize());
         }
-        if (Value.SymbolName || SymType == SymbolRef::ST_Unknown) {
+        if (IsExtern || (AbiVariant == 2 && Value.SectionID != SectionID)) {
           // Restore the TOC for external calls
           if (AbiVariant == 2)
-            writeInt32BE(Target + 4, 0xE8410018); // ld r2,28(r1)
+            writeInt32BE(Target + 4, 0xE8410018); // ld r2,24(r1)
           else
             writeInt32BE(Target + 4, 0xE8410028); // ld r2,40(r1)
         }
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index d3c33edec1867..743e3710fd682 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -359,11 +359,9 @@ LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void LLVMDumpType(LLVMTypeRef Ty) {
-  return unwrap(Ty)->dump();
+void LLVMDumpType(LLVMTypeRef Ty) {
+  return unwrap(Ty)->print(errs(), /*IsForDebug=*/true);
 }
-#endif
 
 char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
   std::string buf;
@@ -658,7 +656,7 @@ void LLVMSetValueName(LLVMValueRef Val, const char *Name) {
   unwrap(Val)->setName(Name);
 }
 
-LLVM_DUMP_METHOD void LLVMDumpValue(LLVMValueRef Val) {
+void LLVMDumpValue(LLVMValueRef Val) {
   unwrap(Val)->print(errs(), /*IsForDebug=*/true);
 }
 
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index a6b5c43f1d2a6..328f000f37c9d 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -289,6 +289,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   case Triple::mips64el:
     FDECFIEncoding = dwarf::DW_EH_PE_sdata8;
     break;
+  case Triple::ppc64:
+  case Triple::ppc64le:
   case Triple::x86_64:
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel |
                      (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4);
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index d968688911eb9..6439d16a2a3fa 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -13,8 +13,13 @@ elseif( CMAKE_HOST_UNIX )
   if( HAVE_LIBDL )
     set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
   endif()
-  if( HAVE_BACKTRACE )
-    set(system_libs ${system_libs} ${Backtrace_LIBRARIES})
+  if( HAVE_BACKTRACE AND NOT "${Backtrace_LIBRARIES}" STREQUAL "" )
+    # On BSDs, CMake returns a fully qualified path to the backtrace library.
+    # We need to remove the path and the 'lib' prefix, to make it look like a
+    # regular short library name, suitable for appending to a -l link flag.
+    get_filename_component(Backtrace_LIBFILE ${Backtrace_LIBRARIES} NAME_WE)
+    STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE})
+    set(system_libs ${system_libs} ${Backtrace_LIBFILE})
   endif()
   if(LLVM_ENABLE_TERMINFO)
     if(HAVE_TERMINFO)
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 3dc67ad782af5..6e65b5e6c8075 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -1009,7 +1009,7 @@ StringRef sys::getHostCPUName() {
 #include "llvm/Support/X86TargetParser.def"
 
   // Now check types.
-#define X86_CPU_SUBTYPE(ARCHNAME, ENUM) \
+#define X86_CPU_TYPE(ARCHNAME, ENUM) \
   if (Type == X86::ENUM) \
     return ARCHNAME;
 #include "llvm/Support/X86TargetParser.def"
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 2ff2ee347f560..6704fa27c86ee 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -299,6 +299,11 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
     printOffset(MO.getOffset(), O);
     break;
   }
+  case MachineOperand::MO_BlockAddress: {
+    MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
+    Sym->print(O, MAI);
+    break;
+  }
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index d1ddb2e3ef703..0d00dab598d57 100644
--- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <iterator>
@@ -60,6 +61,8 @@ STATISTIC(NumCollisionsAvoided,
           "Number of HW prefetch tag collisions avoided");
 STATISTIC(NumCollisionsNotAvoided,
           "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+DEBUG_COUNTER(FixCounter, "falkor-hwpf",
+              "Controls which tag collisions are avoided");
 
 namespace {
 
@@ -729,6 +732,21 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
       bool Fixed = false;
       DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
 
+      if (!DebugCounter::shouldExecute(FixCounter)) {
+        DEBUG(dbgs() << "Skipping fix due to debug counter:\n  " << MI);
+        continue;
+      }
+
+      // Add the non-base registers of MI as live so we don't use them as
+      // scratch registers.
+      for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
+        if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
+          continue;
+        MachineOperand &MO = MI.getOperand(OpI);
+        if (MO.isReg() && MO.readsReg())
+          LR.addReg(MO.getReg());
+      }
+
       for (unsigned ScratchReg : AArch64::GPR64RegClass) {
         if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
           continue;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index d66f7b59a4b52..789200b28445e 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -917,6 +917,8 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
   int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
   int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
   bool isFixed = MFI.isFixedObjectIndex(FI);
+  bool isCSR = !isFixed && MFI.getObjectOffset(FI) >=
+                               -((int)AFI->getCalleeSavedStackSize());
 
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -930,6 +932,12 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
     // Argument access should always use the FP.
     if (isFixed) {
       UseFP = hasFP(MF);
+    } else if (isCSR && RegInfo->needsStackRealignment(MF)) {
+      // References to the CSR area must use FP if we're re-aligning the stack
+      // since the dynamically-sized alignment padding is between the SP/BP and
+      // the CSR area.
+      assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
+      UseFP = true;
     } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) &&
                !RegInfo->needsStackRealignment(MF)) {
       // Use SP or FP, whichever gives us the best chance of the offset
@@ -947,9 +955,9 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
     }
   }
 
-  assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
+  assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
          "In the presence of dynamic stack pointer realignment, "
-         "non-argument objects cannot be accessed through the frame pointer");
+         "non-argument/CSR objects cannot be accessed through the frame pointer");
 
   if (UseFP) {
     FrameReg = RegInfo->getFrameRegister(MF);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 41ed24c329ef0..233d6be247c2c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4930,7 +4930,8 @@ bool AArch64TargetLowering::isOffsetFoldingLegal(
 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
   // FIXME: We should be able to handle f128 as well with a clever lowering.
-  if (Imm.isPosZero() && (VT == MVT::f16 || VT == MVT::f64 || VT == MVT::f32)) {
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 ||
+                          (VT == MVT::f16 && Subtarget->hasFullFP16()))) {
     DEBUG(dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
     return true;
   }
@@ -5066,7 +5067,7 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
 
 // Table of Constraints
 // TODO: This is the current set of constraints supported by ARM for the
-// compiler, not all of them may make sense, e.g. S may be difficult to support.
+// compiler, not all of them may make sense.
 //
 // r - A general register
 // w - An FP/SIMD register of some size in the range v0-v31
@@ -5126,6 +5127,8 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
     // currently handle addresses it is the same as 'r'.
     case 'Q':
       return C_Memory;
+    case 'S': // A symbolic address
+      return C_Other;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -5250,6 +5253,23 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
     break;
   }
+  case 'S': {
+    // An absolute symbolic address or label reference.
+    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
+                                          GA->getValueType(0));
+    } else if (const BlockAddressSDNode *BA =
+                   dyn_cast<BlockAddressSDNode>(Op)) {
+      Result =
+          DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
+    } else if (const ExternalSymbolSDNode *ES =
+                   dyn_cast<ExternalSymbolSDNode>(Op)) {
+      Result =
+          DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
+    } else
+      return;
+    break;
+  }
 
   case 'I':
   case 'J':
@@ -9637,6 +9657,15 @@ static SDValue performPostLD1Combine(SDNode *N,
   if (LD->getOpcode() != ISD::LOAD)
     return SDValue();
 
+  // The vector lane must be a constant in the LD1LANE opcode.
+  SDValue Lane;
+  if (IsLaneOp) {
+    Lane = N->getOperand(2);
+    auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
+    if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
+      return SDValue();
+  }
+
   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
   EVT MemVT = LoadSDN->getMemoryVT();
   // Check if memory operand is the same type as the vector element.
@@ -9693,7 +9722,7 @@ static SDValue performPostLD1Combine(SDNode *N,
     Ops.push_back(LD->getOperand(0));  // Chain
     if (IsLaneOp) {
       Ops.push_back(Vector);           // The vector to be inserted
-      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+      Ops.push_back(Lane);             // The lane to be inserted in the vector
     }
     Ops.push_back(Addr);
     Ops.push_back(Inc);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 79826ca2ed8dc..040011d858e77 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2713,7 +2713,7 @@ defm FMOV : UnscaledConversion<"fmov">;
 // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
 let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
 def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>,
-    Sched<[WriteF]>;
+    Sched<[WriteF]>, Requires<[HasFullFP16]>;
 def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
     Sched<[WriteF]>;
 def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2042dbf6d5e2a..e09263b6fac9f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -147,6 +147,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeR600PacketizerPass(*PR);
   initializeR600ExpandSpecialInstrsPassPass(*PR);
   initializeR600VectorRegMergerPass(*PR);
+  initializeGlobalISel(*PR);
   initializeAMDGPUDAGToDAGISelPass(*PR);
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 6d89aa6968e9a..41ca7fe8bfaa8 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -358,6 +358,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+    setOperationAction(ISD::CTPOP, MVT::i16, Promote);
 
     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 9740a18b72488..8c02e8da8d793 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -726,6 +726,10 @@ def : GCNPat <
   (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
+def : GCNPat <
+  (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+  (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
 
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 8c1727724a9e3..cff24a10bb5f8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4864,12 +4864,14 @@ bool ARMBaseInstrInfo::getRegSequenceLikeInputs(
     // Populate the InputRegs accordingly.
     // rY
     const MachineOperand *MOReg = &MI.getOperand(1);
-    InputRegs.push_back(
-        RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_0));
+    if (!MOReg->isUndef())
+      InputRegs.push_back(RegSubRegPairAndIdx(MOReg->getReg(),
+                                              MOReg->getSubReg(), ARM::ssub_0));
     // rZ
     MOReg = &MI.getOperand(2);
-    InputRegs.push_back(
-        RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_1));
+    if (!MOReg->isUndef())
+      InputRegs.push_back(RegSubRegPairAndIdx(MOReg->getReg(),
+                                              MOReg->getSubReg(), ARM::ssub_1));
     return true;
   }
   llvm_unreachable("Target dependent opcode missing");
@@ -4888,6 +4890,8 @@ bool ARMBaseInstrInfo::getExtractSubregLikeInputs(
     // rX = EXTRACT_SUBREG dZ, ssub_0
     // rY = EXTRACT_SUBREG dZ, ssub_1
     const MachineOperand &MOReg = MI.getOperand(2);
+    if (MOReg.isUndef())
+      return false;
     InputReg.Reg = MOReg.getReg();
     InputReg.SubReg = MOReg.getSubReg();
     InputReg.SubIdx = DefIdx == 0 ? ARM::ssub_0 : ARM::ssub_1;
@@ -4907,6 +4911,8 @@ bool ARMBaseInstrInfo::getInsertSubregLikeInputs(
     // dX = VSETLNi32 dY, rZ, imm
     const MachineOperand &MOBaseReg = MI.getOperand(1);
     const MachineOperand &MOInsertedReg = MI.getOperand(2);
+    if (MOInsertedReg.isUndef())
+      return false;
     const MachineOperand &MOIndex = MI.getOperand(3);
     BaseReg.Reg = MOBaseReg.getReg();
     BaseReg.SubReg = MOBaseReg.getSubReg();
diff --git a/lib/Target/ARM/ARMComputeBlockSize.cpp b/lib/Target/ARM/ARMComputeBlockSize.cpp
index 2e97b99b05a7d..b263e9d86c42c 100644
--- a/lib/Target/ARM/ARMComputeBlockSize.cpp
+++ b/lib/Target/ARM/ARMComputeBlockSize.cpp
@@ -35,6 +35,7 @@ mayOptimizeThumb2Instruction(const MachineInstr *MI) {
     case ARM::tBcc:
     // optimizeThumb2JumpTables.
     case ARM::t2BR_JT:
+    case ARM::tBR_JTr:
       return true;
   }
   return false;
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 345b081500a42..f36a4317b1b9c 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -5136,6 +5136,7 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction
   // and registers Rd and Base for microMIPS lwp instruction
   case Mips::JALR_HB:
+  case Mips::JALR_HB64:
   case Mips::JALRC_HB_MMR6:
   case Mips::JALRC_MMR6:
     if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 6d2f098a6b32a..3c67743947cb9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -225,6 +225,8 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
   switch (Kind) {
   case Mips::fixup_Mips_NONE:
     return ELF::R_MIPS_NONE;
+  case FK_Data_1:
+    report_fatal_error("MIPS does not support one byte relocations");
   case Mips::fixup_Mips_16:
   case FK_Data_2:
     return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16;
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 3ff3f07654d93..326897dc5c630 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -1886,6 +1886,12 @@ let AddedComplexity = 41 in {
 
 def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
 
+def TAILCALLREG_MMR6  : TailCallReg<JRC16_MM, GPR32Opnd>, ISA_MICROMIPS32R6;
+
+def PseudoIndirectBranch_MMR6 : PseudoIndirectBranchBase<JRC16_MMR6,
+                                                         GPR32Opnd>,
+                                ISA_MICROMIPS32R6;
+
 def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
               (TAILCALL_MMR6 tglobaladdr:$dst)>, ISA_MICROMIPS32R6;
 
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 64fe55e9776b6..1fef51fd69d01 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1003,6 +1003,12 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
 
 def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>, ISA_MIPS1_NOT_32R6_64R6;
 
+def TAILCALLREG_MM  : TailCallReg<JRC16_MM, GPR32Opnd>,
+                      ISA_MICROMIPS32_NOT_MIPS32R6;
+
+def PseudoIndirectBranch_MM : PseudoIndirectBranchBase<JR_MM, GPR32Opnd>,
+                              ISA_MICROMIPS32_NOT_MIPS32R6;
+
 let DecoderNamespace = "MicroMips" in {
   def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
                  RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 6ceb055775387..f8e739497f4ce 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -193,6 +193,10 @@ def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
 def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
                                         "Disable use of the jal instruction">;
 
+def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
+                                                    "UseIndirectJumpsHazard",
+                                                    "true", "Use indirect jump"
+                        " guards to prevent certain speculation based attacks">;
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 62f045e77fdbd..9e9e074875d0c 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1036,3 +1036,42 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
               (SELEQZ i32:$f, i32:$cond)>,
               ISA_MIPS32R6;
 }
+
+// Pseudo instructions
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
+    hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in {
+  class TailCallRegR6<Instruction JumpInst, Register RT, RegisterOperand RO> :
+    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
+    PseudoInstExpansion<(JumpInst RT:$rt, RO:$rs)>;
+}
+
+class PseudoIndirectBranchBaseR6<Instruction JumpInst, Register RT,
+                                 RegisterOperand RO> :
+    MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
+               II_IndirectBranchPseudo>,
+    PseudoInstExpansion<(JumpInst RT:$rt, RO:$rs)> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  bit isCTI = 1;
+}
+
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            NoIndirectJumpGuards] in {
+  def TAILCALLR6REG : TailCallRegR6<JALR, ZERO, GPR32Opnd>, ISA_MIPS32R6;
+  def PseudoIndirectBranchR6 : PseudoIndirectBranchBaseR6<JALR, ZERO,
+                                                          GPR32Opnd>,
+                               ISA_MIPS32R6;
+}
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            UseIndirectJumpsHazard] in {
+  def TAILCALLHBR6REG : TailCallReg<JR_HB_R6, GPR32Opnd>, ISA_MIPS32R6;
+  def PseudoIndrectHazardBranchR6 : PseudoIndirectBranchBase<JR_HB_R6,
+                                                             GPR32Opnd>,
+                                    ISA_MIPS32R6;
+}
+
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index e008aeafaa2b0..828dd4f542232 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -240,13 +240,32 @@ let isCodeGenOnly = 1 in {
   def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
   def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
   def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
-  def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+  let AdditionalPredicates = [NoIndirectJumpGuards] in
+    def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
 }
+let AdditionalPredicates = [NotInMicroMips],
+    DecoderNamespace = "Mips64" in {
+  def JR_HB64 : JR_HB_DESC<GPR64Opnd>, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+  def JALR_HB64 : JALR_HB_DESC<GPR64Opnd>, JALR_HB_ENC, ISA_MIPS32R2;
+}
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
 
-def TAILCALLREG64 : TailCallReg<GPR64Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            NoIndirectJumpGuards] in {
+  def TAILCALLREG64 : TailCallReg<JR64, GPR64Opnd>, ISA_MIPS3_NOT_32R6_64R6,
+                      PTR_64;
+  def PseudoIndirectBranch64 : PseudoIndirectBranchBase<JR64, GPR64Opnd>,
+                               ISA_MIPS3_NOT_32R6_64R6;
+}
 
-def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
-def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            UseIndirectJumpsHazard] in {
+  def TAILCALLREGHB64 : TailCallReg<JR_HB64, GPR64Opnd>,
+                        ISA_MIPS32R2_NOT_32R6_64R6, PTR_64;
+  def PseudoIndirectHazardBranch64 : PseudoIndirectBranchBase<JR_HB64,
+                                                              GPR64Opnd>,
+                                     ISA_MIPS32R2_NOT_32R6_64R6;
+}
 
 /// Multiply and Divide Instructions.
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -536,6 +555,10 @@ def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, MFC3OP_FM<0x12, 5>,
             ISA_MIPS3;
 }
 
+
+let AdditionalPredicates = [UseIndirectJumpsHazard] in
+  def JALRHB64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR_HB64, RA_64>;
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -843,7 +866,8 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"dext $rt, $rs, $pos, $size",
                       (DEXTU GPR64Opnd:$rt, GPR64Opnd:$rs, uimm5_plus32:$pos,
                              uimm5_plus1:$size), 0>, ISA_MIPS64R2;
-
+  def : MipsInstAlias<"jalr.hb $rs", (JALR_HB64 RA_64, GPR64Opnd:$rs), 1>,
+        ISA_MIPS64;
 // Two operand (implicit 0 selector) versions:
   def : MipsInstAlias<"dmtc0 $rt, $rd",
                       (DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index 1cd43ee6f1c34..da743fbdee45f 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -104,6 +104,16 @@ class JIC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR64Opnd,
 
 class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
 class SC64_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
+
+class JR_HB64_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR64Opnd> {
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 1;
+  bit isTerminator=1;
+  bit isBarrier=1;
+  bit isCTI = 1;
+  InstrItinClass Itinerary = II_JR_HB;
+}
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -136,6 +146,7 @@ def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
 let DecoderNamespace = "Mips32r6_64r6_GP64" in {
   def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
   def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+  def JR_HB64_R6 : JR_HB_R6_ENC, JR_HB64_R6_DESC, ISA_MIPS32R6;
 }
 let AdditionalPredicates = [NotInMicroMips],
     DecoderNamespace = "Mips32r6_64r6_PTR64" in {
@@ -277,3 +288,22 @@ def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
 def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
               (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
               ISA_MIPS64R6;
+
+// Pseudo instructions
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            NoIndirectJumpGuards] in {
+  def TAILCALL64R6REG : TailCallRegR6<JALR64, ZERO_64, GPR64Opnd>, ISA_MIPS64R6;
+  def PseudoIndirectBranch64R6 : PseudoIndirectBranchBaseR6<JALR64, ZERO_64,
+                                                            GPR64Opnd>,
+                                 ISA_MIPS64R6;
+}
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            UseIndirectJumpsHazard] in {
+  def TAILCALLHB64R6REG : TailCallReg<JR_HB64_R6, GPR64Opnd>,
+                          ISA_MIPS64R6;
+  def PseudoIndrectHazardBranch64R6 : PseudoIndirectBranchBase<JR_HB64_R6,
+                                                                 GPR64Opnd>,
+                                      ISA_MIPS64R6;
+}
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
index 0ceb1858fb09e..2dcefdc789a55 100644
--- a/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -53,7 +53,7 @@ class DSPInst<string opstr = "">
 
 class PseudoDSP<dag outs, dag ins, list<dag> pattern,
                 InstrItinClass itin = IIPseudo>
-    : MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+    : MipsPseudo<outs, ins, pattern, itin> {
   let InsnPredicates = [HasDSP];
 }
 
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 8bbac3ed7cfb6..d3048c7390e1c 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -67,6 +67,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cstdint>
 
@@ -1306,13 +1307,13 @@ bool MipsFastISel::fastLowerArguments() {
     return false;
   }
 
-  const ArrayRef<MCPhysReg> GPR32ArgRegs = {Mips::A0, Mips::A1, Mips::A2,
-                                            Mips::A3};
-  const ArrayRef<MCPhysReg> FGR32ArgRegs = {Mips::F12, Mips::F14};
-  const ArrayRef<MCPhysReg> AFGR64ArgRegs = {Mips::D6, Mips::D7};
-  ArrayRef<MCPhysReg>::iterator NextGPR32 = GPR32ArgRegs.begin();
-  ArrayRef<MCPhysReg>::iterator NextFGR32 = FGR32ArgRegs.begin();
-  ArrayRef<MCPhysReg>::iterator NextAFGR64 = AFGR64ArgRegs.begin();
+  std::array<MCPhysReg, 4> GPR32ArgRegs = {{Mips::A0, Mips::A1, Mips::A2,
+                                           Mips::A3}};
+  std::array<MCPhysReg, 2> FGR32ArgRegs = {{Mips::F12, Mips::F14}};
+  std::array<MCPhysReg, 2> AFGR64ArgRegs = {{Mips::D6, Mips::D7}};
+  auto NextGPR32 = GPR32ArgRegs.begin();
+  auto NextFGR32 = FGR32ArgRegs.begin();
+  auto NextAFGR64 = AFGR64ArgRegs.begin();
 
   struct AllocatedReg {
     const TargetRegisterClass *RC;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index ba05b0f48df79..3d383b3dfe3e4 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -3868,7 +3868,7 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       return std::make_pair(0U, nullptr);
     case 'l': // use the `lo` register to store values
               // that are no bigger than a word
-      if (VT == MVT::i32)
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8)
         return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
       return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
     case 'x': // use the concatenated `hi` and `lo` registers
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 817d9b44b9c2f..516edef0556ce 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -128,7 +128,7 @@ class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Mips Pseudo Instructions Format
 class MipsPseudo<dag outs, dag ins, list<dag> pattern,
                  InstrItinClass itin = IIPseudo> :
-  MipsInst<outs, ins, "", pattern, itin, Pseudo> {
+  MipsInst<outs, ins, "", pattern, itin, Pseudo>, PredicateControl {
   let isCodeGenOnly = 1;
   let isPseudo = 1;
 }
@@ -136,7 +136,7 @@ class MipsPseudo<dag outs, dag ins, list<dag> pattern,
 // Mips32/64 Pseudo Instruction Format
 class PseudoSE<dag outs, dag ins, list<dag> pattern,
                InstrItinClass itin = IIPseudo> :
-  MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  MipsPseudo<outs, ins, pattern, itin> {
   let EncodingPredicates = [HasStdEnc];
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 51ddc0d44c002..2e30d271e130b 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -298,7 +298,6 @@ unsigned MipsInstrInfo::getEquivalentCompactForm(
     case Mips::JR:
     case Mips::PseudoReturn:
     case Mips::PseudoIndirectBranch:
-    case Mips::TAILCALLREG:
       canUseShortMicroMipsCTI = true;
       break;
     }
@@ -377,18 +376,18 @@ unsigned MipsInstrInfo::getEquivalentCompactForm(
     // For MIPSR6, the instruction 'jic' can be used for these cases. Some
     // tools will accept 'jrc reg' as an alias for 'jic 0, $reg'.
     case Mips::JR:
+    case Mips::PseudoIndirectBranchR6:
     case Mips::PseudoReturn:
-    case Mips::PseudoIndirectBranch:
-    case Mips::TAILCALLREG:
+    case Mips::TAILCALLR6REG:
       if (canUseShortMicroMipsCTI)
         return Mips::JRC16_MM;
       return Mips::JIC;
     case Mips::JALRPseudo:
       return Mips::JIALC;
     case Mips::JR64:
+    case Mips::PseudoIndirectBranch64R6:
     case Mips::PseudoReturn64:
-    case Mips::PseudoIndirectBranch64:
-    case Mips::TAILCALLREG64:
+    case Mips::TAILCALL64R6REG:
       return Mips::JIC64;
     case Mips::JALR64Pseudo:
       return Mips::JIALC64;
@@ -617,6 +616,18 @@ bool MipsInstrInfo::verifyInstruction(const MachineInstr &MI,
       return verifyInsExtInstruction(MI, ErrInfo, 0, 32, 32, 64, 32, 64);
     case Mips::DEXTU:
       return verifyInsExtInstruction(MI, ErrInfo, 32, 64, 0, 32, 32, 64);
+    case Mips::TAILCALLREG:
+    case Mips::PseudoIndirectBranch:
+    case Mips::JR:
+    case Mips::JR64:
+    case Mips::JALR:
+    case Mips::JALR64:
+    case Mips::JALRPseudo:
+      if (!Subtarget.useIndirectJumpsHazard())
+        return true;
+
+      ErrInfo = "invalid instruction when using jump guards!";
+      return false;
     default:
       return true;
   }
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index e0d818b749df6..33a061e12a3fc 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -244,7 +244,10 @@ def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
                AssemblerPredicate<"!FeatureMadd4">;
 def HasMT  : Predicate<"Subtarget->hasMT()">,
              AssemblerPredicate<"FeatureMT">;
-
+def UseIndirectJumpsHazard : Predicate<"Subtarget->useIndirectJumpsHazard()">,
+                            AssemblerPredicate<"FeatureUseIndirectJumpsHazard">;
+def NoIndirectJumpGuards : Predicate<"!Subtarget->useIndirectJumpsHazard()">,
+                           AssemblerPredicate<"!FeatureUseIndirectJumpsHazard">;
 //===----------------------------------------------------------------------===//
 // Mips GPR size adjectives.
 // They are mutually exclusive.
@@ -1540,8 +1543,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
     PseudoInstExpansion<(JumpInst Opnd:$target)>;
 
-  class TailCallReg<RegisterOperand RO> :
-    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
+  class TailCallReg<Instruction JumpInst, RegisterOperand RO> :
+    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
+    PseudoInstExpansion<(JumpInst RO:$rs)>;
 }
 
 class BAL_BR_Pseudo<Instruction RealInst> :
@@ -2068,7 +2072,7 @@ def B       : UncondBranch<BEQ, brtarget>,
               AdditionalRequires<[NotInMicroMips]>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
-let AdditionalPredicates = [NotInMicroMips] in {
+let AdditionalPredicates = [NotInMicroMips, NoIndirectJumpGuards] in {
   def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
   def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
@@ -2088,24 +2092,28 @@ def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
 let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips] in {
   def TAILCALL : TailCall<J, jmptarget>;
 }
-
-def TAILCALLREG : TailCallReg<GPR32Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            NoIndirectJumpGuards] in
+  def TAILCALLREG : TailCallReg<JR, GPR32Opnd>, ISA_MIPS1_NOT_32R6_64R6;
 
 // Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
 // then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
-class PseudoIndirectBranchBase<RegisterOperand RO> :
+class PseudoIndirectBranchBase<Instruction JumpInst, RegisterOperand RO> :
     MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
-               II_IndirectBranchPseudo> {
+               II_IndirectBranchPseudo>,
+    PseudoInstExpansion<(JumpInst RO:$rs)> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
   let isBranch = 1;
   let isIndirectBranch = 1;
   bit isCTI = 1;
-  let Predicates = [NotInMips16Mode];
 }
 
-def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            NoIndirectJumpGuards] in
+  def PseudoIndirectBranch : PseudoIndirectBranchBase<JR, GPR32Opnd>,
+                             ISA_MIPS1_NOT_32R6_64R6;
 
 // Return instructions are matched as a RetRA instruction, then are expanded
 // into PseudoReturn/PseudoReturn64 after register allocation. Finally,
@@ -2278,8 +2286,8 @@ class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   list<dag> Pattern = [];
 }
 
-class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
-                   JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+class JR_HB_DESC<RegisterOperand RO> :
+  InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>, JR_HB_DESC_BASE<"jr.hb", RO> {
   let isBranch=1;
   let isIndirectBranch=1;
   let hasDelaySlot=1;
@@ -2288,8 +2296,9 @@ class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
   bit isCTI = 1;
 }
 
-class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
-                     JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+class JALR_HB_DESC<RegisterOperand RO> :
+  InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>, JALR_HB_DESC_BASE<"jalr.hb",
+                                                                     RO> {
   let isIndirectBranch=1;
   let hasDelaySlot=1;
   bit isCTI = 1;
@@ -2298,8 +2307,19 @@ class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
 class JR_HB_ENC : JR_HB_FM<8>;
 class JALR_HB_ENC : JALR_HB_FM<9>;
 
-def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
-def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+def JR_HB : JR_HB_DESC<GPR32Opnd>, JR_HB_ENC, ISA_MIPS32R2_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC<GPR32Opnd>, JALR_HB_ENC, ISA_MIPS32;
+
+let AdditionalPredicates = [NotInMicroMips, UseIndirectJumpsHazard] in
+  def JALRHBPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR_HB, RA>;
+
+
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
+                            UseIndirectJumpsHazard] in {
+  def TAILCALLREGHB : TailCallReg<JR_HB, GPR32Opnd>, ISA_MIPS32_NOT_32R6_64R6;
+  def PseudoIndirectHazardBranch : PseudoIndirectBranchBase<JR_HB, GPR32Opnd>,
+                                   ISA_MIPS32R2_NOT_32R6_64R6;
+}
 
 class TLB<string asmstr, InstrItinClass itin = NoItinerary> :
   InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
@@ -2433,7 +2453,8 @@ def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 }
-def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
+def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>,
+      ISA_MIPS32;
 def : MipsInstAlias<"neg $rt, $rs",
                     (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
 def : MipsInstAlias<"neg $rt",
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index bbf2050ce1ebb..e6ecbe9b5f661 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -371,11 +371,12 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
 
       // In NaCl, modifying the sp is not allowed in branch delay slot.
       // For MIPS32R6, we can skip using a delay slot branch.
-      if (Subtarget.isTargetNaCl() || Subtarget.hasMips32r6())
+      if (Subtarget.isTargetNaCl() ||
+          (Subtarget.hasMips32r6() && !Subtarget.useIndirectJumpsHazard()))
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
           .addReg(Mips::SP).addImm(8);
 
-      if (Subtarget.hasMips32r6()) {
+      if (Subtarget.hasMips32r6() && !Subtarget.useIndirectJumpsHazard()) {
         const unsigned JICOp =
             Subtarget.inMicroMipsMode() ? Mips::JIC_MMR6 : Mips::JIC;
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(JICOp))
@@ -383,7 +384,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
             .addImm(0);
 
       } else {
-        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+        unsigned JROp =
+            Subtarget.useIndirectJumpsHazard()
+                ? (Subtarget.hasMips32r6() ? Mips::JR_HB_R6 : Mips::JR_HB)
+                : Mips::JR;
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(JROp)).addReg(Mips::AT);
 
         if (Subtarget.isTargetNaCl()) {
           BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
@@ -475,7 +480,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
         .addReg(Mips::SP_64).addImm(0);
 
-      if (Subtarget.hasMips64r6()) {
+      if (Subtarget.hasMips64r6() && !Subtarget.useIndirectJumpsHazard()) {
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
             .addReg(Mips::SP_64)
             .addImm(16);
@@ -483,7 +488,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
             .addReg(Mips::AT_64)
             .addImm(0);
       } else {
-        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+        unsigned JROp =
+            Subtarget.useIndirectJumpsHazard()
+                ? (Subtarget.hasMips32r6() ? Mips::JR_HB64_R6 : Mips::JR_HB64)
+                : Mips::JR64;
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(JROp)).addReg(Mips::AT_64);
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
             .addReg(Mips::SP_64)
             .addImm(16);
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index f7d7e2af85e47..eee5b23117f6f 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -701,6 +701,77 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT,
+                                               SelectionDAG &DAG,
+                                               const MipsSubtarget &Subtarget) {
+  // Estimate the number of operations the below transform will turn a
+  // constant multiply into. The number is approximately how many powers
+  // of two summed together that the constant can be broken down into.
+
+  SmallVector<APInt, 16> WorkStack(1, C);
+  unsigned Steps = 0;
+  unsigned BitWidth = C.getBitWidth();
+
+  while (!WorkStack.empty()) {
+    APInt Val = WorkStack.pop_back_val();
+
+    if (Val == 0 || Val == 1)
+      continue;
+
+    if (Val.isPowerOf2()) {
+      ++Steps;
+      continue;
+    }
+
+    APInt Floor = APInt(BitWidth, 1) << Val.logBase2();
+    APInt Ceil = Val.isNegative() ? APInt(BitWidth, 0)
+                                  : APInt(BitWidth, 1) << C.ceilLogBase2();
+
+    if ((Val - Floor).ule(Ceil - Val)) {
+      WorkStack.push_back(Floor);
+      WorkStack.push_back(Val - Floor);
+      ++Steps;
+      continue;
+    }
+
+    WorkStack.push_back(Ceil);
+    WorkStack.push_back(Ceil - Val);
+    ++Steps;
+
+    // If we have taken more than 12[1] / 8[2] steps to attempt the
+    // optimization for a native sized value, it is more than likely that this
+    // optimization will make things worse.
+    //
+    // [1] MIPS64 requires 6 instructions at most to materialize any constant,
+    //     multiplication requires at least 4 cycles, but another cycle (or two)
+    //     to retrieve the result from the HI/LO registers.
+    //
+    // [2] For MIPS32, more than 8 steps is expensive as the constant could be
+    //     materialized in 2 instructions, multiplication requires at least 4
+    //     cycles, but another cycle (or two) to retrieve the result from the
+    //     HI/LO registers.
+
+    if (Steps > 12 && (Subtarget.isABI_N32() || Subtarget.isABI_N64()))
+      return false;
+
+    if (Steps > 8 && Subtarget.isABI_O32())
+      return false;
+  }
+
+  // If the value being multiplied is not supported natively, we have to pay
+  // an additional legalization cost, conservatively assume an increase in the
+  // cost of 3 instructions per step. This values for this heuristic were
+  // determined experimentally.
+  unsigned RegisterSize = DAG.getTargetLoweringInfo()
+                              .getRegisterType(*DAG.getContext(), VT)
+                              .getSizeInBits();
+  Steps *= (VT.getSizeInBits() != RegisterSize) * 3;
+  if (Steps > 27)
+    return false;
+
+  return true;
+}
+
 static SDValue genConstMult(SDValue X, APInt C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Return 0.
@@ -739,11 +810,13 @@ static SDValue genConstMult(SDValue X, APInt C, const SDLoc &DL, EVT VT,
 
 static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
                                  const TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSETargetLowering *TL) {
+                                 const MipsSETargetLowering *TL,
+                                 const MipsSubtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
-    if (!VT.isVector())
+    if (!VT.isVector() && shouldTransformMulToShiftsAddsSubs(
+                              C->getAPIntValue(), VT, DAG, Subtarget))
       return genConstMult(N->getOperand(0), C->getAPIntValue(), SDLoc(N), VT,
                           TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT),
                           DAG);
@@ -983,7 +1056,7 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::MUL:
-    return performMULCombine(N, DAG, DCI, this);
+    return performMULCombine(N, DAG, DCI, this, Subtarget);
   case ISD::SHL:
     Val = performSHLCombine(N, DAG, DCI, Subtarget);
     break;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index f6af7e22e351c..ddaa07ea9bc12 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -72,9 +72,10 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
       HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
       Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
       HasEVA(false), DisableMadd4(false), HasMT(false),
-      StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
-      TSInfo(), InstrInfo(MipsInstrInfo::create(
-                    initializeSubtargetDependencies(CPU, FS, TM))),
+      UseIndirectJumpsHazard(false), StackAlignOverride(StackAlignOverride),
+      TM(TM), TargetTriple(TT), TSInfo(),
+      InstrInfo(
+          MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
       TLInfo(MipsTargetLowering::create(TM, *this)) {
 
@@ -107,6 +108,15 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
   if (hasMips64r6() && InMicroMipsMode)
     report_fatal_error("microMIPS64R6 is not supported", false);
 
+
+  if (UseIndirectJumpsHazard) {
+    if (InMicroMipsMode)
+      report_fatal_error(
+          "cannot combine indirect jumps with hazard barriers and microMIPS");
+    if (!hasMips32r2())
+      report_fatal_error(
+          "indirect jumps with hazard barriers requires MIPS32R2 or later");
+  }
   if (hasMips32r6()) {
     StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
 
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 8b10b0596e0eb..ad2905c51601d 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -152,6 +152,10 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasMT -- support MT ASE.
   bool HasMT;
 
+  // Use hazard variants of the jump register instructions for indirect
+  // function calls and jump tables.
+  bool UseIndirectJumpsHazard;
+
   // Disable use of the `jal` instruction.
   bool UseLongCalls = false;
 
@@ -272,6 +276,9 @@ public:
   bool disableMadd4() const { return DisableMadd4; }
   bool hasEVA() const { return HasEVA; }
   bool hasMT() const { return HasMT; }
+  bool useIndirectJumpsHazard() const {
+    return UseIndirectJumpsHazard && hasMips32r2();
+  }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index d31e1cb5047b7..cb8cc7bb347af 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -44,6 +44,14 @@ static cl::opt<bool>
                                cl::desc("Disable load/store vectorizer"),
                                cl::init(false), cl::Hidden);
 
+// TODO: Remove this flag when we are confident with no regressions.
+static cl::opt<bool> DisableRequireStructuredCFG(
+    "disable-nvptx-require-structured-cfg",
+    cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
+             "structured CFG. The requirement should be disabled only when "
+             "unexpected regressions happen."),
+    cl::init(false), cl::Hidden);
+
 namespace llvm {
 
 void initializeNVVMIntrRangePass(PassRegistry&);
@@ -108,6 +116,8 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
     drvInterface = NVPTX::NVCL;
   else
     drvInterface = NVPTX::CUDA;
+  if (!DisableRequireStructuredCFG)
+    setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f0e8b11a3d9ce..26e9f13f9ff41 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12264,6 +12264,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
          N->getOperand(1).getValueType() == MVT::i16 ||
          (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
           N->getOperand(1).getValueType() == MVT::i64))) {
+      // STBRX can only handle simple types.
+      EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
+      if (mVT.isExtended())
+        break;
+
       SDValue BSwapOp = N->getOperand(1).getOperand(0);
       // Do an any-extend to 32-bits if this is a half-word input.
       if (BSwapOp.getValueType() == MVT::i16)
@@ -12271,7 +12276,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
       // If the type of BSWAP operand is wider than stored memory width
       // it need to be shifted to the right side before STBRX.
-      EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
       if (Op1VT.bitsGT(mVT)) {
         int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
         BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index fb16700a5e177..4ef71effd49b4 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2431,7 +2431,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
     // Use APInt's rotate function.
     int64_t SH = MI.getOperand(2).getImm();
     int64_t MB = MI.getOperand(3).getImm();
-    APInt InVal(Opc == PPC::RLDICL ? 64 : 32, SExtImm, true);
+    APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICLo) ?
+                64 : 32, SExtImm, true);
     InVal = InVal.rotl(SH);
     uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
     InVal &= Mask;
@@ -2444,6 +2445,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
       Is64BitLI = Opc != PPC::RLDICL_32;
       NewImm = InVal.getSExtValue();
       SetCR = Opc == PPC::RLDICLo;
+      if (SetCR && (SExtImm & NewImm) != NewImm)
+        return false;
       break;
     }
     return false;
@@ -2471,6 +2474,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
       Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
       NewImm = InVal.getSExtValue();
       SetCR = Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o;
+      if (SetCR && (SExtImm & NewImm) != NewImm)
+        return false;
       break;
     }
     return false;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 23ac9d9936ad8..44400813094b7 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -31,6 +31,7 @@ set(sources
   X86FixupBWInsts.cpp
   X86FixupLEAs.cpp
   X86FixupSetCC.cpp
+  X86FlagsCopyLowering.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
   X86InstructionSelector.cpp
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index c58254ae38c19..b3c491b3de5e3 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -265,13 +265,10 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
 /// @param reg        - The Reg to append.
 static void translateRegister(MCInst &mcInst, Reg reg) {
 #define ENTRY(x) X86::x,
-  uint8_t llvmRegnums[] = {
-    ALL_REGS
-    0
-  };
+  static constexpr MCPhysReg llvmRegnums[] = {ALL_REGS};
 #undef ENTRY
 
-  uint8_t llvmRegnum = llvmRegnums[reg];
+  MCPhysReg llvmRegnum = llvmRegnums[reg];
   mcInst.addOperand(MCOperand::createReg(llvmRegnum));
 }
 
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 3613268242922..642dda8f42259 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -66,6 +66,9 @@ FunctionPass *createX86OptimizeLEAs();
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
+/// Return a pass that lowers EFLAGS copy pseudo instructions.
+FunctionPass *createX86FlagsCopyLoweringPass();
+
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();
 
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index bc0f55f581ff4..ffe176ad47701 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Printable.h"
 #include <bitset>
 
 using namespace llvm;
@@ -262,25 +263,6 @@ public:
   }
 };
 
-/// An Instruction Converter which completely deletes an instruction.
-/// For example, IMPLICIT_DEF instructions can be deleted when converting from
-/// GPR to mask.
-class InstrDeleter : public InstrConverterBase {
-public:
-  InstrDeleter(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}
-
-  bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
-                    MachineRegisterInfo *MRI) const override {
-    assert(isLegal(MI, TII) && "Cannot convert instruction");
-    return true;
-  }
-
-  double getExtraCost(const MachineInstr *MI,
-                      MachineRegisterInfo *MRI) const override {
-    return 0;
-  }
-};
-
 // Key type to be used by the Instruction Converters map.
 // A converter is identified by <destination domain, source opcode>
 typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;
@@ -310,8 +292,12 @@ private:
   /// Domains which this closure can legally be reassigned to.
   std::bitset<NumDomains> LegalDstDomains;
 
+  /// An ID to uniquely identify this closure, even when it gets
+  /// moved around
+  unsigned ID;
+
 public:
-  Closure(std::initializer_list<RegDomain> LegalDstDomainList) {
+  Closure(unsigned ID, std::initializer_list<RegDomain> LegalDstDomainList) : ID(ID) {
     for (RegDomain D : LegalDstDomainList)
       LegalDstDomains.set(D);
   }
@@ -347,6 +333,27 @@ public:
     return Instrs;
   }
 
+  LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const {
+    dbgs() << "Registers: ";
+    bool First = true;
+    for (unsigned Reg : Edges) {
+      if (!First)
+        dbgs() << ", ";
+      First = false;
+      dbgs() << printReg(Reg, MRI->getTargetRegisterInfo());
+    }
+    dbgs() << "\n" << "Instructions:";
+    for (MachineInstr *MI : Instrs) {
+      dbgs() << "\n  ";
+      MI->print(dbgs());
+    }
+    dbgs() << "\n";
+  }
+
+  unsigned getID() const {
+    return ID;
+  }
+
 };
 
 class X86DomainReassignment : public MachineFunctionPass {
@@ -358,7 +365,7 @@ class X86DomainReassignment : public MachineFunctionPass {
   DenseSet<unsigned> EnclosedEdges;
 
   /// All instructions that are included in some closure.
-  DenseMap<MachineInstr *, Closure *> EnclosedInstrs;
+  DenseMap<MachineInstr *, unsigned> EnclosedInstrs;
 
 public:
   static char ID;
@@ -435,14 +442,14 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
 void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
   auto I = EnclosedInstrs.find(MI);
   if (I != EnclosedInstrs.end()) {
-    if (I->second != &C)
+    if (I->second != C.getID())
       // Instruction already belongs to another closure, avoid conflicts between
       // closure and mark this closure as illegal.
       C.setAllIllegal();
     return;
   }
 
-  EnclosedInstrs[MI] = &C;
+  EnclosedInstrs[MI] = C.getID();
   C.addInstruction(MI);
 
   // Mark closure as illegal for reassignment to domains, if there is no
@@ -587,7 +594,7 @@ void X86DomainReassignment::initConverters() {
       new InstrIgnore(TargetOpcode::PHI);
 
   Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
-      new InstrDeleter(TargetOpcode::IMPLICIT_DEF);
+      new InstrIgnore(TargetOpcode::IMPLICIT_DEF);
 
   Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
       new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2);
@@ -723,6 +730,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   std::vector<Closure> Closures;
 
   // Go over all virtual registers and calculate a closure.
+  unsigned ClosureID = 0;
   for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);
 
@@ -735,7 +743,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     // Calculate closure starting with Reg.
-    Closure C({MaskDomain});
+    Closure C(ClosureID++, {MaskDomain});
     buildClosure(C, Reg);
 
     // Collect all closures that can potentially be converted.
@@ -743,15 +751,16 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
       Closures.push_back(std::move(C));
   }
 
-  for (Closure &C : Closures)
+  for (Closure &C : Closures) {
+    DEBUG(C.dump(MRI));
     if (isReassignmentProfitable(C, MaskDomain)) {
       reassign(C, MaskDomain);
       ++NumClosuresConverted;
       Changed = true;
     }
+  }
 
-  for (auto I : Converters)
-    delete I.second;
+  DeleteContainerSeconds(Converters);
 
   DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n");
   DEBUG(MF.print(dbgs()));
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 80ce3c579fe0b..dca6c592614c9 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1789,9 +1789,16 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 bool X86FastISel::X86SelectShift(const Instruction *I) {
   unsigned CReg = 0, OpReg = 0;
   const TargetRegisterClass *RC = nullptr;
-  assert(!I->getType()->isIntegerTy(8) &&
-         "i8 shifts should be handled by autogenerated table");
-  if (I->getType()->isIntegerTy(16)) {
+  if (I->getType()->isIntegerTy(8)) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(16)) {
     CReg = X86::CX;
     RC = &X86::GR16RegClass;
     switch (I->getOpcode()) {
@@ -1836,10 +1843,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
 
   // The shift instruction uses X86::CL. If we defined a super-register
   // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
-  assert(CReg != X86::CL && "CReg should be a super register of CL");
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-          TII.get(TargetOpcode::KILL), X86::CL)
-    .addReg(CReg, RegState::Kill);
+  if (CReg != X86::CL)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::KILL), X86::CL)
+      .addReg(CReg, RegState::Kill);
 
   unsigned ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
new file mode 100644
index 0000000000000..a6fccd1347407
--- /dev/null
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -0,0 +1,935 @@
+//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
+/// flag bits.
+///
+/// We have to do this by carefully analyzing and rewriting the usage of the
+/// copied EFLAGS register because there is no general way to rematerialize the
+/// entire EFLAGS register safely and efficiently. Using `popf` both forces
+/// dynamic stack adjustment and can create correctness issues due to IF, TF,
+/// and other non-status flags being overwritten. Using sequences involving
+/// SAHF don't work on all x86 processors and are often quite slow compared to
+/// directly testing a single status preserved in its own GPR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-flags-copy-lowering"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
+STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
+STATISTIC(NumTestsInserted, "Number of test instructions inserted");
+STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
+
+namespace llvm {
+
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+// Convenient array type for storing registers associated with each condition.
+using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
+
+class X86FlagsCopyLoweringPass : public MachineFunctionPass {
+public:
+  X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
+    initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+private:
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetRegisterClass *PromoteRC;
+  MachineDominatorTree *MDT;
+
+  CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
+                                  MachineInstr &CopyDefI);
+
+  unsigned promoteCondToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator TestPos,
+                            DebugLoc TestLoc, X86::CondCode Cond);
+  std::pair<unsigned, bool>
+  getCondOrInverseInReg(MachineBasicBlock &TestMBB,
+                        MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                        X86::CondCode Cond, CondRegArray &CondRegs);
+  void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+                  DebugLoc Loc, unsigned Reg);
+
+  void rewriteArithmetic(MachineBasicBlock &TestMBB,
+                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                         MachineInstr &MI, MachineOperand &FlagUse,
+                         CondRegArray &CondRegs);
+  void rewriteCMov(MachineBasicBlock &TestMBB,
+                   MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                   MachineInstr &CMovI, MachineOperand &FlagUse,
+                   CondRegArray &CondRegs);
+  void rewriteCondJmp(MachineBasicBlock &TestMBB,
+                      MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                      MachineInstr &JmpI, CondRegArray &CondRegs);
+  void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
+                   MachineInstr &CopyDefI);
+  void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
+                               MachineBasicBlock::iterator TestPos,
+                               DebugLoc TestLoc, MachineInstr &SetBI,
+                               MachineOperand &FlagUse, CondRegArray &CondRegs);
+  void rewriteSetCC(MachineBasicBlock &TestMBB,
+                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                    MachineInstr &SetCCI, MachineOperand &FlagUse,
+                    CondRegArray &CondRegs);
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+                      "X86 EFLAGS copy lowering", false, false)
+INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+                    "X86 EFLAGS copy lowering", false, false)
+
+FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
+  return new X86FlagsCopyLoweringPass();
+}
+
+char X86FlagsCopyLoweringPass::ID = 0;
+
+void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+namespace {
+/// An enumeration of the arithmetic instruction mnemonics which have
+/// interesting flag semantics.
+///
+/// We can map instruction opcodes into these mnemonics to make it easy to
+/// dispatch with specific functionality.
+enum class FlagArithMnemonic {
+  ADC,
+  ADCX,
+  ADOX,
+  RCL,
+  RCR,
+  SBB,
+};
+} // namespace
+
+static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    report_fatal_error("No support for lowering a copy into EFLAGS when used "
+                       "by this instruction!");
+
+#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
+  case X86::MNEMONIC##8##SUFFIX:                                               \
+  case X86::MNEMONIC##16##SUFFIX:                                              \
+  case X86::MNEMONIC##32##SUFFIX:                                              \
+  case X86::MNEMONIC##64##SUFFIX:
+
+#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
+  case X86::MNEMONIC##8ri:                                                     \
+  case X86::MNEMONIC##16ri8:                                                   \
+  case X86::MNEMONIC##32ri8:                                                   \
+  case X86::MNEMONIC##64ri8:                                                   \
+  case X86::MNEMONIC##16ri:                                                    \
+  case X86::MNEMONIC##32ri:                                                    \
+  case X86::MNEMONIC##64ri32:                                                  \
+  case X86::MNEMONIC##8mi:                                                     \
+  case X86::MNEMONIC##16mi8:                                                   \
+  case X86::MNEMONIC##32mi8:                                                   \
+  case X86::MNEMONIC##64mi8:                                                   \
+  case X86::MNEMONIC##16mi:                                                    \
+  case X86::MNEMONIC##32mi:                                                    \
+  case X86::MNEMONIC##64mi32:                                                  \
+  case X86::MNEMONIC##8i8:                                                     \
+  case X86::MNEMONIC##16i16:                                                   \
+  case X86::MNEMONIC##32i32:                                                   \
+  case X86::MNEMONIC##64i32:
+
+    LLVM_EXPAND_ADC_SBB_INSTR(ADC)
+    return FlagArithMnemonic::ADC;
+
+    LLVM_EXPAND_ADC_SBB_INSTR(SBB)
+    return FlagArithMnemonic::SBB;
+
+#undef LLVM_EXPAND_ADC_SBB_INSTR
+
+    LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
+    LLVM_EXPAND_INSTR_SIZES(RCL, r1)
+    LLVM_EXPAND_INSTR_SIZES(RCL, ri)
+    return FlagArithMnemonic::RCL;
+
+    LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
+    LLVM_EXPAND_INSTR_SIZES(RCR, r1)
+    LLVM_EXPAND_INSTR_SIZES(RCR, ri)
+    return FlagArithMnemonic::RCR;
+
+#undef LLVM_EXPAND_INSTR_SIZES
+
+  case X86::ADCX32rr:
+  case X86::ADCX64rr:
+  case X86::ADCX32rm:
+  case X86::ADCX64rm:
+    return FlagArithMnemonic::ADCX;
+
+  case X86::ADOX32rr:
+  case X86::ADOX64rr:
+  case X86::ADOX32rm:
+  case X86::ADOX64rm:
+    return FlagArithMnemonic::ADOX;
+  }
+}
+
+static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
+                                     MachineInstr &SplitI,
+                                     const X86InstrInfo &TII) {
+  MachineFunction &MF = *MBB.getParent();
+
+  assert(SplitI.getParent() == &MBB &&
+         "Split instruction must be in the split block!");
+  assert(SplitI.isBranch() &&
+         "Only designed to split a tail of branch instructions!");
+  assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
+         "Must split on an actual jCC instruction!");
+
+  // Dig out the previous instruction to the split point.
+  MachineInstr &PrevI = *std::prev(SplitI.getIterator());
+  assert(PrevI.isBranch() && "Must split after a branch!");
+  assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
+         "Must split after an actual jCC instruction!");
+  assert(!std::prev(PrevI.getIterator())->isTerminator() &&
+         "Must only have this one terminator prior to the split!");
+
+  // Grab the one successor edge that will stay in `MBB`.
+  MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
+
+  // Analyze the original block to see if we are actually splitting an edge
+  // into two edges. This can happen when we have multiple conditional jumps to
+  // the same successor.
+  bool IsEdgeSplit =
+      std::any_of(SplitI.getIterator(), MBB.instr_end(),
+                  [&](MachineInstr &MI) {
+                    assert(MI.isTerminator() &&
+                           "Should only have spliced terminators!");
+                    return llvm::any_of(
+                        MI.operands(), [&](MachineOperand &MOp) {
+                          return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
+                        });
+                  }) ||
+      MBB.getFallThrough() == &UnsplitSucc;
+
+  MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+  // Insert the new block immediately after the current one. Any existing
+  // fallthrough will be sunk into this new block anyways.
+  MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+  // Splice the tail of instructions into the new block.
+  NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
+
+  // Copy the necessary succesors (and their probability info) into the new
+  // block.
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+    if (IsEdgeSplit || *SI != &UnsplitSucc)
+      NewMBB.copySuccessor(&MBB, SI);
+  // Normalize the probabilities if we didn't end up splitting the edge.
+  if (!IsEdgeSplit)
+    NewMBB.normalizeSuccProbs();
+
+  // Now replace all of the moved successors in the original block with the new
+  // block. This will merge their probabilities.
+  for (MachineBasicBlock *Succ : NewMBB.successors())
+    if (Succ != &UnsplitSucc)
+      MBB.replaceSuccessor(Succ, &NewMBB);
+
+  // We should always end up replacing at least one successor.
+  assert(MBB.isSuccessor(&NewMBB) &&
+         "Failed to make the new block a successor!");
+
+  // Now update all the PHIs.
+  for (MachineBasicBlock *Succ : NewMBB.successors()) {
+    for (MachineInstr &MI : *Succ) {
+      if (!MI.isPHI())
+        break;
+
+      for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+           OpIdx += 2) {
+        MachineOperand &OpV = MI.getOperand(OpIdx);
+        MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+        assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+        if (OpMBB.getMBB() != &MBB)
+          continue;
+
+        // Replace the operand for unsplit successors
+        if (!IsEdgeSplit || Succ != &UnsplitSucc) {
+          OpMBB.setMBB(&NewMBB);
+
+          // We have to continue scanning as there may be multiple entries in
+          // the PHI.
+          continue;
+        }
+
+        // When we have split the edge append a new successor.
+        MI.addOperand(MF, OpV);
+        MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+        break;
+      }
+    }
+  }
+
+  return NewMBB;
+}
+
+bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+               << " **********\n");
+
+  auto &Subtarget = MF.getSubtarget<X86Subtarget>();
+  MRI = &MF.getRegInfo();
+  TII = Subtarget.getInstrInfo();
+  TRI = Subtarget.getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  PromoteRC = &X86::GR8RegClass;
+
+  if (MF.begin() == MF.end())
+    // Nothing to do for a degenerate empty function...
+    return false;
+
+  SmallVector<MachineInstr *, 4> Copies;
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == TargetOpcode::COPY &&
+          MI.getOperand(0).getReg() == X86::EFLAGS)
+        Copies.push_back(&MI);
+
+  for (MachineInstr *CopyI : Copies) {
+    MachineBasicBlock &MBB = *CopyI->getParent();
+
+    MachineOperand &VOp = CopyI->getOperand(1);
+    assert(VOp.isReg() &&
+           "The input to the copy for EFLAGS should always be a register!");
+    MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
+    if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
+      // FIXME: The big likely candidate here are PHI nodes. We could in theory
+      // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
+      // enough that it is probably better to change every other part of LLVM
+      // to avoid creating them. The issue is that once we have PHIs we won't
+      // know which original EFLAGS value we need to capture with our setCCs
+      // below. The end result will be computing a complete set of setCCs that
+      // we *might* want, computing them in every place where we copy *out* of
+      // EFLAGS and then doing SSA formation on all of them to insert necessary
+      // PHI nodes and consume those here. Then hoping that somehow we DCE the
+      // unnecessary ones. This DCE seems very unlikely to be successful and so
+      // we will almost certainly end up with a glut of dead setCC
+      // instructions. Until we have a motivating test case and fail to avoid
+      // it by changing other parts of LLVM's lowering, we refuse to handle
+      // this complex case here.
+      DEBUG(dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
+            CopyDefI.dump());
+      report_fatal_error(
+          "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
+    }
+
+    auto Cleanup = make_scope_exit([&] {
+      // All uses of the EFLAGS copy are now rewritten, kill the copy into
+      // eflags and if dead the copy from.
+      CopyI->eraseFromParent();
+      if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
+        CopyDefI.eraseFromParent();
+      ++NumCopiesEliminated;
+    });
+
+    MachineOperand &DOp = CopyI->getOperand(0);
+    assert(DOp.isDef() && "Expected register def!");
+    assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
+    if (DOp.isDead())
+      continue;
+
+    MachineBasicBlock &TestMBB = *CopyDefI.getParent();
+    auto TestPos = CopyDefI.getIterator();
+    DebugLoc TestLoc = CopyDefI.getDebugLoc();
+
+    DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+
+    // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer
+    // jumps because their usage is very constrained.
+    bool FlagsKilled = false;
+    SmallVector<MachineInstr *, 4> JmpIs;
+
+    // Gather the condition flags that have already been preserved in
+    // registers. We do this from scratch each time as we expect there to be
+    // very few of them and we expect to not revisit the same copy definition
+    // many times. If either of those change sufficiently we could build a map
+    // of these up front instead.
+    CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
+
+    // Collect the basic blocks we need to scan. Typically this will just be
+    // a single basic block but we may have to scan multiple blocks if the
+    // EFLAGS copy lives into successors.
+    SmallVector<MachineBasicBlock *, 2> Blocks;
+    SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
+    Blocks.push_back(&MBB);
+    VisitedBlocks.insert(&MBB);
+
+    do {
+      MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
+
+      // We currently don't do any PHI insertion and so we require that the
+      // test basic block dominates all of the use basic blocks.
+      //
+      // We could in theory do PHI insertion here if it becomes useful by just
+      // taking undef values in along every edge that we don't trace this
+      // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
+      // but still seems like a great deal of complexity.
+      //
+      // Because it is theoretically possible that some earlier MI pass or
+      // other lowering transformation could induce this to happen, we do
+      // a hard check even in non-debug builds here.
+      if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
+        DEBUG({
+          dbgs() << "ERROR: Encountered use that is not dominated by our test "
+                    "basic block! Rewriting this would require inserting PHI "
+                    "nodes to track the flag state across the CFG.\n\nTest "
+                    "block:\n";
+          TestMBB.dump();
+          dbgs() << "Use block:\n";
+          UseMBB.dump();
+        });
+        report_fatal_error("Cannot lower EFLAGS copy when original copy def "
+                           "does not dominate all uses.");
+      }
+
+      for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
+                                      : UseMBB.instr_begin(),
+                MIE = UseMBB.instr_end();
+           MII != MIE;) {
+        MachineInstr &MI = *MII++;
+        MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+        if (!FlagUse) {
+          if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+            // If EFLAGS are defined, it's as-if they were killed. We can stop
+            // scanning here.
+            //
+            // NB!!! Many instructions only modify some flags. LLVM currently
+            // models this as clobbering all flags, but if that ever changes
+            // this will need to be carefully updated to handle that more
+            // complex logic.
+            FlagsKilled = true;
+            break;
+          }
+          continue;
+        }
+
+        DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
+
+        // Check the kill flag before we rewrite as that may change it.
+        if (FlagUse->isKill())
+          FlagsKilled = true;
+
+        // Once we encounter a branch, the rest of the instructions must also be
+        // branches. We can't rewrite in place here, so we handle them below.
+        //
+        // Note that we don't have to handle tail calls here, even conditional
+        // tail calls, as those are not introduced into the X86 MI until post-RA
+        // branch folding or black placement. As a consequence, we get to deal
+        // with the simpler formulation of conditional branches followed by tail
+        // calls.
+        if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+          auto JmpIt = MI.getIterator();
+          do {
+            JmpIs.push_back(&*JmpIt);
+            ++JmpIt;
+          } while (JmpIt != UseMBB.instr_end() &&
+                   X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+                       X86::COND_INVALID);
+          break;
+        }
+
+        // Otherwise we can just rewrite in-place.
+        if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+          rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
+                   X86::COND_INVALID) {
+          rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        } else if (MI.getOpcode() == TargetOpcode::COPY) {
+          rewriteCopy(MI, *FlagUse, CopyDefI);
+        } else {
+          // We assume all other instructions that use flags also def them.
+          assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+                 "Expected a def of EFLAGS for this instruction!");
+
+          // NB!!! Several arithmetic instructions only *partially* update
+          // flags. Theoretically, we could generate MI code sequences that
+          // would rely on this fact and observe different flags independently.
+          // But currently LLVM models all of these instructions as clobbering
+          // all the flags in an undef way. We rely on that to simplify the
+          // logic.
+          FlagsKilled = true;
+
+          switch (MI.getOpcode()) {
+          case X86::SETB_C8r:
+          case X86::SETB_C16r:
+          case X86::SETB_C32r:
+          case X86::SETB_C64r:
+            // Use custom lowering for arithmetic that is merely extending the
+            // carry flag. We model this as the SETB_C* pseudo instructions.
+            rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+                                    CondRegs);
+            break;
+
+          default:
+            // Generically handle remaining uses as arithmetic instructions.
+            rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+                              CondRegs);
+            break;
+          }
+          break;
+        }
+
+        // If this was the last use of the flags, we're done.
+        if (FlagsKilled)
+          break;
+      }
+
+      // If the flags were killed, we're done with this block.
+      if (FlagsKilled)
+        break;
+
+      // Otherwise we need to scan successors for ones where the flags live-in
+      // and queue those up for processing.
+      for (MachineBasicBlock *SuccMBB : UseMBB.successors())
+        if (SuccMBB->isLiveIn(X86::EFLAGS) &&
+            VisitedBlocks.insert(SuccMBB).second)
+          Blocks.push_back(SuccMBB);
+    } while (!Blocks.empty());
+
+    // Now rewrite the jumps that use the flags. These we handle specially
+    // because if there are multiple jumps in a single basic block we'll have
+    // to do surgery on the CFG.
+    MachineBasicBlock *LastJmpMBB = nullptr;
+    for (MachineInstr *JmpI : JmpIs) {
+      // Past the first jump within a basic block we need to split the blocks
+      // apart.
+      if (JmpI->getParent() == LastJmpMBB)
+        splitBlock(*JmpI->getParent(), *JmpI, *TII);
+      else
+        LastJmpMBB = JmpI->getParent();
+
+      rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+    }
+
+    // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
+    // the copy's def operand is itself a kill.
+  }
+
+#ifndef NDEBUG
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == TargetOpcode::COPY &&
+          (MI.getOperand(0).getReg() == X86::EFLAGS ||
+           MI.getOperand(1).getReg() == X86::EFLAGS)) {
+        DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; MI.dump());
+        llvm_unreachable("Unlowered EFLAGS copy!");
+      }
+#endif
+
+  return true;
+}
+
+/// Collect any conditions that have already been set in registers so that we
+/// can re-use them rather than adding duplicates.
+CondRegArray
+X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
+                                             MachineInstr &CopyDefI) {
+  CondRegArray CondRegs = {};
+
+  // Scan backwards across the range of instructions with live EFLAGS.
+  for (MachineInstr &MI : llvm::reverse(
+           llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
+    X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
+    if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
+        TRI->isVirtualRegister(MI.getOperand(0).getReg()))
+      CondRegs[Cond] = MI.getOperand(0).getReg();
+
+    // Stop scanning when we see the first definition of the EFLAGS as prior to
+    // this we would potentially capture the wrong flag state.
+    if (MI.findRegisterDefOperand(X86::EFLAGS))
+      break;
+  }
+  return CondRegs;
+}
+
+unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, X86::CondCode Cond) {
+  unsigned Reg = MRI->createVirtualRegister(PromoteRC);
+  auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
+                      TII->get(X86::getSETFromCond(Cond)), Reg);
+  (void)SetI;
+  DEBUG(dbgs() << "    save cond: "; SetI->dump());
+  ++NumSetCCsInserted;
+  return Reg;
+}
+
+std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
+  unsigned &CondReg = CondRegs[Cond];
+  unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
+  if (!CondReg && !InvCondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+  if (CondReg)
+    return {CondReg, false};
+  else
+    return {InvCondReg, true};
+}
+
+void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator Pos,
+                                          DebugLoc Loc, unsigned Reg) {
+  // We emit test instructions as register/immediate test against -1. This
+  // allows register allocation to fold a memory operand if needed (that will
+  // happen often due to the places this code is emitted). But hopefully will
+  // also allow us to select a shorter encoding of `testb %reg, %reg` when that
+  // would be equivalent.
+  auto TestI =
+      BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
+  (void)TestI;
+  DEBUG(dbgs() << "    test cond: "; TestI->dump());
+  ++NumTestsInserted;
+}
+
+void X86FlagsCopyLoweringPass::rewriteArithmetic(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
+    CondRegArray &CondRegs) {
+  // Arithmetic is either reading CF or OF. Figure out which condition we need
+  // to preserve in a register.
+  X86::CondCode Cond;
+
+  // The addend to use to reset CF or OF when added to the flag value.
+  int Addend;
+
+  switch (getMnemonicFromOpcode(MI.getOpcode())) {
+  case FlagArithMnemonic::ADC:
+  case FlagArithMnemonic::ADCX:
+  case FlagArithMnemonic::RCL:
+  case FlagArithMnemonic::RCR:
+  case FlagArithMnemonic::SBB:
+    Cond = X86::COND_B; // CF == 1
+    // Set up an addend that when one is added will need a carry due to not
+    // having a higher bit available.
+    Addend = 255;
+    break;
+
+  case FlagArithMnemonic::ADOX:
+    Cond = X86::COND_O; // OF == 1
+    // Set up an addend that when one is added will turn from positive to
+    // negative and thus overflow in the signed domain.
+    Addend = 127;
+    break;
+  }
+
+  // Now get a register that contains the value of the flag input to the
+  // arithmetic. We require exactly this flag to simplify the arithmetic
+  // required to materialize it back into the flag.
+  unsigned &CondReg = CondRegs[Cond];
+  if (!CondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  // Insert an instruction that will set the flag back to the desired value.
+  unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
+  auto AddI =
+      BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
+          .addDef(TmpReg, RegState::Dead)
+          .addReg(CondReg)
+          .addImm(Addend);
+  (void)AddI;
+  DEBUG(dbgs() << "    add cond: "; AddI->dump());
+  ++NumAddsInserted;
+  FlagUse.setIsKill(true);
+}
+
+void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
+                                           MachineBasicBlock::iterator TestPos,
+                                           DebugLoc TestLoc,
+                                           MachineInstr &CMovI,
+                                           MachineOperand &FlagUse,
+                                           CondRegArray &CondRegs) {
+  // First get the register containing this specific condition.
+  X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
+  unsigned CondReg;
+  bool Inverted;
+  std::tie(CondReg, Inverted) =
+      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+  MachineBasicBlock &MBB = *CMovI.getParent();
+
+  // Insert a direct test of the saved register.
+  insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
+
+  // Rewrite the CMov to use the !ZF flag from the test (but match register
+  // size and memory operand), and then kill its use of the flags afterward.
+  auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
+  CMovI.setDesc(TII->get(X86::getCMovFromCond(
+      Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
+      !CMovI.memoperands_empty())));
+  FlagUse.setIsKill(true);
+  DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCondJmp(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+  // First get the register containing this specific condition.
+  X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
+  unsigned CondReg;
+  bool Inverted;
+  std::tie(CondReg, Inverted) =
+      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+  MachineBasicBlock &JmpMBB = *JmpI.getParent();
+
+  // Insert a direct test of the saved register.
+  insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
+
+  // Rewrite the jump to use the !ZF flag from the test, and kill its use of
+  // flags afterward.
+  JmpI.setDesc(TII->get(
+      X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
+  const int ImplicitEFLAGSOpIdx = 1;
+  JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
+  DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
+                                           MachineOperand &FlagUse,
+                                           MachineInstr &CopyDefI) {
+  // Just replace this copy with the the original copy def.
+  MRI->replaceRegWith(MI.getOperand(0).getReg(),
+                      CopyDefI.getOperand(0).getReg());
+  MI.eraseFromParent();
+}
+
+void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
+    CondRegArray &CondRegs) {
+  // This routine is only used to handle pseudos for setting a register to zero
+  // or all ones based on CF. This is essentially the sign extended from 1-bit
+  // form of SETB and modeled with the SETB_C* pseudos. They require special
+  // handling as they aren't normal SETcc instructions and are lowered to an
+  // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
+  // they are only provided in reg-defining forms. A complicating factor is that
+  // they can define many different register widths.
+  assert(SetBI.getOperand(0).isReg() &&
+         "Cannot have a non-register defined operand to this variant of SETB!");
+
+  // Little helper to do the common final step of replacing the register def'ed
+  // by this SETB instruction with a new register and removing the SETB
+  // instruction.
+  auto RewriteToReg = [&](unsigned Reg) {
+    MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
+    SetBI.eraseFromParent();
+  };
+
+  // Grab the register class used for this particular instruction.
+  auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
+
+  MachineBasicBlock &MBB = *SetBI.getParent();
+  auto SetPos = SetBI.getIterator();
+  auto SetLoc = SetBI.getDebugLoc();
+
+  auto AdjustReg = [&](unsigned Reg) {
+    auto &OrigRC = *MRI->getRegClass(Reg);
+    if (&OrigRC == &SetBRC)
+      return Reg;
+
+    unsigned NewReg;
+
+    int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
+    int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
+    assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
+    assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
+    int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
+                       X86::NoSubRegister, X86::sub_32bit};
+
+    // If the original size is smaller than the target *and* is smaller than 4
+    // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
+    // to maximize the chance of being able to CSE that operation and to avoid
+    // partial dependency stalls extending to 2-bytes.
+    if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
+      NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
+          .addReg(Reg);
+      if (&SetBRC == &X86::GR32RegClass)
+        return NewReg;
+      Reg = NewReg;
+      OrigRegSize = 4;
+    }
+
+    NewReg = MRI->createVirtualRegister(&SetBRC);
+    if (OrigRegSize < TargetRegSize) {
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
+              NewReg)
+          .addImm(0)
+          .addReg(Reg)
+          .addImm(SubRegIdx[OrigRegSize]);
+    } else if (OrigRegSize > TargetRegSize) {
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
+              NewReg)
+          .addReg(Reg)
+          .addImm(SubRegIdx[TargetRegSize]);
+    } else {
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
+          .addReg(Reg);
+    }
+    return NewReg;
+  };
+
+  unsigned &CondReg = CondRegs[X86::COND_B];
+  if (!CondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
+
+  // Adjust the condition to have the desired register width by zero-extending
+  // as needed.
+  // FIXME: We should use a better API to avoid the local reference and using a
+  // different variable here.
+  unsigned ExtCondReg = AdjustReg(CondReg);
+
+  // Now we need to turn this into a bitmask. We do this by subtracting it from
+  // zero.
+  unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
+  ZeroReg = AdjustReg(ZeroReg);
+
+  unsigned Sub;
+  switch (SetBI.getOpcode()) {
+  case X86::SETB_C8r:
+    Sub = X86::SUB8rr;
+    break;
+
+  case X86::SETB_C16r:
+    Sub = X86::SUB16rr;
+    break;
+
+  case X86::SETB_C32r:
+    Sub = X86::SUB32rr;
+    break;
+
+  case X86::SETB_C64r:
+    Sub = X86::SUB64rr;
+    break;
+
+  default:
+    llvm_unreachable("Invalid SETB_C* opcode!");
+  }
+  unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
+  BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
+      .addReg(ZeroReg)
+      .addReg(ExtCondReg);
+  return RewriteToReg(ResultReg);
+}
+
+void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
+                                            MachineBasicBlock::iterator TestPos,
+                                            DebugLoc TestLoc,
+                                            MachineInstr &SetCCI,
+                                            MachineOperand &FlagUse,
+                                            CondRegArray &CondRegs) {
+  X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
+  // Note that we can't usefully rewrite this to the inverse without complex
+  // analysis of the users of the setCC. Largely we rely on duplicates which
+  // could have been avoided already being avoided here.
+  unsigned &CondReg = CondRegs[Cond];
+  if (!CondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+  // Rewriting a register def is trivial: we just replace the register and
+  // remove the setcc.
+  if (!SetCCI.mayStore()) {
+    assert(SetCCI.getOperand(0).isReg() &&
+           "Cannot have a non-register defined operand to SETcc!");
+    MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
+    SetCCI.eraseFromParent();
+    return;
+  }
+
+  // Otherwise, we need to emit a store.
+  auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
+                     SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
+  // Copy the address operands.
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.add(SetCCI.getOperand(i));
+
+  MIB.addReg(CondReg);
+
+  MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end());
+
+  SetCCI.eraseFromParent();
+  return;
+}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 10e19f92b4a63..c1ddb771e2faa 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -27781,11 +27781,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
-    // Permit reads of the FLAGS register without it being defined.
+    // Permit reads of the EFLAGS and DF registers without them being defined.
     // This intrinsic exists to read external processor state in flags, such as
     // the trap flag, interrupt flag, and direction flag, none of which are
     // modeled by the backend.
+    assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
+           "Unexpected register in operand!");
     Push->getOperand(2).setIsUndef();
+    assert(Push->getOperand(3).getReg() == X86::DF &&
+           "Unexpected register in operand!");
+    Push->getOperand(3).setIsUndef();
     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
 
     MI.eraseFromParent(); // The pseudo is gone now.
@@ -37829,25 +37834,6 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   }
 }
 
-/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
-/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
-/// we don't adjust the stack we clobber the first frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  return any_of(MRI.reg_instructions(X86::EFLAGS),
-                [](const MachineInstr &RI) { return RI.isCopy(); });
-}
-
-void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
-  if (hasCopyImplyingStackAdjustment(MF)) {
-    MachineFrameInfo &MFI = MF.getFrameInfo();
-    MFI.setHasCopyImplyingStackAdjustment(true);
-  }
-
-  TargetLoweringBase::finalizeLowering(MF);
-}
-
 /// This method query the target whether it is beneficial for dag combiner to
 /// promote the specified node. If true, it should return the desired promotion
 /// type by reference.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 3aa9d01bff20c..7820c3e032e55 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1099,9 +1099,6 @@ namespace llvm {
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
-
-    void finalizeLowering(MachineFunction &MF) const override;
-
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index d09deb5b75848..98cc8fb7439e8 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1334,7 +1334,7 @@ let Predicates = [HasBMI2] in {
 }
 
 //===----------------------------------------------------------------------===//
-// ADCX Instruction
+// ADCX and ADOX Instructions
 //
 let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
     Constraints = "$src0 = $dst", AddedComplexity = 10 in {
@@ -1349,6 +1349,15 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
              [(set GR64:$dst, EFLAGS,
                  (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
              IIC_BIN_CARRY_NONMEM>, T8PD;
+
+  // We don't have patterns for ADOX yet.
+  let hasSideEffects = 0 in {
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src0, GR32:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+
+  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src0, GR64:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+  } // hasSideEffects = 0
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
@@ -1363,27 +1372,14 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
              [(set GR64:$dst, EFLAGS,
                  (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
              IIC_BIN_CARRY_MEM>, T8PD;
-  }
-}
 
-//===----------------------------------------------------------------------===//
-// ADOX Instruction
-//
-let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
-    Uses = [EFLAGS] in {
-  let SchedRW = [WriteALU] in {
-  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
-
-  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
-  } // SchedRW
-
-  let mayLoad = 1, SchedRW = [WriteALULd] in {
-  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  // We don't have patterns for ADOX yet.
+  let hasSideEffects = 0 in {
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src0, i32mem:$src),
              "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
 
-  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src0, i64mem:$src),
              "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
   }
+  } // hasSideEffects = 0
 }
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index d66d9258e96f9..b3371c96cc29d 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -473,7 +473,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
             ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
     usesCustomInserter = 1, Uses = [ESP, SSP] in {
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
@@ -493,7 +493,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
             ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
     usesCustomInserter = 1, Uses = [RSP, SSP] in {
 def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_addr64",
@@ -509,7 +509,7 @@ def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 // For i386, the address of the thunk is passed on the stack, on return the
 // address of the variable is in %eax.  %ecx is trashed during the function
 // call.  All other registers are preserved.
-let Defs = [EAX, ECX, EFLAGS],
+let Defs = [EAX, ECX, EFLAGS, DF],
     Uses = [ESP, SSP],
     usesCustomInserter = 1 in
 def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
@@ -522,7 +522,7 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
 // %rdi. The lowering will do the right thing with RDI.
 // On return the address of the variable is in %rax.  All other
 // registers are preserved.
-let Defs = [RAX, EFLAGS],
+let Defs = [RAX, EFLAGS, DF],
     Uses = [RSP, SSP],
     usesCustomInserter = 1 in
 def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7ca1c58184f63..11ada51a87046 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -5782,7 +5782,7 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   return false;
 }
 
-static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
   case X86::JE_1:  return X86::COND_E;
@@ -5805,7 +5805,7 @@ static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
 }
 
 /// Return condition code of a SET opcode.
-static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+X86::CondCode X86::getCondFromSETOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
@@ -6130,7 +6130,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
     if (!I->isBranch())
       assert(0 && "Can't find the branch to replace!");
 
-    X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
+    X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode());
     assert(BranchCond.size() == 1);
     if (CC != BranchCond[0].getImm())
       continue;
@@ -6237,7 +6237,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
     }
 
     // Handle conditional branches.
-    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+    X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode());
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
@@ -6433,7 +6433,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
     if (I->isDebugValue())
       continue;
     if (I->getOpcode() != X86::JMP_1 &&
-        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+        X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -6710,102 +6710,12 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  bool FromEFLAGS = SrcReg == X86::EFLAGS;
-  bool ToEFLAGS = DestReg == X86::EFLAGS;
-  int Reg = FromEFLAGS ? DestReg : SrcReg;
-  bool is32 = X86::GR32RegClass.contains(Reg);
-  bool is64 = X86::GR64RegClass.contains(Reg);
-
-  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
-    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
-    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
-    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
-    int Pop = is64 ? X86::POP64r : X86::POP32r;
-    int PopF = is64 ? X86::POPF64 : X86::POPF32;
-    int AX = is64 ? X86::RAX : X86::EAX;
-
-    if (!Subtarget.hasLAHFSAHF()) {
-      assert(Subtarget.is64Bit() &&
-             "Not having LAHF/SAHF only happens on 64-bit.");
-      // Moving EFLAGS to / from another register requires a push and a pop.
-      // Notice that we have to adjust the stack if we don't want to clobber the
-      // first frame index. See X86FrameLowering.cpp - usesTheStack.
-      if (FromEFLAGS) {
-        BuildMI(MBB, MI, DL, get(PushF));
-        BuildMI(MBB, MI, DL, get(Pop), DestReg);
-      }
-      if (ToEFLAGS) {
-        BuildMI(MBB, MI, DL, get(Push))
-            .addReg(SrcReg, getKillRegState(KillSrc));
-        BuildMI(MBB, MI, DL, get(PopF));
-      }
-      return;
-    }
-
-    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
-    // inefficient. Instead:
-    //   - Save the overflow flag OF into AL using SETO, and restore it using a
-    //     signed 8-bit addition of AL and INT8_MAX.
-    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
-    //     using LAHF/SAHF.
-    //   - When RAX/EAX is live and isn't the destination register, make sure it
-    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
-    //     the flags.
-    // This approach is ~2.25x faster than using PUSHF/POPF.
-    //
-    // This is still somewhat inefficient because we don't know which flags are
-    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
-    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
-    //
-    // PUSHF/POPF is also potentially incorrect because it affects other flags
-    // such as TF/IF/DF, which LLVM doesn't model.
-    //
-    // Notice that we have to adjust the stack if we don't want to clobber the
-    // first frame index.
-    // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
-
-    const TargetRegisterInfo &TRI = getRegisterInfo();
-    MachineBasicBlock::LivenessQueryResult LQR =
-        MBB.computeRegisterLiveness(&TRI, AX, MI);
-    // We do not want to save and restore AX if we do not have to.
-    // Moreover, if we do so whereas AX is dead, we would need to set
-    // an undef flag on the use of AX, otherwise the verifier will
-    // complain that we read an undef value.
-    // We do not want to change the behavior of the machine verifier
-    // as this is usually wrong to read an undef value.
-    if (MachineBasicBlock::LQR_Unknown == LQR) {
-      LivePhysRegs LPR(TRI);
-      LPR.addLiveOuts(MBB);
-      MachineBasicBlock::iterator I = MBB.end();
-      while (I != MI) {
-        --I;
-        LPR.stepBackward(*I);
-      }
-      // AX contains the top most register in the aliasing hierarchy.
-      // It may not be live, but one of its aliases may be.
-      for (MCRegAliasIterator AI(AX, &TRI, true);
-           AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
-        LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
-                                : MachineBasicBlock::LQR_Dead;
-    }
-    bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
-    if (!AXDead)
-      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
-    if (FromEFLAGS) {
-      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
-      BuildMI(MBB, MI, DL, get(X86::LAHF));
-      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
-    }
-    if (ToEFLAGS) {
-      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
-          .addReg(X86::AL)
-          .addImm(INT8_MAX);
-      BuildMI(MBB, MI, DL, get(X86::SAHF));
-    }
-    if (!AXDead)
-      BuildMI(MBB, MI, DL, get(Pop), AX);
-    return;
+  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
+    // FIXME: We use a fatal error here because historically LLVM has tried
+    // lower some of these physreg copies and we want to ensure we get
+    // reasonable bug reports if someone encounters a case no other testing
+    // found. This path should be removed after the LLVM 7 release.
+    report_fatal_error("Unable to copy EFLAGS physical register!");
   }
 
   DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
@@ -7465,9 +7375,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
     if (IsCmpZero || IsSwapped) {
       // We decode the condition code from opcode.
       if (Instr.isBranch())
-        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+        OldCC = X86::getCondFromBranchOpc(Instr.getOpcode());
       else {
-        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        OldCC = X86::getCondFromSETOpc(Instr.getOpcode());
         if (OldCC != X86::COND_INVALID)
           OpcIsSET = true;
         else
@@ -9413,8 +9323,9 @@ bool X86InstrInfo::
 isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   // FIXME: Return false for x87 stack register classes for now. We can't
   // allow any loads of these registers before FpGet_ST0_80.
-  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
-           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+  return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
+           RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
+           RC == &X86::RFP80RegClass);
 }
 
 /// Return a virtual register initialized with the
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 02a09c340cef2..2b5ad934f9b18 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -77,6 +77,12 @@ unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
 unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
                          bool HasMemoryOperand = false);
 
+// Turn jCC opcode into condition code.
+CondCode getCondFromBranchOpc(unsigned Opc);
+
+// Turn setCC opcode into condition code.
+CondCode getCondFromSETOpc(unsigned Opc);
+
 // Turn CMov opcode into condition code.
 CondCode getCondFromCMovOpc(unsigned Opc);
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index a657b19c08c97..68f40c28d5279 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1235,18 +1235,18 @@ let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
 
 let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
     SchedRW = [WriteRMW] in {
-  let Defs = [ESP, EFLAGS], Uses = [ESP] in
+  let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in
   def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
                    [(int_x86_flags_write_u32 GR32:$src)]>,
                 Requires<[Not64BitMode]>;
 
-  let Defs = [RSP, EFLAGS], Uses = [RSP] in
+  let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in
   def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
                    [(int_x86_flags_write_u64 GR64:$src)]>,
                 Requires<[In64BitMode]>;
 }
 
-let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
     SchedRW = [WriteLoad] in {
 def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
                 OpSize16;
@@ -1254,7 +1254,7 @@ def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
                 OpSize32, Requires<[Not64BitMode]>;
 }
 
-let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0,
     SchedRW = [WriteStore] in {
 def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
                  OpSize16;
@@ -1294,10 +1294,10 @@ def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
                     Requires<[In64BitMode]>;
 }
 
-let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
 def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
                OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
-let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
+let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in
 def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
                  OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
 
@@ -1382,8 +1382,7 @@ def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in {
 def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
               "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
 def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
@@ -1394,36 +1393,33 @@ def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
                "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
 }
 
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+let Defs = [EDI], Uses = [AL,EDI,DF] in
 def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
               "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
-let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+let Defs = [EDI], Uses = [AX,EDI,DF] in
 def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
               "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
-let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+let Defs = [EDI], Uses = [EAX,EDI,DF] in
 def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
               "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
-let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
+let Defs = [RDI], Uses = [RAX,RDI,DF] in
 def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
                "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
 
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in
 def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
               "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
-let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in
 def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
               "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
-let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in
 def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
               "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
-let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in
 def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
                "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
 
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in {
 def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
               "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
 def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
@@ -2070,8 +2066,7 @@ def DATA32_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data32", [], IIC_NOP>,
 } // SchedRW
 
 // Repeat string operation instruction prefixes
-// These use the DF flag in the EFLAGS register to inc or dec ECX
-let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in {
 // Repeat (used with INS, OUTS, MOVS, LODS and STOS)
 def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
 // Repeat while not equal (used with CMPS and SCAS)
@@ -2080,24 +2075,22 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
 
 // String manipulation instructions
 let SchedRW = [WriteMicrocoded] in {
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+let Defs = [AL,ESI], Uses = [ESI,DF] in
 def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
               "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
-let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+let Defs = [AX,ESI], Uses = [ESI,DF] in
 def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
               "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
-let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+let Defs = [EAX,ESI], Uses = [ESI,DF] in
 def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
               "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
-let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+let Defs = [RAX,ESI], Uses = [ESI,DF] in
 def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
                "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
 }
 
 let SchedRW = [WriteSystem] in {
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+let Defs = [ESI], Uses = [DX,ESI,DF] in {
 def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
              "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
 def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
@@ -2106,8 +2099,7 @@ def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
               "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
 }
 
-// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
-let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+let Defs = [EDI], Uses = [DX,EDI,DF] in {
 def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
              "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
 def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
@@ -2117,19 +2109,22 @@ def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
 }
 }
 
-// Flag instructions
-let SchedRW = [WriteALU] in {
-def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
-def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
-def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
-def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+// EFLAGS management instructions.
+let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC_CMC_STC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_CLC_CMC_STC>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CLC_CMC_STC>;
+}
+
+// DF management instructions.
+// FIXME: These are a bit more expensive than CLC and STC. We should consider
+// adjusting their schedule bucket.
+let SchedRW = [WriteALU], Defs = [DF] in {
 def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
 def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
-def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
-
-def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
 }
 
+
 // Table lookup instructions
 let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
 def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 40d2dca4f9ec9..576f87b13ab46 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -693,6 +693,19 @@ let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
+// TS flag control instruction.
+let SchedRW = [WriteSystem] in {
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// IF (inside EFLAGS) management instructions.
+let SchedRW = [WriteSystem], Uses = [EFLAGS], Defs = [EFLAGS] in {
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+}
+
+//===----------------------------------------------------------------------===//
 // RDPID Instruction
 let SchedRW = [WriteSystem] in {
 def RDPID32 : I<0xC7, MRM7r, (outs GR32:$src), (ins),
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 2341e1fb0facf..1a776dcd04ebb 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -251,9 +251,19 @@ def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
 // Floating-point status word
 def FPSW : X86Reg<"fpsw", 0>;
 
-// Status flags register
+// Status flags register.
+//
+// Note that some flags that are commonly thought of as part of the status
+// flags register are modeled separately. Typically this is due to instructions
+// reading and updating those flags independently of all the others. We don't
+// want to create false dependencies between these instructions and so we use
+// a separate register to model them.
 def EFLAGS : X86Reg<"flags", 0>;
 
+// The direction flag.
+def DF : X86Reg<"DF", 0>;
+
+
 // Segment registers
 def CS : X86Reg<"cs", 1>;
 def DS : X86Reg<"ds", 3>;
@@ -497,6 +507,10 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
   let isAllocatable = 0;
 }
+def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
 
 // AVX-512 vector/mask registers.
 def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 2e21a97541b2c..078d459634cea 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -608,12 +608,10 @@ def IIC_CMPXCHG_8B : InstrItinClass;
 def IIC_CMPXCHG_16B : InstrItinClass;
 def IIC_LODS : InstrItinClass;
 def IIC_OUTS : InstrItinClass;
-def IIC_CLC : InstrItinClass;
+def IIC_CLC_CMC_STC : InstrItinClass;
 def IIC_CLD : InstrItinClass;
 def IIC_CLI : InstrItinClass;
-def IIC_CMC : InstrItinClass;
 def IIC_CLTS : InstrItinClass;
-def IIC_STC : InstrItinClass;
 def IIC_STI : InstrItinClass;
 def IIC_STD : InstrItinClass;
 def IIC_XLAT : InstrItinClass;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index e052ad98104cf..460b9823a7e7b 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -514,12 +514,10 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
   InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
   InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
-  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC_CMC_STC, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
-  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
   InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
   InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index ac242e1c00e04..e41e16d82d83b 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -62,6 +62,7 @@ void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
 void initializeX86ExecutionDepsFixPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -80,6 +81,7 @@ extern "C" void LLVMInitializeX86Target() {
   initializeX86CmovConverterPassPass(PR);
   initializeX86ExecutionDepsFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
+  initializeX86FlagsCopyLoweringPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -415,6 +417,7 @@ void X86PassConfig::addPreRegAlloc() {
     addPass(createX86CallFrameOptimization());
   }
 
+  addPass(createX86FlagsCopyLoweringPass());
   addPass(createX86WinAllocaExpander());
 }
 void X86PassConfig::addMachineSSAOptimization() {
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index b25cbcad3b9d0..76c4a8fbc16e9 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -847,10 +847,20 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
     if (CS.getInstruction() == nullptr || !CS.isCallee(&U))
       return nullptr;
 
+    // Can't change signature of musttail callee
+    if (CS.isMustTailCall())
+      return nullptr;
+
     if (CS.getInstruction()->getParent()->getParent() == F)
       isSelfRecursive = true;
   }
 
+  // Can't change signature of musttail caller
+  // FIXME: Support promoting whole chain of musttail functions
+  for (BasicBlock &BB : *F)
+    if (BB.getTerminatingMustTailCall())
+      return nullptr;
+
   const DataLayout &DL = F->getParent()->getDataLayout();
 
   AAResults &AAR = AARGetter(*F);
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 5446541550e54..b2afa6f2c9cda 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -507,14 +507,28 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   // MaybeLive. Initialized to a list of RetCount empty lists.
   RetUses MaybeLiveRetUses(RetCount);
 
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+  bool HasMustTailCalls = false;
+
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
       if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
           != F.getFunctionType()->getReturnType()) {
         // We don't support old style multiple return values.
         MarkLive(F);
         return;
       }
+    }
+
+    // If we have any returns of `musttail` results - the signature can't
+    // change
+    if (BB->getTerminatingMustTailCall() != nullptr)
+      HasMustTailCalls = true;
+  }
+
+  if (HasMustTailCalls) {
+    DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+                 << " has musttail calls\n");
+  }
 
   if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
     MarkLive(F);
@@ -526,6 +540,9 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   // Keep track of the number of live retvals, so we can skip checks once all
   // of them turn out to be live.
   unsigned NumLiveRetVals = 0;
+
+  bool HasMustTailCallers = false;
+
   // Loop all uses of the function.
   for (const Use &U : F.uses()) {
     // If the function is PASSED IN as an argument, its address has been
@@ -536,6 +553,11 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
       return;
     }
 
+    // The number of arguments for `musttail` call must match the number of
+    // arguments of the caller
+    if (CS.isMustTailCall())
+      HasMustTailCallers = true;
+
     // If this use is anything other than a call site, the function is alive.
     const Instruction *TheCall = CS.getInstruction();
     if (!TheCall) {   // Not a direct call site?
@@ -580,6 +602,11 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
     }
   }
 
+  if (HasMustTailCallers) {
+    DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+                 << " has musttail callers\n");
+  }
+
   // Now we've inspected all callers, record the liveness of our return values.
   for (unsigned i = 0; i != RetCount; ++i)
     MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
@@ -593,12 +620,19 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   for (Function::const_arg_iterator AI = F.arg_begin(),
        E = F.arg_end(); AI != E; ++AI, ++i) {
     Liveness Result;
-    if (F.getFunctionType()->isVarArg()) {
+    if (F.getFunctionType()->isVarArg() || HasMustTailCallers ||
+        HasMustTailCalls) {
       // Variadic functions will already have a va_arg function expanded inside
       // them, making them potentially very sensitive to ABI changes resulting
       // from removing arguments entirely, so don't. For example AArch64 handles
       // register and stack HFAs very differently, and this is reflected in the
       // IR which has already been generated.
+      //
+      // `musttail` calls to this function restrict argument removal attempts.
+      // The signature of the caller must match the signature of the function.
+      //
+      // `musttail` calls in this function prevents us from changing its
+      // signature
       Result = Live;
     } else {
       // See what the effect of this use is (recording any uses that cause
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 4bb2984e3b473..e0bbf45d316aa 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2099,8 +2099,31 @@ static void RemoveNestAttribute(Function *F) {
 /// GHC, or anyregcc.
 static bool isProfitableToMakeFastCC(Function *F) {
   CallingConv::ID CC = F->getCallingConv();
+
   // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
-  return CC == CallingConv::C || CC == CallingConv::X86_ThisCall;
+  if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
+    return false;
+
+  // FIXME: Change CC for the whole chain of musttail calls when possible.
+  //
+  // Can't change CC of the function that either has musttail calls, or is a
+  // musttail callee itself
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    CallInst* CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    if (CI->isMustTailCall())
+      return false;
+  }
+
+  for (BasicBlock &BB : *F)
+    if (BB.getTerminatingMustTailCall())
+      return false;
+
+  return true;
 }
 
 static bool
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 76b90391fbb1b..8886af90ba65a 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -638,6 +638,19 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
   DEBUG(dbgs() << " }\n");
 }
 
+// Don't merge tiny functions using a thunk, since it can just end up
+// making the function larger.
+static bool isThunkProfitable(Function * F) {
+  if (F->size() == 1) {
+    if (F->front().size() <= 2) {
+      DEBUG(dbgs() << "isThunkProfitable: " << F->getName()
+                    << " is too small to bother creating a thunk for\n");
+      return false;
+    }
+  }
+  return true;
+}
+
 // Replace G with a simple tail call to bitcast(F). Also (unless
 // MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
 // delete G. Under MergeFunctionsPDI, we use G itself for creating
@@ -647,39 +660,6 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 // For better debugability, under MergeFunctionsPDI, we do not modify G's
 // call sites to point to F even when within the same translation unit.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
-  if (!G->isInterposable() && !MergeFunctionsPDI) {
-    if (G->hasGlobalUnnamedAddr()) {
-      // G might have been a key in our GlobalNumberState, and it's illegal
-      // to replace a key in ValueMap<GlobalValue *> with a non-global.
-      GlobalNumbers.erase(G);
-      // If G's address is not significant, replace it entirely.
-      Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
-      G->replaceAllUsesWith(BitcastF);
-    } else {
-      // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
-      // above).
-      replaceDirectCallers(G, F);
-    }
-  }
-
-  // If G was internal then we may have replaced all uses of G with F. If so,
-  // stop here and delete G. There's no need for a thunk. (See note on
-  // MergeFunctionsPDI above).
-  if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
-    G->eraseFromParent();
-    return;
-  }
-
-  // Don't merge tiny functions using a thunk, since it can just end up
-  // making the function larger.
-  if (F->size() == 1) {
-    if (F->front().size() <= 2) {
-      DEBUG(dbgs() << "writeThunk: " << F->getName()
-                   << " is too small to bother creating a thunk for\n");
-      return;
-    }
-  }
-
   BasicBlock *GEntryBlock = nullptr;
   std::vector<Instruction *> PDIUnrelatedWL;
   BasicBlock *BB = nullptr;
@@ -754,6 +734,10 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
   if (F->isInterposable()) {
     assert(G->isInterposable());
 
+    if (!isThunkProfitable(F)) {
+      return;
+    }
+
     // Make them both thunks to the same internal function.
     Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
                                    F->getParent());
@@ -770,11 +754,41 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     F->setAlignment(MaxAlignment);
     F->setLinkage(GlobalValue::PrivateLinkage);
     ++NumDoubleWeak;
+    ++NumFunctionsMerged;
   } else {
+    // For better debugability, under MergeFunctionsPDI, we do not modify G's
+    // call sites to point to F even when within the same translation unit.
+    if (!G->isInterposable() && !MergeFunctionsPDI) {
+      if (G->hasGlobalUnnamedAddr()) {
+        // G might have been a key in our GlobalNumberState, and it's illegal
+        // to replace a key in ValueMap<GlobalValue *> with a non-global.
+        GlobalNumbers.erase(G);
+        // If G's address is not significant, replace it entirely.
+        Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+        G->replaceAllUsesWith(BitcastF);
+      } else {
+        // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+        // above).
+        replaceDirectCallers(G, F);
+      }
+    }
+
+    // If G was internal then we may have replaced all uses of G with F. If so,
+    // stop here and delete G. There's no need for a thunk. (See note on
+    // MergeFunctionsPDI above).
+    if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
+      G->eraseFromParent();
+      ++NumFunctionsMerged;
+      return;
+    }
+
+    if (!isThunkProfitable(F)) {
+      return;
+    }
+
     writeThunk(F, G);
+    ++NumFunctionsMerged;
   }
-
-  ++NumFunctionsMerged;
 }
 
 /// Replace function F by function G.
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index b332e75c7feb2..8fa7d0684b94d 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -34,6 +34,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -1946,13 +1947,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // addrspacecast between types is canonicalized as a bitcast, then an
   // addrspacecast. To take advantage of the below bitcast + struct GEP, look
   // through the addrspacecast.
+  Value *ASCStrippedPtrOp = PtrOp;
   if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
     //   X = bitcast A addrspace(1)* to B addrspace(1)*
     //   Y = addrspacecast A addrspace(1)* to B addrspace(2)*
     //   Z = gep Y, <...constant indices...>
     // Into an addrspacecasted GEP of the struct.
     if (BitCastInst *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
-      PtrOp = BC;
+      ASCStrippedPtrOp = BC;
   }
 
   /// See if we can simplify:
@@ -1960,7 +1962,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   ///   Y = gep X, <...constant indices...>
   /// into a gep of the original struct.  This is important for SROA and alias
   /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
-  if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
     Value *Operand = BCI->getOperand(0);
     PointerType *OpType = cast<PointerType>(Operand->getType());
     unsigned OffsetBits = DL.getPointerTypeSizeInBits(GEP.getType());
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 4edea7cc3c825..7488cd5af8bed 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -201,6 +201,46 @@ static bool canSplitCallSite(CallSite CS) {
   return CallSiteBB->canSplitPredecessors();
 }
 
+static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
+                                         Value *V) {
+  Instruction *Copy = I->clone();
+  Copy->setName(I->getName());
+  Copy->insertBefore(Before);
+  if (V)
+    Copy->setOperand(0, V);
+  return Copy;
+}
+
+/// Copy mandatory `musttail` return sequence that follows original `CI`, and
+/// link it up to `NewCI` value instead:
+///
+///   * (optional) `bitcast NewCI to ...`
+///   * `ret bitcast or NewCI`
+///
+/// Insert this sequence right before `SplitBB`'s terminator, which will be
+/// cleaned up later in `splitCallSite` below.
+static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
+                               Instruction *NewCI) {
+  bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
+  auto II = std::next(CI->getIterator());
+
+  BitCastInst *BCI = dyn_cast<BitCastInst>(&*II);
+  if (BCI)
+    ++II;
+
+  ReturnInst *RI = dyn_cast<ReturnInst>(&*II);
+  assert(RI && "`musttail` call must be followed by `ret` instruction");
+
+  TerminatorInst *TI = SplitBB->getTerminator();
+  Value *V = NewCI;
+  if (BCI)
+    V = cloneInstForMustTail(BCI, TI, V);
+  cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V);
+
+  // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug
+  // that prevents doing this now.
+}
+
 /// Return true if the CS is split into its new predecessors which are directly
 /// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
 /// CallInst1 and CallInst2 will be the new call-sites placed in the new
@@ -245,6 +285,7 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
                           Instruction *CallInst1, Instruction *CallInst2) {
   Instruction *Instr = CS.getInstruction();
   BasicBlock *TailBB = Instr->getParent();
+  bool IsMustTailCall = CS.isMustTailCall();
   assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
 
   BasicBlock *SplitBlock1 =
@@ -276,9 +317,14 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
       ++ArgNo;
     }
   }
+  // Clone and place bitcast and return instructions before `TI`
+  if (IsMustTailCall) {
+    copyMustTailReturn(SplitBlock1, CS.getInstruction(), CallInst1);
+    copyMustTailReturn(SplitBlock2, CS.getInstruction(), CallInst2);
+  }
 
   // Replace users of the original call with a PHI mering call-sites split.
-  if (Instr->getNumUses()) {
+  if (!IsMustTailCall && Instr->getNumUses()) {
     PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
                                   TailBB->getFirstNonPHI());
     PN->addIncoming(CallInst1, SplitBlock1);
@@ -290,8 +336,25 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
                << "\n");
   DEBUG(dbgs() << "    " << *CallInst2 << " in " << SplitBlock2->getName()
                << "\n");
-  Instr->eraseFromParent();
+
   NumCallSiteSplit++;
+
+  // FIXME: remove TI in `copyMustTailReturn`
+  if (IsMustTailCall) {
+    // Remove superfluous `br` terminators from the end of the Split blocks
+    // NOTE: Removing terminator removes the SplitBlock from the TailBB's
+    // predecessors. Therefore we must get complete list of Splits before
+    // attempting removal.
+    SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB)));
+    assert(Splits.size() == 2 && "Expected exactly 2 splits!");
+    for (unsigned i = 0; i < Splits.size(); i++)
+      Splits[i]->getTerminator()->eraseFromParent();
+
+    // Erase the tail block once done with musttail patching
+    TailBB->eraseFromParent();
+    return;
+  }
+  Instr->eraseFromParent();
 }
 
 // Return true if the call-site has an argument which is a PHI with only
@@ -369,7 +432,17 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
       Function *Callee = CS.getCalledFunction();
       if (!Callee || Callee->isDeclaration())
         continue;
+
+      // Successful musttail call-site splits result in erased CI and erased BB.
+      // Check if such path is possible before attempting the splitting.
+      bool IsMustTail = CS.isMustTailCall();
+
       Changed |= tryToSplitCallSite(CS);
+
+      // There're no interesting instructions after this. The call site
+      // itself might have been erased on splitting.
+      if (IsMustTail)
+        break;
     }
   }
   return Changed;
diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp
index e383af89a3845..e1bc590c5c9ae 100644
--- a/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -48,7 +50,10 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
 
   // Insert all divide and remainder instructions into maps keyed by their
   // operands and opcode (signed or unsigned).
-  DenseMap<DivRemMapKey, Instruction *> DivMap, RemMap;
+  DenseMap<DivRemMapKey, Instruction *> DivMap;
+  // Use a MapVector for RemMap so that instructions are moved/inserted in a
+  // deterministic order.
+  MapVector<DivRemMapKey, Instruction *> RemMap;
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (I.getOpcode() == Instruction::SDiv)
@@ -67,14 +72,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
   // rare than division.
   for (auto &RemPair : RemMap) {
     // Find the matching division instruction from the division map.
-    Instruction *DivInst = DivMap[RemPair.getFirst()];
+    Instruction *DivInst = DivMap[RemPair.first];
     if (!DivInst)
       continue;
 
     // We have a matching pair of div/rem instructions. If one dominates the
     // other, hoist and/or replace one.
     NumPairs++;
-    Instruction *RemInst = RemPair.getSecond();
+    Instruction *RemInst = RemPair.second;
     bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
     bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 141c9938bf8be..2f1645433fb87 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1454,6 +1454,9 @@ FindMostPopularDest(BasicBlock *BB,
     if (PredToDest.second)
       DestPopularity[PredToDest.second]++;
 
+  if (DestPopularity.empty())
+    return nullptr;
+
   // Find the most popular dest.
   DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
   BasicBlock *MostPopularDest = DPI->first;
@@ -1629,8 +1632,20 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   // threadable destination (the common case) we can avoid this.
   BasicBlock *MostPopularDest = OnlyDest;
 
-  if (MostPopularDest == MultipleDestSentinel)
+  if (MostPopularDest == MultipleDestSentinel) {
+    // Remove any loop headers from the Dest list, ThreadEdge conservatively
+    // won't process them, but we might have other destination that are eligible
+    // and we still want to process.
+    erase_if(PredToDestList,
+             [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
+               return LoopHeaders.count(PredToDest.second) != 0;
+             });
+
+    if (PredToDestList.empty())
+      return false;
+
     MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+  }
 
   // Now that we know what the most popular destination is, factor all
   // predecessors that will jump to it into a single predecessor.
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 9dc550ceaeca7..3e12649ddedca 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -223,6 +223,10 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   /// represented here for efficient lookup.
   SmallPtrSet<Function *, 16> MRVFunctionsTracked;
 
+  /// MustTailFunctions - Each function here is a callee of non-removable
+  /// musttail call site.
+  SmallPtrSet<Function *, 16> MustTailCallees;
+
   /// TrackingIncomingArguments - This is the set of functions for whose
   /// arguments we make optimistic assumptions about and try to prove as
   /// constants.
@@ -289,6 +293,18 @@ public:
       TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
   }
 
+  /// AddMustTailCallee - If the SCCP solver finds that this function is called
+  /// from non-removable musttail call site.
+  void AddMustTailCallee(Function *F) {
+    MustTailCallees.insert(F);
+  }
+
+  /// Returns true if the given function is called from non-removable musttail
+  /// call site.
+  bool isMustTailCallee(Function *F) {
+    return MustTailCallees.count(F);
+  }
+
   void AddArgumentTrackedFunction(Function *F) {
     TrackingIncomingArguments.insert(F);
   }
@@ -358,6 +374,12 @@ public:
     return MRVFunctionsTracked;
   }
 
+  /// getMustTailCallees - Get the set of functions which are called
+  /// from non-removable musttail call sites.
+  const SmallPtrSet<Function *, 16> getMustTailCallees() {
+    return MustTailCallees;
+  }
+
   /// markOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
   void markOverdefined(Value *V) {
@@ -1672,6 +1694,23 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
           IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
   }
   assert(Const && "Constant is nullptr here!");
+
+  // Replacing `musttail` instructions with constant breaks `musttail` invariant
+  // unless the call itself can be removed
+  CallInst *CI = dyn_cast<CallInst>(V);
+  if (CI && CI->isMustTailCall() && !isInstructionTriviallyDead(CI)) {
+    CallSite CS(CI);
+    Function *F = CS.getCalledFunction();
+
+    // Don't zap returns of the callee
+    if (F)
+      Solver.AddMustTailCallee(F);
+
+    DEBUG(dbgs() << "  Can\'t treat the result of musttail call : " << *CI
+                 << " as a constant\n");
+    return false;
+  }
+
   DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
 
   // Replaces all of the uses of a variable with uses of the constant.
@@ -1802,10 +1841,26 @@ static void findReturnsToZap(Function &F,
   if (!Solver.isArgumentTrackedFunction(&F))
     return;
 
-  for (BasicBlock &BB : F)
+  // There is a non-removable musttail call site of this function. Zapping
+  // returns is not allowed.
+  if (Solver.isMustTailCallee(&F)) {
+    DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
+                 << " due to present musttail call of it\n");
+    return;
+  }
+
+  for (BasicBlock &BB : F) {
+    if (CallInst *CI = BB.getTerminatingMustTailCall()) {
+      DEBUG(dbgs() << "Can't zap return of the block due to present "
+                   << "musttail call : " << *CI << "\n");
+      (void)CI;
+      return;
+    }
+
     if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
       if (!isa<UndefValue>(RI->getOperand(0)))
         ReturnsToZap.push_back(RI);
+  }
 }
 
 static bool runIPSCCP(Module &M, const DataLayout &DL,
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index bddcbd86e914d..75539428b688e 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -710,7 +710,7 @@ int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
     return Res;
   if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
     return Res;
-  llvm_unreachable("InlineAsm blocks were not uniqued.");
+  assert(L->getFunctionType() != R->getFunctionType());
   return 0;
 }