12 files changed, 469 insertions, 98 deletions
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 314c990293cc5..7e75e88477852 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -46,13 +46,21 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
 
   bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
-  
+  Module *TheModule = F ? F->getParent() : nullptr;
+
   // Loop over all instructions, and copy them over.
   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
        II != IE; ++II) {
 
-    if (DIFinder && F->getParent() && II->getDebugLoc())
-      DIFinder->processLocation(*F->getParent(), II->getDebugLoc().get());
+    if (DIFinder && TheModule) {
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(II))
+        DIFinder->processDeclare(*TheModule, DDI);
+      else if (auto *DVI = dyn_cast<DbgValueInst>(II))
+        DIFinder->processValue(*TheModule, DVI);
+
+      if (auto DbgLoc = II->getDebugLoc())
+        DIFinder->processLocation(*TheModule, DbgLoc.get());
+    }
 
     Instruction *NewInst = II->clone();
     if (II->hasName())
@@ -153,6 +161,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // When we remap instructions, we want to avoid duplicating inlined
   // DISubprograms, so record all subprograms we find as we duplicate
   // instructions and then freeze them in the MD map.
+  // We also record information about dbg.value and dbg.declare to avoid
+  // duplicating the types.
   DebugInfoFinder DIFinder;
 
   // Loop over all of the basic blocks in the function, cloning them as
@@ -193,6 +203,10 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     }
   }
 
+  for (auto *Type : DIFinder.types()) {
+    VMap.MD()[Type].reset(Type);
+  }
+
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
   for (Function::iterator BB =
diff --git a/lib/Transforms/Utils/CmpInstAnalysis.cpp b/lib/Transforms/Utils/CmpInstAnalysis.cpp
index 9f4d9c7e39810..d9294c4993091 100644
--- a/lib/Transforms/Utils/CmpInstAnalysis.cpp
+++ b/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -81,7 +81,7 @@ bool llvm::decomposeBitTestICmp(const ICmpInst *I, CmpInst::Predicate &Pred,
     break;
   case ICmpInst::ICMP_SGT:
     // X > -1 is equivalent to (X & SignMask) == 0.
-    if (!C->isAllOnesValue())
+    if (!C->isMinusOne())
       return false;
     Y = ConstantInt::get(I->getContext(), APInt::getSignMask(C->getBitWidth()));
     Pred = ICmpInst::ICMP_EQ;
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 30d8856cfbef1..1189714dfab10 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1116,12 +1116,6 @@ Function *CodeExtractor::extractCodeRegion() {
         }
     }
 
-  //cerr << "NEW FUNCTION: " << *newFunction;
-  //  verifyFunction(*newFunction);
-
-  //  cerr << "OLD FUNCTION: " << *oldFunction;
-  //  verifyFunction(*oldFunction);
-
   DEBUG(if (verifyFunction(*newFunction)) 
         report_fatal_error("verifyFunction failed!"));
   return newFunction;
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
index c97e544e620a9..1328f2f3ec012 100644
--- a/lib/Transforms/Utils/Evaluator.cpp
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -402,7 +402,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           Value *Ptr = PtrArg->stripPointerCasts();
           if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
             Type *ElemTy = GV->getValueType();
-            if (!Size->isAllOnesValue() &&
+            if (!Size->isMinusOne() &&
                 Size->getValue().getLimitedValue() >=
                     DL.getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index 0457294361b56..4a2be3a531767 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -513,8 +513,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
       return Res;
-    if (int Res =
-            cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
+    if (int Res = cmpNumbers(LI->getSyncScopeID(),
+                             cast<LoadInst>(R)->getSyncScopeID()))
       return Res;
     return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
         cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
@@ -529,7 +529,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope());
+    return cmpNumbers(SI->getSyncScopeID(),
+                      cast<StoreInst>(R)->getSyncScopeID());
   }
   if (const CmpInst *CI = dyn_cast<CmpInst>(L))
     return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
@@ -584,7 +585,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope());
+    return cmpNumbers(FI->getSyncScopeID(),
+                      cast<FenceInst>(R)->getSyncScopeID());
   }
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
     if (int Res = cmpNumbers(CXI->isVolatile(),
@@ -601,8 +603,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
             cmpOrderings(CXI->getFailureOrdering(),
                          cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
       return Res;
-    return cmpNumbers(CXI->getSynchScope(),
-                      cast<AtomicCmpXchgInst>(R)->getSynchScope());
+    return cmpNumbers(CXI->getSyncScopeID(),
+                      cast<AtomicCmpXchgInst>(R)->getSyncScopeID());
   }
   if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
     if (int Res = cmpNumbers(RMWI->getOperation(),
@@ -614,8 +616,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpOrderings(RMWI->getOrdering(),
                              cast<AtomicRMWInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(RMWI->getSynchScope(),
-                      cast<AtomicRMWInst>(R)->getSynchScope());
+    return cmpNumbers(RMWI->getSyncScopeID(),
+                      cast<AtomicRMWInst>(R)->getSyncScopeID());
   }
   if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
     const PHINode *PNR = cast<PHINode>(R);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 5127eba3f9aea..74610613001c6 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1662,9 +1662,10 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
   TI->eraseFromParent();
 }
 
-/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// removeUnreachableBlocks - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise.
+/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
+/// after modifying the CFG.
 bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
   SmallPtrSet<BasicBlock*, 16> Reachable;
   bool Changed = markAliveBlocks(F, Reachable);
@@ -2168,6 +2169,9 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
     return true;
   case Instruction::Call:
   case Instruction::Invoke:
+    // Can't handle inline asm. Skip it.
+    if (isa<InlineAsm>(ImmutableCallSite(I).getCalledValue()))
+      return false;
     // Many arithmetic intrinsics have no issue taking a
     // variable, however it's hard to distingish these from
     // specials such as @llvm.frameaddress that require a constant.
@@ -2182,12 +2186,18 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
   case Instruction::ShuffleVector:
     // Shufflevector masks are constant.
     return OpIdx != 2;
+  case Instruction::Switch:
   case Instruction::ExtractValue:
-  case Instruction::InsertValue:
     // All operands apart from the first are constant.
     return OpIdx == 0;
+  case Instruction::InsertValue:
+    // All operands apart from the first and the second are constant.
+    return OpIdx < 2;
   case Instruction::Alloca:
-    return false;
+    // Static allocas (constant size in the entry block) are handled by
+    // prologue/epilogue insertion so they're free anyway. We definitely don't
+    // want to make them non-constant.
+    return !dyn_cast<AllocaInst>(I)->isStaticAlloca();
   case Instruction::GetElementPtr:
     if (OpIdx == 0)
       return true;
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 9ad2b707e6b23..5170c68e2915a 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -65,9 +65,11 @@ static cl::opt<bool> UnrollRuntimeMultiExit(
 ///   than the unroll factor.
 ///
 static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
-                          BasicBlock *PrologExit, BasicBlock *PreHeader,
-                          BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
-                          DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
+                          BasicBlock *PrologExit,
+                          BasicBlock *OriginalLoopLatchExit,
+                          BasicBlock *PreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
   BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
@@ -142,17 +144,15 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   // then (BECount + 1) cannot unsigned-overflow.
   Value *BrLoopExit =
       B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
-  BasicBlock *Exit = L->getUniqueExitBlock();
-  assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
-  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
-  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
+  SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
+  SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the unrolled loop)
-  B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
+  B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
   InsertPt->eraseFromParent();
   if (DT)
-    DT->changeImmediateDominator(Exit, PrologExit);
+    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
 }
 
 /// Connect the unrolling epilog code to the original loop.
@@ -427,6 +427,50 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
     return nullptr;
 }
 
+/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
+/// is populated with all the loop exit blocks other than the LatchExit block.
+static bool
+canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
+                             BasicBlock *LatchExit, bool PreserveLCSSA,
+                             bool UseEpilogRemainder) {
+
+  // Support runtime unrolling for multiple exit blocks and multiple exiting
+  // blocks.
+  if (!UnrollRuntimeMultiExit)
+    return false;
+  // Even if runtime multi exit is enabled, we currently have some correctness
+  // constrains in unrolling a multi-exit loop.
+  // We rely on LCSSA form being preserved when the exit blocks are transformed.
+  if (!PreserveLCSSA)
+    return false;
+  SmallVector<BasicBlock *, 4> Exits;
+  L->getUniqueExitBlocks(Exits);
+  for (auto *BB : Exits)
+    if (BB != LatchExit)
+      OtherExits.push_back(BB);
+
+  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
+  // UnrollRuntimeMultiExit is true. This will need updating the logic in
+  // connectEpilog/connectProlog.
+  if (!LatchExit->getSinglePredecessor()) {
+    DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+                    "predecessor.\n");
+    return false;
+  }
+  // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
+  // and L is an inner loop. This is because in presence of multiple exits, the
+  // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
+  // outer loop. This is automatically handled in the prolog case, so we do not
+  // have that bug in prolog generation.
+  if (UseEpilogRemainder && L->getParentLoop())
+    return false;
+
+  // All constraints have been satisfied.
+  return true;
+}
+
+
+
 /// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
@@ -470,53 +514,40 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       bool UseEpilogRemainder,
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, bool PreserveLCSSA) {
-  // for now, only unroll loops that contain a single exit
-  if (!UnrollRuntimeMultiExit && !L->getExitingBlock())
-    return false;
+  DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+  DEBUG(L->dump());
 
   // Make sure the loop is in canonical form.
-  if (!L->isLoopSimplifyForm())
+  if (!L->isLoopSimplifyForm()) {
+    DEBUG(dbgs() << "Not in simplify form!\n");
     return false;
+  }
 
   // Guaranteed by LoopSimplifyForm.
   BasicBlock *Latch = L->getLoopLatch();
   BasicBlock *Header = L->getHeader();
 
-  BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop
-  if (!LatchExit && !UnrollRuntimeMultiExit)
-    return false;
-  // These are exit blocks other than the target of the latch exiting block.
-  SmallVector<BasicBlock *, 4> OtherExits;
   BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-  unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
   // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
   // targets of the Latch be an exit block out of the loop. This needs
   // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
-  assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) &&
+  assert(!L->contains(LatchExit) &&
          "one of the loop latch successors should be the exit block!");
-  // Support runtime unrolling for multiple exit blocks and multiple exiting
-  // blocks.
-  if (!LatchExit) {
-    assert(UseEpilogRemainder && "Multi exit unrolling is currently supported "
-                                 "unrolling with epilog remainder only!");
-    LatchExit = LatchBR->getSuccessor(ExitIndex);
-    // We rely on LCSSA form being preserved when the exit blocks are
-    // transformed.
-    if (!PreserveLCSSA)
-      return false;
-    // TODO: Support multiple exiting blocks jumping to the `LatchExit`. This
-    // will need updating the logic in connectEpilog.
-    if (!LatchExit->getSinglePredecessor())
-        return false;
-    SmallVector<BasicBlock *, 4> Exits;
-    L->getUniqueExitBlocks(Exits);
-    for (auto *BB : Exits)
-      if (BB != LatchExit)
-        OtherExits.push_back(BB);
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  bool isMultiExitUnrollingEnabled = canSafelyUnrollMultiExitLoop(
+      L, OtherExits, LatchExit, PreserveLCSSA, UseEpilogRemainder);
+  // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
+  if (!isMultiExitUnrollingEnabled &&
+      (!L->getExitingBlock() || OtherExits.size())) {
+    DEBUG(
+        dbgs()
+        << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
+           "enabled!\n");
+    return false;
   }
-
-  assert(LatchExit && "Latch Exit should exist!");
-
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
@@ -530,29 +561,38 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // exiting blocks).
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
-      !BECountSC->getType()->isIntegerTy())
+      !BECountSC->getType()->isIntegerTy()) {
+    DEBUG(dbgs() << "Could not compute exit block SCEV\n");
     return false;
+  }
 
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
 
   // Add 1 since the backedge count doesn't include the first loop iteration.
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
-  if (isa<SCEVCouldNotCompute>(TripCountSC))
+  if (isa<SCEVCouldNotCompute>(TripCountSC)) {
+    DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
     return false;
+  }
 
   BasicBlock *PreHeader = L->getLoopPreheader();
   BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
-      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR))
+      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
+    DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
+  }
 
   // This constraint lets us deal with an overflowing trip count easily; see the
   // comment on ModVal below.
-  if (Log2_32(Count) > BEWidth)
+  if (Log2_32(Count) > BEWidth) {
+    DEBUG(dbgs()
+          << "Count failed constraint on overflow trip count calculation.\n");
     return false;
+  }
 
   // Loop structure is the following:
   //
@@ -711,11 +751,10 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
      // node.
      for (unsigned i =0; i < oldNumOperands; i++){
        Value *newVal = VMap[Phi->getIncomingValue(i)];
-       if (!newVal) {
-         assert(isa<Constant>(Phi->getIncomingValue(i)) &&
-                "VMap should exist for all values except constants!");
+       // newVal can be a constant or derived from values outside the loop, and
+       // hence need not have a VMap value.
+       if (!newVal)
          newVal = Phi->getIncomingValue(i);
-       }
        Phi->addIncoming(newVal,
                            cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
      }
@@ -781,8 +820,8 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   } else {
     // Connect the prolog code to the original loop and update the
     // PHI functions.
-    ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
-                  VMap, DT, LI, PreserveLCSSA);
+    ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA);
   }
 
   // If this loop is nested, then the loop unroller changes the code in the
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 1c2a60a6b8b24..900450b400612 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -8,12 +8,256 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 
+static unsigned getLoopOperandSizeInBytes(Type *Type) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Type)) {
+    return VTy->getBitWidth() / 8;
+  }
+
+  return Type->getPrimitiveSizeInBits() / 8;
+}
+
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+                                     Value *DstAddr, ConstantInt *CopyLen,
+                                     unsigned SrcAlign, unsigned DestAlign,
+                                     bool SrcIsVolatile, bool DstIsVolatile,
+                                     const TargetTransformInfo &TTI) {
+  // No need to expand zero length copies.
+  if (CopyLen->isZero())
+    return;
+
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  if (LoopEndCount != 0) {
+    // Split
+    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
+    BasicBlock *LoopBB =
+        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
+    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+    IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+    // Cast the Src and Dst pointers to pointers to the loop operand type (if
+    // needed).
+    PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+    PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+    if (SrcAddr->getType() != SrcOpType) {
+      SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+    }
+    if (DstAddr->getType() != DstOpType) {
+      DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+    }
+
+    IRBuilder<> LoopBuilder(LoopBB);
+    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
+    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
+    // Loop Body
+    Value *SrcGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+    Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *NewIndex =
+        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    LoopIndex->addIncoming(NewIndex, LoopBB);
+
+    // Create the loop branch condition.
+    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
+                             LoopBB, PostLoopBB);
+  }
+
+  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
+  if (RemainingBytes) {
+    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
+                                    : InsertBefore);
+
+    // Update the alignment based on the copy size used in the loop body.
+    SrcAlign = std::min(SrcAlign, LoopOpSize);
+    DestAlign = std::min(DestAlign, LoopOpSize);
+
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAlign, DestAlign);
+
+    for (auto OpTy : RemainingOps) {
+      // Calaculate the new index
+      unsigned OperandSize = getLoopOperandSizeInBytes(OpTy);
+      uint64_t GepIndex = BytesCopied / OperandSize;
+      assert(GepIndex * OperandSize == BytesCopied &&
+             "Division should have no Remainder!");
+      // Cast source to operand type and load
+      PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
+      Value *CastedSrc = SrcAddr->getType() == SrcPtrType
+                             ? SrcAddr
+                             : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
+      Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      Value *Load = RBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+
+      // Cast destination to operand type and store.
+      PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
+      Value *CastedDst = DstAddr->getType() == DstPtrType
+                             ? DstAddr
+                             : RBuilder.CreateBitCast(DstAddr, DstPtrType);
+      Value *DstGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      RBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+      BytesCopied += OperandSize;
+    }
+  }
+  assert(BytesCopied == CopyLen->getZExtValue() &&
+         "Bytes copied should match size in the call!");
+}
+
+void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       Value *CopyLen, unsigned SrcAlign,
+                                       unsigned DestAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB =
+      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
+
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+
+  IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+  PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+  if (SrcAddr->getType() != SrcOpType) {
+    SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+  }
+  if (DstAddr->getType() != DstOpType) {
+    DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+  }
+
+  // Calculate the loop trip count, and remaining bytes to copy after the loop.
+  Type *CopyLenType = CopyLen->getType();
+  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
+  assert(ILengthType &&
+         "expected size argument to memcpy to be an integer type!");
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+  Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, nullptr);
+  IRBuilder<> LoopBuilder(LoopBB);
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
+  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
+
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  if (LoopOpType != Int8Type) {
+    // Loop body for the residual copy.
+    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
+                                               PreLoopBB->getParent(), nullptr);
+    // Residual loop header.
+    BasicBlock *ResHeaderBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
+
+    // Need to update the pre-loop basic block to branch to the correct place.
+    // branch to the main loop if the count is non-zero, branch to the residual
+    // loop if the copy size is smaller then 1 iteration of the main loop but
+    // non-zero and finally branch to after the residual loop if the memcpy
+    //  size is zero.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, ResHeaderBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        ResHeaderBB);
+
+    // Determine if we need to branch to the residual loop or bypass it.
+    IRBuilder<> RHBuilder(ResHeaderBB);
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+                           ResLoopBB, PostLoopBB);
+
+    // Copy the residual with single byte load/store loop.
+    IRBuilder<> ResBuilder(ResLoopBB);
+    PHINode *ResidualIndex =
+        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
+    ResidualIndex->addIncoming(Zero, ResHeaderBB);
+
+    Value *SrcAsInt8 =
+        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
+    Value *DstAsInt8 =
+        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
+    Value *SrcGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
+    Value *Load = ResBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
+    ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *ResNewIndex =
+        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+    // Create the loop branch condition.
+    ResBuilder.CreateCondBr(
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        PostLoopBB);
+  } else {
+    // In this case the loop operand type was a byte, and there is no need for a
+    // residual loop to copy the remaining memory after the main loop.
+    // We do however need to patch up the control flow by creating the
+    // terminators for the preloop block and the memcpy loop.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, PostLoopBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        PostLoopBB);
+  }
+}
+
 void llvm::createMemCpyLoop(Instruction *InsertBefore,
                             Value *SrcAddr, Value *DstAddr, Value *CopyLen,
                             unsigned SrcAlign, unsigned DestAlign,
@@ -208,15 +452,41 @@ static void createMemSetLoop(Instruction *InsertBefore,
                            NewBB);
 }
 
-void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy) {
-  createMemCpyLoop(/* InsertBefore */ Memcpy,
-                   /* SrcAddr */ Memcpy->getRawSource(),
-                   /* DstAddr */ Memcpy->getRawDest(),
-                   /* CopyLen */ Memcpy->getLength(),
-                   /* SrcAlign */ Memcpy->getAlignment(),
-                   /* DestAlign */ Memcpy->getAlignment(),
-                   /* SrcIsVolatile */ Memcpy->isVolatile(),
-                   /* DstIsVolatile */ Memcpy->isVolatile());
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
+                              const TargetTransformInfo &TTI) {
+  // Original implementation
+  if (!TTI.useWideIRMemcpyLoopLowering()) {
+    createMemCpyLoop(/* InsertBefore */ Memcpy,
+                     /* SrcAddr */ Memcpy->getRawSource(),
+                     /* DstAddr */ Memcpy->getRawDest(),
+                     /* CopyLen */ Memcpy->getLength(),
+                     /* SrcAlign */ Memcpy->getAlignment(),
+                     /* DestAlign */ Memcpy->getAlignment(),
+                     /* SrcIsVolatile */ Memcpy->isVolatile(),
+                     /* DstIsVolatile */ Memcpy->isVolatile());
+  } else {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+      createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
+                                /* SrcAddr */ Memcpy->getRawSource(),
+                                /* DstAddr */ Memcpy->getRawDest(),
+                                /* CopyLen */ CI,
+                                /* SrcAlign */ Memcpy->getAlignment(),
+                                /* DestAlign */ Memcpy->getAlignment(),
+                                /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                /* DstIsVolatile */ Memcpy->isVolatile(),
+                                /* TargetTransformInfo */ TTI);
+    } else {
+      createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
+                                  /* SrcAddr */ Memcpy->getRawSource(),
+                                  /* DstAddr */ Memcpy->getRawDest(),
+                                  /* CopyLen */ Memcpy->getLength(),
+                                  /* SrcAlign */ Memcpy->getAlignment(),
+                                  /* DestAlign */ Memcpy->getAlignment(),
+                                  /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                  /* DstIsVolatile */ Memcpy->isVolatile(),
+                                  /* TargetTransfomrInfo */ TTI);
+    }
+  }
 }
 
 void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index e724b0a28c322..dee658f983932 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5754,8 +5754,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (BasicBlock *Dom = BB->getSinglePredecessor()) {
     auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
     if (PBI && PBI->isConditional() &&
-        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
-        (PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB)) {
+        PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+      assert(PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB);
       bool CondIsFalse = PBI->getSuccessor(1) == BB;
       Optional<bool> Implication = isImpliedCondition(
           PBI->getCondition(), BI->getCondition(), DL, CondIsFalse);
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index ec8b0d426265a..6d90e6b48358a 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -80,6 +81,7 @@ namespace {
                               bool IsSigned);
     bool eliminateSDiv(BinaryOperator *SDiv);
     bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
+    bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand);
   };
 }
 
@@ -154,6 +156,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
 void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   unsigned IVOperIdx = 0;
   ICmpInst::Predicate Pred = ICmp->getPredicate();
+  ICmpInst::Predicate OriginalPred = Pred;
   if (IVOperand != ICmp->getOperand(0)) {
     // Swapped
     assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
@@ -262,6 +265,16 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
     ICmp->setPredicate(InvariantPredicate);
     ICmp->setOperand(0, NewLHS);
     ICmp->setOperand(1, NewRHS);
+  } else if (ICmpInst::isSigned(OriginalPred) &&
+             SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) {
+    // If we were unable to make anything above, all we can is to canonicalize
+    // the comparison hoping that it will open the doors for other
+    // optimizations. If we find out that we compare two non-negative values,
+    // we turn the instruction's predicate to its unsigned version. Note that
+    // we cannot rely on Pred here unless we check if we have swapped it.
+    assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
+    DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp << '\n');
+    ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
   } else
     return;
 
@@ -583,6 +596,35 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
   return Changed;
 }
 
+/// Annotate the Shr in (X << IVOperand) >> C as exact using the
+/// information from the IV's range. Returns true if anything changed, false
+/// otherwise.
+bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
+                                          Value *IVOperand) {
+  using namespace llvm::PatternMatch;
+
+  if (BO->getOpcode() == Instruction::Shl) {
+    bool Changed = false;
+    ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
+    for (auto *U : BO->users()) {
+      const APInt *C;
+      if (match(U,
+                m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) ||
+          match(U,
+                m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) {
+        BinaryOperator *Shr = cast<BinaryOperator>(U);
+        if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) {
+          Shr->setIsExact(true);
+          Changed = true;
+        }
+      }
+    }
+    return Changed;
+  }
+
+  return false;
+}
+
 /// Add all uses of Def to the current IV's worklist.
 static void pushIVUsers(
   Instruction *Def,
@@ -675,8 +717,9 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     }
 
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
-      if (isa<OverflowingBinaryOperator>(BO) &&
-          strengthenOverflowingOperation(BO, IVOperand)) {
+      if ((isa<OverflowingBinaryOperator>(BO) &&
+           strengthenOverflowingOperation(BO, IVOperand)) ||
+          (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
         // re-queue uses of the now modified binary operator and fall
         // through to the checks that remain.
         pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index b723b65f35e59..77c0a41929ac7 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -656,7 +656,7 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
   ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
 
   // memchr(x, y, 0) -> null
-  if (LenC && LenC->isNullValue())
+  if (LenC && LenC->isZero())
     return Constant::getNullValue(CI->getType());
 
   // From now on we need at least constant length and string.
@@ -2280,7 +2280,7 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
     return true;
   if (ConstantInt *ObjSizeCI =
           dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
-    if (ObjSizeCI->isAllOnesValue())
+    if (ObjSizeCI->isMinusOne())
       return true;
     // If the object size wasn't -1 (unknown), bail out if we were asked to.
     if (OnlyLowerUnknownSize)
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index 60d9ede2c4871..c3feea6a0a414 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -51,25 +51,24 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
     // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->getScalarType()->isPointerTy() &&
-        LoadedTy->getScalarType()->isPointerTy()) {
+    if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) {
       StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
     } else {
       // Convert source pointers to integers, which can be bitcast.
-      if (StoredValTy->getScalarType()->isPointerTy()) {
+      if (StoredValTy->isPtrOrPtrVectorTy()) {
         StoredValTy = DL.getIntPtrType(StoredValTy);
         StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
       }
 
       Type *TypeToCastTo = LoadedTy;
-      if (TypeToCastTo->getScalarType()->isPointerTy())
+      if (TypeToCastTo->isPtrOrPtrVectorTy())
         TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
 
       if (StoredValTy != TypeToCastTo)
         StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
 
       // Cast to pointer if the load needs a pointer type.
-      if (LoadedTy->getScalarType()->isPointerTy())
+      if (LoadedTy->isPtrOrPtrVectorTy())
         StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
     }
 
@@ -86,7 +85,7 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
          "canCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->getScalarType()->isPointerTy()) {
+  if (StoredValTy->isPtrOrPtrVectorTy()) {
     StoredValTy = DL.getIntPtrType(StoredValTy);
     StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
   }
@@ -112,7 +111,7 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
 
   if (LoadedTy != NewIntTy) {
     // If the result is a pointer, inttoptr.
-    if (LoadedTy->getScalarType()->isPointerTy())
+    if (LoadedTy->isPtrOrPtrVectorTy())
       StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
     else
       // Otherwise, bitcast.
@@ -316,7 +315,7 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
-  if (SrcVal->getType()->getScalarType()->isPointerTy())
+  if (SrcVal->getType()->isPtrOrPtrVectorTy())
     SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
   if (!SrcVal->getType()->isIntegerTy())
     SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));