diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2015-06-09 19:06:30 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2015-06-09 19:06:30 +0000 | 
| commit | 85d8b2bbe386bcfe669575d05b61482d7be07e5d (patch) | |
| tree | 1dc5e75ab222a9ead44c699eceafab7a6ca7b310 /lib/Transforms | |
| parent | 5a5ac124e1efaf208671f01c46edb15f29ed2a0b (diff) | |
Notes
Diffstat (limited to 'lib/Transforms')
30 files changed, 1454 insertions, 737 deletions
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index 7b7672d0edfeb..c7c57ab564442 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -553,7 +553,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,      LoadInst *Load = Loads[i];      BasicBlock *BB = Load->getParent(); -    AliasAnalysis::Location Loc = AA.getLocation(Load); +    AliasAnalysis::Location Loc = MemoryLocation::get(Load);      if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc,          AliasAnalysis::Mod))        return false;  // Pointer is invalidated! @@ -774,7 +774,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {            Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);            Value *Idx = GetElementPtrInst::Create( -              STy, *AI, Idxs, (*AI)->getName() + "." + utostr(i), Call); +              STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);            // TODO: Tell AA about the new values?            Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));          } diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 92e384a340a73..ef8f42ffd6d43 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -232,20 +232,20 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {        } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {          // Ignore non-volatile loads from local memory. (Atomic is okay here.)          if (!LI->isVolatile()) { -          AliasAnalysis::Location Loc = AA->getLocation(LI); +          AliasAnalysis::Location Loc = MemoryLocation::get(LI);            if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))              continue;          }        } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {          // Ignore non-volatile stores to local memory. (Atomic is okay here.)          if (!SI->isVolatile()) { -          AliasAnalysis::Location Loc = AA->getLocation(SI); +          AliasAnalysis::Location Loc = MemoryLocation::get(SI);            if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))              continue;          }        } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {          // Ignore vaargs on local memory. -        AliasAnalysis::Location Loc = AA->getLocation(VI); +        AliasAnalysis::Location Loc = MemoryLocation::get(VI);          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))            continue;        } diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 91a5eefca17dd..052f1b4b13257 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -389,11 +389,21 @@ private:  };  class FunctionNode { -  AssertingVH<Function> F; +  mutable AssertingVH<Function> F;  public:    FunctionNode(Function *F) : F(F) {}    Function *getFunc() const { return F; } + +  /// Replace the reference to the function F by the function G, assuming their +  /// implementations are equal. +  void replaceBy(Function *G) const { +    assert(!(*this < FunctionNode(G)) && !(FunctionNode(G) < *this) && +           "The two functions must be equal"); + +    F = G; +  } +    void release() { F = 0; }    bool operator<(const FunctionNode &RHS) const {      return (FunctionComparator(F, RHS.getFunc()).compare()) == -1; @@ -1122,6 +1132,9 @@ private:    /// Replace G with an alias to F. Deletes G.    void writeAlias(Function *F, Function *G); +  /// Replace function F with function G in the function tree. +  void replaceFunctionInTree(FnTreeType::iterator &IterToF, Function *G); +    /// The set of all distinct functions. Use the insert() and remove() methods    /// to modify it.    FnTreeType FnTree; @@ -1414,6 +1427,21 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {    ++NumFunctionsMerged;  } +/// Replace function F for function G in the map. +void MergeFunctions::replaceFunctionInTree(FnTreeType::iterator &IterToF, +                                           Function *G) { +  Function *F = IterToF->getFunc(); + +  // A total order is already guaranteed otherwise because we process strong +  // functions before weak functions. +  assert(((F->mayBeOverridden() && G->mayBeOverridden()) || +          (!F->mayBeOverridden() && !G->mayBeOverridden())) && +         "Only change functions if both are strong or both are weak"); +  (void)F; + +  IterToF->replaceBy(G); +} +  // Insert a ComparableFunction into the FnTree, or merge it away if equal to one  // that was already inserted.  bool MergeFunctions::insert(Function *NewFunction) { @@ -1439,6 +1467,22 @@ bool MergeFunctions::insert(Function *NewFunction) {      }    } +  // Impose a total order (by name) on the replacement of functions. This is +  // important when operating on more than one module independently to prevent +  // cycles of thunks calling each other when the modules are linked together. +  // +  // When one function is weak and the other is strong there is an order imposed +  // already. We process strong functions before weak functions. +  if ((OldF.getFunc()->mayBeOverridden() && NewFunction->mayBeOverridden()) || +      (!OldF.getFunc()->mayBeOverridden() && !NewFunction->mayBeOverridden())) +    if (OldF.getFunc()->getName() > NewFunction->getName()) { +      // Swap the two functions. +      Function *F = OldF.getFunc(); +      replaceFunctionInTree(Result.first, NewFunction); +      NewFunction = F; +      assert(OldF.getFunc() != F && "Must have swapped the functions."); +    } +    // Never thunk a strong function to a weak function.    assert(!OldF.getFunc()->mayBeOverridden() || NewFunction->mayBeOverridden()); @@ -1465,7 +1509,7 @@ void MergeFunctions::remove(Function *F) {    if (Erased) {      DEBUG(dbgs() << "Removed " << F->getName()                   << " from set and deferred it.\n"); -    Deferred.push_back(F); +    Deferred.emplace_back(F);    }  } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 2dafa58d305e7..f53eeef1dae60 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2149,6 +2149,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,        if (WillNotOverflowSignedAdd(LHS, RHS, OrigI))          return SetResult(Builder->CreateNSWAdd(LHS, RHS), Builder->getFalse(),                           true); +    break;    }    case OCF_UNSIGNED_SUB: @@ -2194,6 +2195,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,        if (WillNotOverflowSignedMul(LHS, RHS, OrigI))          return SetResult(Builder->CreateNSWMul(LHS, RHS), Builder->getFalse(),                           true); +    break;    }    return false; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 5aa59c69f39bb..e7a45330d9557 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -483,12 +483,17 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {    }    // Fold away bit casts of the loaded value by loading the desired type. +  // We can do this for BitCastInsts as well as casts from and to pointer types, +  // as long as those are noops (i.e., the source or dest type have the same +  // bitwidth as the target's pointers).    if (LI.hasOneUse()) -    if (auto *BC = dyn_cast<BitCastInst>(LI.user_back())) { -      LoadInst *NewLoad = combineLoadToNewType(IC, LI, BC->getDestTy()); -      BC->replaceAllUsesWith(NewLoad); -      IC.EraseInstFromFunction(*BC); -      return &LI; +    if (auto* CI = dyn_cast<CastInst>(LI.user_back())) { +      if (CI->isNoopCast(DL)) { +        LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy()); +        CI->replaceAllUsesWith(NewLoad); +        IC.EraseInstFromFunction(*CI); +        return &LI; +      }      }    // FIXME: We should also canonicalize loads of vectors when their elements are diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index d2fbcdd39915c..f51442a9f36d2 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -276,72 +276,6 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,    return nullptr;  } -/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is -/// replaced with RepOp. -static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, -                                     const TargetLibraryInfo *TLI, -                                     const DataLayout &DL, DominatorTree *DT, -                                     AssumptionCache *AC) { -  // Trivial replacement. -  if (V == Op) -    return RepOp; - -  Instruction *I = dyn_cast<Instruction>(V); -  if (!I) -    return nullptr; - -  // If this is a binary operator, try to simplify it with the replaced op. -  if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) { -    if (B->getOperand(0) == Op) -      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), DL, TLI); -    if (B->getOperand(1) == Op) -      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, DL, TLI); -  } - -  // Same for CmpInsts. -  if (CmpInst *C = dyn_cast<CmpInst>(I)) { -    if (C->getOperand(0) == Op) -      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), DL, -                             TLI, DT, AC); -    if (C->getOperand(1) == Op) -      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, DL, -                             TLI, DT, AC); -  } - -  // TODO: We could hand off more cases to instsimplify here. - -  // If all operands are constant after substituting Op for RepOp then we can -  // constant fold the instruction. -  if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) { -    // Build a list of all constant operands. -    SmallVector<Constant*, 8> ConstOps; -    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { -      if (I->getOperand(i) == Op) -        ConstOps.push_back(CRepOp); -      else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i))) -        ConstOps.push_back(COp); -      else -        break; -    } - -    // All operands were constants, fold it. -    if (ConstOps.size() == I->getNumOperands()) { -      if (CmpInst *C = dyn_cast<CmpInst>(I)) -        return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0], -                                               ConstOps[1], DL, TLI); - -      if (LoadInst *LI = dyn_cast<LoadInst>(I)) -        if (!LI->isVolatile()) -          return ConstantFoldLoadFromConstPtr(ConstOps[0], DL); - -      return ConstantFoldInstOperands(I->getOpcode(), I->getType(), ConstOps, -                                      DL, TLI); -    } -  } - -  return nullptr; -} -  /// foldSelectICmpAndOr - We want to turn:  ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))  /// into: @@ -477,14 +411,6 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,    // here, so make sure the select is the only user.    if (ICI->hasOneUse())      if (ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS)) { -      // X < MIN ? T : F  -->  F -      if ((Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT) -          && CI->isMinValue(Pred == ICmpInst::ICMP_SLT)) -        return ReplaceInstUsesWith(SI, FalseVal); -      // X > MAX ? T : F  -->  F -      else if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT) -               && CI->isMaxValue(Pred == ICmpInst::ICMP_SGT)) -        return ReplaceInstUsesWith(SI, FalseVal);        switch (Pred) {        default: break;        case ICmpInst::ICMP_ULT: @@ -598,33 +524,6 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,      }    } -  // If we have an equality comparison then we know the value in one of the -  // arms of the select. See if substituting this value into the arm and -  // simplifying the result yields the same value as the other arm. -  if (Pred == ICmpInst::ICMP_EQ) { -    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == -            TrueVal || -        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == -            TrueVal) -      return ReplaceInstUsesWith(SI, FalseVal); -    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == -            FalseVal || -        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == -            FalseVal) -      return ReplaceInstUsesWith(SI, FalseVal); -  } else if (Pred == ICmpInst::ICMP_NE) { -    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == -            FalseVal || -        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == -            FalseVal) -      return ReplaceInstUsesWith(SI, TrueVal); -    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == -            TrueVal || -        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == -            TrueVal) -      return ReplaceInstUsesWith(SI, TrueVal); -  } -    // NOTE: if we wanted to, this is where to detect integer MIN/MAX    if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) { @@ -639,7 +538,8 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,      }    } -  if (unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits()) { +  { +    unsigned BitWidth = DL.getTypeSizeInBits(TrueVal->getType());      APInt MinSignedValue = APInt::getSignBit(BitWidth);      Value *X;      const APInt *Y, *C; diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index be49cd1c436bc..9d602c6a9e22a 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1843,7 +1843,7 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,        case Instruction::BitCast:        case Instruction::GetElementPtr: -        Users.push_back(I); +        Users.emplace_back(I);          Worklist.push_back(I);          continue; @@ -1852,7 +1852,7 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,          // We can fold eq/ne comparisons with null to false/true, respectively.          if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1)))            return false; -        Users.push_back(I); +        Users.emplace_back(I);          continue;        } @@ -1878,13 +1878,13 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,            case Intrinsic::lifetime_start:            case Intrinsic::lifetime_end:            case Intrinsic::objectsize: -            Users.push_back(I); +            Users.emplace_back(I);              continue;            }          }          if (isFreeCall(I, TLI)) { -          Users.push_back(I); +          Users.emplace_back(I);            continue;          }          return false; @@ -1893,7 +1893,7 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,          StoreInst *SI = cast<StoreInst>(I);          if (SI->isVolatile() || SI->getPointerOperand() != PI)            return false; -        Users.push_back(I); +        Users.emplace_back(I);          continue;        }        } diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 939e04bb2a51f..25f78b0b2a267 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -106,14 +106,15 @@ static const char *const kAsanUnpoisonStackMemoryName =  static const char *const kAsanOptionDetectUAR =      "__asan_option_detect_stack_use_after_return"; +static const char *const kAsanAllocaPoison = +    "__asan_alloca_poison"; +static const char *const kAsanAllocasUnpoison = +    "__asan_allocas_unpoison"; +  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.  static const size_t kNumberOfAccessSizes = 5;  static const unsigned kAllocaRzSize = 32; -static const unsigned kAsanAllocaLeftMagic = 0xcacacacaU; -static const unsigned kAsanAllocaRightMagic = 0xcbcbcbcbU; -static const unsigned kAsanAllocaPartialVal1 = 0xcbcbcb00U; -static const unsigned kAsanAllocaPartialVal2 = 0x000000cbU;  // Command-line flags. @@ -230,8 +231,6 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),  STATISTIC(NumInstrumentedReads, "Number of instrumented reads");  STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); -STATISTIC(NumInstrumentedDynamicAllocas, -          "Number of instrumented dynamic allocas");  STATISTIC(NumOptimizedAccessesToGlobalVar,            "Number of optimized accesses to global vars");  STATISTIC(NumOptimizedAccessesToStackVar, @@ -402,6 +401,12 @@ struct AddressSanitizer : public FunctionPass {    }    /// Check if we want (and can) handle this alloca.    bool isInterestingAlloca(AllocaInst &AI); + +  // Check if we have dynamic alloca. +  bool isDynamicAlloca(AllocaInst &AI) const { +    return AI.isArrayAllocation() || !AI.isStaticAlloca(); +  } +    /// If it is an interesting memory access, return the PointerOperand    /// and set IsWrite/Alignment. Otherwise return nullptr.    Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite, @@ -517,6 +522,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],        *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];    Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc; +  Function *AsanAllocaPoisonFunc, *AsanAllocasUnpoisonFunc;    // Stores a place and arguments of poisoning/unpoisoning call for alloca.    struct AllocaPoisonCall { @@ -527,23 +533,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    };    SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec; -  // Stores left and right redzone shadow addresses for dynamic alloca -  // and pointer to alloca instruction itself. -  // LeftRzAddr is a shadow address for alloca left redzone. -  // RightRzAddr is a shadow address for alloca right redzone. -  struct DynamicAllocaCall { -    AllocaInst *AI; -    Value *LeftRzAddr; -    Value *RightRzAddr; -    bool Poison; -    explicit DynamicAllocaCall(AllocaInst *AI, Value *LeftRzAddr = nullptr, -                               Value *RightRzAddr = nullptr) -        : AI(AI), -          LeftRzAddr(LeftRzAddr), -          RightRzAddr(RightRzAddr), -          Poison(true) {} -  }; -  SmallVector<DynamicAllocaCall, 1> DynamicAllocaVec; +  SmallVector<AllocaInst *, 1> DynamicAllocaVec; +  SmallVector<IntrinsicInst *, 1> StackRestoreVec; +  AllocaInst *DynamicAllocaLayout = nullptr;    // Maps Value to an AllocaInst from which the Value is originated.    typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy; @@ -586,41 +578,29 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    // Then unpoison everything back before the function returns.    void poisonStack(); +  void createDynamicAllocasInitStorage(); +    // ----------------------- Visitors.    /// \brief Collect all Ret instructions.    void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); } -  // Unpoison dynamic allocas redzones. -  void unpoisonDynamicAlloca(DynamicAllocaCall &AllocaCall) { -    if (!AllocaCall.Poison) return; -    for (auto Ret : RetVec) { -      IRBuilder<> IRBRet(Ret); -      PointerType *Int32PtrTy = PointerType::getUnqual(IRBRet.getInt32Ty()); -      Value *Zero = Constant::getNullValue(IRBRet.getInt32Ty()); -      Value *PartialRzAddr = IRBRet.CreateSub(AllocaCall.RightRzAddr, -                                              ConstantInt::get(IntptrTy, 4)); -      IRBRet.CreateStore( -          Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr, Int32PtrTy)); -      IRBRet.CreateStore(Zero, -                         IRBRet.CreateIntToPtr(PartialRzAddr, Int32PtrTy)); -      IRBRet.CreateStore( -          Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr, Int32PtrTy)); -    } +  void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore, +                                        Value *SavedStack) { +    IRBuilder<> IRB(InstBefore); +    IRB.CreateCall(AsanAllocasUnpoisonFunc, +                    {IRB.CreateLoad(DynamicAllocaLayout), +                    IRB.CreatePtrToInt(SavedStack, IntptrTy)});    } -  // Right shift for BigEndian and left shift for LittleEndian. -  Value *shiftAllocaMagic(Value *Val, IRBuilder<> &IRB, Value *Shift) { -    auto &DL = F.getParent()->getDataLayout(); -    return DL.isLittleEndian() ? IRB.CreateShl(Val, Shift) -                               : IRB.CreateLShr(Val, Shift); -  } +  // Unpoison dynamic allocas redzones. +  void unpoisonDynamicAllocas() { +    for (auto &Ret : RetVec) +      unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout); -  // Compute PartialRzMagic for dynamic alloca call. Since we don't know the -  // size of requested memory until runtime, we should compute it dynamically. -  // If PartialSize is 0, PartialRzMagic would contain kAsanAllocaRightMagic, -  // otherwise it would contain the value that we will use to poison the -  // partial redzone for alloca call. -  Value *computePartialRzMagic(Value *PartialSize, IRBuilder<> &IRB); +    for (auto &StackRestoreInst : StackRestoreVec) +      unpoisonDynamicAllocasBeforeInst(StackRestoreInst, +                                       StackRestoreInst->getOperand(0)); +  }    // Deploy and poison redzones around dynamic alloca call. To do this, we    // should replace this call with another one with changed parameters and @@ -632,20 +612,15 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    //   addr = tmp + 32 (first 32 bytes are for the left redzone).    // Additional_size is added to make new memory allocation contain not only    // requested memory, but also left, partial and right redzones. -  // After that, we should poison redzones: -  // (1) Left redzone with kAsanAllocaLeftMagic. -  // (2) Partial redzone with the value, computed in runtime by -  //     computePartialRzMagic function. -  // (3) Right redzone with kAsanAllocaRightMagic. -  void handleDynamicAllocaCall(DynamicAllocaCall &AllocaCall); +  void handleDynamicAllocaCall(AllocaInst *AI);    /// \brief Collect Alloca instructions we want (and can) handle.    void visitAllocaInst(AllocaInst &AI) {      if (!ASan.isInterestingAlloca(AI)) return;      StackAlignment = std::max(StackAlignment, AI.getAlignment()); -    if (isDynamicAlloca(AI)) -      DynamicAllocaVec.push_back(DynamicAllocaCall(&AI)); +    if (ASan.isDynamicAlloca(AI)) +      DynamicAllocaVec.push_back(&AI);      else        AllocaVec.push_back(&AI);    } @@ -653,8 +628,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    /// \brief Collect lifetime intrinsic calls to check for use-after-scope    /// errors.    void visitIntrinsicInst(IntrinsicInst &II) { -    if (!ClCheckLifetime) return;      Intrinsic::ID ID = II.getIntrinsicID(); +    if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II); +    if (!ClCheckLifetime) return;      if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)        return;      // Found lifetime intrinsic, add ASan instrumentation if necessary. @@ -690,9 +666,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {      return true;    } -  bool isDynamicAlloca(AllocaInst &AI) const { -    return AI.isArrayAllocation() || !AI.isStaticAlloca(); -  }    /// Finds alloca where the value comes from.    AllocaInst *findAllocaForValue(Value *V);    void poisonRedZones(ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB, @@ -811,12 +784,14 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {    if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())      return PreviouslySeenAllocaInfo->getSecond(); -  bool IsInteresting = (AI.getAllocatedType()->isSized() && -    // alloca() may be called with 0 size, ignore it. -    getAllocaSizeInBytes(&AI) > 0 && -    // We are only interested in allocas not promotable to registers. -    // Promotable allocas are common under -O0. -    (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI))); +  bool IsInteresting = +      (AI.getAllocatedType()->isSized() && +       // alloca() may be called with 0 size, ignore it. +       getAllocaSizeInBytes(&AI) > 0 && +       // We are only interested in allocas not promotable to registers. +       // Promotable allocas are common under -O0. +       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI) || +        isDynamicAlloca(AI)));    ProcessedAllocas[&AI] = IsInteresting;    return IsInteresting; @@ -1158,6 +1133,18 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {    if (G->hasSection()) {      StringRef Section(G->getSection()); +    // Globals from llvm.metadata aren't emitted, do not instrument them. +    if (Section == "llvm.metadata") return false; + +    // Callbacks put into the CRT initializer/terminator sections +    // should not be instrumented. +    // See https://code.google.com/p/address-sanitizer/issues/detail?id=305 +    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx +    if (Section.startswith(".CRT")) { +      DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n"); +      return false; +    } +      if (TargetTriple.isOSBinFormatMachO()) {        StringRef ParsedSegment, ParsedSection;        unsigned TAA = 0, StubSize = 0; @@ -1165,8 +1152,8 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {        std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(            Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);        if (!ErrorCode.empty()) { -        report_fatal_error("Invalid section specifier '" + ParsedSection + -                           "': " + ErrorCode + "."); +        assert(false && "Invalid section specifier."); +        return false;        }        // Ignore the globals from the __OBJC section. The ObjC runtime assumes @@ -1196,18 +1183,6 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {          return false;        }      } - -    // Callbacks put into the CRT initializer/terminator sections -    // should not be instrumented. -    // See https://code.google.com/p/address-sanitizer/issues/detail?id=305 -    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx -    if (Section.startswith(".CRT")) { -      DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n"); -      return false; -    } - -    // Globals from llvm.metadata aren't emitted, do not instrument them. -    if (Section == "llvm.metadata") return false;    }    return true; @@ -1617,6 +1592,11 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {    AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(        M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),                              IntptrTy, IntptrTy, nullptr)); +  AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction( +      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); +  AsanAllocasUnpoisonFunc = +      checkSanitizerInterfaceFunction(M.getOrInsertFunction( +          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));  }  void FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes, @@ -1712,15 +1692,24 @@ Value *FunctionStackPoisoner::createAllocaForLayout(    return IRB.CreatePointerCast(Alloca, IntptrTy);  } +void FunctionStackPoisoner::createDynamicAllocasInitStorage() { +  BasicBlock &FirstBB = *F.begin(); +  IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin())); +  DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr); +  IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout); +  DynamicAllocaLayout->setAlignment(32); +} +  void FunctionStackPoisoner::poisonStack() {    assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0); -  if (ClInstrumentAllocas) { +  if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) {      // Handle dynamic allocas. -    for (auto &AllocaCall : DynamicAllocaVec) { -      handleDynamicAllocaCall(AllocaCall); -      unpoisonDynamicAlloca(AllocaCall); -    } +    createDynamicAllocasInitStorage(); +    for (auto &AI : DynamicAllocaVec) +      handleDynamicAllocaCall(AI); + +    unpoisonDynamicAllocas();    }    if (AllocaVec.size() == 0) return; @@ -1955,78 +1944,25 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {    return Res;  } -// Compute PartialRzMagic for dynamic alloca call. PartialRzMagic is -// constructed from two separate 32-bit numbers: PartialRzMagic = Val1 | Val2. -// (1) Val1 is resposible for forming base value for PartialRzMagic, containing -//     only 00 for fully addressable and 0xcb for fully poisoned bytes for each -//     8-byte chunk of user memory respectively. -// (2) Val2 forms the value for marking first poisoned byte in shadow memory -//     with appropriate value (0x01 - 0x07 or 0xcb if Padding % 8 == 0). - -// Shift = Padding & ~7; // the number of bits we need to shift to access first -//                          chunk in shadow memory, containing nonzero bytes. -// Example: -// Padding = 21                       Padding = 16 -// Shadow:  |00|00|05|cb|          Shadow:  |00|00|cb|cb| -//                ^                               ^ -//                |                               | -// Shift = 21 & ~7 = 16            Shift = 16 & ~7 = 16 -// -// Val1 = 0xcbcbcbcb << Shift; -// PartialBits = Padding ? Padding & 7 : 0xcb; -// Val2 = PartialBits << Shift; -// Result = Val1 | Val2; -Value *FunctionStackPoisoner::computePartialRzMagic(Value *PartialSize, -                                                    IRBuilder<> &IRB) { -  PartialSize = IRB.CreateIntCast(PartialSize, IRB.getInt32Ty(), false); -  Value *Shift = IRB.CreateAnd(PartialSize, IRB.getInt32(~7)); -  unsigned Val1Int = kAsanAllocaPartialVal1; -  unsigned Val2Int = kAsanAllocaPartialVal2; -  if (!F.getParent()->getDataLayout().isLittleEndian()) { -    Val1Int = sys::getSwappedBytes(Val1Int); -    Val2Int = sys::getSwappedBytes(Val2Int); -  } -  Value *Val1 = shiftAllocaMagic(IRB.getInt32(Val1Int), IRB, Shift); -  Value *PartialBits = IRB.CreateAnd(PartialSize, IRB.getInt32(7)); -  // For BigEndian get 0x000000YZ -> 0xYZ000000. -  if (F.getParent()->getDataLayout().isBigEndian()) -    PartialBits = IRB.CreateShl(PartialBits, IRB.getInt32(24)); -  Value *Val2 = IRB.getInt32(Val2Int); -  Value *Cond = -      IRB.CreateICmpNE(PartialBits, Constant::getNullValue(IRB.getInt32Ty())); -  Val2 = IRB.CreateSelect(Cond, shiftAllocaMagic(PartialBits, IRB, Shift), -                          shiftAllocaMagic(Val2, IRB, Shift)); -  return IRB.CreateOr(Val1, Val2); -} - -void FunctionStackPoisoner::handleDynamicAllocaCall( -    DynamicAllocaCall &AllocaCall) { -  AllocaInst *AI = AllocaCall.AI; -  if (!doesDominateAllExits(AI)) { -    // We do not yet handle complex allocas -    AllocaCall.Poison = false; -    return; -  } - +void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {    IRBuilder<> IRB(AI); -  PointerType *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());    const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());    const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;    Value *Zero = Constant::getNullValue(IntptrTy);    Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);    Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask); -  Value *NotAllocaRzMask = ConstantInt::get(IntptrTy, ~AllocaRedzoneMask);    // Since we need to extend alloca with additional memory to locate    // redzones, and OldSize is number of allocated blocks with    // ElementSize size, get allocated memory size in bytes by    // OldSize * ElementSize. -  unsigned ElementSize = +  const unsigned ElementSize =        F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType()); -  Value *OldSize = IRB.CreateMul(AI->getArraySize(), -                                 ConstantInt::get(IntptrTy, ElementSize)); +  Value *OldSize = +      IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false), +                    ConstantInt::get(IntptrTy, ElementSize));    // PartialSize = OldSize % 32    Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask); @@ -2054,43 +1990,20 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(    Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),                                      ConstantInt::get(IntptrTy, Align)); -  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType()); - -  // LeftRzAddress = NewAddress - kAllocaRzSize -  Value *LeftRzAddress = IRB.CreateSub(NewAddress, AllocaRzSize); - -  // Poisoning left redzone. -  AllocaCall.LeftRzAddr = ASan.memToShadow(LeftRzAddress, IRB); -  IRB.CreateStore(ConstantInt::get(IRB.getInt32Ty(), kAsanAllocaLeftMagic), -                  IRB.CreateIntToPtr(AllocaCall.LeftRzAddr, Int32PtrTy)); +  // Insert __asan_alloca_poison call for new created alloca. +  IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize}); -  // PartialRzAligned = PartialRzAddr & ~AllocaRzMask -  Value *PartialRzAddr = IRB.CreateAdd(NewAddress, OldSize); -  Value *PartialRzAligned = IRB.CreateAnd(PartialRzAddr, NotAllocaRzMask); +  // Store the last alloca's address to DynamicAllocaLayout. We'll need this +  // for unpoisoning stuff. +  IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout); -  // Poisoning partial redzone. -  Value *PartialRzMagic = computePartialRzMagic(PartialSize, IRB); -  Value *PartialRzShadowAddr = ASan.memToShadow(PartialRzAligned, IRB); -  IRB.CreateStore(PartialRzMagic, -                  IRB.CreateIntToPtr(PartialRzShadowAddr, Int32PtrTy)); - -  // RightRzAddress -  //   =  (PartialRzAddr + AllocaRzMask) & ~AllocaRzMask -  Value *RightRzAddress = IRB.CreateAnd( -      IRB.CreateAdd(PartialRzAddr, AllocaRzMask), NotAllocaRzMask); - -  // Poisoning right redzone. -  AllocaCall.RightRzAddr = ASan.memToShadow(RightRzAddress, IRB); -  IRB.CreateStore(ConstantInt::get(IRB.getInt32Ty(), kAsanAllocaRightMagic), -                  IRB.CreateIntToPtr(AllocaCall.RightRzAddr, Int32PtrTy)); +  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType()); -  // Replace all uses of AddessReturnedByAlloca with NewAddress. +  // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.    AI->replaceAllUsesWith(NewAddressPtr); -  // We are done. Erase old alloca and store left, partial and right redzones -  // shadow addresses for future unpoisoning. +  // We are done. Erase old alloca from parent.    AI->eraseFromParent(); -  NumInstrumentedDynamicAllocas++;  }  // isSafeAccess returns true if Addr is always inbounds with respect to its diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp index 610ff52ba9db0..05a9c8a5dfe54 100644 --- a/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -146,8 +146,8 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {    IRBuilder<> Builder(Inc->getParent(), *Inc);    uint64_t Index = Inc->getIndex()->getZExtValue(); -  llvm::Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); -  llvm::Value *Count = Builder.CreateLoad(Addr, "pgocount"); +  Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); +  Value *Count = Builder.CreateLoad(Addr, "pgocount");    Count = Builder.CreateAdd(Count, Builder.getInt64(1));    Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr));    Inc->eraseFromParent(); @@ -196,14 +196,17 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {    if (It != RegionCounters.end())      return It->second; -  // Move the name variable to the right section. +  // Move the name variable to the right section. Make sure it is placed in the +  // same comdat as its associated function. Otherwise, we may get multiple +  // counters for the same function in certain cases. +  Function *Fn = Inc->getParent()->getParent();    Name->setSection(getNameSection());    Name->setAlignment(1); +  Name->setComdat(Fn->getComdat());    uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();    LLVMContext &Ctx = M->getContext();    ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters); -  Function *Fn = Inc->getParent()->getParent();    // Create the counters variable.    auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(), @@ -212,9 +215,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {    Counters->setVisibility(Name->getVisibility());    Counters->setSection(getCountersSection());    Counters->setAlignment(8); -  // Place the counters in the same comdat section as its parent function. -  // Otherwise, we may get multiple counters for the same function in certain -  // cases.    Counters->setComdat(Fn->getComdat());    RegionCounters[Inc->getName()] = Counters; @@ -263,7 +263,7 @@ void InstrProfiling::emitRegistration() {    if (Options.NoRedZone)      RegisterF->addFnAttr(Attribute::NoRedZone); -  auto *RuntimeRegisterTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false); +  auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);    auto *RuntimeRegisterF =        Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,                         "__llvm_profile_register_function", M); @@ -310,7 +310,7 @@ void InstrProfiling::emitUses() {      return;    GlobalVariable *LLVMUsed = M->getGlobalVariable("llvm.used"); -  std::vector<Constant*> MergedVars; +  std::vector<Constant *> MergedVars;    if (LLVMUsed) {      // Collect the existing members of llvm.used.      ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); @@ -323,13 +323,13 @@ void InstrProfiling::emitUses() {    // Add uses for our data.    for (auto *Value : UsedVars)      MergedVars.push_back( -        ConstantExpr::getBitCast(cast<llvm::Constant>(Value), i8PTy)); +        ConstantExpr::getBitCast(cast<Constant>(Value), i8PTy));    // Recreate llvm.used.    ArrayType *ATy = ArrayType::get(i8PTy, MergedVars.size()); -  LLVMUsed = new llvm::GlobalVariable( -      *M, ATy, false, llvm::GlobalValue::AppendingLinkage, -      llvm::ConstantArray::get(ATy, MergedVars), "llvm.used"); +  LLVMUsed = +      new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage, +                         ConstantArray::get(ATy, MergedVars), "llvm.used");    LLVMUsed->setSection("llvm.metadata");  } diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 2a3139f8705de..e7731ad5cd17b 100644 --- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -200,7 +200,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,    bool SawRelease = false;    // Get the location associated with Load. -  AliasAnalysis::Location Loc = AA->getLocation(Load); +  AliasAnalysis::Location Loc = MemoryLocation::get(Load);    // Walk down to find the store and the release, which may be in either order.    for (auto I = std::next(BasicBlock::iterator(Load)), diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index d1302c6e22f49..79624b2e4c475 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -113,10 +113,11 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {        Value *Condition = SI->getCondition();        if (!Condition->getType()->isVectorTy()) { -        if (Constant *C = LVI->getConstantOnEdge(Condition, P->getIncomingBlock(i), BB, P)) { -          if (C == ConstantInt::getTrue(Condition->getType())) { +        if (Constant *C = LVI->getConstantOnEdge( +                Condition, P->getIncomingBlock(i), BB, P)) { +          if (C->isOneValue()) {              V = SI->getTrueValue(); -          } else { +          } else if (C->isZeroValue()) {              V = SI->getFalseValue();            }            // Once LVI learns to handle vector types, we could also add support diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 01952cf6e8b38..eb48a766a2cfe 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -197,11 +197,11 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {  static AliasAnalysis::Location  getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) -    return AA.getLocation(SI); +    return MemoryLocation::get(SI);    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {      // memcpy/memmove/memset. -    AliasAnalysis::Location Loc = AA.getLocationForDest(MI); +    AliasAnalysis::Location Loc = MemoryLocation::getForDest(MI);      return Loc;    } @@ -231,7 +231,7 @@ getLocForRead(Instruction *Inst, AliasAnalysis &AA) {    // The only instructions that both read and write are the mem transfer    // instructions (memcpy/memmove).    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) -    return AA.getLocationForSource(MTI); +    return MemoryLocation::getForSource(MTI);    return AliasAnalysis::Location();  } @@ -815,11 +815,11 @@ bool DSE::handleEndBlock(BasicBlock &BB) {      if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {        if (!L->isUnordered()) // Be conservative with atomic/volatile load          break; -      LoadedLoc = AA->getLocation(L); +      LoadedLoc = MemoryLocation::get(L);      } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { -      LoadedLoc = AA->getLocation(V); +      LoadedLoc = MemoryLocation::get(V);      } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { -      LoadedLoc = AA->getLocationForSource(MTI); +      LoadedLoc = MemoryLocation::getForSource(MTI);      } else if (!BBI->mayReadFromMemory()) {        // Instruction doesn't read memory.  Note that stores that weren't removed        // above will hit this case. diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 600589c904c49..359a616c069d3 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -68,6 +68,22 @@ static cl::opt<bool> VerifyIndvars(  static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden,    cl::desc("Reduce live induction variables.")); +enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl }; + +static cl::opt<ReplaceExitVal> ReplaceExitValue( +    "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), +    cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), +    cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"), +               clEnumValN(OnlyCheapRepl, "cheap", +                          "only replace exit value when the cost is cheap"), +               clEnumValN(AlwaysRepl, "always", +                          "always replace exit value whenever possible"), +               clEnumValEnd)); + +namespace { +struct RewritePhi; +} +  namespace {    class IndVarSimplify : public LoopPass {      LoopInfo                  *LI; @@ -112,6 +128,7 @@ namespace {      void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); +    bool CanLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);      void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);      Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, @@ -464,6 +481,21 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {      SE->forgetLoop(L);  } +namespace { +// Collect information about PHI nodes which can be transformed in +// RewriteLoopExitValues. +struct RewritePhi { +  PHINode *PN; +  unsigned Ith;  // Ith incoming value. +  Value *Val;    // Exit value after expansion. +  bool HighCost; // High Cost when expansion. +  bool SafePhi;  // LCSSASafePhiForRAUW. + +  RewritePhi(PHINode *P, unsigned I, Value *V, bool H, bool S) +      : PN(P), Ith(I), Val(V), HighCost(H), SafePhi(S) {} +}; +} +  //===----------------------------------------------------------------------===//  // RewriteLoopExitValues - Optimize IV users outside the loop.  // As a side effect, reduces the amount of IV processing within the loop. @@ -486,6 +518,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {    SmallVector<BasicBlock*, 8> ExitBlocks;    L->getUniqueExitBlocks(ExitBlocks); +  SmallVector<RewritePhi, 8> RewritePhiSet;    // Find all values that are computed inside the loop, but used outside of it.    // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan    // the exit blocks of the loop to find them. @@ -604,23 +637,44 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {            DeadInsts.push_back(ExitVal);            continue;          } -        Changed = true; -        ++NumReplaced; +        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L); -        PN->setIncomingValue(i, ExitVal); +        // Collect all the candidate PHINodes to be rewritten. +        RewritePhiSet.push_back( +            RewritePhi(PN, i, ExitVal, HighCost, LCSSASafePhiForRAUW)); +      } +    } +  } -        // If this instruction is dead now, delete it. Don't do it now to avoid -        // invalidating iterators. -        if (isInstructionTriviallyDead(Inst, TLI)) -          DeadInsts.push_back(Inst); +  bool LoopCanBeDel = CanLoopBeDeleted(L, RewritePhiSet); -        // If we determined that this PHI is safe to replace even if an LCSSA -        // PHI, do so. -        if (LCSSASafePhiForRAUW) { -          PN->replaceAllUsesWith(ExitVal); -          PN->eraseFromParent(); -        } -      } +  // Transformation. +  for (const RewritePhi &Phi : RewritePhiSet) { +    PHINode *PN = Phi.PN; +    Value *ExitVal = Phi.Val; + +    // Only do the rewrite when the ExitValue can be expanded cheaply. +    // If LoopCanBeDel is true, rewrite exit value aggressively. +    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) { +      DeadInsts.push_back(ExitVal); +      continue; +    } + +    Changed = true; +    ++NumReplaced; +    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith)); +    PN->setIncomingValue(Phi.Ith, ExitVal); + +    // If this instruction is dead now, delete it. Don't do it now to avoid +    // invalidating iterators. +    if (isInstructionTriviallyDead(Inst, TLI)) +      DeadInsts.push_back(Inst); + +    // If we determined that this PHI is safe to replace even if an LCSSA +    // PHI, do so. +    if (Phi.SafePhi) { +      PN->replaceAllUsesWith(ExitVal); +      PN->eraseFromParent();      }    } @@ -629,6 +683,65 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {    Rewriter.clearInsertPoint();  } +/// CanLoopBeDeleted - Check whether it is possible to delete the loop after +/// rewriting exit value. If it is possible, ignore ReplaceExitValue and +/// do rewriting aggressively. +bool IndVarSimplify::CanLoopBeDeleted( +    Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { + +  BasicBlock *Preheader = L->getLoopPreheader(); +  // If there is no preheader, the loop will not be deleted. +  if (!Preheader) +    return false; + +  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1. +  // We obviate multiple ExitingBlocks case for simplicity. +  // TODO: If we see testcase with multiple ExitingBlocks can be deleted +  // after exit value rewriting, we can enhance the logic here. +  SmallVector<BasicBlock *, 4> ExitingBlocks; +  L->getExitingBlocks(ExitingBlocks); +  SmallVector<BasicBlock *, 8> ExitBlocks; +  L->getUniqueExitBlocks(ExitBlocks); +  if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1) +    return false; + +  BasicBlock *ExitBlock = ExitBlocks[0]; +  BasicBlock::iterator BI = ExitBlock->begin(); +  while (PHINode *P = dyn_cast<PHINode>(BI)) { +    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]); + +    // If the Incoming value of P is found in RewritePhiSet, we know it +    // could be rewritten to use a loop invariant value in transformation +    // phase later. Skip it in the loop invariant check below. +    bool found = false; +    for (const RewritePhi &Phi : RewritePhiSet) { +      unsigned i = Phi.Ith; +      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) { +        found = true; +        break; +      } +    } + +    Instruction *I; +    if (!found && (I = dyn_cast<Instruction>(Incoming))) +      if (!L->hasLoopInvariantOperands(I)) +        return false; + +    ++BI; +  } + +  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); +       LI != LE; ++LI) { +    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE; +         ++BI) { +      if (BI->mayHaveSideEffects()) +        return false; +    } +  } + +  return true; +} +  //===----------------------------------------------------------------------===//  //  IV Widening - Extend the width of an IV to cover its widest uses.  //===----------------------------------------------------------------------===// @@ -989,7 +1102,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {          IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt());          Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());          UsePhi->replaceAllUsesWith(Trunc); -        DeadInsts.push_back(UsePhi); +        DeadInsts.emplace_back(UsePhi);          DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi                << " to " << *WidePhi << "\n");        } @@ -1022,7 +1135,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {              << " replaced by " << *DU.WideDef << "\n");        ++NumElimExt;        DU.NarrowUse->replaceAllUsesWith(NewDef); -      DeadInsts.push_back(DU.NarrowUse); +      DeadInsts.emplace_back(DU.NarrowUse);      }      // Now that the extend is gone, we want to expose it's uses for potential      // further simplification. We don't need to directly inform SimplifyIVUsers @@ -1075,7 +1188,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {    if (WideAddRec != SE->getSCEV(WideUse)) {      DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse            << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n"); -    DeadInsts.push_back(WideUse); +    DeadInsts.emplace_back(WideUse);      return nullptr;    } @@ -1172,7 +1285,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {      // WidenIVUse may have removed the def-use edge.      if (DU.NarrowDef->use_empty()) -      DeadInsts.push_back(DU.NarrowDef); +      DeadInsts.emplace_back(DU.NarrowDef);    }    return WidePhi;  } @@ -1867,7 +1980,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {    // loop into any instructions outside of the loop that use the final values of    // the current expressions.    // -  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) +  if (ReplaceExitValue != NeverRepl && +      !isa<SCEVCouldNotCompute>(BackedgeTakenCount))      RewriteLoopExitValues(L, Rewriter);    // Eliminate redundant IV cycles. diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 584c7aee7f1dd..4b59f3d2f6cc1 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -811,7 +811,7 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {        if (Instruction *U = dyn_cast<Instruction>(O)) {          O = nullptr;          if (U->use_empty()) -          DeadInsts.push_back(U); +          DeadInsts.emplace_back(U);        }      I->eraseFromParent(); @@ -2917,7 +2917,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,        IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");      }      Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper); -    DeadInsts.push_back(Inc.IVOperand); +    DeadInsts.emplace_back(Inc.IVOperand);    }    // If LSR created a new, wider phi, we may also replace its postinc. We only    // do this if we also found a wide value for the head of the chain. @@ -2939,7 +2939,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,          IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");        }        Phi->replaceUsesOfWith(PostIncV, IVOper); -      DeadInsts.push_back(PostIncV); +      DeadInsts.emplace_back(PostIncV);      }    }  } @@ -4594,7 +4594,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    // form, update the ICmp's other operand.    if (LU.Kind == LSRUse::ICmpZero) {      ICmpInst *CI = cast<ICmpInst>(LF.UserInst); -    DeadInsts.push_back(CI->getOperand(1)); +    DeadInsts.emplace_back(CI->getOperand(1));      assert(!F.BaseGV && "ICmp does not support folding a global value and "                             "a scale at the same time!");      if (F.Scale == -1) { @@ -4737,7 +4737,7 @@ void LSRInstance::Rewrite(const LSRFixup &LF,        LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);    } -  DeadInsts.push_back(LF.OperandValToReplace); +  DeadInsts.emplace_back(LF.OperandValToReplace);  }  /// ImplementSolution - Rewrite all the fixup locations with new values, diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index ccafd100ef0f4..4ccbfc953e0cf 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -38,25 +38,25 @@ using namespace llvm;  #define DEBUG_TYPE "loop-unroll"  static cl::opt<unsigned> -UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, -  cl::desc("The cut-off point for automatic loop unrolling")); +    UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, +                    cl::desc("The baseline cost threshold for loop unrolling")); + +static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold( +    "unroll-percent-dynamic-cost-saved-threshold", cl::init(20), cl::Hidden, +    cl::desc("The percentage of estimated dynamic cost which must be saved by " +             "unrolling to allow unrolling up to the max threshold.")); + +static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount( +    "unroll-dynamic-cost-savings-discount", cl::init(2000), cl::Hidden, +    cl::desc("This is the amount discounted from the total unroll cost when " +             "the unrolled form has a high dynamic cost savings (triggered by " +             "the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));  static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(      "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,      cl::desc("Don't allow loop unrolling to simulate more than this number of"               "iterations when checking full unroll profitability")); -static cl::opt<unsigned> UnrollMinPercentOfOptimized( -    "unroll-percent-of-optimized-for-complete-unroll", cl::init(20), cl::Hidden, -    cl::desc("If complete unrolling could trigger further optimizations, and, " -             "by that, remove the given percent of instructions, perform the " -             "complete unroll even if it's beyond the threshold")); - -static cl::opt<unsigned> UnrollAbsoluteThreshold( -    "unroll-absolute-threshold", cl::init(2000), cl::Hidden, -    cl::desc("Don't unroll if the unrolled size is bigger than this threshold," -             " even if we can remove big portion of instructions later.")); -  static cl::opt<unsigned>  UnrollCount("unroll-count", cl::init(0), cl::Hidden,    cl::desc("Use this unroll count for all loops including those with " @@ -82,16 +82,18 @@ namespace {      static char ID; // Pass ID, replacement for typeid      LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {        CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); -      CurrentAbsoluteThreshold = UnrollAbsoluteThreshold; -      CurrentMinPercentOfOptimized = UnrollMinPercentOfOptimized; +      CurrentPercentDynamicCostSavedThreshold = +          UnrollPercentDynamicCostSavedThreshold; +      CurrentDynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;        CurrentCount = (C == -1) ? UnrollCount : unsigned(C);        CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;        CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;        UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); -      UserAbsoluteThreshold = (UnrollAbsoluteThreshold.getNumOccurrences() > 0); -      UserPercentOfOptimized = -          (UnrollMinPercentOfOptimized.getNumOccurrences() > 0); +      UserPercentDynamicCostSavedThreshold = +          (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0); +      UserDynamicCostSavingsDiscount = +          (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0);        UserAllowPartial = (P != -1) ||                           (UnrollAllowPartial.getNumOccurrences() > 0);        UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); @@ -115,18 +117,18 @@ namespace {      unsigned CurrentCount;      unsigned CurrentThreshold; -    unsigned CurrentAbsoluteThreshold; -    unsigned CurrentMinPercentOfOptimized; -    bool     CurrentAllowPartial; -    bool     CurrentRuntime; -    bool     UserCount;            // CurrentCount is user-specified. -    bool     UserThreshold;        // CurrentThreshold is user-specified. -    bool UserAbsoluteThreshold;    // CurrentAbsoluteThreshold is -                                   // user-specified. -    bool UserPercentOfOptimized;   // CurrentMinPercentOfOptimized is -                                   // user-specified. -    bool     UserAllowPartial;     // CurrentAllowPartial is user-specified. -    bool     UserRuntime;          // CurrentRuntime is user-specified. +    unsigned CurrentPercentDynamicCostSavedThreshold; +    unsigned CurrentDynamicCostSavingsDiscount; +    bool CurrentAllowPartial; +    bool CurrentRuntime; + +    // Flags for whether the 'current' settings are user-specified. +    bool UserCount; +    bool UserThreshold; +    bool UserPercentDynamicCostSavedThreshold; +    bool UserDynamicCostSavingsDiscount; +    bool UserAllowPartial; +    bool UserRuntime;      bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -156,8 +158,9 @@ namespace {      void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,                                   TargetTransformInfo::UnrollingPreferences &UP) {        UP.Threshold = CurrentThreshold; -      UP.AbsoluteThreshold = CurrentAbsoluteThreshold; -      UP.MinPercentOfOptimized = CurrentMinPercentOfOptimized; +      UP.PercentDynamicCostSavedThreshold = +          CurrentPercentDynamicCostSavedThreshold; +      UP.DynamicCostSavingsDiscount = CurrentDynamicCostSavingsDiscount;        UP.OptSizeThreshold = OptSizeUnrollThreshold;        UP.PartialThreshold = CurrentThreshold;        UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; @@ -186,8 +189,8 @@ namespace {      void selectThresholds(const Loop *L, bool HasPragma,                            const TargetTransformInfo::UnrollingPreferences &UP,                            unsigned &Threshold, unsigned &PartialThreshold, -                          unsigned &AbsoluteThreshold, -                          unsigned &PercentOfOptimizedForCompleteUnroll) { +                          unsigned &PercentDynamicCostSavedThreshold, +                          unsigned &DynamicCostSavingsDiscount) {        // Determine the current unrolling threshold.  While this is        // normally set from UnrollThreshold, it is overridden to a        // smaller value if the current function is marked as @@ -195,11 +198,13 @@ namespace {        // specified.        Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;        PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; -      AbsoluteThreshold = UserAbsoluteThreshold ? CurrentAbsoluteThreshold -                                                : UP.AbsoluteThreshold; -      PercentOfOptimizedForCompleteUnroll = UserPercentOfOptimized -                                                ? CurrentMinPercentOfOptimized -                                                : UP.MinPercentOfOptimized; +      PercentDynamicCostSavedThreshold = +          UserPercentDynamicCostSavedThreshold +              ? CurrentPercentDynamicCostSavedThreshold +              : UP.PercentDynamicCostSavedThreshold; +      DynamicCostSavingsDiscount = UserDynamicCostSavingsDiscount +                                       ? CurrentDynamicCostSavingsDiscount +                                       : UP.DynamicCostSavingsDiscount;        if (!UserThreshold &&            L->getHeader()->getParent()->hasFnAttribute( @@ -220,9 +225,9 @@ namespace {        }      }      bool canUnrollCompletely(Loop *L, unsigned Threshold, -                             unsigned AbsoluteThreshold, uint64_t UnrolledSize, -                             unsigned NumberOfOptimizedInstructions, -                             unsigned PercentOfOptimizedForCompleteUnroll); +                             unsigned PercentDynamicCostSavedThreshold, +                             unsigned DynamicCostSavingsDiscount, +                             uint64_t UnrolledCost, uint64_t RolledDynamicCost);    };  } @@ -246,187 +251,6 @@ Pass *llvm::createSimpleLoopUnrollPass() {  }  namespace { -/// \brief SCEV expressions visitor used for finding expressions that would -/// become constants if the loop L is unrolled. -struct FindConstantPointers { -  /// \brief Shows whether the expression is ConstAddress+Constant or not. -  bool IndexIsConstant; - -  /// \brief Used for filtering out SCEV expressions with two or more AddRec -  /// subexpressions. -  /// -  /// Used to filter out complicated SCEV expressions, having several AddRec -  /// sub-expressions. We don't handle them, because unrolling one loop -  /// would help to replace only one of these inductions with a constant, and -  /// consequently, the expression would remain non-constant. -  bool HaveSeenAR; - -  /// \brief If the SCEV expression becomes ConstAddress+Constant, this value -  /// holds ConstAddress. Otherwise, it's nullptr. -  Value *BaseAddress; - -  /// \brief The loop, which we try to completely unroll. -  const Loop *L; - -  ScalarEvolution &SE; - -  FindConstantPointers(const Loop *L, ScalarEvolution &SE) -      : IndexIsConstant(true), HaveSeenAR(false), BaseAddress(nullptr), -        L(L), SE(SE) {} - -  /// Examine the given expression S and figure out, if it can be a part of an -  /// expression, that could become a constant after the loop is unrolled. -  /// The routine sets IndexIsConstant and HaveSeenAR according to the analysis -  /// results. -  /// \returns true if we need to examine subexpressions, and false otherwise. -  bool follow(const SCEV *S) { -    if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) { -      // We've reached the leaf node of SCEV, it's most probably just a -      // variable. -      // If it's the only one SCEV-subexpression, then it might be a base -      // address of an index expression. -      // If we've already recorded base address, then just give up on this SCEV -      // - it's too complicated. -      if (BaseAddress) { -        IndexIsConstant = false; -        return false; -      } -      BaseAddress = SC->getValue(); -      return false; -    } -    if (isa<SCEVConstant>(S)) -      return false; -    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { -      // If the current SCEV expression is AddRec, and its loop isn't the loop -      // we are about to unroll, then we won't get a constant address after -      // unrolling, and thus, won't be able to eliminate the load. -      if (AR->getLoop() != L) { -        IndexIsConstant = false; -        return false; -      } -      // We don't handle multiple AddRecs here, so give up in this case. -      if (HaveSeenAR) { -        IndexIsConstant = false; -        return false; -      } -      HaveSeenAR = true; -    } - -    // Continue traversal. -    return true; -  } -  bool isDone() const { return !IndexIsConstant; } -}; -} // End anonymous namespace. - -namespace { -/// \brief A cache of SCEV results used to optimize repeated queries to SCEV on -/// the same set of instructions. -/// -/// The primary cost this saves is the cost of checking the validity of a SCEV -/// every time it is looked up. However, in some cases we can provide a reduced -/// and especially useful model for an instruction based upon SCEV that is -/// non-trivial to compute but more useful to clients. -class SCEVCache { -public: -  /// \brief Struct to represent a GEP whose start and step are known fixed -  /// offsets from a base address due to SCEV's analysis. -  struct GEPDescriptor { -    Value *BaseAddr = nullptr; -    unsigned Start = 0; -    unsigned Step = 0; -  }; - -  Optional<GEPDescriptor> getGEPDescriptor(GetElementPtrInst *GEP); - -  SCEVCache(const Loop &L, ScalarEvolution &SE) : L(L), SE(SE) {} - -private: -  const Loop &L; -  ScalarEvolution &SE; - -  SmallDenseMap<GetElementPtrInst *, GEPDescriptor> GEPDescriptors; -}; -} // End anonymous namespace. - -/// \brief Get a simplified descriptor for a GEP instruction. -/// -/// Where possible, this produces a simplified descriptor for a GEP instruction -/// using SCEV analysis of the containing loop. If this isn't possible, it -/// returns an empty optional. -/// -/// The model is a base address, an initial offset, and a per-iteration step. -/// This fits very common patterns of GEPs inside loops and is something we can -/// use to simulate the behavior of a particular iteration of a loop. -/// -/// This is a cached interface. The first call may do non-trivial work to -/// compute the result, but all subsequent calls will return a fast answer -/// based on a cached result. This includes caching negative results. -Optional<SCEVCache::GEPDescriptor> -SCEVCache::getGEPDescriptor(GetElementPtrInst *GEP) { -  decltype(GEPDescriptors)::iterator It; -  bool Inserted; - -  std::tie(It, Inserted) = GEPDescriptors.insert({GEP, {}}); - -  if (!Inserted) { -    if (!It->second.BaseAddr) -      return None; - -    return It->second; -  } - -  // We've inserted a new record into the cache, so compute the GEP descriptor -  // if possible. -  Value *V = cast<Value>(GEP); -  if (!SE.isSCEVable(V->getType())) -    return None; -  const SCEV *S = SE.getSCEV(V); - -  // FIXME: It'd be nice if the worklist and set used by the -  // SCEVTraversal could be re-used between loop iterations, but the -  // interface doesn't support that. There is no way to clear the visited -  // sets between uses. -  FindConstantPointers Visitor(&L, SE); -  SCEVTraversal<FindConstantPointers> T(Visitor); - -  // Try to find (BaseAddress+Step+Offset) tuple. -  // If succeeded, save it to the cache - it might help in folding -  // loads. -  T.visitAll(S); -  if (!Visitor.IndexIsConstant || !Visitor.BaseAddress) -    return None; - -  const SCEV *BaseAddrSE = SE.getSCEV(Visitor.BaseAddress); -  if (BaseAddrSE->getType() != S->getType()) -    return None; -  const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE); -  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE); - -  if (!AR) -    return None; - -  const SCEVConstant *StepSE = -      dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)); -  const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart()); -  if (!StepSE || !StartSE) -    return None; - -  // Check and skip caching if doing so would require lots of bits to -  // avoid overflow. -  APInt Start = StartSE->getValue()->getValue(); -  APInt Step = StepSE->getValue()->getValue(); -  if (Start.getActiveBits() > 32 || Step.getActiveBits() > 32) -    return None; - -  // We found a cacheable SCEV model for the GEP. -  It->second.BaseAddr = Visitor.BaseAddress; -  It->second.Start = Start.getLimitedValue(); -  It->second.Step = Step.getLimitedValue(); -  return It->second; -} - -namespace {  // This class is used to get an estimate of the optimization effects that we  // could get from complete loop unrolling. It comes from the fact that some  // loads might be replaced with concrete constant values and that could trigger @@ -446,17 +270,31 @@ namespace {  class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {    typedef InstVisitor<UnrolledInstAnalyzer, bool> Base;    friend class InstVisitor<UnrolledInstAnalyzer, bool>; +  struct SimplifiedAddress { +    Value *Base = nullptr; +    ConstantInt *Offset = nullptr; +  };  public:    UnrolledInstAnalyzer(unsigned Iteration,                         DenseMap<Value *, Constant *> &SimplifiedValues, -                       SCEVCache &SC) -      : Iteration(Iteration), SimplifiedValues(SimplifiedValues), SC(SC) {} +                       const Loop *L, ScalarEvolution &SE) +      : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) { +      IterationNumber = SE.getConstant(APInt(64, Iteration)); +  }    // Allow access to the initial visit method.    using Base::visit;  private: +  /// \brief A cache of pointer bases and constant-folded offsets corresponding +  /// to GEP (or derived from GEP) instructions. +  /// +  /// In order to find the base pointer one needs to perform non-trivial +  /// traversal of the corresponding SCEV expression, so it's good to have the +  /// results saved. +  DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; +    /// \brief Number of currently simulated iteration.    ///    /// If an expression is ConstAddress+Constant, then the Constant is @@ -464,18 +302,71 @@ private:    /// SCEVGEPCache.    unsigned Iteration; -  // While we walk the loop instructions, we we build up and maintain a mapping -  // of simplified values specific to this iteration.  The idea is to propagate -  // any special information we have about loads that can be replaced with -  // constants after complete unrolling, and account for likely simplifications -  // post-unrolling. +  /// \brief SCEV expression corresponding to number of currently simulated +  /// iteration. +  const SCEV *IterationNumber; + +  /// \brief A Value->Constant map for keeping values that we managed to +  /// constant-fold on the given iteration. +  /// +  /// While we walk the loop instructions, we build up and maintain a mapping +  /// of simplified values specific to this iteration.  The idea is to propagate +  /// any special information we have about loads that can be replaced with +  /// constants after complete unrolling, and account for likely simplifications +  /// post-unrolling.    DenseMap<Value *, Constant *> &SimplifiedValues; -  // We use a cache to wrap all our SCEV queries. -  SCEVCache &SC; +  const Loop *L; +  ScalarEvolution &SE; + +  /// \brief Try to simplify instruction \param I using its SCEV expression. +  /// +  /// The idea is that some AddRec expressions become constants, which then +  /// could trigger folding of other instructions. However, that only happens +  /// for expressions whose start value is also constant, which isn't always the +  /// case. In another common and important case the start value is just some +  /// address (i.e. SCEVUnknown) - in this case we compute the offset and save +  /// it along with the base address instead. +  bool simplifyInstWithSCEV(Instruction *I) { +    if (!SE.isSCEVable(I->getType())) +      return false; + +    const SCEV *S = SE.getSCEV(I); +    if (auto *SC = dyn_cast<SCEVConstant>(S)) { +      SimplifiedValues[I] = SC->getValue(); +      return true; +    } + +    auto *AR = dyn_cast<SCEVAddRecExpr>(S); +    if (!AR) +      return false; + +    const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE); +    // Check if the AddRec expression becomes a constant. +    if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) { +      SimplifiedValues[I] = SC->getValue(); +      return true; +    } + +    // Check if the offset from the base address becomes a constant. +    auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S)); +    if (!Base) +      return false; +    auto *Offset = +        dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base)); +    if (!Offset) +      return false; +    SimplifiedAddress Address; +    Address.Base = Base->getValue(); +    Address.Offset = Offset->getValue(); +    SimplifiedAddresses[I] = Address; +    return true; +  }    /// Base case for the instruction visitor. -  bool visitInstruction(Instruction &I) { return false; }; +  bool visitInstruction(Instruction &I) { +    return simplifyInstWithSCEV(&I); +  }    /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt. @@ -492,6 +383,7 @@ private:      if (!isa<Constant>(RHS))        if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))          RHS = SimpleRHS; +      Value *SimpleV = nullptr;      const DataLayout &DL = I.getModule()->getDataLayout();      if (auto FI = dyn_cast<FPMathOperator>(&I)) @@ -503,24 +395,21 @@ private:      if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))        SimplifiedValues[&I] = C; -    return SimpleV; +    if (SimpleV) +      return true; +    return Base::visitBinaryOperator(I);    }    /// Try to fold load I.    bool visitLoad(LoadInst &I) {      Value *AddrOp = I.getPointerOperand(); -    if (!isa<Constant>(AddrOp)) -      if (Constant *SimplifiedAddrOp = SimplifiedValues.lookup(AddrOp)) -        AddrOp = SimplifiedAddrOp; -    auto *GEP = dyn_cast<GetElementPtrInst>(AddrOp); -    if (!GEP) -      return false; -    auto OptionalGEPDesc = SC.getGEPDescriptor(GEP); -    if (!OptionalGEPDesc) +    auto AddressIt = SimplifiedAddresses.find(AddrOp); +    if (AddressIt == SimplifiedAddresses.end())        return false; +    ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset; -    auto GV = dyn_cast<GlobalVariable>(OptionalGEPDesc->BaseAddr); +    auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);      // We're only interested in loads that can be completely folded to a      // constant.      if (!GV || !GV->hasInitializer()) @@ -531,13 +420,10 @@ private:      if (!CDS)        return false; -    // This calculation should never overflow because we bound Iteration quite -    // low and both the start and step are 32-bit integers. We use signed -    // integers so that UBSan will catch if a bug sneaks into the code.      int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; -    int64_t Index = ((int64_t)OptionalGEPDesc->Start + -                     (int64_t)OptionalGEPDesc->Step * (int64_t)Iteration) / -                    ElemSize; +    assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && +           "Unexpectedly large index value."); +    int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;      if (Index >= CDS->getNumElements()) {        // FIXME: For now we conservatively ignore out of bound accesses, but        // we're allowed to perform the optimization in this case. @@ -556,11 +442,12 @@ private:  namespace {  struct EstimatedUnrollCost { -  /// \brief Count the number of optimized instructions. -  unsigned NumberOfOptimizedInstructions; +  /// \brief The estimated cost after unrolling. +  unsigned UnrolledCost; -  /// \brief Count the total number of instructions. -  unsigned UnrolledLoopSize; +  /// \brief The estimated dynamic cost of executing the instructions in the +  /// rolled form. +  unsigned RolledDynamicCost;  };  } @@ -593,12 +480,15 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,    SmallSetVector<BasicBlock *, 16> BBWorklist;    DenseMap<Value *, Constant *> SimplifiedValues; -  // Use a cache to access SCEV expressions so that we don't pay the cost on -  // each iteration. This cache is lazily self-populating. -  SCEVCache SC(*L, SE); - -  unsigned NumberOfOptimizedInstructions = 0; -  unsigned UnrolledLoopSize = 0; +  // The estimated cost of the unrolled form of the loop. We try to estimate +  // this by simplifying as much as we can while computing the estimate. +  unsigned UnrolledCost = 0; +  // We also track the estimated dynamic (that is, actually executed) cost in +  // the rolled form. This helps identify cases when the savings from unrolling +  // aren't just exposing dead control flows, but actual reduced dynamic +  // instructions due to the simplifications which we expect to occur after +  // unrolling. +  unsigned RolledDynamicCost = 0;    // Simulate execution of each iteration of the loop counting instructions,    // which would be simplified. @@ -606,7 +496,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,    // we literally have to go through all loop's iterations.    for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {      SimplifiedValues.clear(); -    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SC); +    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE);      BBWorklist.clear();      BBWorklist.insert(L->getHeader()); @@ -618,17 +508,20 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,        // it.  We don't change the actual IR, just count optimization        // opportunities.        for (Instruction &I : *BB) { -        UnrolledLoopSize += TTI.getUserCost(&I); +        unsigned InstCost = TTI.getUserCost(&I);          // Visit the instruction to analyze its loop cost after unrolling, -        // and if the visitor returns true, then we can optimize this -        // instruction away. -        if (Analyzer.visit(I)) -          NumberOfOptimizedInstructions += TTI.getUserCost(&I); +        // and if the visitor returns false, include this instruction in the +        // unrolled cost. +        if (!Analyzer.visit(I)) +          UnrolledCost += InstCost; + +        // Also track this instructions expected cost when executing the rolled +        // loop form. +        RolledDynamicCost += InstCost;          // If unrolled body turns out to be too big, bail out. -        if (UnrolledLoopSize - NumberOfOptimizedInstructions > -            MaxUnrolledLoopSize) +        if (UnrolledCost > MaxUnrolledLoopSize)            return None;        } @@ -640,10 +533,10 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,      // If we found no optimization opportunities on the first iteration, we      // won't find them on later ones too. -    if (!NumberOfOptimizedInstructions) +    if (UnrolledCost == RolledDynamicCost)        return None;    } -  return {{NumberOfOptimizedInstructions, UnrolledLoopSize}}; +  return {{UnrolledCost, RolledDynamicCost}};  }  /// ApproximateLoopSize - Approximate the size of the loop. @@ -749,46 +642,56 @@ static void SetLoopAlreadyUnrolled(Loop *L) {    L->setLoopID(NewLoopID);  } -bool LoopUnroll::canUnrollCompletely( -    Loop *L, unsigned Threshold, unsigned AbsoluteThreshold, -    uint64_t UnrolledSize, unsigned NumberOfOptimizedInstructions, -    unsigned PercentOfOptimizedForCompleteUnroll) { +bool LoopUnroll::canUnrollCompletely(Loop *L, unsigned Threshold, +                                     unsigned PercentDynamicCostSavedThreshold, +                                     unsigned DynamicCostSavingsDiscount, +                                     uint64_t UnrolledCost, +                                     uint64_t RolledDynamicCost) {    if (Threshold == NoThreshold) {      DEBUG(dbgs() << "  Can fully unroll, because no threshold is set.\n");      return true;    } -  if (UnrolledSize <= Threshold) { -    DEBUG(dbgs() << "  Can fully unroll, because unrolled size: " -                 << UnrolledSize << "<" << Threshold << "\n"); +  if (UnrolledCost <= Threshold) { +    DEBUG(dbgs() << "  Can fully unroll, because unrolled cost: " +                 << UnrolledCost << "<" << Threshold << "\n");      return true;    } -  assert(UnrolledSize && "UnrolledSize can't be 0 at this point."); -  unsigned PercentOfOptimizedInstructions = -      (uint64_t)NumberOfOptimizedInstructions * 100ull / UnrolledSize; - -  if (UnrolledSize <= AbsoluteThreshold && -      PercentOfOptimizedInstructions >= PercentOfOptimizedForCompleteUnroll) { -    DEBUG(dbgs() << "  Can fully unroll, because unrolling will help removing " -                 << PercentOfOptimizedInstructions -                 << "% instructions (threshold: " -                 << PercentOfOptimizedForCompleteUnroll << "%)\n"); -    DEBUG(dbgs() << "  Unrolled size (" << UnrolledSize -                 << ") is less than the threshold (" << AbsoluteThreshold -                 << ").\n"); +  assert(UnrolledCost && "UnrolledCost can't be 0 at this point."); +  assert(RolledDynamicCost >= UnrolledCost && +         "Cannot have a higher unrolled cost than a rolled cost!"); + +  // Compute the percentage of the dynamic cost in the rolled form that is +  // saved when unrolled. If unrolling dramatically reduces the estimated +  // dynamic cost of the loop, we use a higher threshold to allow more +  // unrolling. +  unsigned PercentDynamicCostSaved = +      (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost; + +  if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold && +      (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <= +          (int64_t)Threshold) { +    DEBUG(dbgs() << "  Can fully unroll, because unrolling will reduce the " +                    "expected dynamic cost by " << PercentDynamicCostSaved +                 << "% (threshold: " << PercentDynamicCostSavedThreshold +                 << "%)\n" +                 << "  and the unrolled cost (" << UnrolledCost +                 << ") is less than the max threshold (" +                 << DynamicCostSavingsDiscount << ").\n");      return true;    }    DEBUG(dbgs() << "  Too large to fully unroll:\n"); -  DEBUG(dbgs() << "    Unrolled size: " << UnrolledSize << "\n"); -  DEBUG(dbgs() << "    Estimated number of optimized instructions: " -               << NumberOfOptimizedInstructions << "\n"); -  DEBUG(dbgs() << "    Absolute threshold: " << AbsoluteThreshold << "\n"); -  DEBUG(dbgs() << "    Minimum percent of removed instructions: " -               << PercentOfOptimizedForCompleteUnroll << "\n"); -  DEBUG(dbgs() << "    Threshold for small loops: " << Threshold << "\n"); +  DEBUG(dbgs() << "    Threshold: " << Threshold << "\n"); +  DEBUG(dbgs() << "    Max threshold: " << DynamicCostSavingsDiscount << "\n"); +  DEBUG(dbgs() << "    Percent cost saved threshold: " +               << PercentDynamicCostSavedThreshold << "%\n"); +  DEBUG(dbgs() << "    Unrolled cost: " << UnrolledCost << "\n"); +  DEBUG(dbgs() << "    Rolled dynamic cost: " << RolledDynamicCost << "\n"); +  DEBUG(dbgs() << "    Percent cost saved: " << PercentDynamicCostSaved +               << "\n");    return false;  } @@ -899,9 +802,11 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {    }    unsigned Threshold, PartialThreshold; -  unsigned AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll; +  unsigned PercentDynamicCostSavedThreshold; +  unsigned DynamicCostSavingsDiscount;    selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, -                   AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll); +                   PercentDynamicCostSavedThreshold, +                   DynamicCostSavingsDiscount);    // Given Count, TripCount and thresholds determine the type of    // unrolling which is to be performed. @@ -910,20 +815,18 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {    if (TripCount && Count == TripCount) {      Unrolling = Partial;      // If the loop is really small, we don't need to run an expensive analysis. -    if (canUnrollCompletely( -            L, Threshold, AbsoluteThreshold, -            UnrolledSize, 0, 100)) { +    if (canUnrollCompletely(L, Threshold, 100, DynamicCostSavingsDiscount, +                            UnrolledSize, UnrolledSize)) {        Unrolling = Full;      } else {        // The loop isn't that small, but we still can fully unroll it if that        // helps to remove a significant number of instructions.        // To check that, run additional analysis on the loop. -      if (Optional<EstimatedUnrollCost> Cost = -              analyzeLoopUnrollCost(L, TripCount, *SE, TTI, AbsoluteThreshold)) -        if (canUnrollCompletely(L, Threshold, AbsoluteThreshold, -                                Cost->UnrolledLoopSize, -                                Cost->NumberOfOptimizedInstructions, -                                PercentOfOptimizedForCompleteUnroll)) { +      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( +              L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount)) +        if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold, +                                DynamicCostSavingsDiscount, Cost->UnrolledCost, +                                Cost->RolledDynamicCost)) {            Unrolling = Full;          }      } diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 66d6ac6f3a099..2bdf670f67e39 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -510,7 +510,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {          // Check that nothing touches the dest of the "copy" between          // the call and the store.          AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); -        AliasAnalysis::Location StoreLoc = AA.getLocation(SI); +        AliasAnalysis::Location StoreLoc = MemoryLocation::get(SI);          for (BasicBlock::iterator I = --BasicBlock::iterator(SI),                                    E = C; I != E; --I) {            if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { @@ -802,9 +802,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {    //    // NOTE: This is conservative, it will stop on any read from the source loc,    // not just the defining memcpy. -  MemDepResult SourceDep = -    MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), -                                 false, M, M->getParent()); +  MemDepResult SourceDep = MD->getPointerDependencyFrom( +      MemoryLocation::getForSource(MDep), false, M, M->getParent());    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)      return false; @@ -812,7 +811,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {    // source and dest might overlap.  We still want to eliminate the intermediate    // value, but we have to generate a memmove instead of memcpy.    bool UseMemMove = false; -  if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) +  if (!AA.isNoAlias(MemoryLocation::getForDest(M), +                    MemoryLocation::getForSource(MDep)))      UseMemMove = true;    // If all checks passed, then we can transform M. @@ -860,9 +860,8 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,      return false;    // Check that there are no other dependencies on the memset destination. -  MemDepResult DstDepInfo = -      MD->getPointerDependencyFrom(AliasAnalysis::getLocationForDest(MemSet), -                                   false, MemCpy, MemCpy->getParent()); +  MemDepResult DstDepInfo = MD->getPointerDependencyFrom( +      MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent());    if (DstDepInfo.getInst() != MemSet)      return false; @@ -998,7 +997,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {      }    } -  AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); +  AliasAnalysis::Location SrcLoc = MemoryLocation::getForSource(M);    MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,                                                           M, M->getParent()); @@ -1047,7 +1046,8 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {      return false;    // See if the pointers alias. -  if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) +  if (!AA.isNoAlias(MemoryLocation::getForDest(M), +                    MemoryLocation::getForSource(M)))      return false;    DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); @@ -1121,8 +1121,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {    // NOTE: This is conservative, it will stop on any read from the source loc,    // not just the defining memcpy.    MemDepResult SourceDep = -    MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), -                                 false, CS.getInstruction(), MDep->getParent()); +      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, +                                   CS.getInstruction(), MDep->getParent());    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)      return false; diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 611a941b0b213..776dfb4d487ff 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -241,7 +241,7 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {  bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,                                                         const Instruction& End,                                                        LoadInst* LI) { -  AliasAnalysis::Location Loc = AA->getLocation(LI); +  AliasAnalysis::Location Loc = MemoryLocation::get(LI);    return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod);  } @@ -266,8 +266,8 @@ LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,      LoadInst *Load1 = dyn_cast<LoadInst>(Inst);      BasicBlock *BB0 = Load0->getParent(); -    AliasAnalysis::Location Loc0 = AA->getLocation(Load0); -    AliasAnalysis::Location Loc1 = AA->getLocation(Load1); +    AliasAnalysis::Location Loc0 = MemoryLocation::get(Load0); +    AliasAnalysis::Location Loc1 = MemoryLocation::get(Load1);      if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) &&          !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) &&          !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) { @@ -425,8 +425,8 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,      StoreInst *Store1 = cast<StoreInst>(Inst); -    AliasAnalysis::Location Loc0 = AA->getLocation(Store0); -    AliasAnalysis::Location Loc1 = AA->getLocation(Store1); +    AliasAnalysis::Location Loc0 = MemoryLocation::get(Store0); +    AliasAnalysis::Location Loc1 = MemoryLocation::get(Store1);      if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&        !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),                                   BB1->back(), Loc1) && diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp index 5b370e04088fc..4cf68b00da0a7 100644 --- a/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/lib/Transforms/Scalar/NaryReassociate.cpp @@ -234,6 +234,7 @@ bool NaryReassociate::doOneIteration(Function &F) {      BasicBlock *BB = Node->getBlock();      for (auto I = BB->begin(); I != BB->end(); ++I) {        if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { +        const SCEV *OldSCEV = SE->getSCEV(I);          if (Instruction *NewI = tryReassociate(I)) {            Changed = true;            SE->forgetValue(I); @@ -243,7 +244,28 @@ bool NaryReassociate::doOneIteration(Function &F) {          }          // Add the rewritten instruction to SeenExprs; the original instruction          // is deleted. -        SeenExprs[SE->getSCEV(I)].push_back(I); +        const SCEV *NewSCEV = SE->getSCEV(I); +        SeenExprs[NewSCEV].push_back(I); +        // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) +        // is equivalent to I. However, ScalarEvolution::getSCEV may +        // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose +        // we reassociate +        //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4 +        // to +        //   NewI = &a[sext(i)] + sext(j). +        // +        // ScalarEvolution computes +        //   getSCEV(I)    = a + 4 * sext(i + j) +        //   getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j) +        // which are different SCEVs. +        // +        // To alleviate this issue of ScalarEvolution not always capturing +        // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can +        // map both SCEV before and after tryReassociate(I) to I. +        // +        // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll. +        if (NewSCEV != OldSCEV) +          SeenExprs[OldSCEV].push_back(I);        }      }    } @@ -295,8 +317,10 @@ static bool isGEPFoldable(GetElementPtrInst *GEP,        BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);      }    } + +  unsigned AddrSpace = GEP->getPointerAddressSpace();    return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, -                                    BaseOffset, HasBaseReg, Scale); +                                    BaseOffset, HasBaseReg, Scale, AddrSpace);  }  Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) { diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp index 3e7deeba9f213..9ecaf102574a3 100644 --- a/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -496,7 +496,7 @@ template <typename T> static void unique_unsorted(std::vector<T> &vec) {    }  } -static std::string GCSafepointPollName("gc.safepoint_poll"); +static const char *const GCSafepointPollName = "gc.safepoint_poll";  static bool isGCSafepointPoll(Function &F) {    return F.getName().equals(GCSafepointPollName); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index b677523d70328..6c66b58729e9a 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -733,7 +733,7 @@ static bool LinearizeExprTree(BinaryOperator *I,    if (Ops.empty()) {      Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());      assert(Identity && "Associative operation without identity!"); -    Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); +    Ops.emplace_back(Identity, APInt(Bitwidth, 1));    }    return Changed; @@ -1966,38 +1966,35 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {    if (!I->hasOneUse() || I->getType()->isVectorTy())      return nullptr; -  // Must be a mul, fmul, or fdiv instruction. +  // Must be a fmul or fdiv instruction.    unsigned Opcode = I->getOpcode(); -  if (Opcode != Instruction::Mul && Opcode != Instruction::FMul && -      Opcode != Instruction::FDiv) +  if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)      return nullptr; -  // Must have at least one constant operand. -  Constant *C0 = dyn_cast<Constant>(I->getOperand(0)); -  Constant *C1 = dyn_cast<Constant>(I->getOperand(1)); -  if (!C0 && !C1) +  auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0)); +  auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1)); + +  // Both operands are constant, let it get constant folded away. +  if (C0 && C1)      return nullptr; -  // Must be a negative ConstantInt or ConstantFP. -  Constant *C = C0 ? C0 : C1; -  unsigned ConstIdx = C0 ? 0 : 1; -  if (auto *CI = dyn_cast<ConstantInt>(C)) { -    if (!CI->isNegative() || CI->isMinValue(true)) -      return nullptr; -  } else if (auto *CF = dyn_cast<ConstantFP>(C)) { -    if (!CF->isNegative()) -      return nullptr; -  } else +  ConstantFP *CF = C0 ? C0 : C1; + +  // Must have one constant operand. +  if (!CF) +    return nullptr; + +  // Must be a negative ConstantFP. +  if (!CF->isNegative())      return nullptr;    // User must be a binary operator with one or more uses.    Instruction *User = I->user_back(); -  if (!isa<BinaryOperator>(User) || !User->getNumUses()) +  if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))      return nullptr;    unsigned UserOpcode = User->getOpcode(); -  if (UserOpcode != Instruction::Add && UserOpcode != Instruction::FAdd && -      UserOpcode != Instruction::Sub && UserOpcode != Instruction::FSub) +  if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)      return nullptr;    // Subtraction is not commutative. Explicitly, the following transform is @@ -2006,14 +2003,9 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {      return nullptr;    // Change the sign of the constant. -  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) -    I->setOperand(ConstIdx, ConstantInt::get(CI->getContext(), -CI->getValue())); -  else { -    ConstantFP *CF = cast<ConstantFP>(C); -    APFloat Val = CF->getValueAPF(); -    Val.changeSign(); -    I->setOperand(ConstIdx, ConstantFP::get(CF->getContext(), Val)); -  } +  APFloat Val = CF->getValueAPF(); +  Val.changeSign(); +  I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));    // Canonicalize I to RHS to simplify the next bit of logic. E.g.,    // ((-Const*y) + x) -> (x + (-Const*y)). @@ -2023,15 +2015,9 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {    Value *Op0 = User->getOperand(0);    Value *Op1 = User->getOperand(1);    BinaryOperator *NI; -  switch(UserOpcode) { +  switch (UserOpcode) {    default:      llvm_unreachable("Unexpected Opcode!"); -  case Instruction::Add: -    NI = BinaryOperator::CreateSub(Op0, Op1); -    break; -  case Instruction::Sub: -    NI = BinaryOperator::CreateAdd(Op0, Op1); -    break;    case Instruction::FAdd:      NI = BinaryOperator::CreateFSub(Op0, Op1);      NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 6cf765a8438ce..6f6ba72c6e6f7 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -30,6 +30,7 @@  #include "llvm/IR/Intrinsics.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Module.h" +#include "llvm/IR/MDBuilder.h"  #include "llvm/IR/Statepoint.h"  #include "llvm/IR/Value.h"  #include "llvm/IR/Verifier.h" @@ -74,13 +75,27 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",                                                    cl::Hidden);  namespace { -struct RewriteStatepointsForGC : public FunctionPass { +struct RewriteStatepointsForGC : public ModulePass {    static char ID; // Pass identification, replacement for typeid -  RewriteStatepointsForGC() : FunctionPass(ID) { +  RewriteStatepointsForGC() : ModulePass(ID) {      initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());    } -  bool runOnFunction(Function &F) override; +  bool runOnFunction(Function &F); +  bool runOnModule(Module &M) override { +    bool Changed = false; +    for (Function &F : M) +      Changed |= runOnFunction(F); + +    if (Changed) { +      // stripDereferenceabilityInfo asserts that shouldRewriteStatepointsIn +      // returns true for at least one function in the module.  Since at least +      // one function changed, we know that the precondition is satisfied. +      stripDereferenceabilityInfo(M); +    } + +    return Changed; +  }    void getAnalysisUsage(AnalysisUsage &AU) const override {      // We add and rewrite a bunch of instructions, but don't really do much @@ -88,12 +103,26 @@ struct RewriteStatepointsForGC : public FunctionPass {      AU.addRequired<DominatorTreeWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>();    } + +  /// The IR fed into RewriteStatepointsForGC may have had attributes implying +  /// dereferenceability that are no longer valid/correct after +  /// RewriteStatepointsForGC has run.  This is because semantically, after +  /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire +  /// heap.  stripDereferenceabilityInfo (conservatively) restores correctness +  /// by erasing all attributes in the module that externally imply +  /// dereferenceability. +  /// +  void stripDereferenceabilityInfo(Module &M); + +  // Helpers for stripDereferenceabilityInfo +  void stripDereferenceabilityInfoFromBody(Function &F); +  void stripDereferenceabilityInfoFromPrototype(Function &F);  };  } // namespace  char RewriteStatepointsForGC::ID = 0; -FunctionPass *llvm::createRewriteStatepointsForGCPass() { +ModulePass *llvm::createRewriteStatepointsForGCPass() {    return new RewriteStatepointsForGC();  } @@ -1031,14 +1060,11 @@ static void recomputeLiveInValues(  // goes through the statepoint.  We might need to split an edge to make this  // possible.  static BasicBlock * -normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, Pass *P) { -  DominatorTree *DT = nullptr; -  if (auto *DTP = P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) -    DT = &DTP->getDomTree(); - +normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, +                            DominatorTree &DT) {    BasicBlock *Ret = BB;    if (!BB->getUniquePredecessor()) { -    Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, DT); +    Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, &DT);    }    // Now that 'ret' has unique predecessor we can safely remove all phi nodes @@ -2016,9 +2042,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,        continue;      InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction());      normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(), -                                P); +                                DT);      normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(), -                                P); +                                DT);    }    // A list of dummy calls added to the IR to keep various values obviously @@ -2197,6 +2223,72 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,    return !records.empty();  } +// Handles both return values and arguments for Functions and CallSites. +template <typename AttrHolder> +static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, +                                   unsigned Index) { +  AttrBuilder R; +  if (AH.getDereferenceableBytes(Index)) +    R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, +                                  AH.getDereferenceableBytes(Index))); +  if (AH.getDereferenceableOrNullBytes(Index)) +    R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, +                                  AH.getDereferenceableOrNullBytes(Index))); + +  if (!R.empty()) +    AH.setAttributes(AH.getAttributes().removeAttributes( +        Ctx, Index, AttributeSet::get(Ctx, Index, R))); +} + +void +RewriteStatepointsForGC::stripDereferenceabilityInfoFromPrototype(Function &F) { +  LLVMContext &Ctx = F.getContext(); + +  for (Argument &A : F.args()) +    if (isa<PointerType>(A.getType())) +      RemoveDerefAttrAtIndex(Ctx, F, A.getArgNo() + 1); + +  if (isa<PointerType>(F.getReturnType())) +    RemoveDerefAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); +} + +void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { +  if (F.empty()) +    return; + +  LLVMContext &Ctx = F.getContext(); +  MDBuilder Builder(Ctx); + +  for (Instruction &I : inst_range(F)) { +    if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { +      assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); +      bool IsImmutableTBAA = +          MD->getNumOperands() == 4 && +          mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1; + +      if (!IsImmutableTBAA) +        continue; // no work to do, MD_tbaa is already marked mutable + +      MDNode *Base = cast<MDNode>(MD->getOperand(0)); +      MDNode *Access = cast<MDNode>(MD->getOperand(1)); +      uint64_t Offset = +          mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue(); + +      MDNode *MutableTBAA = +          Builder.createTBAAStructTagNode(Base, Access, Offset); +      I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA); +    } + +    if (CallSite CS = CallSite(&I)) { +      for (int i = 0, e = CS.arg_size(); i != e; i++) +        if (isa<PointerType>(CS.getArgument(i)->getType())) +          RemoveDerefAttrAtIndex(Ctx, CS, i + 1); +      if (isa<PointerType>(CS.getType())) +        RemoveDerefAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); +    } +  } +} +  /// Returns true if this function should be rewritten by this pass.  The main  /// point of this function is as an extension point for custom logic.  static bool shouldRewriteStatepointsIn(Function &F) { @@ -2211,6 +2303,19 @@ static bool shouldRewriteStatepointsIn(Function &F) {      return false;  } +void RewriteStatepointsForGC::stripDereferenceabilityInfo(Module &M) { +#ifndef NDEBUG +  assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) && +         "precondition!"); +#endif + +  for (Function &F : M) +    stripDereferenceabilityInfoFromPrototype(F); + +  for (Function &F : M) +    stripDereferenceabilityInfoFromBody(F); +} +  bool RewriteStatepointsForGC::runOnFunction(Function &F) {    // Nothing to do for declarations.    if (F.isDeclaration() || F.empty()) @@ -2221,7 +2326,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {    if (!shouldRewriteStatepointsIn(F))      return false; -  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();    // Gather all the statepoints which need rewritten.  Be careful to only    // consider those in reachable code since we need to ask dominance queries diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 3a782d159dab9..4a875311881a5 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -852,9 +852,11 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {      TargetTransformInfo &TTI =          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(              *GEP->getParent()->getParent()); +    unsigned AddrSpace = GEP->getPointerAddressSpace();      if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),                                     /*BaseGV=*/nullptr, AccumulativeByteOffset, -                                   /*HasBaseReg=*/true, /*Scale=*/0)) { +                                   /*HasBaseReg=*/true, /*Scale=*/0, +                                   AddrSpace)) {        return Changed;      }    } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 8566cd9736d3f..f0e3ffdb95acd 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -193,11 +193,18 @@ namespace {  struct CFGSimplifyPass : public FunctionPass {    static char ID; // Pass identification, replacement for typeid    unsigned BonusInstThreshold; -  CFGSimplifyPass(int T = -1) : FunctionPass(ID) { +  std::function<bool(const Function &)> PredicateFtor; + +  CFGSimplifyPass(int T = -1, +                  std::function<bool(const Function &)> Ftor = nullptr) +      : FunctionPass(ID), PredicateFtor(Ftor) {      BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);      initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());    }    bool runOnFunction(Function &F) override { +    if (PredicateFtor && !PredicateFtor(F)) +      return false; +      if (skipOptnoneFunction(F))        return false; @@ -224,7 +231,9 @@ INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,                      false)  // Public interface to the CFGSimplification pass -FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { -  return new CFGSimplifyPass(Threshold); +FunctionPass * +llvm::createCFGSimplificationPass(int Threshold, +                                  std::function<bool(const Function &)> Ftor) { +  return new CFGSimplifyPass(Threshold, Ftor);  } diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index b169d5612f002..078c6a921a089 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -163,7 +163,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,    }    if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { -    AliasAnalysis::Location Loc = AA->getLocation(L); +    AliasAnalysis::Location Loc = MemoryLocation::get(L);      for (Instruction *S : Stores)        if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod)          return false; @@ -172,6 +172,12 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,    if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst))      return false; +  // Convergent operations can only be moved to control equivalent blocks. +  if (auto CS = CallSite(Inst)) { +    if (CS.hasFnAttr(Attribute::Convergent)) +      return false; +  } +    return true;  } diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index a5890c0adb067..5f25e6b2cb6f9 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -491,6 +491,9 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,    const DataLayout &DL = Phi->getModule()->getDataLayout();    int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType)); +  if (!Size) +    return false; +    int64_t CVSize = CV->getSExtValue();    if (CVSize % Size)      return false; diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 623dbc9f42ce2..a87f8504bfb5e 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -636,7 +636,7 @@ void PromoteMem2Reg::run() {    // and inserting the phi nodes we marked as necessary    //    std::vector<RenamePassData> RenamePassWorkList; -  RenamePassWorkList.push_back(RenamePassData(F.begin(), nullptr, Values)); +  RenamePassWorkList.emplace_back(F.begin(), nullptr, std::move(Values));    do {      RenamePassData RPD;      RPD.swap(RenamePassWorkList.back()); @@ -973,7 +973,7 @@ NextIteration:    for (; I != E; ++I)      if (VisitedSuccs.insert(*I).second) -      Worklist.push_back(RenamePassData(*I, Pred, IncomingVals)); +      Worklist.emplace_back(*I, Pred, IncomingVals);    goto NextIteration;  } diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index 3757a801d645e..ab30aa17c76bc 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -141,7 +141,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)    ++NumElimOperand;    Changed = true;    if (IVOperand->use_empty()) -    DeadInsts.push_back(IVOperand); +    DeadInsts.emplace_back(IVOperand);    return IVSrc;  } @@ -178,7 +178,7 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');    ++NumElimCmp;    Changed = true; -  DeadInsts.push_back(ICmp); +  DeadInsts.emplace_back(ICmp);  }  /// SimplifyIVUsers helper for eliminating useless @@ -229,7 +229,7 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,    DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');    ++NumElimRem;    Changed = true; -  DeadInsts.push_back(Rem); +  DeadInsts.emplace_back(Rem);  }  /// Eliminate an operation that consumes a simple IV and has @@ -260,7 +260,7 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,    UseInst->replaceAllUsesWith(IVOperand);    ++NumElimIdentity;    Changed = true; -  DeadInsts.push_back(UseInst); +  DeadInsts.emplace_back(UseInst);    return true;  } @@ -386,7 +386,7 @@ Instruction *SimplifyIndvar::splitOverflowIntrinsic(Instruction *IVUser,           "Bad add instruction created from overflow intrinsic.");    AddVal->replaceAllUsesWith(AddInst); -  DeadInsts.push_back(AddVal); +  DeadInsts.emplace_back(AddVal);    return AddInst;  } diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index cac80accc32f4..8c72641da9e7a 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -400,8 +400,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,    }    if (auto *AI = dyn_cast<AllocaInst>(I))      AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType())); -  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) +  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {      GEP->setSourceElementType(          TypeMapper->remapType(GEP->getSourceElementType())); +    GEP->setResultElementType( +        TypeMapper->remapType(GEP->getResultElementType())); +  }    I->mutateType(TypeMapper->remapType(I->getType()));  } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 011fd0f6fa888..95c9381985ab7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -34,6 +34,10 @@  // Variable uniformity checks are inspired by:  //  Karrenberg, R. and Hack, S. Whole Function Vectorization.  // +// The interleaved access vectorization is based on the paper: +//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved +//  Data for SIMD +//  // Other ideas/concepts are from:  //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.  // @@ -134,6 +138,16 @@ static cl::opt<bool> EnableMemAccessVersioning(      "enable-mem-access-versioning", cl::init(true), cl::Hidden,      cl::desc("Enable symblic stride memory access versioning")); +static cl::opt<bool> EnableInterleavedMemAccesses( +    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, +    cl::desc("Enable vectorization on interleaved memory accesses in a loop")); + +/// Maximum factor for an interleaved memory access. +static cl::opt<unsigned> MaxInterleaveGroupFactor( +    "max-interleave-group-factor", cl::Hidden, +    cl::desc("Maximum factor for an interleaved access group (default = 8)"), +    cl::init(8)); +  /// We don't unroll loops with a known constant trip count below this number.  static const unsigned TinyTripCountUnrollThreshold = 128; @@ -351,6 +365,9 @@ protected:    /// broadcast them into a vector.    VectorParts &getVectorValue(Value *V); +  /// Try to vectorize the interleaved access group that \p Instr belongs to. +  void vectorizeInterleaveGroup(Instruction *Instr); +    /// Generate a shuffle sequence that will reverse the vector Vec.    virtual Value *reverseVector(Value *Vec); @@ -545,6 +562,219 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F        propagateMetadata(I, From);  } +/// \brief The group of interleaved loads/stores sharing the same stride and +/// close to each other. +/// +/// Each member in this group has an index starting from 0, and the largest +/// index should be less than interleaved factor, which is equal to the absolute +/// value of the access's stride. +/// +/// E.g. An interleaved load group of factor 4: +///        for (unsigned i = 0; i < 1024; i+=4) { +///          a = A[i];                           // Member of index 0 +///          b = A[i+1];                         // Member of index 1 +///          d = A[i+3];                         // Member of index 3 +///          ... +///        } +/// +///      An interleaved store group of factor 4: +///        for (unsigned i = 0; i < 1024; i+=4) { +///          ... +///          A[i]   = a;                         // Member of index 0 +///          A[i+1] = b;                         // Member of index 1 +///          A[i+2] = c;                         // Member of index 2 +///          A[i+3] = d;                         // Member of index 3 +///        } +/// +/// Note: the interleaved load group could have gaps (missing members), but +/// the interleaved store group doesn't allow gaps. +class InterleaveGroup { +public: +  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) +      : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { +    assert(Align && "The alignment should be non-zero"); + +    Factor = std::abs(Stride); +    assert(Factor > 1 && "Invalid interleave factor"); + +    Reverse = Stride < 0; +    Members[0] = Instr; +  } + +  bool isReverse() const { return Reverse; } +  unsigned getFactor() const { return Factor; } +  unsigned getAlignment() const { return Align; } +  unsigned getNumMembers() const { return Members.size(); } + +  /// \brief Try to insert a new member \p Instr with index \p Index and +  /// alignment \p NewAlign. The index is related to the leader and it could be +  /// negative if it is the new leader. +  /// +  /// \returns false if the instruction doesn't belong to the group. +  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { +    assert(NewAlign && "The new member's alignment should be non-zero"); + +    int Key = Index + SmallestKey; + +    // Skip if there is already a member with the same index. +    if (Members.count(Key)) +      return false; + +    if (Key > LargestKey) { +      // The largest index is always less than the interleave factor. +      if (Index >= static_cast<int>(Factor)) +        return false; + +      LargestKey = Key; +    } else if (Key < SmallestKey) { +      // The largest index is always less than the interleave factor. +      if (LargestKey - Key >= static_cast<int>(Factor)) +        return false; + +      SmallestKey = Key; +    } + +    // It's always safe to select the minimum alignment. +    Align = std::min(Align, NewAlign); +    Members[Key] = Instr; +    return true; +  } + +  /// \brief Get the member with the given index \p Index +  /// +  /// \returns nullptr if contains no such member. +  Instruction *getMember(unsigned Index) const { +    int Key = SmallestKey + Index; +    if (!Members.count(Key)) +      return nullptr; + +    return Members.find(Key)->second; +  } + +  /// \brief Get the index for the given member. Unlike the key in the member +  /// map, the index starts from 0. +  unsigned getIndex(Instruction *Instr) const { +    for (auto I : Members) +      if (I.second == Instr) +        return I.first - SmallestKey; + +    llvm_unreachable("InterleaveGroup contains no such member"); +  } + +  Instruction *getInsertPos() const { return InsertPos; } +  void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + +private: +  unsigned Factor; // Interleave Factor. +  bool Reverse; +  unsigned Align; +  DenseMap<int, Instruction *> Members; +  int SmallestKey; +  int LargestKey; + +  // To avoid breaking dependences, vectorized instructions of an interleave +  // group should be inserted at either the first load or the last store in +  // program order. +  // +  // E.g. %even = load i32             // Insert Position +  //      %add = add i32 %even         // Use of %even +  //      %odd = load i32 +  // +  //      store i32 %even +  //      %odd = add i32               // Def of %odd +  //      store i32 %odd               // Insert Position +  Instruction *InsertPos; +}; + +/// \brief Drive the analysis of interleaved memory accesses in the loop. +/// +/// Use this class to analyze interleaved accesses only when we can vectorize +/// a loop. Otherwise it's meaningless to do analysis as the vectorization +/// on interleaved accesses is unsafe. +/// +/// The analysis collects interleave groups and records the relationships +/// between the member and the group in a map. +class InterleavedAccessInfo { +public: +  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) +      : SE(SE), TheLoop(L), DT(DT) {} + +  ~InterleavedAccessInfo() { +    SmallSet<InterleaveGroup *, 4> DelSet; +    // Avoid releasing a pointer twice. +    for (auto &I : InterleaveGroupMap) +      DelSet.insert(I.second); +    for (auto *Ptr : DelSet) +      delete Ptr; +  } + +  /// \brief Analyze the interleaved accesses and collect them in interleave +  /// groups. Substitute symbolic strides using \p Strides. +  void analyzeInterleaving(const ValueToValueMap &Strides); + +  /// \brief Check if \p Instr belongs to any interleave group. +  bool isInterleaved(Instruction *Instr) const { +    return InterleaveGroupMap.count(Instr); +  } + +  /// \brief Get the interleave group that \p Instr belongs to. +  /// +  /// \returns nullptr if doesn't have such group. +  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { +    if (InterleaveGroupMap.count(Instr)) +      return InterleaveGroupMap.find(Instr)->second; +    return nullptr; +  } + +private: +  ScalarEvolution *SE; +  Loop *TheLoop; +  DominatorTree *DT; + +  /// Holds the relationships between the members and the interleave group. +  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; + +  /// \brief The descriptor for a strided memory access. +  struct StrideDescriptor { +    StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, +                     unsigned Align) +        : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} + +    StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} + +    int Stride; // The access's stride. It is negative for a reverse access. +    const SCEV *Scev; // The scalar expression of this access +    unsigned Size;    // The size of the memory object. +    unsigned Align;   // The alignment of this access. +  }; + +  /// \brief Create a new interleave group with the given instruction \p Instr, +  /// stride \p Stride and alignment \p Align. +  /// +  /// \returns the newly created interleave group. +  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, +                                         unsigned Align) { +    assert(!InterleaveGroupMap.count(Instr) && +           "Already in an interleaved access group"); +    InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); +    return InterleaveGroupMap[Instr]; +  } + +  /// \brief Release the group and remove all the relationships. +  void releaseGroup(InterleaveGroup *Group) { +    for (unsigned i = 0; i < Group->getFactor(); i++) +      if (Instruction *Member = Group->getMember(i)) +        InterleaveGroupMap.erase(Member); + +    delete Group; +  } + +  /// \brief Collect all the accesses with a constant stride in program order. +  void collectConstStridedAccesses( +      MapVector<Instruction *, StrideDescriptor> &StrideAccesses, +      const ValueToValueMap &Strides); +}; +  /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and  /// to what vectorization factor.  /// This class does not look at the profitability of vectorization, only the @@ -565,8 +795,8 @@ public:                              Function *F, const TargetTransformInfo *TTI,                              LoopAccessAnalysis *LAA)        : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), -        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), -        WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} +        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), +        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}    /// This enum represents the kinds of inductions that we support.    enum InductionKind { @@ -697,6 +927,16 @@ public:      return LAI;    } +  /// \brief Check if \p Instr belongs to any interleaved access group. +  bool isAccessInterleaved(Instruction *Instr) { +    return InterleaveInfo.isInterleaved(Instr); +  } + +  /// \brief Get the interleaved access group that \p Instr belongs to. +  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { +    return InterleaveInfo.getInterleaveGroup(Instr); +  } +    unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }    bool hasStride(Value *V) { return StrideSet.count(V); } @@ -792,6 +1032,10 @@ private:    // null until canVectorizeMemory sets it up.    const LoopAccessInfo *LAI; +  /// The interleave access information contains groups of interleaved accesses +  /// with the same stride and close to each other. +  InterleavedAccessInfo InterleaveInfo; +    //  ---  vectorization state --- //    /// Holds the integer induction variable. This is the counter of the @@ -1657,6 +1901,251 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {                                       "reverse");  } +// Get a mask to interleave \p NumVec vectors into a wide vector. +// I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors, if VF is 4, the mask is: +//      <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, +                                    unsigned NumVec) { +  SmallVector<Constant *, 16> Mask; +  for (unsigned i = 0; i < VF; i++) +    for (unsigned j = 0; j < NumVec; j++) +      Mask.push_back(Builder.getInt32(j * VF + i)); + +  return ConstantVector::get(Mask); +} + +// Get the strided mask starting from index \p Start. +// I.e.  <Start, Start + Stride, ..., Start + Stride*(VF-1)> +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, +                                unsigned Stride, unsigned VF) { +  SmallVector<Constant *, 16> Mask; +  for (unsigned i = 0; i < VF; i++) +    Mask.push_back(Builder.getInt32(Start + i * Stride)); + +  return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consists of sequential integers +// starting from 0, The second part consists of UNDEFs. +// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, +                                   unsigned NumUndef) { +  SmallVector<Constant *, 16> Mask; +  for (unsigned i = 0; i < NumInt; i++) +    Mask.push_back(Builder.getInt32(i)); + +  Constant *Undef = UndefValue::get(Builder.getInt32Ty()); +  for (unsigned i = 0; i < NumUndef; i++) +    Mask.push_back(Undef); + +  return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 2nd vector should +// not have more elements than the 1st vector. If the 2nd vector has less +// elements, extend it with UNDEFs. +static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, +                                    Value *V2) { +  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); +  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); +  assert(VecTy1 && VecTy2 && +         VecTy1->getScalarType() == VecTy2->getScalarType() && +         "Expect two vectors with the same element type"); + +  unsigned NumElts1 = VecTy1->getNumElements(); +  unsigned NumElts2 = VecTy2->getNumElements(); +  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); + +  if (NumElts1 > NumElts2) { +    // Extend with UNDEFs. +    Constant *ExtMask = +        getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); +    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); +  } + +  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); +  return Builder.CreateShuffleVector(V1, V2, Mask); +} + +// Concatenate vectors in the given list. All vectors have the same type. +static Value *ConcatenateVectors(IRBuilder<> &Builder, +                                 ArrayRef<Value *> InputList) { +  unsigned NumVec = InputList.size(); +  assert(NumVec > 1 && "Should be at least two vectors"); + +  SmallVector<Value *, 8> ResList; +  ResList.append(InputList.begin(), InputList.end()); +  do { +    SmallVector<Value *, 8> TmpList; +    for (unsigned i = 0; i < NumVec - 1; i += 2) { +      Value *V0 = ResList[i], *V1 = ResList[i + 1]; +      assert((V0->getType() == V1->getType() || i == NumVec - 2) && +             "Only the last vector may have a different type"); + +      TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); +    } + +    // Push the last vector if the total number of vectors is odd. +    if (NumVec % 2 != 0) +      TmpList.push_back(ResList[NumVec - 1]); + +    ResList = TmpList; +    NumVec = ResList.size(); +  } while (NumVec > 1); + +  return ResList[0]; +} + +// Try to vectorize the interleave group that \p Instr belongs to. +// +// E.g. Translate following interleaved load group (factor = 3): +//   for (i = 0; i < N; i+=3) { +//     R = Pic[i];             // Member of index 0 +//     G = Pic[i+1];           // Member of index 1 +//     B = Pic[i+2];           // Member of index 2 +//     ... // do something to R, G, B +//   } +// To: +//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B +//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements +//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements +//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements +// +// Or translate following interleaved store group (factor = 3): +//   for (i = 0; i < N; i+=3) { +//     ... do something to R, G, B +//     Pic[i]   = R;           // Member of index 0 +//     Pic[i+1] = G;           // Member of index 1 +//     Pic[i+2] = B;           // Member of index 2 +//   } +// To: +//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> +//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec, +//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements +//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { +  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); +  assert(Group && "Fail to get an interleaved access group."); + +  // Skip if current instruction is not the insert position. +  if (Instr != Group->getInsertPos()) +    return; + +  LoadInst *LI = dyn_cast<LoadInst>(Instr); +  StoreInst *SI = dyn_cast<StoreInst>(Instr); +  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + +  // Prepare for the vector type of the interleaved load/store. +  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); +  unsigned InterleaveFactor = Group->getFactor(); +  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); +  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + +  // Prepare for the new pointers. +  setDebugLocFromInst(Builder, Ptr); +  VectorParts &PtrParts = getVectorValue(Ptr); +  SmallVector<Value *, 2> NewPtrs; +  unsigned Index = Group->getIndex(Instr); +  for (unsigned Part = 0; Part < UF; Part++) { +    // Extract the pointer for current instruction from the pointer vector. A +    // reverse access uses the pointer in the last lane. +    Value *NewPtr = Builder.CreateExtractElement( +        PtrParts[Part], +        Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + +    // Notice current instruction could be any index. Need to adjust the address +    // to the member of index 0. +    // +    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction) +    //       b = A[i];       // Member of index 0 +    // Current pointer is pointed to A[i+1], adjust it to A[i]. +    // +    // E.g.  A[i+1] = a;     // Member of index 1 +    //       A[i]   = b;     // Member of index 0 +    //       A[i+2] = c;     // Member of index 2 (Current instruction) +    // Current pointer is pointed to A[i+2], adjust it to A[i]. +    NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + +    // Cast to the vector pointer type. +    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); +  } + +  setDebugLocFromInst(Builder, Instr); +  Value *UndefVec = UndefValue::get(VecTy); + +  // Vectorize the interleaved load group. +  if (LI) { +    for (unsigned Part = 0; Part < UF; Part++) { +      Instruction *NewLoadInstr = Builder.CreateAlignedLoad( +          NewPtrs[Part], Group->getAlignment(), "wide.vec"); + +      for (unsigned i = 0; i < InterleaveFactor; i++) { +        Instruction *Member = Group->getMember(i); + +        // Skip the gaps in the group. +        if (!Member) +          continue; + +        Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); +        Value *StridedVec = Builder.CreateShuffleVector( +            NewLoadInstr, UndefVec, StrideMask, "strided.vec"); + +        // If this member has different type, cast the result type. +        if (Member->getType() != ScalarTy) { +          VectorType *OtherVTy = VectorType::get(Member->getType(), VF); +          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); +        } + +        VectorParts &Entry = WidenMap.get(Member); +        Entry[Part] = +            Group->isReverse() ? reverseVector(StridedVec) : StridedVec; +      } + +      propagateMetadata(NewLoadInstr, Instr); +    } +    return; +  } + +  // The sub vector type for current instruction. +  VectorType *SubVT = VectorType::get(ScalarTy, VF); + +  // Vectorize the interleaved store group. +  for (unsigned Part = 0; Part < UF; Part++) { +    // Collect the stored vector from each member. +    SmallVector<Value *, 4> StoredVecs; +    for (unsigned i = 0; i < InterleaveFactor; i++) { +      // Interleaved store group doesn't allow a gap, so each index has a member +      Instruction *Member = Group->getMember(i); +      assert(Member && "Fail to get a member from an interleaved store group"); + +      Value *StoredVec = +          getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part]; +      if (Group->isReverse()) +        StoredVec = reverseVector(StoredVec); + +      // If this member has different type, cast it to an unified type. +      if (StoredVec->getType() != SubVT) +        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); + +      StoredVecs.push_back(StoredVec); +    } + +    // Concatenate all vectors into a wide vector. +    Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + +    // Interleave the elements in the wide vector. +    Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); +    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, +                                              "interleaved.vec"); + +    Instruction *NewStoreInstr = +        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); +    propagateMetadata(NewStoreInstr, Instr); +  } +} +  void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {    // Attempt to issue a wide load.    LoadInst *LI = dyn_cast<LoadInst>(Instr); @@ -1664,6 +2153,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {    assert((LI || SI) && "Invalid Load/Store instruction"); +  // Try to vectorize the interleave group if this access is interleaved. +  if (Legal->isAccessInterleaved(Instr)) +    return vectorizeInterleaveGroup(Instr); +    Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();    Type *DataTy = VectorType::get(ScalarDataTy, VF);    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); @@ -3408,6 +3901,10 @@ bool LoopVectorizationLegality::canVectorize() {           "")          <<"!\n"); +  // Analyze interleaved memory accesses. +  if (EnableInterleavedMemAccesses) +    InterleaveInfo.analyzeInterleaving(Strides); +    // Okay! We can vectorize. At this point we don't have any other mem analysis    // which may limit our maximum vectorization factor, so just return true with    // no restrictions. @@ -3923,6 +4420,166 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,    return true;  } +void InterleavedAccessInfo::collectConstStridedAccesses( +    MapVector<Instruction *, StrideDescriptor> &StrideAccesses, +    const ValueToValueMap &Strides) { +  // Holds load/store instructions in program order. +  SmallVector<Instruction *, 16> AccessList; + +  for (auto *BB : TheLoop->getBlocks()) { +    bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); + +    for (auto &I : *BB) { +      if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I)) +        continue; +      // FIXME: Currently we can't handle mixed accesses and predicated accesses +      if (IsPred) +        return; + +      AccessList.push_back(&I); +    } +  } + +  if (AccessList.empty()) +    return; + +  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); +  for (auto I : AccessList) { +    LoadInst *LI = dyn_cast<LoadInst>(I); +    StoreInst *SI = dyn_cast<StoreInst>(I); + +    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); +    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + +    // The factor of the corresponding interleave group. +    unsigned Factor = std::abs(Stride); + +    // Ignore the access if the factor is too small or too large. +    if (Factor < 2 || Factor > MaxInterleaveGroupFactor) +      continue; + +    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); +    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); +    unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); + +    // An alignment of 0 means target ABI alignment. +    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); +    if (!Align) +      Align = DL.getABITypeAlignment(PtrTy->getElementType()); + +    StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); +  } +} + +// Analyze interleaved accesses and collect them into interleave groups. +// +// Notice that the vectorization on interleaved groups will change instruction +// orders and may break dependences. But the memory dependence check guarantees +// that there is no overlap between two pointers of different strides, element +// sizes or underlying bases. +// +// For pointers sharing the same stride, element size and underlying base, no +// need to worry about Read-After-Write dependences and Write-After-Read +// dependences. +// +// E.g. The RAW dependence:  A[i] = a; +//                           b = A[i]; +// This won't exist as it is a store-load forwarding conflict, which has +// already been checked and forbidden in the dependence check. +// +// E.g. The WAR dependence:  a = A[i];  // (1) +//                           A[i] = b;  // (2) +// The store group of (2) is always inserted at or below (2), and the load group +// of (1) is always inserted at or above (1). The dependence is safe. +void InterleavedAccessInfo::analyzeInterleaving( +    const ValueToValueMap &Strides) { +  DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); + +  // Holds all the stride accesses. +  MapVector<Instruction *, StrideDescriptor> StrideAccesses; +  collectConstStridedAccesses(StrideAccesses, Strides); + +  if (StrideAccesses.empty()) +    return; + +  // Holds all interleaved store groups temporarily. +  SmallSetVector<InterleaveGroup *, 4> StoreGroups; + +  // Search the load-load/write-write pair B-A in bottom-up order and try to +  // insert B into the interleave group of A according to 3 rules: +  //   1. A and B have the same stride. +  //   2. A and B have the same memory object size. +  //   3. B belongs to the group according to the distance. +  // +  // The bottom-up order can avoid breaking the Write-After-Write dependences +  // between two pointers of the same base. +  // E.g.  A[i]   = a;   (1) +  //       A[i]   = b;   (2) +  //       A[i+1] = c    (3) +  // We form the group (2)+(3) in front, so (1) has to form groups with accesses +  // above (1), which guarantees that (1) is always above (2). +  for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; +       ++I) { +    Instruction *A = I->first; +    StrideDescriptor DesA = I->second; + +    InterleaveGroup *Group = getInterleaveGroup(A); +    if (!Group) { +      DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); +      Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); +    } + +    if (A->mayWriteToMemory()) +      StoreGroups.insert(Group); + +    for (auto II = std::next(I); II != E; ++II) { +      Instruction *B = II->first; +      StrideDescriptor DesB = II->second; + +      // Ignore if B is already in a group or B is a different memory operation. +      if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) +        continue; + +      // Check the rule 1 and 2. +      if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) +        continue; + +      // Calculate the distance and prepare for the rule 3. +      const SCEVConstant *DistToA = +          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); +      if (!DistToA) +        continue; + +      int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + +      // Skip if the distance is not multiple of size as they are not in the +      // same group. +      if (DistanceToA % static_cast<int>(DesA.Size)) +        continue; + +      // The index of B is the index of A plus the related index to A. +      int IndexB = +          Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size); + +      // Try to insert B into the group. +      if (Group->insertMember(B, IndexB, DesB.Align)) { +        DEBUG(dbgs() << "LV: Inserted:" << *B << '\n' +                     << "    into the interleave group with" << *A << '\n'); +        InterleaveGroupMap[B] = Group; + +        // Set the first load in program order as the insert position. +        if (B->mayReadFromMemory()) +          Group->setInsertPos(B); +      } +    } // Iteration on instruction B +  }   // Iteration on instruction A + +  // Remove interleaved store groups with gaps. +  for (InterleaveGroup *Group : StoreGroups) +    if (Group->getNumMembers() != Group->getFactor()) +      releaseGroup(Group); +} +  LoopVectorizationCostModel::VectorizationFactor  LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {    // Width 1 means no vectorize @@ -4575,6 +5232,46 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {        return TTI.getAddressComputationCost(VectorTy) +          TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); +    // For an interleaved access, calculate the total cost of the whole +    // interleave group. +    if (Legal->isAccessInterleaved(I)) { +      auto Group = Legal->getInterleavedAccessGroup(I); +      assert(Group && "Fail to get an interleaved access group."); + +      // Only calculate the cost once at the insert position. +      if (Group->getInsertPos() != I) +        return 0; + +      unsigned InterleaveFactor = Group->getFactor(); +      Type *WideVecTy = +          VectorType::get(VectorTy->getVectorElementType(), +                          VectorTy->getVectorNumElements() * InterleaveFactor); + +      // Holds the indices of existing members in an interleaved load group. +      // An interleaved store group doesn't need this as it dones't allow gaps. +      SmallVector<unsigned, 4> Indices; +      if (LI) { +        for (unsigned i = 0; i < InterleaveFactor; i++) +          if (Group->getMember(i)) +            Indices.push_back(i); +      } + +      // Calculate the cost of the whole interleaved group. +      unsigned Cost = TTI.getInterleavedMemoryOpCost( +          I->getOpcode(), WideVecTy, Group->getFactor(), Indices, +          Group->getAlignment(), AS); + +      if (Group->isReverse()) +        Cost += +            Group->getNumMembers() * +            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + +      // FIXME: The interleaved load group with a huge gap could be even more +      // expensive than scalar operations. Then we could ignore such group and +      // use scalar operations instead. +      return Cost; +    } +      // Scalarized loads/stores.      int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);      bool Reverse = ConsecutiveStride < 0; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 504425eae406b..a3a45c80d8504 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -317,9 +317,9 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,  /// \returns the AA location that is being access by the instruction.  static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {    if (StoreInst *SI = dyn_cast<StoreInst>(I)) -    return AA->getLocation(SI); +    return MemoryLocation::get(SI);    if (LoadInst *LI = dyn_cast<LoadInst>(I)) -    return AA->getLocation(LI); +    return MemoryLocation::get(LI);    return AliasAnalysis::Location();  } @@ -472,7 +472,7 @@ private:    /// Create a new VectorizableTree entry.    TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { -    VectorizableTree.push_back(TreeEntry()); +    VectorizableTree.emplace_back();      int idx = VectorizableTree.size() - 1;      TreeEntry *Last = &VectorizableTree[idx];      Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());  | 
