diff options
Diffstat (limited to 'llvm/lib/Transforms')
107 files changed, 5344 insertions, 3708 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index 68a34bdcb1cd..1533e1805f17 100644 --- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -176,11 +176,14 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { lowerCoroNoop(cast<IntrinsicInst>(&I)); break; case Intrinsic::coro_id: - // Mark a function that comes out of the frontend that has a coro.id - // with a coroutine attribute. if (auto *CII = cast<CoroIdInst>(&I)) { if (CII->getInfo().isPreSplit()) { - F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT); + assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) && + F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() == + UNPREPARED_FOR_SPLIT && + "The frontend uses Swtich-Resumed ABI should emit " + "\"coroutine.presplit\" attribute with value \"0\" for the " + "coroutine."); setCannotDuplicate(CII); CII->setCoroutineSelf(); CoroId = cast<CoroIdInst>(&I); @@ -190,6 +193,8 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { case Intrinsic::coro_id_retcon: case Intrinsic::coro_id_retcon_once: case Intrinsic::coro_id_async: + // TODO: Remove the line once we support it in the corresponding + // frontend. F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT); break; case Intrinsic::coro_resume: diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index a0d12865bd3a..92acfb93057a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -587,7 +587,7 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F, } }); - if (!Shape.ReuseFrameSlot && !EnableReuseStorageInFrame) { + if (!Shape.OptimizeFrame && !EnableReuseStorageInFrame) { for (const auto &A : FrameData.Allocas) { AllocaInst *Alloca = A.Alloca; NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); @@ -808,7 +808,7 @@ static StringRef solveTypeName(Type *Ty) { if (Ty->isPointerTy()) { auto *PtrTy = cast<PointerType>(Ty); - Type *PointeeTy = PtrTy->getElementType(); + Type *PointeeTy = PtrTy->getPointerElementType(); auto Name = solveTypeName(PointeeTy); if (Name == "UnknownType") return "PointerType"; @@ -1659,7 +1659,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData, &*Builder.GetInsertPoint()); // This dbg.declare is for the main function entry point. It // will be deleted in all coro-split functions. - coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.ReuseFrameSlot); + coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame); } } @@ -2278,7 +2278,7 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); auto ArgTy = cast<PointerType>(Arg.getType()); - auto ValueTy = ArgTy->getElementType(); + auto ValueTy = ArgTy->getPointerElementType(); // Reduce to the alloca case: @@ -2506,7 +2506,7 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape, void coro::salvageDebugInfo( SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache, - DbgVariableIntrinsic *DVI, bool ReuseFrameSlot) { + DbgVariableIntrinsic *DVI, bool OptimizeFrame) { Function *F = DVI->getFunction(); IRBuilder<> Builder(F->getContext()); auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); @@ -2558,7 +2558,7 @@ void coro::salvageDebugInfo( // // Avoid to create the alloca would be eliminated by optimization // passes and the corresponding dbg.declares would be invalid. - if (!ReuseFrameSlot && !EnableReuseStorageInFrame) + if (!OptimizeFrame && !EnableReuseStorageInFrame) if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) { auto &Cached = DbgPtrAllocaCache[Storage]; if (!Cached) { diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h index bf3d781ba43e..014938c15a0a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInstr.h +++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h @@ -599,6 +599,18 @@ public: } }; +/// This represents the llvm.coro.align instruction. +class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_align; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst { enum { FrameArg, UnwindArg }; diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index 27ba8524f975..9a17068df3a9 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -36,6 +36,11 @@ void initializeCoroCleanupLegacyPass(PassRegistry &); // adds coroutine subfunctions to the SCC to be processed by IPO pipeline. // Async lowering similarily triggers a restart of the pipeline after it has // split the coroutine. +// +// FIXME: Refactor these attributes as LLVM attributes instead of string +// attributes since these attributes are already used outside LLVM's +// coroutine module. +// FIXME: Remove these values once we remove the Legacy PM. #define CORO_PRESPLIT_ATTR "coroutine.presplit" #define UNPREPARED_FOR_SPLIT "0" #define PREPARED_FOR_SPLIT "1" @@ -54,7 +59,7 @@ void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs, /// holding a pointer to the coroutine frame. void salvageDebugInfo( SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache, - DbgVariableIntrinsic *DVI, bool ReuseFrameSlot); + DbgVariableIntrinsic *DVI, bool OptimizeFrame); // Keeps data and helper functions for lowering coroutine intrinsics. struct LowererBase { @@ -99,6 +104,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape { CoroBeginInst *CoroBegin; SmallVector<AnyCoroEndInst *, 4> CoroEnds; SmallVector<CoroSizeInst *, 2> CoroSizes; + SmallVector<CoroAlignInst *, 2> CoroAligns; SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends; SmallVector<CallInst*, 2> SwiftErrorOps; @@ -126,7 +132,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape { BasicBlock *AllocaSpillBlock; /// This would only be true if optimization are enabled. - bool ReuseFrameSlot; + bool OptimizeFrame; struct SwitchLoweringStorage { SwitchInst *ResumeSwitch; @@ -272,8 +278,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape { void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const; Shape() = default; - explicit Shape(Function &F, bool ReuseFrameSlot = false) - : ReuseFrameSlot(ReuseFrameSlot) { + explicit Shape(Function &F, bool OptimizeFrame = false) + : OptimizeFrame(OptimizeFrame) { buildFrom(F); } void buildFrom(Function &F); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 12c1829524ef..b5129809c6a6 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -617,7 +618,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, Value *CachedSlot = nullptr; auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * { if (CachedSlot) { - assert(CachedSlot->getType()->getPointerElementType() == ValueTy && + assert(cast<PointerType>(CachedSlot->getType()) + ->isOpaqueOrPointeeTypeMatches(ValueTy) && "multiple swifterror slots in function with different types"); return CachedSlot; } @@ -626,7 +628,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, for (auto &Arg : F.args()) { if (Arg.isSwiftError()) { CachedSlot = &Arg; - assert(Arg.getType()->getPointerElementType() == ValueTy && + assert(cast<PointerType>(Arg.getType()) + ->isOpaqueOrPointeeTypeMatches(ValueTy) && "swifterror argument does not have expected type"); return &Arg; } @@ -682,7 +685,7 @@ void CoroCloner::salvageDebugInfo() { if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) Worklist.push_back(DVI); for (DbgVariableIntrinsic *DVI : Worklist) - coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.ReuseFrameSlot); + coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.OptimizeFrame); // Remove all salvaged dbg.declare intrinsics that became // either unreachable or stale due to the CoroSplit transformation. @@ -835,7 +838,7 @@ Value *CoroCloner::deriveNewFramePointer() { static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context, unsigned ParamIndex, uint64_t Size, Align Alignment) { - AttrBuilder ParamAttrs; + AttrBuilder ParamAttrs(Context); ParamAttrs.addAttribute(Attribute::NonNull); ParamAttrs.addAttribute(Attribute::NoAlias); ParamAttrs.addAlignmentAttr(Alignment); @@ -845,14 +848,14 @@ static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context, static void addAsyncContextAttrs(AttributeList &Attrs, LLVMContext &Context, unsigned ParamIndex) { - AttrBuilder ParamAttrs; + AttrBuilder ParamAttrs(Context); ParamAttrs.addAttribute(Attribute::SwiftAsync); Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs); } static void addSwiftSelfAttrs(AttributeList &Attrs, LLVMContext &Context, unsigned ParamIndex) { - AttrBuilder ParamAttrs; + AttrBuilder ParamAttrs(Context); ParamAttrs.addAttribute(Attribute::SwiftSelf); Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs); } @@ -929,7 +932,7 @@ void CoroCloner::create() { case coro::ABI::Switch: // Bootstrap attributes by copying function attributes from the // original function. This should include optimization settings and so on. - NewAttrs = NewAttrs.addFnAttributes(Context, OrigAttrs.getFnAttrs()); + NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, OrigAttrs.getFnAttrs())); addFramePointerAttrs(NewAttrs, Context, 0, Shape.FrameSize, Shape.FrameAlign); @@ -952,7 +955,7 @@ void CoroCloner::create() { // Transfer the original function's attributes. auto FnAttrs = OrigF.getAttributes().getFnAttrs(); - NewAttrs = NewAttrs.addFnAttributes(Context, FnAttrs); + NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, FnAttrs)); break; } case coro::ABI::Retcon: @@ -1082,10 +1085,16 @@ static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) { Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct); } -static void replaceFrameSize(coro::Shape &Shape) { +static void replaceFrameSizeAndAlignment(coro::Shape &Shape) { if (Shape.ABI == coro::ABI::Async) updateAsyncFuncPointerContextSize(Shape); + for (CoroAlignInst *CA : Shape.CoroAligns) { + CA->replaceAllUsesWith( + ConstantInt::get(CA->getType(), Shape.FrameAlign.value())); + CA->eraseFromParent(); + } + if (Shape.CoroSizes.empty()) return; @@ -1197,10 +1206,34 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock, static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) { DenseMap<Value *, Value *> ResolvedValues; BasicBlock *UnconditionalSucc = nullptr; + assert(InitialInst->getModule()); + const DataLayout &DL = InitialInst->getModule()->getDataLayout(); + + auto GetFirstValidInstruction = [](Instruction *I) { + while (I) { + // BitCastInst wouldn't generate actual code so that we could skip it. + if (isa<BitCastInst>(I) || I->isDebugOrPseudoInst() || + I->isLifetimeStartOrEnd()) + I = I->getNextNode(); + else if (isInstructionTriviallyDead(I)) + // Duing we are in the middle of the transformation, we need to erase + // the dead instruction manually. + I = &*I->eraseFromParent(); + else + break; + } + return I; + }; + + auto TryResolveConstant = [&ResolvedValues](Value *V) { + auto It = ResolvedValues.find(V); + if (It != ResolvedValues.end()) + V = It->second; + return dyn_cast<ConstantInt>(V); + }; Instruction *I = InitialInst; - while (I->isTerminator() || - (isa<CmpInst>(I) && I->getNextNode()->isTerminator())) { + while (I->isTerminator() || isa<CmpInst>(I)) { if (isa<ReturnInst>(I)) { if (I != InitialInst) { // If InitialInst is an unconditional branch, @@ -1213,48 +1246,68 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) { } if (auto *BR = dyn_cast<BranchInst>(I)) { if (BR->isUnconditional()) { - BasicBlock *BB = BR->getSuccessor(0); + BasicBlock *Succ = BR->getSuccessor(0); if (I == InitialInst) - UnconditionalSucc = BB; - scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); - I = BB->getFirstNonPHIOrDbgOrLifetime(); + UnconditionalSucc = Succ; + scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues); + I = GetFirstValidInstruction(Succ->getFirstNonPHIOrDbgOrLifetime()); continue; } - } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) { - auto *BR = dyn_cast<BranchInst>(I->getNextNode()); - if (BR && BR->isConditional() && CondCmp == BR->getCondition()) { - // If the case number of suspended switch instruction is reduced to - // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator. - // And the comparsion looks like : %cond = icmp eq i8 %V, constant. - ConstantInt *CondConst = dyn_cast<ConstantInt>(CondCmp->getOperand(1)); - if (CondConst && CondCmp->getPredicate() == CmpInst::ICMP_EQ) { - Value *V = CondCmp->getOperand(0); - auto it = ResolvedValues.find(V); - if (it != ResolvedValues.end()) - V = it->second; - - if (ConstantInt *Cond0 = dyn_cast<ConstantInt>(V)) { - BasicBlock *BB = Cond0->equalsInt(CondConst->getZExtValue()) - ? BR->getSuccessor(0) - : BR->getSuccessor(1); - scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); - I = BB->getFirstNonPHIOrDbgOrLifetime(); - continue; - } - } - } - } else if (auto *SI = dyn_cast<SwitchInst>(I)) { - Value *V = SI->getCondition(); - auto it = ResolvedValues.find(V); - if (it != ResolvedValues.end()) - V = it->second; - if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { - BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor(); - scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); - I = BB->getFirstNonPHIOrDbgOrLifetime(); + + BasicBlock *BB = BR->getParent(); + // Handle the case the condition of the conditional branch is constant. + // e.g., + // + // br i1 false, label %cleanup, label %CoroEnd + // + // It is possible during the transformation. We could continue the + // simplifying in this case. + if (ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true)) { + // Handle this branch in next iteration. + I = BB->getTerminator(); continue; } + } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) { + // If the case number of suspended switch instruction is reduced to + // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator. + auto *BR = dyn_cast<BranchInst>( + GetFirstValidInstruction(CondCmp->getNextNode())); + if (!BR || !BR->isConditional() || CondCmp != BR->getCondition()) + return false; + + // And the comparsion looks like : %cond = icmp eq i8 %V, constant. + // So we try to resolve constant for the first operand only since the + // second operand should be literal constant by design. + ConstantInt *Cond0 = TryResolveConstant(CondCmp->getOperand(0)); + auto *Cond1 = dyn_cast<ConstantInt>(CondCmp->getOperand(1)); + if (!Cond0 || !Cond1) + return false; + + // Both operands of the CmpInst are Constant. So that we could evaluate + // it immediately to get the destination. + auto *ConstResult = + dyn_cast_or_null<ConstantInt>(ConstantFoldCompareInstOperands( + CondCmp->getPredicate(), Cond0, Cond1, DL)); + if (!ConstResult) + return false; + + CondCmp->replaceAllUsesWith(ConstResult); + CondCmp->eraseFromParent(); + + // Handle this branch in next iteration. + I = BR; + continue; + } else if (auto *SI = dyn_cast<SwitchInst>(I)) { + ConstantInt *Cond = TryResolveConstant(SI->getCondition()); + if (!Cond) + return false; + + BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor(); + scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); + I = GetFirstValidInstruction(BB->getFirstNonPHIOrDbgOrLifetime()); + continue; } + return false; } return false; @@ -1826,20 +1879,20 @@ namespace { static coro::Shape splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones, - bool ReuseFrameSlot) { + bool OptimizeFrame) { PrettyStackTraceFunction prettyStackTrace(F); // The suspend-crossing algorithm in buildCoroutineFrame get tripped // up by uses in unreachable blocks, so remove them as a first pass. removeUnreachableBlocks(F); - coro::Shape Shape(F, ReuseFrameSlot); + coro::Shape Shape(F, OptimizeFrame); if (!Shape.CoroBegin) return Shape; simplifySuspendPoints(Shape); buildCoroutineFrame(F, Shape); - replaceFrameSize(Shape); + replaceFrameSizeAndAlignment(Shape); // If there are no suspend points, no split required, just remove // the allocation and deallocation blocks, they are not needed. @@ -2165,7 +2218,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C, F.removeFnAttr(CORO_PRESPLIT_ATTR); SmallVector<Function *, 4> Clones; - const coro::Shape Shape = splitCoroutine(F, Clones, ReuseFrameSlot); + const coro::Shape Shape = splitCoroutine(F, Clones, OptimizeFrame); updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM); if (!Shape.CoroSuspends.empty()) { @@ -2198,13 +2251,13 @@ namespace { struct CoroSplitLegacy : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - CoroSplitLegacy(bool ReuseFrameSlot = false) - : CallGraphSCCPass(ID), ReuseFrameSlot(ReuseFrameSlot) { + CoroSplitLegacy(bool OptimizeFrame = false) + : CallGraphSCCPass(ID), OptimizeFrame(OptimizeFrame) { initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry()); } bool Run = false; - bool ReuseFrameSlot; + bool OptimizeFrame; // A coroutine is identified by the presence of coro.begin intrinsic, if // we don't have any, this pass has nothing to do. @@ -2263,7 +2316,7 @@ struct CoroSplitLegacy : public CallGraphSCCPass { F->removeFnAttr(CORO_PRESPLIT_ATTR); SmallVector<Function *, 4> Clones; - const coro::Shape Shape = splitCoroutine(*F, Clones, ReuseFrameSlot); + const coro::Shape Shape = splitCoroutine(*F, Clones, OptimizeFrame); updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC); if (Shape.ABI == coro::ABI::Async) { // Restart SCC passes. @@ -2300,6 +2353,6 @@ INITIALIZE_PASS_END( "Split coroutine into a set of functions driving its state machine", false, false) -Pass *llvm::createCoroSplitLegacyPass(bool ReuseFrameSlot) { - return new CoroSplitLegacy(ReuseFrameSlot); +Pass *llvm::createCoroSplitLegacyPass(bool OptimizeFrame) { + return new CoroSplitLegacy(OptimizeFrame); } diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index fba8b03e44ba..965a146c143f 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -123,6 +123,7 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index, static bool isCoroutineIntrinsicName(StringRef Name) { // NOTE: Must be sorted! static const char *const CoroIntrinsics[] = { + "llvm.coro.align", "llvm.coro.alloc", "llvm.coro.async.context.alloc", "llvm.coro.async.context.dealloc", @@ -268,6 +269,9 @@ void coro::Shape::buildFrom(Function &F) { case Intrinsic::coro_size: CoroSizes.push_back(cast<CoroSizeInst>(II)); break; + case Intrinsic::coro_align: + CoroAligns.push_back(cast<CoroAlignInst>(II)); + break; case Intrinsic::coro_frame: CoroFrames.push_back(cast<CoroFrameInst>(II)); break; @@ -672,8 +676,11 @@ static void checkAsyncFuncPointer(const Instruction *I, Value *V) { if (!AsyncFuncPtrAddr) fail(I, "llvm.coro.id.async async function pointer not a global", V); - auto *StructTy = - cast<StructType>(AsyncFuncPtrAddr->getType()->getPointerElementType()); + if (AsyncFuncPtrAddr->getType()->isOpaquePointerTy()) + return; + + auto *StructTy = cast<StructType>( + AsyncFuncPtrAddr->getType()->getNonOpaquePointerElementType()); if (StructTy->isOpaque() || !StructTy->isPacked() || StructTy->getNumElements() != 2 || !StructTy->getElementType(0)->isIntegerTy(32) || @@ -697,14 +704,16 @@ void CoroIdAsyncInst::checkWellFormed() const { static void checkAsyncContextProjectFunction(const Instruction *I, Function *F) { auto *FunTy = cast<FunctionType>(F->getValueType()); - if (!FunTy->getReturnType()->isPointerTy() || - !FunTy->getReturnType()->getPointerElementType()->isIntegerTy(8)) + Type *Int8Ty = Type::getInt8Ty(F->getContext()); + auto *RetPtrTy = dyn_cast<PointerType>(FunTy->getReturnType()); + if (!RetPtrTy || !RetPtrTy->isOpaqueOrPointeeTypeMatches(Int8Ty)) fail(I, "llvm.coro.suspend.async resume function projection function must " "return an i8* type", F); if (FunTy->getNumParams() != 1 || !FunTy->getParamType(0)->isPointerTy() || - !FunTy->getParamType(0)->getPointerElementType()->isIntegerTy(8)) + !cast<PointerType>(FunTy->getParamType(0)) + ->isOpaqueOrPointeeTypeMatches(Int8Ty)) fail(I, "llvm.coro.suspend.async resume function projection function must " "take one i8* type as parameter", @@ -719,8 +728,7 @@ void CoroAsyncEndInst::checkWellFormed() const { auto *MustTailCallFunc = getMustTailCallFunction(); if (!MustTailCallFunc) return; - auto *FnTy = - cast<FunctionType>(MustTailCallFunc->getType()->getPointerElementType()); + auto *FnTy = MustTailCallFunc->getFunctionType(); if (FnTy->getNumParams() != (arg_size() - 3)) fail(this, "llvm.coro.end.async must tail call function argument type must " diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index 01e724e22dcf..a6d9ce1033f3 100644 --- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -54,13 +54,13 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, if (F.isPresplitCoroutine()) continue; - if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) && - isInlineViable(F).isSuccess()) { + if (!F.isDeclaration() && isInlineViable(F).isSuccess()) { Calls.clear(); for (User *U : F.users()) if (auto *CB = dyn_cast<CallBase>(U)) - if (CB->getCalledFunction() == &F) + if (CB->getCalledFunction() == &F && + CB->hasFnAttr(Attribute::AlwaysInline)) Calls.insert(CB); for (CallBase *CB : Calls) { @@ -92,10 +92,12 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, Changed = true; } - // Remember to try and delete this function afterward. This both avoids - // re-walking the rest of the module and avoids dealing with any iterator - // invalidation issues while deleting functions. - InlinedFunctions.push_back(&F); + if (F.hasFnAttribute(Attribute::AlwaysInline)) { + // Remember to try and delete this function afterward. This both avoids + // re-walking the rest of the module and avoids dealing with any + // iterator invalidation issues while deleting functions. + InlinedFunctions.push_back(&F); + } } } @@ -117,7 +119,7 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, if (!InlinedFunctions.empty()) { // Now we just have the comdat functions. Filter out the ones whose comdats // are not actually dead. - filterDeadComdatFunctions(M, InlinedFunctions); + filterDeadComdatFunctions(InlinedFunctions); // The remaining functions are actually dead. for (Function *F : InlinedFunctions) { M.getFunctionList().erase(F); diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 3a42a2cac928..ce3c5153bde2 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -196,8 +196,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, for (const auto &ArgIndex : ArgIndices) { // not allowed to dereference ->begin() if size() is 0 Params.push_back(GetElementPtrInst::getIndexedType( - cast<PointerType>(I->getType())->getElementType(), - ArgIndex.second)); + I->getType()->getPointerElementType(), ArgIndex.second)); ArgAttrVec.push_back(AttributeSet()); assert(Params.back()); } @@ -298,7 +297,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, Ops.push_back(ConstantInt::get(IdxTy, II)); // Keep track of the type we're currently indexing. if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) - ElTy = ElPTy->getElementType(); + ElTy = ElPTy->getPointerElementType(); else ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II); } @@ -928,7 +927,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter, SmallPtrSet<Argument *, 8> ArgsToPromote; SmallPtrSet<Argument *, 8> ByValArgsToTransform; for (Argument *PtrArg : PointerArgs) { - Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); + Type *AgTy = PtrArg->getType()->getPointerElementType(); // Replace sret attribute with noalias. This reduces register pressure by // avoiding a register copy. diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 7e729e57153c..12b8a0ef9d00 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ValueTracking.h" @@ -202,9 +203,12 @@ bool AA::isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA, return NoRecurseAA.isAssumedNoRecurse(); } -Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty) { +Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty, + const TargetLibraryInfo *TLI) { if (isa<AllocaInst>(Obj)) return UndefValue::get(&Ty); + if (isAllocationFn(&Obj, TLI)) + return getInitialValueOfAllocation(&cast<CallBase>(Obj), TLI, &Ty); auto *GV = dyn_cast<GlobalVariable>(&Obj); if (!GV || !GV->hasLocalLinkage()) return nullptr; @@ -316,7 +320,8 @@ bool AA::getPotentialCopiesOfStoredValue( dbgs() << "Underlying object is a valid nullptr, giving up.\n";); return false; } - if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) { + if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) && + !isNoAliasCall(Obj)) { LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj << "\n";); return false; @@ -741,6 +746,7 @@ void IRPosition::verify() { assert((CBContext == nullptr) && "'call site argument' position must not have CallBaseContext!"); Use *U = getAsUsePtr(); + (void)U; // Silence unused variable warning. assert(U && "Expected use for a 'call site argument' position!"); assert(isa<CallBase>(U->getUser()) && "Expected call base user for a 'call site argument' position!"); @@ -999,10 +1005,11 @@ bool Attributor::isAssumedDead(const BasicBlock &BB, return false; } -bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred, - const AbstractAttribute &QueryingAA, - const Value &V, bool CheckBBLivenessOnly, - DepClassTy LivenessDepClass) { +bool Attributor::checkForAllUses( + function_ref<bool(const Use &, bool &)> Pred, + const AbstractAttribute &QueryingAA, const Value &V, + bool CheckBBLivenessOnly, DepClassTy LivenessDepClass, + function_ref<bool(const Use &OldU, const Use &NewU)> EquivalentUseCB) { // Check the trivial case first as it catches void values. if (V.use_empty()) @@ -1053,8 +1060,15 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred, << PotentialCopies.size() << " potential copies instead!\n"); for (Value *PotentialCopy : PotentialCopies) - for (const Use &U : PotentialCopy->uses()) - Worklist.push_back(&U); + for (const Use &CopyUse : PotentialCopy->uses()) { + if (EquivalentUseCB && !EquivalentUseCB(*U, CopyUse)) { + LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was " + "rejected by the equivalence call back: " + << *CopyUse << "!\n"); + return false; + } + Worklist.push_back(&CopyUse); + } continue; } } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index b977821bcaa6..76420783b2d1 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -417,12 +417,10 @@ const Value *stripAndAccumulateMinimalOffsets( AttributorAnalysis); } -static const Value *getMinimalBaseOfAccessPointerOperand( - Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I, - int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) { - const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); - if (!Ptr) - return nullptr; +static const Value * +getMinimalBaseOfPointer(Attributor &A, const AbstractAttribute &QueryingAA, + const Value *Ptr, int64_t &BytesOffset, + const DataLayout &DL, bool AllowNonInbounds = false) { APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); const Value *Base = stripAndAccumulateMinimalOffsets( A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds); @@ -431,18 +429,6 @@ static const Value *getMinimalBaseOfAccessPointerOperand( return Base; } -static const Value * -getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset, - const DataLayout &DL, - bool AllowNonInbounds = false) { - const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); - if (!Ptr) - return nullptr; - - return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL, - AllowNonInbounds); -} - /// Clamp the information known for all returned values of a function /// (identified by \p QueryingAA) into \p S. template <typename AAType, typename StateType = typename AAType::StateType> @@ -810,14 +796,17 @@ struct AA::PointerInfo::OffsetAndSize : public std::pair<int64_t, int64_t> { int64_t getSize() const { return second; } static OffsetAndSize getUnknown() { return OffsetAndSize(Unknown, Unknown); } + /// Return true if offset or size are unknown. + bool offsetOrSizeAreUnknown() const { + return getOffset() == OffsetAndSize::Unknown || + getSize() == OffsetAndSize::Unknown; + } + /// Return true if this offset and size pair might describe an address that /// overlaps with \p OAS. bool mayOverlap(const OffsetAndSize &OAS) const { // Any unknown value and we are giving up -> overlap. - if (OAS.getOffset() == OffsetAndSize::Unknown || - OAS.getSize() == OffsetAndSize::Unknown || - getOffset() == OffsetAndSize::Unknown || - getSize() == OffsetAndSize::Unknown) + if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown()) return true; // Check if one offset point is in the other interval [offset, offset+size]. @@ -1024,8 +1013,9 @@ protected: OffsetAndSize ItOAS = It.getFirst(); if (!OAS.mayOverlap(ItOAS)) continue; + bool IsExact = OAS == ItOAS && !OAS.offsetOrSizeAreUnknown(); for (auto &Access : It.getSecond()) - if (!CB(Access, OAS == ItOAS)) + if (!CB(Access, IsExact)) return false; } return true; @@ -1161,27 +1151,34 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { return true; }; + const auto *TLI = getAnchorScope() + ? A.getInfoCache().getTargetLibraryInfoForFunction( + *getAnchorScope()) + : nullptr; auto UsePred = [&](const Use &U, bool &Follow) -> bool { Value *CurPtr = U.get(); User *Usr = U.getUser(); LLVM_DEBUG(dbgs() << "[AAPointerInfo] Analyze " << *CurPtr << " in " << *Usr << "\n"); - - OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; + assert(OffsetInfoMap.count(CurPtr) && + "The current pointer offset should have been seeded!"); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Usr)) { if (CE->isCast()) - return HandlePassthroughUser(Usr, PtrOI, Follow); + return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow); if (CE->isCompare()) return true; - if (!CE->isGEPWithNoNotionalOverIndexing()) { + if (!isa<GEPOperator>(CE)) { LLVM_DEBUG(dbgs() << "[AAPointerInfo] Unhandled constant user " << *CE << "\n"); return false; } } if (auto *GEP = dyn_cast<GEPOperator>(Usr)) { + // Note the order here, the Usr access might change the map, CurPtr is + // already in it though. OffsetInfo &UsrOI = OffsetInfoMap[Usr]; + OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; UsrOI = PtrOI; // TODO: Use range information. @@ -1205,19 +1202,22 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { } UsrOI.Offset = PtrOI.Offset + DL.getIndexedOffsetInType( - CurPtr->getType()->getPointerElementType(), Indices); + GEP->getSourceElementType(), Indices); Follow = true; return true; } if (isa<CastInst>(Usr) || isa<SelectInst>(Usr)) - return HandlePassthroughUser(Usr, PtrOI, Follow); + return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow); // For PHIs we need to take care of the recurrence explicitly as the value // might change while we iterate through a loop. For now, we give up if // the PHI is not invariant. if (isa<PHINode>(Usr)) { - // Check if the PHI is invariant (so far). + // Note the order here, the Usr access might change the map, CurPtr is + // already in it though. OffsetInfo &UsrOI = OffsetInfoMap[Usr]; + OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; + // Check if the PHI is invariant (so far). if (UsrOI == PtrOI) return true; @@ -1257,8 +1257,8 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { if (auto *LoadI = dyn_cast<LoadInst>(Usr)) return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr, - AccessKind::AK_READ, PtrOI.Offset, Changed, - LoadI->getType()); + AccessKind::AK_READ, OffsetInfoMap[CurPtr].Offset, + Changed, LoadI->getType()); if (auto *StoreI = dyn_cast<StoreInst>(Usr)) { if (StoreI->getValueOperand() == CurPtr) { LLVM_DEBUG(dbgs() << "[AAPointerInfo] Escaping use in store " @@ -1269,18 +1269,21 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { Optional<Value *> Content = A.getAssumedSimplified( *StoreI->getValueOperand(), *this, UsedAssumedInformation); return handleAccess(A, *StoreI, *CurPtr, Content, AccessKind::AK_WRITE, - PtrOI.Offset, Changed, + OffsetInfoMap[CurPtr].Offset, Changed, StoreI->getValueOperand()->getType()); } if (auto *CB = dyn_cast<CallBase>(Usr)) { if (CB->isLifetimeStartOrEnd()) return true; + if (TLI && isFreeCall(CB, TLI)) + return true; if (CB->isArgOperand(&U)) { unsigned ArgNo = CB->getArgOperandNo(&U); const auto &CSArgPI = A.getAAFor<AAPointerInfo>( *this, IRPosition::callsite_argument(*CB, ArgNo), DepClassTy::REQUIRED); - Changed = translateAndAddCalleeState(A, CSArgPI, PtrOI.Offset, *CB) | + Changed = translateAndAddCalleeState( + A, CSArgPI, OffsetInfoMap[CurPtr].Offset, *CB) | Changed; return true; } @@ -1293,8 +1296,15 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n"); return false; }; + auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) { + if (OffsetInfoMap.count(NewU)) + return OffsetInfoMap[NewU] == OffsetInfoMap[OldU]; + OffsetInfoMap[NewU] = OffsetInfoMap[OldU]; + return true; + }; if (!A.checkForAllUses(UsePred, *this, AssociatedValue, - /* CheckBBLivenessOnly */ true)) + /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL, + EquivalentUseCB)) return indicatePessimisticFixpoint(); LLVM_DEBUG({ @@ -2127,31 +2137,26 @@ static int64_t getKnownNonNullAndDerefBytesForUse( return DerefAA.getKnownDereferenceableBytes(); } + Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I); + if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile()) + return 0; + int64_t Offset; const Value *Base = - getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL); - if (Base) { - if (Base == &AssociatedValue && - getPointerOperand(I, /* AllowVolatile */ false) == UseV) { - int64_t DerefBytes = - (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset; - - IsNonNull |= !NullPointerIsDefined; - return std::max(int64_t(0), DerefBytes); - } + getMinimalBaseOfPointer(A, QueryingAA, Loc->Ptr, Offset, DL); + if (Base && Base == &AssociatedValue) { + int64_t DerefBytes = Loc->Size.getValue() + Offset; + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); } /// Corner case when an offset is 0. - Base = getBasePointerOfAccessPointerOperand(I, Offset, DL, - /*AllowNonInbounds*/ true); - if (Base) { - if (Offset == 0 && Base == &AssociatedValue && - getPointerOperand(I, /* AllowVolatile */ false) == UseV) { - int64_t DerefBytes = - (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()); - IsNonNull |= !NullPointerIsDefined; - return std::max(int64_t(0), DerefBytes); - } + Base = GetPointerBaseWithConstantOffset(Loc->Ptr, Offset, DL, + /*AllowNonInbounds*/ true); + if (Base && Base == &AssociatedValue && Offset == 0) { + int64_t DerefBytes = Loc->Size.getValue(); + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); } return 0; @@ -2325,6 +2330,8 @@ struct AANoRecurseFunction final : AANoRecurseImpl { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { AANoRecurseImpl::initialize(A); + // TODO: We should build a call graph ourselves to enable this in the module + // pass as well. if (const Function *F = getAnchorScope()) if (A.getInfoCache().getSccSize(*F) != 1) indicatePessimisticFixpoint(); @@ -4057,17 +4064,15 @@ struct AADereferenceableImpl : AADereferenceable { if (!UseV->getType()->isPointerTy()) return; - Type *PtrTy = UseV->getType(); - const DataLayout &DL = A.getDataLayout(); + Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I); + if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile()) + return; + int64_t Offset; - if (const Value *Base = getBasePointerOfAccessPointerOperand( - I, Offset, DL, /*AllowNonInbounds*/ true)) { - if (Base == &getAssociatedValue() && - getPointerOperand(I, /* AllowVolatile */ false) == UseV) { - uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType()); - State.addAccessedBytes(Offset, Size); - } - } + const Value *Base = GetPointerBaseWithConstantOffset( + Loc->Ptr, Offset, A.getDataLayout(), /*AllowNonInbounds*/ true); + if (Base && Base == &getAssociatedValue()) + State.addAccessedBytes(Offset, Loc->Size.getValue()); } /// See followUsesInMBEC @@ -5236,6 +5241,8 @@ struct AAValueSimplifyImpl : AAValueSimplify { if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, AA, &L)) return false; + const auto *TLI = + A.getInfoCache().getTargetLibraryInfoForFunction(*L.getFunction()); for (Value *Obj : Objects) { LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n"); if (isa<UndefValue>(Obj)) @@ -5250,9 +5257,7 @@ struct AAValueSimplifyImpl : AAValueSimplify { continue; return false; } - if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) - return false; - Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType()); + Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType(), TLI); if (!InitialVal || !Union(*InitialVal)) return false; @@ -5745,13 +5750,6 @@ struct AAHeapToStackFunction final : public AAHeapToStack { /// The call that allocates the memory. CallBase *const CB; - /// The kind of allocation. - const enum class AllocationKind { - MALLOC, - CALLOC, - ALIGNED_ALLOC, - } Kind; - /// The library function id for the allocation. LibFunc LibraryFunctionId = NotLibFunc; @@ -5808,20 +5806,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack { DeallocationInfos[CB] = new (A.Allocator) DeallocationInfo{CB}; return true; } - bool IsMalloc = isMallocLikeFn(CB, TLI); - bool IsAlignedAllocLike = !IsMalloc && isAlignedAllocLikeFn(CB, TLI); - bool IsCalloc = - !IsMalloc && !IsAlignedAllocLike && isCallocLikeFn(CB, TLI); - if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) - return true; - auto Kind = - IsMalloc ? AllocationInfo::AllocationKind::MALLOC - : (IsCalloc ? AllocationInfo::AllocationKind::CALLOC - : AllocationInfo::AllocationKind::ALIGNED_ALLOC); - - AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB, Kind}; - AllocationInfos[CB] = AI; - TLI->getLibFunc(*CB, AI->LibraryFunctionId); + // To do heap to stack, we need to know that the allocation itself is + // removable once uses are rewritten, and that we can initialize the + // alloca to the same pattern as the original allocation result. + if (isAllocationFn(CB, TLI) && isAllocRemovable(CB, TLI)) { + auto *I8Ty = Type::getInt8Ty(CB->getParent()->getContext()); + if (nullptr != getInitialValueOfAllocation(CB, TLI, I8Ty)) { + AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB}; + AllocationInfos[CB] = AI; + TLI->getLibFunc(*CB, AI->LibraryFunctionId); + } + } return true; }; @@ -5917,21 +5912,22 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Optional<APInt> SizeAPI = getSize(A, *this, AI); if (SizeAPI.hasValue()) { Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI); - } else if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) { - auto *Num = AI.CB->getOperand(0); - auto *SizeT = AI.CB->getOperand(1); - IRBuilder<> B(AI.CB); - Size = B.CreateMul(Num, SizeT, "h2s.calloc.size"); - } else if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) { - Size = AI.CB->getOperand(1); } else { - Size = AI.CB->getOperand(0); + LLVMContext &Ctx = AI.CB->getContext(); + auto &DL = A.getInfoCache().getDL(); + ObjectSizeOpts Opts; + ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts); + SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB); + assert(SizeOffsetPair != ObjectSizeOffsetEvaluator::unknown() && + cast<ConstantInt>(SizeOffsetPair.second)->isZero()); + Size = SizeOffsetPair.first; } Align Alignment(1); - if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) { - Optional<APInt> AlignmentAPI = - getAPInt(A, *this, *AI.CB->getArgOperand(0)); + if (MaybeAlign RetAlign = AI.CB->getRetAlign()) + Alignment = max(Alignment, RetAlign); + if (Value *Align = getAllocAlignment(AI.CB, TLI)) { + Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align); assert(AlignmentAPI.hasValue() && "Expected an alignment during manifest!"); Alignment = @@ -5947,6 +5943,11 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc", Alloca->getNextNode()); + auto *I8Ty = Type::getInt8Ty(F->getContext()); + auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty); + assert(InitVal && + "Must be able to materialize initial memory state of allocation"); + A.changeValueAfterManifest(*AI.CB, *Alloca); if (auto *II = dyn_cast<InvokeInst>(AI.CB)) { @@ -5957,18 +5958,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack { A.deleteAfterManifest(*AI.CB); } - // Zero out the allocated memory if it was a calloc. - if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) { - auto *BI = new BitCastInst(Alloca, AI.CB->getType(), "calloc_bc", - Alloca->getNextNode()); - Value *Ops[] = { - BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size, - ConstantInt::get(Type::getInt1Ty(F->getContext()), false)}; - - Type *Tys[] = {BI->getType(), AI.CB->getOperand(0)->getType()}; - Module *M = F->getParent(); - Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); - CallInst::Create(Fn, Ops, "", BI->getNextNode()); + // Initialize the alloca with the same value as used by the allocation + // function. We can skip undef as the initial value of an alloc is + // undef, and the memset would simply end up being DSEd. + if (!isa<UndefValue>(InitVal)) { + IRBuilder<> Builder(Alloca->getNextNode()); + // TODO: Use alignment above if align!=1 + Builder.CreateMemSet(Alloca, InitVal, Size, None); } HasChanged = ChangeStatus::CHANGED; } @@ -5990,25 +5986,18 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Optional<APInt> getSize(Attributor &A, const AbstractAttribute &AA, AllocationInfo &AI) { + auto Mapper = [&](const Value *V) -> const Value * { + bool UsedAssumedInformation = false; + if (Optional<Constant *> SimpleV = + A.getAssumedConstant(*V, AA, UsedAssumedInformation)) + if (*SimpleV) + return *SimpleV; + return V; + }; - if (AI.Kind == AllocationInfo::AllocationKind::MALLOC) - return getAPInt(A, AA, *AI.CB->getArgOperand(0)); - - if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) - // Only if the alignment is also constant we return a size. - return getAPInt(A, AA, *AI.CB->getArgOperand(0)).hasValue() - ? getAPInt(A, AA, *AI.CB->getArgOperand(1)) - : llvm::None; - - assert(AI.Kind == AllocationInfo::AllocationKind::CALLOC && - "Expected only callocs are left"); - Optional<APInt> Num = getAPInt(A, AA, *AI.CB->getArgOperand(0)); - Optional<APInt> Size = getAPInt(A, AA, *AI.CB->getArgOperand(1)); - if (!Num.hasValue() || !Size.hasValue()) - return llvm::None; - bool Overflow = false; - Size = Size.getValue().umul_ov(Num.getValue(), Overflow); - return Overflow ? llvm::None : Size; + const Function *F = getAnchorScope(); + const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); + return getAllocSize(AI.CB, TLI, Mapper); } /// Collection of all malloc-like calls in a function with associated @@ -6025,6 +6014,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack { ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { ChangeStatus Changed = ChangeStatus::UNCHANGED; const Function *F = getAnchorScope(); + const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); const auto &LivenessAA = A.getAAFor<AAIsDead>(*this, IRPosition::function(*F), DepClassTy::NONE); @@ -6239,22 +6229,24 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { if (AI.Status == AllocationInfo::INVALID) continue; - if (MaxHeapToStackSize == -1) { - if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) - if (!getAPInt(A, *this, *AI.CB->getArgOperand(0)).hasValue()) { - LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB - << "\n"); - AI.Status = AllocationInfo::INVALID; - Changed = ChangeStatus::CHANGED; - continue; - } - } else { + if (Value *Align = getAllocAlignment(AI.CB, TLI)) { + if (!getAPInt(A, *this, *Align)) { + // Can't generate an alloca which respects the required alignment + // on the allocation. + LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB + << "\n"); + AI.Status = AllocationInfo::INVALID; + Changed = ChangeStatus::CHANGED; + continue; + } + } + + if (MaxHeapToStackSize != -1) { Optional<APInt> Size = getSize(A, *this, AI); if (!Size.hasValue() || Size.getValue().ugt(MaxHeapToStackSize)) { LLVM_DEBUG({ if (!Size.hasValue()) - dbgs() << "[H2S] Unknown allocation size (or alignment): " << *AI.CB - << "\n"; + dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n"; else dbgs() << "[H2S] Allocation size too large: " << *AI.CB << " vs. " << MaxHeapToStackSize << "\n"; @@ -6637,9 +6629,10 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { IRBuilder<NoFolder> IRB(IP); const DataLayout &DL = IP->getModule()->getDataLayout(); - if (Base->getType()->getPointerElementType() != PrivType) - Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(), - "", ACS.getInstruction()); + Type *PrivPtrType = PrivType->getPointerTo(); + if (Base->getType() != PrivPtrType) + Base = BitCastInst::CreateBitOrPointerCast(Base, PrivPtrType, "", + ACS.getInstruction()); // Traverse the type, build GEPs and loads. if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) { @@ -6781,7 +6774,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl { if (auto *AI = dyn_cast<AllocaInst>(Obj)) if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) if (CI->isOne()) - return Obj->getType()->getPointerElementType(); + return AI->getAllocatedType(); if (auto *Arg = dyn_cast<Argument>(Obj)) { auto &PrivArgAA = A.getAAFor<AAPrivatizablePtr>( *this, IRPosition::argument(*Arg), DepClassTy::REQUIRED); @@ -7675,7 +7668,6 @@ void AAMemoryLocationImpl::categorizePtrValue( for (Value *Obj : Objects) { // TODO: recognize the TBAA used for constant accesses. MemoryLocationsKind MLK = NO_LOCATIONS; - assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped."); if (isa<UndefValue>(Obj)) continue; if (isa<Argument>(Obj)) { @@ -8485,13 +8477,30 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { /* UseValueSimplify */ false)) return indicatePessimisticFixpoint(); - return clampStateAndIndicateChange(getState(), T); + // Ensure that long def-use chains can't cause circular reasoning either by + // introducing a cutoff below. + if (clampStateAndIndicateChange(getState(), T) == ChangeStatus::UNCHANGED) + return ChangeStatus::UNCHANGED; + if (++NumChanges > MaxNumChanges) { + LLVM_DEBUG(dbgs() << "[AAValueConstantRange] performed " << NumChanges + << " but only " << MaxNumChanges + << " are allowed to avoid cyclic reasoning."); + return indicatePessimisticFixpoint(); + } + return ChangeStatus::CHANGED; } /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(value_range) } + + /// Tracker to bail after too many widening steps of the constant range. + int NumChanges = 0; + + /// Upper bound for the number of allowed changes (=widening steps) for the + /// constant range before we give up. + static constexpr int MaxNumChanges = 5; }; struct AAValueConstantRangeFunction : AAValueConstantRangeImpl { diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index fb9ab7954e36..2a6e38b0437f 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -287,7 +287,8 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) { SmallVector<unsigned, 8> UnusedArgs; bool Changed = false; - AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes(); + AttributeMask UBImplyingAttributes = + AttributeFuncs::getUBImplyingAttributes(); for (Argument &Arg : Fn.args()) { if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && !Arg.hasPassPointeeByValueCopyAttr()) { @@ -838,7 +839,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { assert(NRetTy && "No new return type found?"); // The existing function return attributes. - AttrBuilder RAttrs(PAL.getRetAttrs()); + AttrBuilder RAttrs(F->getContext(), PAL.getRetAttrs()); // Remove any incompatible attributes, but only if we removed all return // values. Otherwise, ensure that we don't have any conflicting attributes @@ -889,7 +890,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Adjust the call return attributes in case the function was changed to // return void. - AttrBuilder RAttrs(CallPAL.getRetAttrs()); + AttrBuilder RAttrs(F->getContext(), CallPAL.getRetAttrs()); RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); @@ -912,7 +913,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // this is not an expected case anyway ArgAttrVec.push_back(AttributeSet::get( F->getContext(), - AttrBuilder(Attrs).removeAttribute(Attribute::Returned))); + AttrBuilder(F->getContext(), Attrs).removeAttribute(Attribute::Returned))); } else { // Otherwise, use the original attributes. ArgAttrVec.push_back(Attrs); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 321d4a19a585..213a998d5bba 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -133,7 +133,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, if (AliasAnalysis::onlyReadsMemory(MRB)) return MAK_ReadOnly; - if (AliasAnalysis::doesNotReadMemory(MRB)) + if (AliasAnalysis::onlyWritesMemory(MRB)) return MAK_WriteOnly; // Conservatively assume it reads and writes to memory. @@ -295,13 +295,13 @@ static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter, // No change. continue; - if (F->doesNotReadMemory() && WritesMemory) + if (F->onlyWritesMemory() && WritesMemory) continue; Changed.insert(F); // Clear out any existing attributes. - AttrBuilder AttrsToRemove; + AttributeMask AttrsToRemove; AttrsToRemove.addAttribute(Attribute::ReadOnly); AttrsToRemove.addAttribute(Attribute::ReadNone); AttrsToRemove.addAttribute(Attribute::WriteOnly); @@ -720,10 +720,16 @@ determinePointerAccessAttrs(Argument *A, // The accessors used on call site here do the right thing for calls and // invokes with operand bundles. - if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) - return Attribute::None; - if (!CB.doesNotAccessMemory(UseIndex)) + if (CB.doesNotAccessMemory(UseIndex)) { + /* nop */ + } else if (CB.onlyReadsMemory() || CB.onlyReadsMemory(UseIndex)) { IsRead = true; + } else if (CB.hasFnAttr(Attribute::WriteOnly) || + CB.dataOperandHasImpliedAttr(UseIndex, Attribute::WriteOnly)) { + IsWrite = true; + } else { + return Attribute::None; + } break; } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 2425646455bd..6c3cc3914337 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -6,15 +6,24 @@ // //===----------------------------------------------------------------------===// // -// This specialises functions with constant parameters (e.g. functions, -// globals). Constant parameters like function pointers and constant globals -// are propagated to the callee by specializing the function. +// This specialises functions with constant parameters. Constant parameters +// like function pointers and constant globals are propagated to the callee by +// specializing the function. The main benefit of this pass at the moment is +// that indirect calls are transformed into direct calls, which provides inline +// opportunities that the inliner would not have been able to achieve. That's +// why function specialisation is run before the inliner in the optimisation +// pipeline; that is by design. Otherwise, we would only benefit from constant +// passing, which is a valid use-case too, but hasn't been explored much in +// terms of performance uplifts, cost-model and compile-time impact. // // Current limitations: -// - It does not yet handle integer ranges. +// - It does not yet handle integer ranges. We do support "literal constants", +// but that's off by default under an option. // - Only 1 argument per function is specialised, -// - The cost-model could be further looked into, -// - We are not yet caching analysis results. +// - The cost-model could be further looked into (it mainly focuses on inlining +// benefits), +// - We are not yet caching analysis results, but profiling and checking where +// extra compile time is spent didn't suggest this to be a problem. // // Ideas: // - With a function specialization attribute for arguments, we could have @@ -30,8 +39,12 @@ // https://reviews.llvm.org/D106426 for details. Perhaps there is a // compile-time friendlier way to control/limit the number of specialisations // for recursive functions. -// - Don't transform the function if there is no function specialization -// happens. +// - Don't transform the function if function specialization does not trigger; +// the SCCPSolver may make IR changes. +// +// References: +// - 2021 LLVM Dev Mtg “Introducing function specialisation, and can we enable +// it by default?”, https://www.youtube.com/watch?v=zJiCjeXgV5Q // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index b1f3ff15c97b..d3cac3efce86 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -303,11 +303,11 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, else if (auto *GEP = dyn_cast<GEPOperator>(U)) append_range(WorkList, GEP->users()); else if (auto *LI = dyn_cast<LoadInst>(U)) { - // A load from zeroinitializer is always zeroinitializer, regardless of - // any applied offset. + // A load from a uniform value is always the same, regardless of any + // applied offset. Type *Ty = LI->getType(); - if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) { - LI->replaceAllUsesWith(Constant::getNullValue(Ty)); + if (Constant *Res = ConstantFoldLoadFromUniformValue(Init, Ty)) { + LI->replaceAllUsesWith(Res); EraseFromParent(LI); continue; } @@ -337,107 +337,68 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, return Changed; } -static bool isSafeSROAElementUse(Value *V); - -/// Return true if the specified GEP is a safe user of a derived -/// expression from a global that we want to SROA. -static bool isSafeSROAGEP(User *U) { - // Check to see if this ConstantExpr GEP is SRA'able. In particular, we - // don't like < 3 operand CE's, and we don't like non-constant integer - // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some - // value of C. - if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) || - !cast<Constant>(U->getOperand(1))->isNullValue()) - return false; - - gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U); - ++GEPI; // Skip over the pointer index. - - // For all other level we require that the indices are constant and inrange. - // In particular, consider: A[0][i]. We cannot know that the user isn't doing - // invalid things like allowing i to index an out-of-range subscript that - // accesses A[1]. This can also happen between different members of a struct - // in llvm IR. - for (; GEPI != E; ++GEPI) { - if (GEPI.isStruct()) +/// Look at all uses of the global and determine which (offset, type) pairs it +/// can be split into. +static bool collectSRATypes(DenseMap<uint64_t, Type *> &Types, GlobalValue *GV, + const DataLayout &DL) { + SmallVector<Use *, 16> Worklist; + SmallPtrSet<Use *, 16> Visited; + auto AppendUses = [&](Value *V) { + for (Use &U : V->uses()) + if (Visited.insert(&U).second) + Worklist.push_back(&U); + }; + AppendUses(GV); + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + User *V = U->getUser(); + if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V)) { + AppendUses(V); continue; + } - ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand()); - if (!IdxVal || (GEPI.isBoundedSequential() && - IdxVal->getZExtValue() >= GEPI.getSequentialNumElements())) - return false; - } - - return llvm::all_of(U->users(), isSafeSROAElementUse); -} - -/// Return true if the specified instruction is a safe user of a derived -/// expression from a global that we want to SROA. -static bool isSafeSROAElementUse(Value *V) { - // We might have a dead and dangling constant hanging off of here. - if (Constant *C = dyn_cast<Constant>(V)) - return isSafeToDestroyConstant(C); - - Instruction *I = dyn_cast<Instruction>(V); - if (!I) return false; + if (auto *GEP = dyn_cast<GEPOperator>(V)) { + if (!GEP->hasAllConstantIndices()) + return false; + AppendUses(V); + continue; + } - // Loads are ok. - if (isa<LoadInst>(I)) return true; + if (Value *Ptr = getLoadStorePointerOperand(V)) { + // This is storing the global address into somewhere, not storing into + // the global. + if (isa<StoreInst>(V) && U->getOperandNo() == 0) + return false; - // Stores *to* the pointer are ok. - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->getOperand(0) != V; + APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset, + /* AllowNonInbounds */ true); + if (Ptr != GV || Offset.getActiveBits() >= 64) + return false; - // Otherwise, it must be a GEP. Check it and its users are safe to SRA. - return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I); -} + // TODO: We currently require that all accesses at a given offset must + // use the same type. This could be relaxed. + Type *Ty = getLoadStoreType(V); + auto It = Types.try_emplace(Offset.getZExtValue(), Ty).first; + if (Ty != It->second) + return false; + continue; + } -/// Look at all uses of the global and decide whether it is safe for us to -/// perform this transformation. -static bool GlobalUsersSafeToSRA(GlobalValue *GV) { - for (User *U : GV->users()) { - // The user of the global must be a GEP Inst or a ConstantExpr GEP. - if (!isa<GetElementPtrInst>(U) && - (!isa<ConstantExpr>(U) || - cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr)) - return false; + // Ignore dead constant users. + if (auto *C = dyn_cast<Constant>(V)) { + if (!isSafeToDestroyConstant(C)) + return false; + continue; + } - // Check the gep and it's users are safe to SRA - if (!isSafeSROAGEP(U)) - return false; + // Unknown user. + return false; } return true; } -static bool IsSRASequential(Type *T) { - return isa<ArrayType>(T) || isa<VectorType>(T); -} -static uint64_t GetSRASequentialNumElements(Type *T) { - if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getNumElements(); - return cast<FixedVectorType>(T)->getNumElements(); -} -static Type *GetSRASequentialElementType(Type *T) { - if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getElementType(); - return cast<VectorType>(T)->getElementType(); -} -static bool CanDoGlobalSRA(GlobalVariable *GV) { - Constant *Init = GV->getInitializer(); - - if (isa<StructType>(Init->getType())) { - // nothing to check - } else if (IsSRASequential(Init->getType())) { - if (GetSRASequentialNumElements(Init->getType()) > 16 && - GV->hasNUsesOrMore(16)) - return false; // It's not worth it. - } else - return false; - - return GlobalUsersSafeToSRA(GV); -} - /// Copy over the debug info for a variable to its SRA replacements. static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, uint64_t FragmentOffsetInBits, @@ -468,161 +429,140 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, /// transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { - // Make sure this global only has simple uses that we can SRA. - if (!CanDoGlobalSRA(GV)) + assert(GV->hasLocalLinkage()); + + // Collect types to split into. + DenseMap<uint64_t, Type *> Types; + if (!collectSRATypes(Types, GV, DL) || Types.empty()) return nullptr; - assert(GV->hasLocalLinkage()); - Constant *Init = GV->getInitializer(); - Type *Ty = Init->getType(); - uint64_t VarSize = DL.getTypeSizeInBits(Ty); + // Make sure we don't SRA back to the same type. + if (Types.size() == 1 && Types.begin()->second == GV->getValueType()) + return nullptr; - std::map<unsigned, GlobalVariable *> NewGlobals; + // Don't perform SRA if we would have to split into many globals. + if (Types.size() > 16) + return nullptr; - // Get the alignment of the global, either explicit or target-specific. - Align StartAlignment = - DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType()); - - // Loop over all users and create replacement variables for used aggregate - // elements. - for (User *GEP : GV->users()) { - assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() == - Instruction::GetElementPtr) || - isa<GetElementPtrInst>(GEP)) && - "NonGEP CE's are not SRAable!"); - - // Ignore the 1th operand, which has to be zero or else the program is quite - // broken (undefined). Get the 2nd operand, which is the structure or array - // index. - unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); - if (NewGlobals.count(ElementIdx) == 1) - continue; // we`ve already created replacement variable - assert(NewGlobals.count(ElementIdx) == 0); - - Type *ElTy = nullptr; - if (StructType *STy = dyn_cast<StructType>(Ty)) - ElTy = STy->getElementType(ElementIdx); - else - ElTy = GetSRASequentialElementType(Ty); - assert(ElTy); + // Sort by offset. + SmallVector<std::pair<uint64_t, Type *>, 16> TypesVector; + append_range(TypesVector, Types); + sort(TypesVector, + [](const auto &A, const auto &B) { return A.first < B.first; }); - Constant *In = Init->getAggregateElement(ElementIdx); - assert(In && "Couldn't get element of initializer?"); + // Check that the types are non-overlapping. + uint64_t Offset = 0; + for (const auto &Pair : TypesVector) { + // Overlaps with previous type. + if (Pair.first < Offset) + return nullptr; - GlobalVariable *NGV = new GlobalVariable( - ElTy, false, GlobalVariable::InternalLinkage, In, - GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(), - GV->getType()->getAddressSpace()); - NGV->setExternallyInitialized(GV->isExternallyInitialized()); - NGV->copyAttributesFrom(GV); - NewGlobals.insert(std::make_pair(ElementIdx, NGV)); - - if (StructType *STy = dyn_cast<StructType>(Ty)) { - const StructLayout &Layout = *DL.getStructLayout(STy); - - // Calculate the known alignment of the field. If the original aggregate - // had 256 byte alignment for example, something might depend on that: - // propagate info to each field. - uint64_t FieldOffset = Layout.getElementOffset(ElementIdx); - Align NewAlign = commonAlignment(StartAlignment, FieldOffset); - if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx))) - NGV->setAlignment(NewAlign); - - // Copy over the debug info for the variable. - uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType()); - uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx); - transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize); - } else { - uint64_t EltSize = DL.getTypeAllocSize(ElTy); - Align EltAlign = DL.getABITypeAlign(ElTy); - uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); - - // Calculate the known alignment of the field. If the original aggregate - // had 256 byte alignment for example, something might depend on that: - // propagate info to each field. - Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx); - if (NewAlign > EltAlign) - NGV->setAlignment(NewAlign); - transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx, - FragmentSizeInBits, VarSize); - } + Offset = Pair.first + DL.getTypeAllocSize(Pair.second); } - if (NewGlobals.empty()) + // Some accesses go beyond the end of the global, don't bother. + if (Offset > DL.getTypeAllocSize(GV->getValueType())) return nullptr; - Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); - for (auto NewGlobalVar : NewGlobals) - Globals.push_back(NewGlobalVar.second); + // Collect initializers for new globals. + Constant *OrigInit = GV->getInitializer(); + DenseMap<uint64_t, Constant *> Initializers; + for (const auto &Pair : Types) { + Constant *NewInit = ConstantFoldLoadFromConst(OrigInit, Pair.second, + APInt(64, Pair.first), DL); + if (!NewInit) { + LLVM_DEBUG(dbgs() << "Global SRA: Failed to evaluate initializer of " + << *GV << " with type " << *Pair.second << " at offset " + << Pair.first << "\n"); + return nullptr; + } + Initializers.insert({Pair.first, NewInit}); + } LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); - Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); + // Get the alignment of the global, either explicit or target-specific. + Align StartAlignment = + DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); + uint64_t VarSize = DL.getTypeSizeInBits(GV->getValueType()); + + // Create replacement globals. + DenseMap<uint64_t, GlobalVariable *> NewGlobals; + unsigned NameSuffix = 0; + for (auto &Pair : TypesVector) { + uint64_t Offset = Pair.first; + Type *Ty = Pair.second; + GlobalVariable *NGV = new GlobalVariable( + *GV->getParent(), Ty, false, GlobalVariable::InternalLinkage, + Initializers[Offset], GV->getName() + "." + Twine(NameSuffix++), GV, + GV->getThreadLocalMode(), GV->getAddressSpace()); + NGV->copyAttributesFrom(GV); + NewGlobals.insert({Offset, NGV}); + + // Calculate the known alignment of the field. If the original aggregate + // had 256 byte alignment for example, something might depend on that: + // propagate info to each field. + Align NewAlign = commonAlignment(StartAlignment, Offset); + if (NewAlign > DL.getABITypeAlign(Ty)) + NGV->setAlignment(NewAlign); + + // Copy over the debug info for the variable. + transferSRADebugInfo(GV, NGV, Offset * 8, DL.getTypeAllocSizeInBits(Ty), + VarSize); + } + + // Replace uses of the original global with uses of the new global. + SmallVector<Value *, 16> Worklist; + SmallPtrSet<Value *, 16> Visited; + SmallVector<WeakTrackingVH, 16> DeadInsts; + auto AppendUsers = [&](Value *V) { + for (User *U : V->users()) + if (Visited.insert(U).second) + Worklist.push_back(U); + }; + AppendUsers(GV); + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V) || + isa<GEPOperator>(V)) { + AppendUsers(V); + if (isa<Instruction>(V)) + DeadInsts.push_back(V); + continue; + } - // Loop over all of the uses of the global, replacing the constantexpr geps, - // with smaller constantexpr geps or direct references. - while (!GV->use_empty()) { - User *GEP = GV->user_back(); - assert(((isa<ConstantExpr>(GEP) && - cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)|| - isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!"); - - // Ignore the 1th operand, which has to be zero or else the program is quite - // broken (undefined). Get the 2nd operand, which is the structure or array - // index. - unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); - assert(NewGlobals.count(ElementIdx) == 1); - - Value *NewPtr = NewGlobals[ElementIdx]; - Type *NewTy = NewGlobals[ElementIdx]->getValueType(); - - // Form a shorter GEP if needed. - if (GEP->getNumOperands() > 3) { - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) { - SmallVector<Constant*, 8> Idxs; - Idxs.push_back(NullInt); - for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i) - Idxs.push_back(CE->getOperand(i)); - NewPtr = - ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs); + if (Value *Ptr = getLoadStorePointerOperand(V)) { + APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset, + /* AllowNonInbounds */ true); + assert(Ptr == GV && "Load/store must be from/to global"); + GlobalVariable *NGV = NewGlobals[Offset.getZExtValue()]; + assert(NGV && "Must have replacement global for this offset"); + + // Update the pointer operand and recalculate alignment. + Align PrefAlign = DL.getPrefTypeAlign(getLoadStoreType(V)); + Align NewAlign = + getOrEnforceKnownAlignment(NGV, PrefAlign, DL, cast<Instruction>(V)); + + if (auto *LI = dyn_cast<LoadInst>(V)) { + LI->setOperand(0, NGV); + LI->setAlignment(NewAlign); } else { - GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP); - SmallVector<Value*, 8> Idxs; - Idxs.push_back(NullInt); - for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) - Idxs.push_back(GEPI->getOperand(i)); - NewPtr = GetElementPtrInst::Create( - NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx), - GEPI); - } - } - GEP->replaceAllUsesWith(NewPtr); - - // We changed the pointer of any memory access user. Recalculate alignments. - for (User *U : NewPtr->users()) { - if (auto *Load = dyn_cast<LoadInst>(U)) { - Align PrefAlign = DL.getPrefTypeAlign(Load->getType()); - Align NewAlign = getOrEnforceKnownAlignment(Load->getPointerOperand(), - PrefAlign, DL, Load); - Load->setAlignment(NewAlign); - } - if (auto *Store = dyn_cast<StoreInst>(U)) { - Align PrefAlign = - DL.getPrefTypeAlign(Store->getValueOperand()->getType()); - Align NewAlign = getOrEnforceKnownAlignment(Store->getPointerOperand(), - PrefAlign, DL, Store); - Store->setAlignment(NewAlign); + auto *SI = cast<StoreInst>(V); + SI->setOperand(1, NGV); + SI->setAlignment(NewAlign); } + continue; } - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP)) - GEPI->eraseFromParent(); - else - cast<ConstantExpr>(GEP)->destroyConstant(); + assert(isa<Constant>(V) && isSafeToDestroyConstant(cast<Constant>(V)) && + "Other users can only be dead constants"); } - // Delete the old global, now that it is dead. - Globals.erase(GV); + // Delete old instructions and global. + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); + GV->removeDeadConstantUsers(); + GV->eraseFromParent(); ++NumSRA; assert(NewGlobals.size() > 0); @@ -677,7 +617,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, "Should be GlobalVariable"); // This and only this kind of non-signed ICmpInst is to be replaced with // the comparing of the value of the created global init bool later in - // optimizeGlobalAddressOfMalloc for the global variable. + // optimizeGlobalAddressOfAllocation for the global variable. } else { //cerr << "NONTRAPPING USE: " << *U; return false; @@ -895,29 +835,36 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL, /// to actually DO the malloc. Instead, turn the malloc into a global, and any /// loads of GV as uses of the new global. static GlobalVariable * -OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, - ConstantInt *NElements, const DataLayout &DL, - TargetLibraryInfo *TLI) { +OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI, + uint64_t AllocSize, Constant *InitVal, + const DataLayout &DL, + TargetLibraryInfo *TLI) { LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); - Type *GlobalType; - if (NElements->getZExtValue() == 1) - GlobalType = AllocTy; - else - // If we have an array allocation, the global variable is of an array. - GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue()); + // Create global of type [AllocSize x i8]. + Type *GlobalType = ArrayType::get(Type::getInt8Ty(GV->getContext()), + AllocSize); - // Create the new global variable. The contents of the malloc'd memory is - // undefined, so initialize with an undef value. + // Create the new global variable. The contents of the allocated memory is + // undefined initially, so initialize with an undef value. GlobalVariable *NewGV = new GlobalVariable( *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, GV->getThreadLocalMode()); - // If there are bitcast users of the malloc (which is typical, usually we have - // a malloc + bitcast) then replace them with uses of the new global. Update - // other users to use the global as well. + // Initialize the global at the point of the original call. Note that this + // is a different point from the initialization referred to below for the + // nullability handling. Sublety: We have not proven the original global was + // only initialized once. As such, we can not fold this into the initializer + // of the new global as may need to re-init the storage multiple times. + if (!isa<UndefValue>(InitVal)) { + IRBuilder<> Builder(CI->getNextNode()); + // TODO: Use alignment above if align!=1 + Builder.CreateMemSet(NewGV, InitVal, AllocSize, None); + } + + // Update users of the allocation to use the new global instead. BitCastInst *TheBC = nullptr; while (!CI->use_empty()) { Instruction *User = cast<Instruction>(CI->user_back()); @@ -1009,7 +956,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, } else GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool); - // Now the GV is dead, nuke it and the malloc.. + // Now the GV is dead, nuke it and the allocation.. GV->eraseFromParent(); CI->eraseFromParent(); @@ -1066,15 +1013,33 @@ valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI, return true; } -/// This function is called when we see a pointer global variable with a single -/// value stored it that is a malloc or cast of malloc. -static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, - Type *AllocTy, - AtomicOrdering Ordering, - const DataLayout &DL, - TargetLibraryInfo *TLI) { - // If this is a malloc of an abstract type, don't touch it. - if (!AllocTy->isSized()) +/// If we have a global that is only initialized with a fixed size allocation +/// try to transform the program to use global memory instead of heap +/// allocated memory. This eliminates dynamic allocation, avoids an indirection +/// accessing the data, and exposes the resultant global to further GlobalOpt. +static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV, + CallInst *CI, + AtomicOrdering Ordering, + const DataLayout &DL, + TargetLibraryInfo *TLI) { + if (!isAllocRemovable(CI, TLI)) + // Must be able to remove the call when we get done.. + return false; + + Type *Int8Ty = Type::getInt8Ty(CI->getFunction()->getContext()); + Constant *InitVal = getInitialValueOfAllocation(CI, TLI, Int8Ty); + if (!InitVal) + // Must be able to emit a memset for initialization + return false; + + uint64_t AllocSize; + if (!getObjectSize(CI, AllocSize, DL, TLI, ObjectSizeOpts())) + return false; + + // Restrict this transformation to only working on small allocations + // (2048 bytes currently), as we don't want to introduce a 16M global or + // something. + if (AllocSize >= 2048) return false; // We can't optimize this global unless all uses of it are *known* to be @@ -1093,25 +1058,8 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV)) return false; - // If we have a global that is only initialized with a fixed size malloc, - // transform the program to use global memory instead of malloc'd memory. - // This eliminates dynamic allocation, avoids an indirection accessing the - // data, and exposes the resultant global to further GlobalOpt. - // We cannot optimize the malloc if we cannot determine malloc array size. - Value *NElems = getMallocArraySize(CI, DL, TLI, true); - if (!NElems) - return false; - - if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) - // Restrict this transformation to only working on small allocations - // (2048 bytes currently), as we don't want to introduce a 16M global or - // something. - if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { - OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); - return true; - } - - return false; + OptimizeGlobalAddressOfAllocation(GV, CI, AllocSize, InitVal, DL, TLI); + return true; } // Try to optimize globals based on the knowledge that only one value (besides @@ -1140,12 +1088,12 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, // Optimize away any trapping uses of the loaded value. if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI)) return true; - } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) { - auto *TLI = &GetTLI(*CI->getFunction()); - Type *MallocType = getMallocAllocatedType(CI, TLI); - if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, - Ordering, DL, TLI)) - return true; + } else if (isAllocationFn(StoredOnceVal, GetTLI)) { + if (auto *CI = dyn_cast<CallInst>(StoredOnceVal)) { + auto *TLI = &GetTLI(*CI->getFunction()); + if (tryToOptimizeStoreOfAllocationToGlobal(GV, CI, Ordering, DL, TLI)) + return true; + } } } @@ -1171,9 +1119,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { // Walk the use list of the global seeing if all the uses are load or store. // If there is anything else, bail out. - for (User *U : GV->users()) + for (User *U : GV->users()) { if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) return false; + if (getLoadStoreType(U) != GVElType) + return false; + } LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n"); @@ -1590,11 +1541,25 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, // This is restricted to address spaces that allow globals to have // initializers. NVPTX, for example, does not support initializers for // shared memory (AS 3). - if (SOVConstant && SOVConstant->getType() == GV->getValueType() && - isa<UndefValue>(GV->getInitializer()) && + if (SOVConstant && isa<UndefValue>(GV->getInitializer()) && + DL.getTypeAllocSize(SOVConstant->getType()) == + DL.getTypeAllocSize(GV->getValueType()) && CanHaveNonUndefGlobalInitializer) { - // Change the initial value here. - GV->setInitializer(SOVConstant); + if (SOVConstant->getType() == GV->getValueType()) { + // Change the initializer in place. + GV->setInitializer(SOVConstant); + } else { + // Create a new global with adjusted type. + auto *NGV = new GlobalVariable( + *GV->getParent(), SOVConstant->getType(), GV->isConstant(), + GV->getLinkage(), SOVConstant, "", GV, GV->getThreadLocalMode(), + GV->getAddressSpace()); + NGV->takeName(GV); + NGV->copyAttributesFrom(GV); + GV->replaceAllUsesWith(ConstantExpr::getBitCast(NGV, GV->getType())); + GV->eraseFromParent(); + GV = NGV; + } // Clean up any obviously simplifiable users now. CleanupConstantGlobalUsers(GV, DL); @@ -2066,194 +2031,6 @@ OptimizeGlobalVars(Module &M, return Changed; } -/// Evaluate a piece of a constantexpr store into a global initializer. This -/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the -/// GEP operands of Addr [0, OpNo) have been stepped into. -static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, - ConstantExpr *Addr, unsigned OpNo) { - // Base case of the recursion. - if (OpNo == Addr->getNumOperands()) { - assert(Val->getType() == Init->getType() && "Type mismatch!"); - return Val; - } - - SmallVector<Constant*, 32> Elts; - if (StructType *STy = dyn_cast<StructType>(Init->getType())) { - // Break up the constant into its elements. - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) - Elts.push_back(Init->getAggregateElement(i)); - - // Replace the element that we are supposed to. - ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo)); - unsigned Idx = CU->getZExtValue(); - assert(Idx < STy->getNumElements() && "Struct index out of range!"); - Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1); - - // Return the modified struct. - return ConstantStruct::get(STy, Elts); - } - - ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo)); - uint64_t NumElts; - if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType())) - NumElts = ATy->getNumElements(); - else - NumElts = cast<FixedVectorType>(Init->getType())->getNumElements(); - - // Break up the array into elements. - for (uint64_t i = 0, e = NumElts; i != e; ++i) - Elts.push_back(Init->getAggregateElement(i)); - - assert(CI->getZExtValue() < NumElts); - Elts[CI->getZExtValue()] = - EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1); - - if (Init->getType()->isArrayTy()) - return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts); - return ConstantVector::get(Elts); -} - -/// We have decided that Addr (which satisfies the predicate -/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen. -static void CommitValueTo(Constant *Val, Constant *Addr) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { - assert(GV->hasInitializer()); - GV->setInitializer(Val); - return; - } - - ConstantExpr *CE = cast<ConstantExpr>(Addr); - GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); - GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2)); -} - -/// Given a map of address -> value, where addresses are expected to be some form -/// of either a global or a constant GEP, set the initializer for the address to -/// be the value. This performs mostly the same function as CommitValueTo() -/// and EvaluateStoreInto() but is optimized to be more efficient for the common -/// case where the set of addresses are GEPs sharing the same underlying global, -/// processing the GEPs in batches rather than individually. -/// -/// To give an example, consider the following C++ code adapted from the clang -/// regression tests: -/// struct S { -/// int n = 10; -/// int m = 2 * n; -/// S(int a) : n(a) {} -/// }; -/// -/// template<typename T> -/// struct U { -/// T *r = &q; -/// T q = 42; -/// U *p = this; -/// }; -/// -/// U<S> e; -/// -/// The global static constructor for 'e' will need to initialize 'r' and 'p' of -/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm' -/// members. This batch algorithm will simply use general CommitValueTo() method -/// to handle the complex nested S struct initialization of 'q', before -/// processing the outermost members in a single batch. Using CommitValueTo() to -/// handle member in the outer struct is inefficient when the struct/array is -/// very large as we end up creating and destroy constant arrays for each -/// initialization. -/// For the above case, we expect the following IR to be generated: -/// -/// %struct.U = type { %struct.S*, %struct.S, %struct.U* } -/// %struct.S = type { i32, i32 } -/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e, -/// i64 0, i32 1), -/// %struct.S { i32 42, i32 84 }, %struct.U* @e } -/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex -/// constant expression, while the other two elements of @e are "simple". -static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) { - SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs; - SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs; - SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs; - SimpleCEs.reserve(Mem.size()); - - for (const auto &I : Mem) { - if (auto *GV = dyn_cast<GlobalVariable>(I.first)) { - GVs.push_back(std::make_pair(GV, I.second)); - } else { - ConstantExpr *GEP = cast<ConstantExpr>(I.first); - // We don't handle the deeply recursive case using the batch method. - if (GEP->getNumOperands() > 3) - ComplexCEs.push_back(std::make_pair(GEP, I.second)); - else - SimpleCEs.push_back(std::make_pair(GEP, I.second)); - } - } - - // The algorithm below doesn't handle cases like nested structs, so use the - // slower fully general method if we have to. - for (auto ComplexCE : ComplexCEs) - CommitValueTo(ComplexCE.second, ComplexCE.first); - - for (auto GVPair : GVs) { - assert(GVPair.first->hasInitializer()); - GVPair.first->setInitializer(GVPair.second); - } - - if (SimpleCEs.empty()) - return; - - // We cache a single global's initializer elements in the case where the - // subsequent address/val pair uses the same one. This avoids throwing away and - // rebuilding the constant struct/vector/array just because one element is - // modified at a time. - SmallVector<Constant *, 32> Elts; - Elts.reserve(SimpleCEs.size()); - GlobalVariable *CurrentGV = nullptr; - - auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) { - Constant *Init = GV->getInitializer(); - Type *Ty = Init->getType(); - if (Update) { - if (CurrentGV) { - assert(CurrentGV && "Expected a GV to commit to!"); - Type *CurrentInitTy = CurrentGV->getInitializer()->getType(); - // We have a valid cache that needs to be committed. - if (StructType *STy = dyn_cast<StructType>(CurrentInitTy)) - CurrentGV->setInitializer(ConstantStruct::get(STy, Elts)); - else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy)) - CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts)); - else - CurrentGV->setInitializer(ConstantVector::get(Elts)); - } - if (CurrentGV == GV) - return; - // Need to clear and set up cache for new initializer. - CurrentGV = GV; - Elts.clear(); - unsigned NumElts; - if (auto *STy = dyn_cast<StructType>(Ty)) - NumElts = STy->getNumElements(); - else if (auto *ATy = dyn_cast<ArrayType>(Ty)) - NumElts = ATy->getNumElements(); - else - NumElts = cast<FixedVectorType>(Ty)->getNumElements(); - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(Init->getAggregateElement(i)); - } - }; - - for (auto CEPair : SimpleCEs) { - ConstantExpr *GEP = CEPair.first; - Constant *Val = CEPair.second; - - GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0)); - commitAndSetupCache(GV, GV != CurrentGV); - ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2)); - Elts[CI->getZExtValue()] = Val; - } - // The last initializer in the list needs to be committed, others - // will be committed on a new initializer being processed. - commitAndSetupCache(CurrentGV, true); -} - /// Evaluate static constructors in the function, if we can. Return true if we /// can, false otherwise. static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, @@ -2268,10 +2045,12 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, ++NumCtorsEvaluated; // We succeeded at evaluation: commit the result. + auto NewInitializers = Eval.getMutatedInitializers(); LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" - << F->getName() << "' to " - << Eval.getMutatedMemory().size() << " stores.\n"); - BatchCommitValueTo(Eval.getMutatedMemory()); + << F->getName() << "' to " << NewInitializers.size() + << " stores.\n"); + for (const auto &Pair : NewInitializers) + Pair.first->setInitializer(Pair.second); for (GlobalVariable *GV : Eval.getInvariants()) GV->setConstant(true); } diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index b8a314c54f18..e064fbbef595 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -36,8 +36,14 @@ using namespace IRSimilarity; // A command flag to be used for debugging to exclude branches from similarity // matching and outlining. +namespace llvm { extern cl::opt<bool> DisableBranches; +// A command flag to be used for debugging to indirect calls from similarity +// matching and outlining. +extern cl::opt<bool> DisableIndirectCalls; +} // namespace llvm + // Set to true if the user wants the ir outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr // functions. Since the outliner is confined to a single module (modulo LTO), @@ -104,6 +110,16 @@ struct OutlinableGroup { /// of the region. unsigned BranchesToOutside = 0; + /// Tracker counting backwards from the highest unsigned value possible to + /// avoid conflicting with the GVNs of assigned values. We start at -3 since + /// -2 and -1 are assigned by the DenseMap. + unsigned PHINodeGVNTracker = -3; + + DenseMap<unsigned, + std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>> + PHINodeGVNToGVNs; + DenseMap<hash_code, unsigned> GVNsToPHINodeGVN; + /// The number of instructions that will be outlined by extracting \ref /// Regions. InstructionCost Benefit = 0; @@ -169,6 +185,44 @@ Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other, return FoundValueOpt.getValueOr(nullptr); } +/// Rewrite the BranchInsts in the incoming blocks to \p PHIBlock that are found +/// in \p Included to branch to BasicBlock \p Replace if they currently branch +/// to the BasicBlock \p Find. This is used to fix up the incoming basic blocks +/// when PHINodes are included in outlined regions. +/// +/// \param PHIBlock - The BasicBlock containing the PHINodes that need to be +/// checked. +/// \param Find - The successor block to be replaced. +/// \param Replace - The new succesor block to branch to. +/// \param Included - The set of blocks about to be outlined. +static void replaceTargetsFromPHINode(BasicBlock *PHIBlock, BasicBlock *Find, + BasicBlock *Replace, + DenseSet<BasicBlock *> &Included) { + for (PHINode &PN : PHIBlock->phis()) { + for (unsigned Idx = 0, PNEnd = PN.getNumIncomingValues(); Idx != PNEnd; + ++Idx) { + // Check if the incoming block is included in the set of blocks being + // outlined. + BasicBlock *Incoming = PN.getIncomingBlock(Idx); + if (!Included.contains(Incoming)) + continue; + + BranchInst *BI = dyn_cast<BranchInst>(Incoming->getTerminator()); + assert(BI && "Not a branch instruction?"); + // Look over the branching instructions into this block to see if we + // used to branch to Find in this outlined block. + for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ != End; + Succ++) { + // If we have found the block to replace, we do so here. + if (BI->getSuccessor(Succ) != Find) + continue; + BI->setSuccessor(Succ, Replace); + } + } + } +} + + void OutlinableRegion::splitCandidate() { assert(!CandidateSplit && "Candidate already split!"); @@ -199,6 +253,39 @@ void OutlinableRegion::splitCandidate() { StartBB = StartInst->getParent(); PrevBB = StartBB; + DenseSet<BasicBlock *> BBSet; + Candidate->getBasicBlocks(BBSet); + + // We iterate over the instructions in the region, if we find a PHINode, we + // check if there are predecessors outside of the region, if there are, + // we ignore this region since we are unable to handle the severing of the + // phi node right now. + BasicBlock::iterator It = StartInst->getIterator(); + while (PHINode *PN = dyn_cast<PHINode>(&*It)) { + unsigned NumPredsOutsideRegion = 0; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (!BBSet.contains(PN->getIncomingBlock(i))) + ++NumPredsOutsideRegion; + + if (NumPredsOutsideRegion > 1) + return; + + It++; + } + + // If the region starts with a PHINode, but is not the initial instruction of + // the BasicBlock, we ignore this region for now. + if (isa<PHINode>(StartInst) && StartInst != &*StartBB->begin()) + return; + + // If the region ends with a PHINode, but does not contain all of the phi node + // instructions of the region, we ignore it for now. + if (isa<PHINode>(BackInst)) { + EndBB = BackInst->getParent(); + if (BackInst != &*std::prev(EndBB->getFirstInsertionPt())) + return; + } + // The basic block gets split like so: // block: block: // inst1 inst1 @@ -225,12 +312,20 @@ void OutlinableRegion::splitCandidate() { FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB); FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB); - return; + } else { + EndBB = BackInst->getParent(); + EndsInBranch = true; + FollowBB = nullptr; } - EndBB = BackInst->getParent(); - EndsInBranch = true; - FollowBB = nullptr; + // Refind the basic block set. + BBSet.clear(); + Candidate->getBasicBlocks(BBSet); + // For the phi nodes in the new starting basic block of the region, we + // reassign the targets of the basic blocks branching instructions. + replaceTargetsFromPHINode(StartBB, PrevBB, StartBB, BBSet); + if (FollowBB) + replaceTargetsFromPHINode(FollowBB, EndBB, FollowBB, BBSet); } void OutlinableRegion::reattachCandidate() { @@ -252,15 +347,21 @@ void OutlinableRegion::reattachCandidate() { // inst4 assert(StartBB != nullptr && "StartBB for Candidate is not defined!"); - // StartBB should only have one predecessor since we put an unconditional - // branch at the end of PrevBB when we split the BasicBlock. - PrevBB = StartBB->getSinglePredecessor(); - assert(PrevBB != nullptr && - "No Predecessor for the region start basic block!"); - assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!"); PrevBB->getTerminator()->eraseFromParent(); + // If we reattaching after outlining, we iterate over the phi nodes to + // the initial block, and reassign the branch instructions of the incoming + // blocks to the block we are remerging into. + if (!ExtractedFunction) { + DenseSet<BasicBlock *> BBSet; + Candidate->getBasicBlocks(BBSet); + + replaceTargetsFromPHINode(StartBB, StartBB, PrevBB, BBSet); + if (!EndsInBranch) + replaceTargetsFromPHINode(FollowBB, FollowBB, EndBB, BBSet); + } + moveBBContents(*StartBB, *PrevBB); BasicBlock *PlacementBB = PrevBB; @@ -354,6 +455,24 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) { return Benefit; } +/// Check the \p OutputMappings structure for value \p Input, if it exists +/// it has been used as an output for outlining, and has been renamed, and we +/// return the new value, otherwise, we return the same value. +/// +/// \param OutputMappings [in] - The mapping of values to their renamed value +/// after being used as an output for an outlined region. +/// \param Input [in] - The value to find the remapped value of, if it exists. +/// \return The remapped value if it has been renamed, and the same value if has +/// not. +static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings, + Value *Input) { + DenseMap<Value *, Value *>::const_iterator OutputMapping = + OutputMappings.find(Input); + if (OutputMapping != OutputMappings.end()) + return OutputMapping->second; + return Input; +} + /// Find whether \p Region matches the global value numbering to Constant /// mapping found so far. /// @@ -830,6 +949,209 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, Region.NumExtractedInputs = OriginalIndex; } +/// Check if the \p V has any uses outside of the region other than \p PN. +/// +/// \param V [in] - The value to check. +/// \param PHILoc [in] - The location in the PHINode of \p V. +/// \param PN [in] - The PHINode using \p V. +/// \param Exits [in] - The potential blocks we exit to from the outlined +/// region. +/// \param BlocksInRegion [in] - The basic blocks contained in the region. +/// \returns true if \p V has any use soutside its region other than \p PN. +static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN, + SmallPtrSet<BasicBlock *, 1> &Exits, + DenseSet<BasicBlock *> &BlocksInRegion) { + // We check to see if the value is used by the PHINode from some other + // predecessor not included in the region. If it is, we make sure + // to keep it as an output. + SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues()); + std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0); + if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) { + return (Idx != PHILoc && V == PN.getIncomingValue(Idx) && + !BlocksInRegion.contains(PN.getIncomingBlock(Idx))); + })) + return true; + + // Check if the value is used by any other instructions outside the region. + return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) { + Instruction *I = dyn_cast<Instruction>(U); + if (!I) + return false; + + // If the use of the item is inside the region, we skip it. Uses + // inside the region give us useful information about how the item could be + // used as an output. + BasicBlock *Parent = I->getParent(); + if (BlocksInRegion.contains(Parent)) + return false; + + // If it's not a PHINode then we definitely know the use matters. This + // output value will not completely combined with another item in a PHINode + // as it is directly reference by another non-phi instruction + if (!isa<PHINode>(I)) + return true; + + // If we have a PHINode outside one of the exit locations, then it + // can be considered an outside use as well. If there is a PHINode + // contained in the Exit where this values use matters, it will be + // caught when we analyze that PHINode. + if (!Exits.contains(Parent)) + return true; + + return false; + }); +} + +/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be +/// considered outputs. A PHINodes is an output when more than one incoming +/// value has been marked by the CodeExtractor as an output. +/// +/// \param CurrentExitFromRegion [in] - The block to analyze. +/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the +/// region. +/// \param RegionBlocks [in] - The basic blocks in the region. +/// \param Outputs [in, out] - The existing outputs for the region, we may add +/// PHINodes to this as we find that they replace output values. +/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are +/// totally replaced by a PHINode. +/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used +/// in PHINodes, but have other uses, and should still be considered outputs. +static void analyzeExitPHIsForOutputUses( + BasicBlock *CurrentExitFromRegion, + SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion, + DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs, + DenseSet<Value *> &OutputsReplacedByPHINode, + DenseSet<Value *> &OutputsWithNonPhiUses) { + for (PHINode &PN : CurrentExitFromRegion->phis()) { + // Find all incoming values from the outlining region. + SmallVector<unsigned, 2> IncomingVals; + for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I) + if (RegionBlocks.contains(PN.getIncomingBlock(I))) + IncomingVals.push_back(I); + + // Do not process PHI if there are no predecessors from region. + unsigned NumIncomingVals = IncomingVals.size(); + if (NumIncomingVals == 0) + continue; + + // If there is one predecessor, we mark it as a value that needs to be kept + // as an output. + if (NumIncomingVals == 1) { + Value *V = PN.getIncomingValue(*IncomingVals.begin()); + OutputsWithNonPhiUses.insert(V); + OutputsReplacedByPHINode.erase(V); + continue; + } + + // This PHINode will be used as an output value, so we add it to our list. + Outputs.insert(&PN); + + // Not all of the incoming values should be ignored as other inputs and + // outputs may have uses in outlined region. If they have other uses + // outside of the single PHINode we should not skip over it. + for (unsigned Idx : IncomingVals) { + Value *V = PN.getIncomingValue(Idx); + if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) { + OutputsWithNonPhiUses.insert(V); + OutputsReplacedByPHINode.erase(V); + continue; + } + if (!OutputsWithNonPhiUses.contains(V)) + OutputsReplacedByPHINode.insert(V); + } + } +} + +// Represents the type for the unsigned number denoting the output number for +// phi node, along with the canonical number for the exit block. +using ArgLocWithBBCanon = std::pair<unsigned, unsigned>; +// The list of canonical numbers for the incoming values to a PHINode. +using CanonList = SmallVector<unsigned, 2>; +// The pair type representing the set of canonical values being combined in the +// PHINode, along with the location data for the PHINode. +using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>; + +/// Encode \p PND as an integer for easy lookup based on the argument location, +/// the parent BasicBlock canonical numbering, and the canonical numbering of +/// the values stored in the PHINode. +/// +/// \param PND - The data to hash. +/// \returns The hash code of \p PND. +static hash_code encodePHINodeData(PHINodeData &PND) { + return llvm::hash_combine( + llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second), + llvm::hash_combine_range(PND.second.begin(), PND.second.end())); +} + +/// Create a special GVN for PHINodes that will be used outside of +/// the region. We create a hash code based on the Canonical number of the +/// parent BasicBlock, the canonical numbering of the values stored in the +/// PHINode and the aggregate argument location. This is used to find whether +/// this PHINode type has been given a canonical numbering already. If not, we +/// assign it a value and store it for later use. The value is returned to +/// identify different output schemes for the set of regions. +/// +/// \param Region - The region that \p PN is an output for. +/// \param PN - The PHINode we are analyzing. +/// \param AggArgIdx - The argument \p PN will be stored into. +/// \returns An optional holding the assigned canonical number, or None if +/// there is some attribute of the PHINode blocking it from being used. +static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region, + PHINode *PN, unsigned AggArgIdx) { + OutlinableGroup &Group = *Region.Parent; + IRSimilarityCandidate &Cand = *Region.Candidate; + BasicBlock *PHIBB = PN->getParent(); + CanonList PHIGVNs; + for (Value *Incoming : PN->incoming_values()) { + // If we cannot find a GVN, this means that the input to the PHINode is + // not included in the region we are trying to analyze, meaning, that if + // it was outlined, we would be adding an extra input. We ignore this + // case for now, and so ignore the region. + Optional<unsigned> OGVN = Cand.getGVN(Incoming); + if (!OGVN.hasValue()) { + Region.IgnoreRegion = true; + return None; + } + + // Collect the canonical numbers of the values in the PHINode. + unsigned GVN = OGVN.getValue(); + OGVN = Cand.getCanonicalNum(GVN); + assert(OGVN.hasValue() && "No GVN found for incoming value?"); + PHIGVNs.push_back(*OGVN); + } + + // Now that we have the GVNs for the incoming values, we are going to combine + // them with the GVN of the incoming bock, and the output location of the + // PHINode to generate a hash value representing this instance of the PHINode. + DenseMap<hash_code, unsigned>::iterator GVNToPHIIt; + DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt; + Optional<unsigned> BBGVN = Cand.getGVN(PHIBB); + assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!"); + + BBGVN = Cand.getCanonicalNum(BBGVN.getValue()); + assert(BBGVN.hasValue() && + "Could not find canonical number for the incoming block!"); + // Create a pair of the exit block canonical value, and the aggregate + // argument location, connected to the canonical numbers stored in the + // PHINode. + PHINodeData TemporaryPair = + std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs); + hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair); + + // Look for and create a new entry in our connection between canonical + // numbers for PHINodes, and the set of objects we just created. + GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash); + if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) { + bool Inserted = false; + std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert( + std::make_pair(Group.PHINodeGVNTracker, TemporaryPair)); + std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert( + std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--)); + } + + return GVNToPHIIt->second; +} + /// Create a mapping of the output arguments for the \p Region to the output /// arguments of the overall outlined function. /// @@ -842,35 +1164,25 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, IRSimilarityCandidate &C = *Region.Candidate; SmallVector<BasicBlock *> BE; - DenseSet<BasicBlock *> BBSet; - C.getBasicBlocks(BBSet, BE); + DenseSet<BasicBlock *> BlocksInRegion; + C.getBasicBlocks(BlocksInRegion, BE); // Find the exits to the region. SmallPtrSet<BasicBlock *, 1> Exits; for (BasicBlock *Block : BE) for (BasicBlock *Succ : successors(Block)) - if (!BBSet.contains(Succ)) + if (!BlocksInRegion.contains(Succ)) Exits.insert(Succ); // After determining which blocks exit to PHINodes, we add these PHINodes to // the set of outputs to be processed. We also check the incoming values of // the PHINodes for whether they should no longer be considered outputs. - for (BasicBlock *ExitBB : Exits) { - for (PHINode &PN : ExitBB->phis()) { - // Find all incoming values from the outlining region. - SmallVector<unsigned, 2> IncomingVals; - for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx) - if (BBSet.contains(PN.getIncomingBlock(Idx))) - IncomingVals.push_back(Idx); - - // Do not process PHI if there is one (or fewer) predecessor from region. - if (IncomingVals.size() <= 1) - continue; - - Region.IgnoreRegion = true; - return; - } - } + DenseSet<Value *> OutputsReplacedByPHINode; + DenseSet<Value *> OutputsWithNonPhiUses; + for (BasicBlock *ExitBB : Exits) + analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs, + OutputsReplacedByPHINode, + OutputsWithNonPhiUses); // This counts the argument number in the extracted function. unsigned OriginalIndex = Region.NumExtractedInputs; @@ -893,9 +1205,13 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, // do not have to be in same order, but are functionally the same, we will // have to use a different scheme, as one-to-one correspondence is not // guaranteed. - unsigned GlobalValue = C.getGVN(Output).getValue(); unsigned ArgumentSize = Group.ArgumentTypes.size(); + // If the output is combined in a PHINode, we make sure to skip over it. + if (OutputsReplacedByPHINode.contains(Output)) + continue; + + unsigned AggArgIdx = 0; for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) { if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType())) continue; @@ -907,7 +1223,7 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, AggArgsUsed.insert(Jdx); Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx)); Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + AggArgIdx = Jdx; break; } @@ -916,18 +1232,54 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, // function to handle this output and create a mapping to it. if (!TypeFound) { Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType())); - AggArgsUsed.insert(Group.ArgumentTypes.size() - 1); + // Mark the new pointer type as the last value in the aggregate argument + // list. + unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1; + AggArgsUsed.insert(ArgTypeIdx); Region.ExtractedArgToAgg.insert( - std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1)); + std::make_pair(OriginalIndex, ArgTypeIdx)); Region.AggArgToExtracted.insert( - std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + std::make_pair(ArgTypeIdx, OriginalIndex)); + AggArgIdx = ArgTypeIdx; + } + + // TODO: Adapt to the extra input from the PHINode. + PHINode *PN = dyn_cast<PHINode>(Output); + + Optional<unsigned> GVN; + if (PN && !BlocksInRegion.contains(PN->getParent())) { + // Values outside the region can be combined into PHINode when we + // have multiple exits. We collect both of these into a list to identify + // which values are being used in the PHINode. Each list identifies a + // different PHINode, and a different output. We store the PHINode as it's + // own canonical value. These canonical values are also dependent on the + // output argument it is saved to. + + // If two PHINodes have the same canonical values, but different aggregate + // argument locations, then they will have distinct Canonical Values. + GVN = getGVNForPHINode(Region, PN, AggArgIdx); + if (!GVN.hasValue()) + return; + } else { + // If we do not have a PHINode we use the global value numbering for the + // output value, to find the canonical number to add to the set of stored + // values. + GVN = C.getGVN(Output); + GVN = C.getCanonicalNum(*GVN); } - stable_sort(Region.GVNStores); + // Each region has a potentially unique set of outputs. We save which + // values are output in a list of canonical values so we can differentiate + // among the different store schemes. + Region.GVNStores.push_back(*GVN); + OriginalIndex++; TypeIndex++; } + + // We sort the stored values to make sure that we are not affected by analysis + // order when determining what combination of items were stored. + stable_sort(Region.GVNStores); } void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region, @@ -1063,6 +1415,214 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { return Call; } +/// Find or create a BasicBlock in the outlined function containing PhiBlocks +/// for \p RetVal. +/// +/// \param Group - The OutlinableGroup containing the information about the +/// overall outlined function. +/// \param RetVal - The return value or exit option that we are currently +/// evaluating. +/// \returns The found or newly created BasicBlock to contain the needed +/// PHINodes to be used as outputs. +static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) { + DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal, + ReturnBlockForRetVal; + PhiBlockForRetVal = Group.PHIBlocks.find(RetVal); + ReturnBlockForRetVal = Group.EndBBs.find(RetVal); + assert(ReturnBlockForRetVal != Group.EndBBs.end() && + "Could not find output value!"); + BasicBlock *ReturnBB = ReturnBlockForRetVal->second; + + // Find if a PHIBlock exists for this return value already. If it is + // the first time we are analyzing this, we will not, so we record it. + PhiBlockForRetVal = Group.PHIBlocks.find(RetVal); + if (PhiBlockForRetVal != Group.PHIBlocks.end()) + return PhiBlockForRetVal->second; + + // If we did not find a block, we create one, and insert it into the + // overall function and record it. + bool Inserted = false; + BasicBlock *PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block", + ReturnBB->getParent()); + std::tie(PhiBlockForRetVal, Inserted) = + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + + // We find the predecessors of the return block in the newly created outlined + // function in order to point them to the new PHIBlock rather than the already + // existing return block. + SmallVector<BranchInst *, 2> BranchesToChange; + for (BasicBlock *Pred : predecessors(ReturnBB)) + BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator())); + + // Now we mark the branch instructions found, and change the references of the + // return block to the newly created PHIBlock. + for (BranchInst *BI : BranchesToChange) + for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) { + if (BI->getSuccessor(Succ) != ReturnBB) + continue; + BI->setSuccessor(Succ, PHIBlock); + } + + BranchInst::Create(ReturnBB, PHIBlock); + + return PhiBlockForRetVal->second; +} + +/// For the function call now representing the \p Region, find the passed value +/// to that call that represents Argument \p A at the call location if the +/// call has already been replaced with a call to the overall, aggregate +/// function. +/// +/// \param A - The Argument to get the passed value for. +/// \param Region - The extracted Region corresponding to the outlined function. +/// \returns The Value representing \p A at the call site. +static Value * +getPassedArgumentInAlreadyOutlinedFunction(const Argument *A, + const OutlinableRegion &Region) { + // If we don't need to adjust the argument number at all (since the call + // has already been replaced by a call to the overall outlined function) + // we can just get the specified argument. + return Region.Call->getArgOperand(A->getArgNo()); +} + +/// For the function call now representing the \p Region, find the passed value +/// to that call that represents Argument \p A at the call location if the +/// call has only been replaced by the call to the aggregate function. +/// +/// \param A - The Argument to get the passed value for. +/// \param Region - The extracted Region corresponding to the outlined function. +/// \returns The Value representing \p A at the call site. +static Value * +getPassedArgumentAndAdjustArgumentLocation(const Argument *A, + const OutlinableRegion &Region) { + unsigned ArgNum = A->getArgNo(); + + // If it is a constant, we can look at our mapping from when we created + // the outputs to figure out what the constant value is. + if (Region.AggArgToConstant.count(ArgNum)) + return Region.AggArgToConstant.find(ArgNum)->second; + + // If it is not a constant, and we are not looking at the overall function, we + // need to adjust which argument we are looking at. + ArgNum = Region.AggArgToExtracted.find(ArgNum)->second; + return Region.Call->getArgOperand(ArgNum); +} + +/// Find the canonical numbering for the incoming Values into the PHINode \p PN. +/// +/// \param PN [in] - The PHINode that we are finding the canonical numbers for. +/// \param Region [in] - The OutlinableRegion containing \p PN. +/// \param OutputMappings [in] - The mapping of output values from outlined +/// region to their original values. +/// \param CanonNums [out] - The canonical numbering for the incoming values to +/// \p PN. +/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call +/// of \p Region rather than the overall function's call. +static void +findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region, + const DenseMap<Value *, Value *> &OutputMappings, + DenseSet<unsigned> &CanonNums, + bool ReplacedWithOutlinedCall = true) { + // Iterate over the incoming values. + for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) { + Value *IVal = PN->getIncomingValue(Idx); + // If we have an argument as incoming value, we need to grab the passed + // value from the call itself. + if (Argument *A = dyn_cast<Argument>(IVal)) { + if (ReplacedWithOutlinedCall) + IVal = getPassedArgumentInAlreadyOutlinedFunction(A, Region); + else + IVal = getPassedArgumentAndAdjustArgumentLocation(A, Region); + } + + // Get the original value if it has been replaced by an output value. + IVal = findOutputMapping(OutputMappings, IVal); + + // Find and add the canonical number for the incoming value. + Optional<unsigned> GVN = Region.Candidate->getGVN(IVal); + assert(GVN.hasValue() && "No GVN for incoming value"); + Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN); + assert(CanonNum.hasValue() && "No Canonical Number for GVN"); + CanonNums.insert(*CanonNum); + } +} + +/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock +/// in order to condense the number of instructions added to the outlined +/// function. +/// +/// \param PN [in] - The PHINode that we are finding the canonical numbers for. +/// \param Region [in] - The OutlinableRegion containing \p PN. +/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find +/// \p PN in. +/// \param OutputMappings [in] - The mapping of output values from outlined +/// region to their original values. +/// \return the newly found or created PHINode in \p OverallPhiBlock. +static PHINode* +findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region, + BasicBlock *OverallPhiBlock, + const DenseMap<Value *, Value *> &OutputMappings) { + OutlinableGroup &Group = *Region.Parent; + + DenseSet<unsigned> PNCanonNums; + // We have to use the extracted function since we have merged this region into + // the overall function yet. We make sure to reassign the argument numbering + // since it is possible that the argument ordering is different between the + // functions. + findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums, + /* ReplacedWithOutlinedCall = */ false); + + OutlinableRegion *FirstRegion = Group.Regions[0]; + DenseSet<unsigned> CurrentCanonNums; + // Find the Canonical Numbering for each PHINode, if it matches, we replace + // the uses of the PHINode we are searching for, with the found PHINode. + for (PHINode &CurrPN : OverallPhiBlock->phis()) { + CurrentCanonNums.clear(); + findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums, + /* ReplacedWithOutlinedCall = */ true); + + if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) { + return CurrentCanonNums.contains(CanonNum); + })) + return &CurrPN; + } + + // If we've made it here, it means we weren't able to replace the PHINode, so + // we must insert it ourselves. + PHINode *NewPN = cast<PHINode>(PN.clone()); + NewPN->insertBefore(&*OverallPhiBlock->begin()); + for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx; + Idx++) { + Value *IncomingVal = NewPN->getIncomingValue(Idx); + BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx); + + // Find corresponding basic block in the overall function for the incoming + // block. + Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI(); + assert(FirstNonPHI && "Incoming block is empty?"); + Value *CorrespondingVal = + Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI); + assert(CorrespondingVal && "Value is nullptr?"); + BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent(); + NewPN->setIncomingBlock(Idx, BlockToUse); + + // If we have an argument we make sure we replace using the argument from + // the correct function. + if (Argument *A = dyn_cast<Argument>(IncomingVal)) { + Value *Val = Group.OutlinedFunction->getArg(A->getArgNo()); + NewPN->setIncomingValue(Idx, Val); + continue; + } + + // Find the corresponding value in the overall function. + IncomingVal = findOutputMapping(OutputMappings, IncomingVal); + Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal); + assert(Val && "Value is nullptr?"); + NewPN->setIncomingValue(Idx, Val); + } + return NewPN; +} + // Within an extracted function, replace the argument uses of the extracted // region with the arguments of the function for an OutlinableGroup. // @@ -1075,6 +1635,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { static void replaceArgumentUses(OutlinableRegion &Region, DenseMap<Value *, BasicBlock *> &OutputBBs, + const DenseMap<Value *, Value *> &OutputMappings, bool FirstFunction = false) { OutlinableGroup &Group = *Region.Parent; assert(Region.ExtractedFunction && "Region has no extracted function?"); @@ -1144,12 +1705,47 @@ replaceArgumentUses(OutlinableRegion &Region, LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " << *OutputBB << "\n"); - if (FirstFunction) + // If this is storing a PHINode, we must make sure it is included in the + // overall function. + if (!isa<PHINode>(ValueOperand) || + Region.Candidate->getGVN(ValueOperand).hasValue()) { + if (FirstFunction) + continue; + Value *CorrVal = + Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); + assert(CorrVal && "Value is nullptr?"); + NewI->setOperand(0, CorrVal); + continue; + } + PHINode *PN = cast<PHINode>(SI->getValueOperand()); + // If it has a value, it was not split by the code extractor, which + // is what we are looking for. + if (Region.Candidate->getGVN(PN).hasValue()) continue; - Value *CorrVal = - Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); - assert(CorrVal && "Value is nullptr?"); - NewI->setOperand(0, CorrVal); + + // We record the parent block for the PHINode in the Region so that + // we can exclude it from checks later on. + Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent())); + + // If this is the first function, we do not need to worry about mergiing + // this with any other block in the overall outlined function, so we can + // just continue. + if (FirstFunction) { + BasicBlock *PHIBlock = PN->getParent(); + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + continue; + } + + // We look for the aggregate block that contains the PHINodes leading into + // this exit path. If we can't find one, we create one. + BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal); + + // For our PHINode, we find the combined canonical numbering, and + // attempt to find a matching PHINode in the overall PHIBlock. If we + // cannot, we copy the PHINode and move it into this new block. + PHINode *NewPN = + findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings); + NewI->setOperand(0, NewPN); } // If we added an edge for basic blocks without a predecessor, we remove it @@ -1390,7 +1986,12 @@ void createSwitchStatement( Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs, std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) { // We only need the switch statement if there is more than one store - // combination. + // combination, or there is more than one set of output blocks. The first + // will occur when we store different sets of values for two different + // regions. The second will occur when we have two outputs that are combined + // in a PHINode outside of the region in one outlined instance, and are used + // seaparately in another. This will create the same set of OutputGVNs, but + // will generate two different output schemes. if (OG.OutputGVNCombinations.size() > 1) { Function *AggFunc = OG.OutlinedFunction; // Create a final block for each different return block. @@ -1433,8 +2034,14 @@ void createSwitchStatement( return; } + assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!"); + // If there needs to be stores, move them from the output blocks to their - // corresponding ending block. + // corresponding ending block. We do not check that the OutputGVNCombinations + // is equal to 1 here since that could just been the case where there are 0 + // outputs. Instead, we check whether there is more than one set of output + // blocks since this is the only case where we would have to move the + // stores, and erase the extraneous blocks. if (OutputStoreBBs.size() == 1) { LLVM_DEBUG(dbgs() << "Move store instructions to the end block in " << *OG.OutlinedFunction << "\n"); @@ -1466,10 +2073,13 @@ void createSwitchStatement( /// set of stores needed for the different functions. /// \param [in,out] FuncsToRemove - Extracted functions to erase from module /// once outlining is complete. +/// \param [in] OutputMappings - Extracted functions to erase from module +/// once outlining is complete. static void fillOverallFunction( Module &M, OutlinableGroup &CurrentGroup, std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs, - std::vector<Function *> &FuncsToRemove) { + std::vector<Function *> &FuncsToRemove, + const DenseMap<Value *, Value *> &OutputMappings) { OutlinableRegion *CurrentOS = CurrentGroup.Regions[0]; // Move first extracted function's instructions into new function. @@ -1489,7 +2099,7 @@ static void fillOverallFunction( CurrentGroup.OutlinedFunction, "output_block_0"); CurrentOS->OutputBlockNum = 0; - replaceArgumentUses(*CurrentOS, NewBBs, true); + replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true); replaceConstants(*CurrentOS); // We first identify if any output blocks are empty, if they are we remove @@ -1523,7 +2133,8 @@ void IROutliner::deduplicateExtractedSections( OutlinableRegion *CurrentOS; - fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove); + fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove, + OutputMappings); std::vector<Value *> SortedKeys; for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) { @@ -1537,8 +2148,7 @@ void IROutliner::deduplicateExtractedSections( createAndInsertBasicBlocks( CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction, "output_block_" + Twine(static_cast<unsigned>(Idx))); - - replaceArgumentUses(*CurrentOS, NewBBs); + replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings); alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs, CurrentGroup.EndBBs, OutputMappings, OutputStoreBBs); @@ -1637,7 +2247,7 @@ void IROutliner::pruneIncompatibleRegions( if (FirstCandidate.getLength() == 2) { if (isa<CallInst>(FirstCandidate.front()->Inst) && isa<BranchInst>(FirstCandidate.back()->Inst)) - return; + return; } unsigned CurrentEndIdx = 0; @@ -1706,6 +2316,34 @@ IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) { return RegionBenefit; } +/// For the \p OutputCanon number passed in find the value represented by this +/// canonical number. If it is from a PHINode, we pick the first incoming +/// value and return that Value instead. +/// +/// \param Region - The OutlinableRegion to get the Value from. +/// \param OutputCanon - The canonical number to find the Value from. +/// \returns The Value represented by a canonical number \p OutputCanon in \p +/// Region. +static Value *findOutputValueInRegion(OutlinableRegion &Region, + unsigned OutputCanon) { + OutlinableGroup &CurrentGroup = *Region.Parent; + // If the value is greater than the value in the tracker, we have a + // PHINode and will instead use one of the incoming values to find the + // type. + if (OutputCanon > CurrentGroup.PHINodeGVNTracker) { + auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon); + assert(It != CurrentGroup.PHINodeGVNToGVNs.end() && + "Could not find GVN set for PHINode number!"); + assert(It->second.second.size() > 0 && "PHINode does not have any values!"); + OutputCanon = *It->second.second.begin(); + } + Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon); + assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?"); + Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN); + assert(OV.hasValue() && "Could not find value for GVN?"); + return *OV; +} + InstructionCost IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) { InstructionCost OverallCost = 0; @@ -1713,10 +2351,8 @@ IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) { TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent()); // Each output incurs a load after the call, so we add that to the cost. - for (unsigned OutputGVN : Region->GVNStores) { - Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN); - assert(OV.hasValue() && "Could not find value for GVN?"); - Value *V = OV.getValue(); + for (unsigned OutputCanon : Region->GVNStores) { + Value *V = findOutputValueInRegion(*Region, OutputCanon); InstructionCost LoadCost = TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0, TargetTransformInfo::TCK_CodeSize); @@ -1745,6 +2381,7 @@ static InstructionCost findCostForOutputBlocks(Module &M, InstructionCost OutputCost = 0; unsigned NumOutputBranches = 0; + OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0]; IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; DenseSet<BasicBlock *> CandidateBlocks; Candidate.getBasicBlocks(CandidateBlocks); @@ -1770,10 +2407,8 @@ static InstructionCost findCostForOutputBlocks(Module &M, for (const ArrayRef<unsigned> &OutputUse : CurrentGroup.OutputGVNCombinations) { - for (unsigned GVN : OutputUse) { - Optional<Value *> OV = Candidate.fromGVN(GVN); - assert(OV.hasValue() && "Could not find value for GVN?"); - Value *V = OV.getValue(); + for (unsigned OutputCanon : OutputUse) { + Value *V = findOutputValueInRegion(FirstRegion, OutputCanon); InstructionCost StoreCost = TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0, TargetTransformInfo::TCK_CodeSize); @@ -1974,6 +2609,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) { unsigned IROutliner::doOutline(Module &M) { // Find the possible similarity sections. InstructionClassifier.EnableBranches = !DisableBranches; + InstructionClassifier.EnableIndirectCalls = !DisableIndirectCalls; IRSimilarityIdentifier &Identifier = getIRSI(M); SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity(); @@ -2033,8 +2669,8 @@ unsigned IROutliner::doOutline(Module &M) { continue; SmallVector<BasicBlock *> BE; - DenseSet<BasicBlock *> BBSet; - OS->Candidate->getBasicBlocks(BBSet, BE); + DenseSet<BasicBlock *> BlocksInRegion; + OS->Candidate->getBasicBlocks(BlocksInRegion, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); @@ -2144,8 +2780,8 @@ unsigned IROutliner::doOutline(Module &M) { OutlinedRegions.clear(); for (OutlinableRegion *OS : CurrentGroup.Regions) { SmallVector<BasicBlock *> BE; - DenseSet<BasicBlock *> BBSet; - OS->Candidate->getBasicBlocks(BBSet, BE); + DenseSet<BasicBlock *> BlocksInRegion; + OS->Candidate->getBasicBlocks(BlocksInRegion, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index 4e3689f09536..49babc24cb82 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -92,6 +92,11 @@ static cl::opt<bool> DisableInlinedAllocaMerging("disable-inlined-alloca-merging", cl::init(false), cl::Hidden); +/// A flag for test, so we can print the content of the advisor when running it +/// as part of the default (e.g. -O3) pipeline. +static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing", + cl::init(false), cl::Hidden); + extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats; static cl::opt<std::string> CGSCCInlineReplayFile( @@ -660,7 +665,7 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, } if (!DeadFunctionsInComdats.empty()) { // Filter out the functions whose comdats remain alive. - filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats); + filterDeadComdatFunctions(DeadFunctionsInComdats); // Remove the rest. for (Function *F : DeadFunctionsInComdats) RemoveCGN(CG[F]); @@ -741,7 +746,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M); Advisor.onPassEntry(); - auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); + auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); }); // We use a single common worklist for calls across the entire SCC. We // process these in-order and append new calls introduced during inlining to @@ -823,6 +828,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // defer deleting these to make it easier to handle the call graph updates. SmallVector<Function *, 4> DeadFunctions; + // Track potentially dead non-local functions with comdats to see if they can + // be deleted as a batch after inlining. + SmallVector<Function *, 4> DeadFunctionsInComdats; + // Loop forward over all of the calls. while (!Calls->empty()) { // We expect the calls to typically be batched with sequences of calls that @@ -935,16 +944,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // Merge the attributes based on the inlining. AttributeFuncs::mergeAttributesForInlining(F, Callee); - // For local functions, check whether this makes the callee trivially - // dead. In that case, we can drop the body of the function eagerly - // which may reduce the number of callers of other functions to one, - // changing inline cost thresholds. + // For local functions or discardable functions without comdats, check + // whether this makes the callee trivially dead. In that case, we can drop + // the body of the function eagerly which may reduce the number of callers + // of other functions to one, changing inline cost thresholds. Non-local + // discardable functions with comdats are checked later on. bool CalleeWasDeleted = false; - if (Callee.hasLocalLinkage()) { - // To check this we also need to nuke any dead constant uses (perhaps - // made dead by this operation on other functions). - Callee.removeDeadConstantUsers(); - if (Callee.use_empty() && !CG.isLibFunction(Callee)) { + if (Callee.isDiscardableIfUnused() && Callee.hasZeroLiveUses() && + !CG.isLibFunction(Callee)) { + if (Callee.hasLocalLinkage() || !Callee.hasComdat()) { Calls->erase_if([&](const std::pair<CallBase *, int> &Call) { return Call.first->getCaller() == &Callee; }); @@ -957,6 +965,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, "Cannot put cause a function to become dead twice!"); DeadFunctions.push_back(&Callee); CalleeWasDeleted = true; + } else { + DeadFunctionsInComdats.push_back(&Callee); } } if (CalleeWasDeleted) @@ -1019,6 +1029,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, FAM.invalidate(F, PreservedAnalyses::none()); } + // We must ensure that we only delete functions with comdats if every function + // in the comdat is going to be deleted. + if (!DeadFunctionsInComdats.empty()) { + filterDeadComdatFunctions(DeadFunctionsInComdats); + for (auto *Callee : DeadFunctionsInComdats) + Callee->dropAllReferences(); + DeadFunctions.append(DeadFunctionsInComdats); + } + // Now that we've finished inlining all of the calls across this SCC, delete // all of the trivially dead functions, updating the call graph and the CGSCC // pass manager in the process. @@ -1045,14 +1064,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, UR.UpdatedC = nullptr; // And delete the actual function from the module. - // The Advisor may use Function pointers to efficiently index various - // internal maps, e.g. for memoization. Function cleanup passes like - // argument promotion create new functions. It is possible for a new - // function to be allocated at the address of a deleted function. We could - // index using names, but that's inefficient. Alternatively, we let the - // Advisor free the functions when it sees fit. - DeadF->getBasicBlockList().clear(); - M.getFunctionList().remove(DeadF); + M.getFunctionList().erase(DeadF); ++NumDeleted; } @@ -1073,8 +1085,7 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params, bool MandatoryFirst, InliningAdvisorMode Mode, unsigned MaxDevirtIterations) - : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations), - PM(), MPM() { + : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations) { // Run the inliner first. The theory is that we are walking bottom-up and so // the callees have already been fully optimized, and we want to inline them // into the callers so that our optimizations can reflect that. @@ -1118,7 +1129,8 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M, // Discard the InlineAdvisor, a subsequent inlining session should construct // its own. auto PA = PreservedAnalyses::all(); - PA.abandon<InlineAdvisorAnalysis>(); + if (!KeepAdvisorForPrinting) + PA.abandon<InlineAdvisorAnalysis>(); return PA; } diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp index ebf080e87c3b..d515303e4911 100644 --- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp +++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp @@ -335,14 +335,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M, FAM.clear(*DeadF, DeadF->getName()); // And delete the actual function from the module. - // The Advisor may use Function pointers to efficiently index various - // internal maps, e.g. for memoization. Function cleanup passes like - // argument promotion create new functions. It is possible for a new - // function to be allocated at the address of a deleted function. We could - // index using names, but that's inefficient. Alternatively, we let the - // Advisor free the functions when it sees fit. - DeadF->getBasicBlockList().clear(); - M.getFunctionList().remove(DeadF); + M.getFunctionList().erase(DeadF); ++NumDeleted; } diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index f289e3ecc979..68f33410c602 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" @@ -153,14 +154,6 @@ static constexpr auto TAG = "[" DEBUG_TYPE "]"; namespace { -enum class AddressSpace : unsigned { - Generic = 0, - Global = 1, - Shared = 3, - Constant = 4, - Local = 5, -}; - struct AAHeapToShared; struct AAICVTracker; @@ -170,7 +163,7 @@ struct AAICVTracker; struct OMPInformationCache : public InformationCache { OMPInformationCache(Module &M, AnalysisGetter &AG, BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, - SmallPtrSetImpl<Kernel> &Kernels) + KernelSet &Kernels) : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), Kernels(Kernels) { @@ -424,6 +417,12 @@ struct OMPInformationCache : public InformationCache { recollectUsesForFunction(static_cast<RuntimeFunction>(Idx)); } + // Helper function to inherit the calling convention of the function callee. + void setCallingConvention(FunctionCallee Callee, CallInst *CI) { + if (Function *Fn = dyn_cast<Function>(Callee.getCallee())) + CI->setCallingConv(Fn->getCallingConv()); + } + /// Helper to initialize all runtime function information for those defined /// in OpenMPKinds.def. void initializeRuntimeFunctions() { @@ -485,7 +484,7 @@ struct OMPInformationCache : public InformationCache { } /// Collection of known kernels (\see Kernel) in the module. - SmallPtrSetImpl<Kernel> &Kernels; + KernelSet &Kernels; /// Collection of known OpenMP runtime functions.. DenseSet<const Function *> RTLFunctions; @@ -1013,7 +1012,8 @@ private: // into a single parallel region is contained in a single basic block // without any other instructions. We use the OpenMPIRBuilder to outline // that block and call the resulting function via __kmpc_fork_call. - auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { + auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs, + BasicBlock *BB) { // TODO: Change the interface to allow single CIs expanded, e.g, to // include an outer loop. assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); @@ -1075,8 +1075,7 @@ private: BranchInst::Create(AfterBB, AfterIP.getBlock()); // Perform the actual outlining. - OMPInfoCache.OMPBuilder.finalize(OriginalFn, - /* AllowExtractorSinking */ true); + OMPInfoCache.OMPBuilder.finalize(OriginalFn); Function *OutlinedFn = MergableCIs.front()->getCaller(); @@ -1538,6 +1537,7 @@ private: CallInst *IssueCallsite = CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); + OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite); RuntimeCall.eraseFromParent(); // Add "wait" runtime call declaration: @@ -1550,7 +1550,9 @@ private: OffloadArray::DeviceIDArgNum), // device_id. Handle // handle to wait on. }; - CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); + CallInst *WaitCallsite = CallInst::Create( + WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); + OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite); return true; } @@ -1597,8 +1599,10 @@ private: &F.getEntryBlock(), F.getEntryBlock().begin())); // Create a fallback location if non was found. // TODO: Use the debug locations of the calls instead. - Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); - Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); + uint32_t SrcLocStrSize; + Constant *Loc = + OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); + Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize); } return Ident; } @@ -2171,7 +2175,7 @@ struct AAICVTrackerFunction : public AAICVTracker { }; auto CallCheck = [&](Instruction &I) { - Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); + Optional<Value *> ReplVal = getValueForCall(A, I, ICV); if (ReplVal.hasValue() && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) HasChanged = ChangeStatus::CHANGED; @@ -2197,12 +2201,12 @@ struct AAICVTrackerFunction : public AAICVTracker { return HasChanged; } - /// Hepler to check if \p I is a call and get the value for it if it is + /// Helper to check if \p I is a call and get the value for it if it is /// unique. - Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, + Optional<Value *> getValueForCall(Attributor &A, const Instruction &I, InternalControlVar &ICV) const { - const auto *CB = dyn_cast<CallBase>(I); + const auto *CB = dyn_cast<CallBase>(&I); if (!CB || CB->hasFnAttr("no_openmp") || CB->hasFnAttr("no_openmp_routines")) return None; @@ -2218,8 +2222,8 @@ struct AAICVTrackerFunction : public AAICVTracker { if (CalledFunction == GetterRFI.Declaration) return None; if (CalledFunction == SetterRFI.Declaration) { - if (ICVReplacementValuesMap[ICV].count(I)) - return ICVReplacementValuesMap[ICV].lookup(I); + if (ICVReplacementValuesMap[ICV].count(&I)) + return ICVReplacementValuesMap[ICV].lookup(&I); return nullptr; } @@ -2231,8 +2235,11 @@ struct AAICVTrackerFunction : public AAICVTracker { const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); - if (ICVTrackingAA.isAssumedTracked()) - return ICVTrackingAA.getUniqueReplacementValue(ICV); + if (ICVTrackingAA.isAssumedTracked()) { + Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV); + if (!URV || (*URV && AA::isValidAtPosition(**URV, I, OMPInfoCache))) + return URV; + } // If we don't know, assume it changes. return nullptr; @@ -2284,7 +2291,7 @@ struct AAICVTrackerFunction : public AAICVTracker { break; } - Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); + Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV); if (!NewReplVal.hasValue()) continue; @@ -2548,7 +2555,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { } /// Set of basic blocks that are executed by a single thread. - DenseSet<const BasicBlock *> SingleThreadedBBs; + SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs; /// Total number of basic blocks in this function. long unsigned NumBBs; @@ -2572,7 +2579,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { if (!A.checkForAllCallSites(PredForCallSite, *this, /* RequiresAllCallSites */ true, AllCallSitesKnown)) - SingleThreadedBBs.erase(&F->getEntryBlock()); + SingleThreadedBBs.remove(&F->getEntryBlock()); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; @@ -2637,7 +2644,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { for (auto *BB : RPOT) { if (!MergePredecessorStates(BB)) - SingleThreadedBBs.erase(BB); + SingleThreadedBBs.remove(BB); } return (NumSingleThreadedBBs == SingleThreadedBBs.size()) @@ -2759,7 +2766,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { if (FreeCalls.size() != 1) continue; - ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0)); + auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0)); LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB << " with " << AllocSize->getZExtValue() @@ -2772,7 +2779,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); auto *SharedMem = new GlobalVariable( *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, - UndefValue::get(Int8ArrTy), CB->getName(), nullptr, + UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr, GlobalValue::NotThreadLocal, static_cast<unsigned>(AddressSpace::Shared)); auto *NewBuffer = @@ -2786,7 +2793,10 @@ struct AAHeapToSharedFunction : public AAHeapToShared { }; A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark); - SharedMem->setAlignment(MaybeAlign(32)); + MaybeAlign Alignment = CB->getRetAlign(); + assert(Alignment && + "HeapToShared on allocation without alignment attribute"); + SharedMem->setAlignment(MaybeAlign(Alignment)); A.changeValueAfterManifest(*CB, *NewBuffer); A.deleteAfterManifest(*CB); @@ -2813,7 +2823,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { if (CallBase *CB = dyn_cast<CallBase>(U)) if (!isa<ConstantInt>(CB->getArgOperand(0)) || !ED.isExecutedByInitialThreadOnly(*CB)) - MallocCalls.erase(CB); + MallocCalls.remove(CB); } findPotentialRemovedFreeCalls(A); @@ -2825,7 +2835,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { } /// Collection of all malloc calls in a function. - SmallPtrSet<CallBase *, 4> MallocCalls; + SmallSetVector<CallBase *, 4> MallocCalls; /// Collection of potentially removed free calls in a function. SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls; }; @@ -2962,7 +2972,7 @@ struct AAKernelInfoFunction : AAKernelInfo { A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); UsedAssumedInformation = !isAtFixpoint(); auto *FalseVal = - ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0); + ConstantInt::getBool(IRP.getAnchorValue().getContext(), false); return FalseVal; }; @@ -3225,8 +3235,11 @@ struct AAKernelInfoFunction : AAKernelInfo { OpenMPIRBuilder::LocationDescription Loc( InsertPointTy(ParentBB, ParentBB->end()), DL); OMPInfoCache.OMPBuilder.updateToLocation(Loc); - auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc); - Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + auto *SrcLocStr = + OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = + OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL); // Add check for Tid in RegionCheckTidBB @@ -3237,8 +3250,10 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee HardwareTidFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_hardware_thread_id_in_block); - Value *Tid = + CallInst *Tid = OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {}); + Tid->setDebugLoc(DL); + OMPInfoCache.setCallingConvention(HardwareTidFn, Tid); Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid); OMPInfoCache.OMPBuilder.Builder .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB) @@ -3251,14 +3266,18 @@ struct AAKernelInfoFunction : AAKernelInfo { M, OMPRTL___kmpc_barrier_simple_spmd); OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy( RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt())); - OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}) - ->setDebugLoc(DL); + CallInst *Barrier = + OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}); + Barrier->setDebugLoc(DL); + OMPInfoCache.setCallingConvention(BarrierFn, Barrier); // Second barrier ensures workers have read broadcast values. - if (HasBroadcastValues) - CallInst::Create(BarrierFn, {Ident, Tid}, "", - RegionBarrierBB->getTerminator()) - ->setDebugLoc(DL); + if (HasBroadcastValues) { + CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "", + RegionBarrierBB->getTerminator()); + Barrier->setDebugLoc(DL); + OMPInfoCache.setCallingConvention(BarrierFn, Barrier); + } }; auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; @@ -3352,17 +3371,17 @@ struct AAKernelInfoFunction : AAKernelInfo { OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), - *ConstantInt::getBool(Ctx, 0)); + *ConstantInt::getBool(Ctx, false)); A.changeUseAfterManifest( KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, 0)); + *ConstantInt::getBool(Ctx, false)); A.changeUseAfterManifest( KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, 0)); + *ConstantInt::getBool(Ctx, false)); ++NumOpenMPTargetRegionKernelsSPMD; @@ -3403,7 +3422,7 @@ struct AAKernelInfoFunction : AAKernelInfo { // If not SPMD mode, indicate we use a custom state machine now. auto &Ctx = getAnchorValue().getContext(); - auto *FalseVal = ConstantInt::getBool(Ctx, 0); + auto *FalseVal = ConstantInt::getBool(Ctx, false); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); @@ -3528,10 +3547,12 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee WarpSizeFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_warp_size); - Instruction *BlockHwSize = + CallInst *BlockHwSize = CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB); + OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize); BlockHwSize->setDebugLoc(DLoc); - Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize); WarpSize->setDebugLoc(DLoc); Instruction *BlockSize = BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB); @@ -3571,8 +3592,10 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee BarrierFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_barrier_simple_generic); - CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB) - ->setDebugLoc(DLoc); + CallInst *Barrier = + CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB); + OMPInfoCache.setCallingConvention(BarrierFn, Barrier); + Barrier->setDebugLoc(DLoc); if (WorkFnAI->getType()->getPointerAddressSpace() != (unsigned int)AddressSpace::Generic) { @@ -3588,8 +3611,9 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee KernelParallelFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_kernel_parallel); - Instruction *IsActiveWorker = CallInst::Create( + CallInst *IsActiveWorker = CallInst::Create( KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB); + OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker); IsActiveWorker->setDebugLoc(DLoc); Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn", StateMachineBeginBB); @@ -3669,10 +3693,13 @@ struct AAKernelInfoFunction : AAKernelInfo { StateMachineIfCascadeCurrentBB) ->setDebugLoc(DLoc); - CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___kmpc_kernel_end_parallel), - {}, "", StateMachineEndParallelBB) - ->setDebugLoc(DLoc); + FunctionCallee EndParallelFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_kernel_end_parallel); + CallInst *EndParallel = + CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB); + OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel); + EndParallel->setDebugLoc(DLoc); BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB) ->setDebugLoc(DLoc); @@ -4508,6 +4535,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { bool UsedAssumedInformation = false; A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, UsedAssumedInformation); + } else if (auto *SI = dyn_cast<StoreInst>(&I)) { + A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI)); } } } diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 2d717475ce7f..5f2223e4047e 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -169,8 +169,7 @@ struct FunctionOutliningInfo { }; struct FunctionOutliningMultiRegionInfo { - FunctionOutliningMultiRegionInfo() - : ORI() {} + FunctionOutliningMultiRegionInfo() {} // Container for outline regions struct OutlineRegionInfo { @@ -971,6 +970,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( }; for (User *User : Users) { + // Don't bother with BlockAddress used by CallBr for asm goto. + if (isa<BlockAddress>(User)) + continue; CallBase *CB = getSupportedCallBase(User); Function *Caller = CB->getCaller(); if (CurrentCaller != Caller) { @@ -1414,6 +1416,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { bool AnyInline = false; for (User *User : Users) { + // Don't bother with BlockAddress used by CallBr for asm goto. + if (isa<BlockAddress>(User)) + continue; + CallBase *CB = getSupportedCallBase(User); if (isLimitReached()) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index eb1b8a29cfc5..0598f751febe 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -519,13 +519,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { unsigned NextTmpIdx = 0; FAddend TmpResult[3]; - // Points to the constant addend of the resulting simplified expression. - // If the resulting expr has constant-addend, this constant-addend is - // desirable to reside at the top of the resulting expression tree. Placing - // constant close to supper-expr(s) will potentially reveal some optimization - // opportunities in super-expr(s). - const FAddend *ConstAdd = nullptr; - // Simplified addends are placed <SimpVect>. AddendVect SimpVect; @@ -541,6 +534,14 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { } Value *Val = ThisAddend->getSymVal(); + + // If the resulting expr has constant-addend, this constant-addend is + // desirable to reside at the top of the resulting expression tree. Placing + // constant close to super-expr(s) will potentially reveal some + // optimization opportunities in super-expr(s). Here we do not implement + // this logic intentionally and rely on SimplifyAssociativeOrCommutative + // call later. + unsigned StartIdx = SimpVect.size(); SimpVect.push_back(ThisAddend); @@ -569,14 +570,8 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { // Pop all addends being folded and push the resulting folded addend. SimpVect.resize(StartIdx); - if (Val) { - if (!R.isZero()) { - SimpVect.push_back(&R); - } - } else { - // Don't push constant addend at this time. It will be the last element - // of <SimpVect>. - ConstAdd = &R; + if (!R.isZero()) { + SimpVect.push_back(&R); } } } @@ -584,9 +579,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) && "out-of-bound access"); - if (ConstAdd) - SimpVect.push_back(ConstAdd); - Value *Result; if (!SimpVect.empty()) Result = createNaryFAdd(SimpVect, InstrQuota); @@ -1296,6 +1288,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // (A*B)+(A*C) -> A*(B+C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); @@ -1498,15 +1493,18 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I, return Lerp; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (!Op0->hasOneUse() || !Op1->hasOneUse()) + return nullptr; + Value *X, *Y, *Z; bool IsFMul; - if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) && - match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) || - (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) && - match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z)))))) + if ((match(Op0, m_FMul(m_Value(X), m_Value(Z))) && + match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z)))) || + (match(Op0, m_FMul(m_Value(Z), m_Value(X))) && + match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z))))) IsFMul = true; - else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) && - match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z))))) + else if (match(Op0, m_FDiv(m_Value(X), m_Value(Z))) && + match(Op1, m_FDiv(m_Value(Y), m_Specific(Z)))) IsFMul = false; else return nullptr; @@ -1541,6 +1539,9 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I)) return FoldedFAdd; @@ -1654,6 +1655,14 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { {X->getType()}, {NewStartC, X}, &I)); } + // (X * MulC) + X --> X * (MulC + 1.0) + Constant *MulC; + if (match(&I, m_c_FAdd(m_FMul(m_Value(X), m_ImmConstant(MulC)), + m_Deferred(X)))) { + MulC = ConstantExpr::getFAdd(MulC, ConstantFP::get(I.getType(), 1.0)); + return BinaryOperator::CreateFMulFMF(X, MulC, &I); + } + if (Value *V = FAddCombine(Builder).simplify(&I)) return replaceInstUsesWith(I, V); } @@ -1748,6 +1757,9 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // If this is a 'B = x-(-A)', change to B = x+A. @@ -2310,6 +2322,9 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // Subtraction from -0.0 is the canonical form of fneg. // fsub -0.0, X ==> fneg X // fsub nsz 0.0, X ==> fneg nsz X diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index de1034c910d5..6bbb0251f2bc 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1727,25 +1727,37 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - Value *A, *B, *C, *X, *Y; + Value *A, *B, *C, *X, *Y, *Dummy; + + // Match following expressions: + // (~(A | B) & C) + // (~(A & B) | C) + // Captures X = ~(A | B) or ~(A & B) + const auto matchNotOrAnd = + [Opcode, FlippedOpcode](Value *Op, auto m_A, auto m_B, auto m_C, + Value *&X, bool CountUses = false) -> bool { + if (CountUses && !Op->hasOneUse()) + return false; + + if (match(Op, m_c_BinOp(FlippedOpcode, + m_CombineAnd(m_Value(X), + m_Not(m_c_BinOp(Opcode, m_A, m_B))), + m_C))) + return !CountUses || X->hasOneUse(); + + return false; + }; // (~(A | B) & C) | ... --> ... // (~(A & B) | C) & ... --> ... // TODO: One use checks are conservative. We just need to check that a total // number of multiple used values does not exceed reduction // in operations. - if (match(Op0, - m_c_BinOp(FlippedOpcode, - m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A), - m_Value(B)))), - m_Value(C)))) { + if (matchNotOrAnd(Op0, m_Value(A), m_Value(B), m_Value(C), X)) { // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A) - if (match(Op1, - m_OneUse(m_c_BinOp(FlippedOpcode, - m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(A), - m_Specific(C)))), - m_Specific(B))))) { + if (matchNotOrAnd(Op1, m_Specific(A), m_Specific(C), m_Specific(B), Dummy, + true)) { Value *Xor = Builder.CreateXor(B, C); return (Opcode == Instruction::Or) ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(A)) @@ -1754,11 +1766,8 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, // (~(A | B) & C) | (~(B | C) & A) --> (A ^ C) & ~B // (~(A & B) | C) & (~(B & C) | A) --> ~((A ^ C) & B) - if (match(Op1, - m_OneUse(m_c_BinOp(FlippedOpcode, - m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(B), - m_Specific(C)))), - m_Specific(A))))) { + if (matchNotOrAnd(Op1, m_Specific(B), m_Specific(C), m_Specific(A), Dummy, + true)) { Value *Xor = Builder.CreateXor(A, C); return (Opcode == Instruction::Or) ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(B)) @@ -1863,6 +1872,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) @@ -2072,21 +2084,37 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C)); - // (A | B) & ((~A) ^ B) -> (A & B) - // (A | B) & (B ^ (~A)) -> (A & B) - // (B | A) & ((~A) ^ B) -> (A & B) - // (B | A) & (B ^ (~A)) -> (A & B) + // (A | B) & (~A ^ B) -> A & B + // (A | B) & (B ^ ~A) -> A & B + // (B | A) & (~A ^ B) -> A & B + // (B | A) & (B ^ ~A) -> A & B if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); - // ((~A) ^ B) & (A | B) -> (A & B) - // ((~A) ^ B) & (B | A) -> (A & B) - // (B ^ (~A)) & (A | B) -> (A & B) - // (B ^ (~A)) & (B | A) -> (A & B) + // (~A ^ B) & (A | B) -> A & B + // (~A ^ B) & (B | A) -> A & B + // (B ^ ~A) & (A | B) -> A & B + // (B ^ ~A) & (B | A) -> A & B if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); + + // (~A | B) & (A ^ B) -> ~A & B + // (~A | B) & (B ^ A) -> ~A & B + // (B | ~A) & (A ^ B) -> ~A & B + // (B | ~A) & (B ^ A) -> ~A & B + if (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_c_Xor(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); + + // (A ^ B) & (~A | B) -> ~A & B + // (B ^ A) & (~A | B) -> ~A & B + // (A ^ B) & (B | ~A) -> ~A & B + // (B ^ A) & (B | ~A) -> ~A & B + if (match(Op1, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Op0, m_c_Xor(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); } { @@ -2640,6 +2668,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) @@ -3528,6 +3559,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *NewXor = foldXorToXor(I, Builder)) return NewXor; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 14427bd1f2f4..1fb46af46bee 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -352,9 +352,27 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) { // * Dereferenceable address & few lanes -> scalarize speculative load/selects // * Adjacent vector addresses -> masked.load // * Narrow width by halfs excluding zero/undef lanes -// * Vector splat address w/known mask -> scalar load // * Vector incrementing address -> vector masked load Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); + if (!ConstMask) + return nullptr; + + // Vector splat address w/known mask -> scalar load + // Fold the gather to load the source vector first lane + // because it is reloading the same value each time + if (ConstMask->isAllOnesValue()) + if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) { + auto *VecTy = cast<VectorType>(II.getType()); + const Align Alignment = + cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); + LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr, + Alignment, "load.scalar"); + Value *Shuf = + Builder.CreateVectorSplat(VecTy->getElementCount(), L, "broadcast"); + return replaceInstUsesWith(II, cast<Instruction>(Shuf)); + } + return nullptr; } @@ -362,7 +380,6 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { // * Single constant active lane -> store // * Adjacent vector addresses -> masked.store // * Narrow store width by halfs excluding zero/undef lanes -// * Vector splat address w/known mask -> scalar store // * Vector incrementing address -> vector masked store Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); @@ -373,6 +390,34 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { if (ConstMask->isNullValue()) return eraseInstFromFunction(II); + // Vector splat address -> scalar store + if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) { + // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr + if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) { + Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); + StoreInst *S = + new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment); + S->copyMetadata(II); + return S; + } + // scatter(vector, splat(ptr), splat(true)) -> store extract(vector, + // lastlane), ptr + if (ConstMask->isAllOnesValue()) { + Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); + VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType()); + ElementCount VF = WideLoadTy->getElementCount(); + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *RunTimeVF = VF.isScalable() ? Builder.CreateVScale(EC) : EC; + Value *LastLane = Builder.CreateSub(RunTimeVF, Builder.getInt32(1)); + Value *Extract = + Builder.CreateExtractElement(II.getArgOperand(0), LastLane); + StoreInst *S = + new StoreInst(Extract, SplatPtr, /*IsVolatile=*/false, Alignment); + S->copyMetadata(II); + return S; + } + } if (isa<ScalableVectorType>(ConstMask->getType())) return nullptr; @@ -449,7 +494,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { // ctlz/cttz i1 Op0 --> not Op0 if (match(Op1, m_Zero())) return BinaryOperator::CreateNot(Op0); - // If zero is undef, then the input can be assumed to be "true", so the + // If zero is poison, then the input can be assumed to be "true", so the // instruction simplifies to "false". assert(match(Op1, m_One()) && "Expected ctlz/cttz operand to be 0 or 1"); return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType())); @@ -474,7 +519,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { } // Zext doesn't change the number of trailing zeros, so narrow: - // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsUndef' parameter is 'true'. + // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsPoison' parameter is 'true'. if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && match(Op1, m_One())) { auto *Cttz = IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, X, IC.Builder.getTrue()); @@ -511,7 +556,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { } // If the input to cttz/ctlz is known to be non-zero, - // then change the 'ZeroIsUndef' parameter to 'true' + // then change the 'ZeroIsPoison' parameter to 'true' // because we know the zero behavior can't affect the result. if (!Known.One.isZero() || isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, @@ -1188,6 +1233,21 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Value *IIOperand = II->getArgOperand(0); Value *X = nullptr; + KnownBits Known = computeKnownBits(IIOperand, 0, II); + uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8); + uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8); + + // bswap(x) -> shift(x) if x has exactly one "active byte" + if (Known.getBitWidth() - LZ - TZ == 8) { + assert(LZ != TZ && "active byte cannot be in the middle"); + if (LZ > TZ) // -> shl(x) if the "active byte" is in the low part of x + return BinaryOperator::CreateNUWShl( + IIOperand, ConstantInt::get(IIOperand->getType(), LZ - TZ)); + // -> lshr(x) if the "active byte" is in the high part of x + return BinaryOperator::CreateExactLShr( + IIOperand, ConstantInt::get(IIOperand->getType(), TZ - LZ)); + } + // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { unsigned C = X->getType()->getScalarSizeInBits() - @@ -2460,7 +2520,7 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call, if (!Call.isByValArgument(ix)) return false; - Type *SrcElemTy = SrcTy->getElementType(); + Type *SrcElemTy = SrcTy->getNonOpaquePointerElementType(); Type *DstElemTy = Call.getParamByValType(ix); if (!SrcElemTy->isSized() || !DstElemTy->isSized()) return false; @@ -2571,57 +2631,36 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) { } void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { - unsigned NumArgs = Call.arg_size(); - ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); - ConstantInt *Op1C = - (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); - // Bail out if the allocation size is zero (or an invalid alignment of zero - // with aligned_alloc). - if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) - return; - - if (isMallocLikeFn(&Call, TLI) && Op0C) { - if (isOpNewLikeFn(&Call, TLI)) + // Note: We only handle cases which can't be driven from generic attributes + // here. So, for example, nonnull and noalias (which are common properties + // of some allocation functions) are expected to be handled via annotation + // of the respective allocator declaration with generic attributes. + + uint64_t Size; + ObjectSizeOpts Opts; + if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) { + // TODO: We really should just emit deref_or_null here and then + // let the generic inference code combine that with nonnull. + if (Call.hasRetAttr(Attribute::NonNull)) Call.addRetAttr(Attribute::getWithDereferenceableBytes( - Call.getContext(), Op0C->getZExtValue())); + Call.getContext(), Size)); else Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Op0C->getZExtValue())); - } else if (isAlignedAllocLikeFn(&Call, TLI)) { - if (Op1C) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Op1C->getZExtValue())); - // Add alignment attribute if alignment is a power of two constant. - if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment) && - isKnownNonZero(Call.getOperand(1), DL, 0, &AC, &Call, &DT)) { - uint64_t AlignmentVal = Op0C->getZExtValue(); - if (llvm::isPowerOf2_64(AlignmentVal)) { - Call.removeRetAttr(Attribute::Alignment); - Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(), - Align(AlignmentVal))); - } - } - } else if (isReallocLikeFn(&Call, TLI) && Op1C) { - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Op1C->getZExtValue())); - } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { - bool Overflow; - const APInt &N = Op0C->getValue(); - APInt Size = N.umul_ov(Op1C->getValue(), Overflow); - if (!Overflow) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Size.getZExtValue())); - } else if (isStrdupLikeFn(&Call, TLI)) { - uint64_t Len = GetStringLength(Call.getOperand(0)); - if (Len) { - // strdup - if (NumArgs == 1) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Len)); - // strndup - else if (NumArgs == 2 && Op1C) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); + Call.getContext(), Size)); + } + + // Add alignment attribute if alignment is a power of two constant. + Value *Alignment = getAllocAlignment(&Call, TLI); + if (!Alignment) + return; + + ConstantInt *AlignOpC = dyn_cast<ConstantInt>(Alignment); + if (AlignOpC && AlignOpC->getValue().ult(llvm::Value::MaximumAlignment)) { + uint64_t AlignmentVal = AlignOpC->getZExtValue(); + if (llvm::isPowerOf2_64(AlignmentVal)) { + Call.removeRetAttr(Attribute::Alignment); + Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(), + Align(AlignmentVal))); } } } @@ -2744,9 +2783,9 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { PointerType *NewTy = cast<PointerType>(CI->getOperand(0)->getType()); if (!NewTy->isOpaque() && Call.isByValArgument(ix)) { Call.removeParamAttr(ix, Attribute::ByVal); - Call.addParamAttr( - ix, Attribute::getWithByValType( - Call.getContext(), NewTy->getElementType())); + Call.addParamAttr(ix, Attribute::getWithByValType( + Call.getContext(), + NewTy->getNonOpaquePointerElementType())); } Changed = true; } @@ -2782,7 +2821,8 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy)); } - if (isAllocLikeFn(&Call, &TLI)) + if (isAllocationFn(&Call, &TLI) && + isAllocRemovable(&cast<CallBase>(Call), &TLI)) return visitAllocSite(Call); // Handle intrinsics which can be used in both call and invoke context. @@ -2934,7 +2974,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { } if (!CallerPAL.isEmpty() && !Caller->use_empty()) { - AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs()); if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) return false; // Attribute not compatible with transformed value. } @@ -2980,7 +3020,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) return false; // Cannot transform this parameter value. - if (AttrBuilder(CallerPAL.getParamAttrs(i)) + if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i)) .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) return false; // Attribute not compatible with transformed value. @@ -2994,12 +3034,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // sized type and the sized type has to have the same size as the old type. if (ParamTy != ActTy && CallerPAL.hasParamAttr(i, Attribute::ByVal)) { PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); - if (!ParamPTy || !ParamPTy->getElementType()->isSized()) + if (!ParamPTy || !ParamPTy->getPointerElementType()->isSized()) return false; Type *CurElTy = Call.getParamByValType(i); if (DL.getTypeAllocSize(CurElTy) != - DL.getTypeAllocSize(ParamPTy->getElementType())) + DL.getTypeAllocSize(ParamPTy->getPointerElementType())) return false; } } @@ -3012,17 +3052,14 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // If the callee is just a declaration, don't change the varargsness of the // call. We don't want to introduce a varargs call where one doesn't // already exist. - PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType()); - if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) + if (FT->isVarArg() != Call.getFunctionType()->isVarArg()) return false; // If both the callee and the cast type are varargs, we still have to make // sure the number of fixed parameters are the same or we have the same // ABI issues as if we introduce a varargs call. - if (FT->isVarArg() && - cast<FunctionType>(APTy->getElementType())->isVarArg() && - FT->getNumParams() != - cast<FunctionType>(APTy->getElementType())->getNumParams()) + if (FT->isVarArg() && Call.getFunctionType()->isVarArg() && + FT->getNumParams() != Call.getFunctionType()->getNumParams()) return false; } @@ -3045,7 +3082,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { ArgAttrs.reserve(NumActualArgs); // Get any return attributes. - AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs()); // If the return value is not being used, the type may not be compatible // with the existing attributes. Wipe out any problematic attributes. @@ -3063,7 +3100,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // Add any parameter attributes. if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) { - AttrBuilder AB(CallerPAL.getParamAttrs(i)); + AttrBuilder AB(FT->getContext(), CallerPAL.getParamAttrs(i)); AB.addByValAttr(NewArg->getType()->getPointerElementType()); ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); } else diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 8df4a4529f47..f11ba8772f3c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -85,13 +85,16 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { PointerType *PTy = cast<PointerType>(CI.getType()); + // Opaque pointers don't have an element type we could replace with. + if (PTy->isOpaque()) + return nullptr; IRBuilderBase::InsertPointGuard Guard(Builder); Builder.SetInsertPoint(&AI); // Get the type really allocated and the type casted to. Type *AllocElTy = AI.getAllocatedType(); - Type *CastElTy = PTy->getElementType(); + Type *CastElTy = PTy->getNonOpaquePointerElementType(); if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; // This optimisation does not work for cases where the cast type @@ -2649,8 +2652,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder, if (SrcPTy->isOpaque() || DstPTy->isOpaque()) return nullptr; - Type *DstElTy = DstPTy->getElementType(); - Type *SrcElTy = SrcPTy->getElementType(); + Type *DstElTy = DstPTy->getNonOpaquePointerElementType(); + Type *SrcElTy = SrcPTy->getNonOpaquePointerElementType(); // When the type pointed to is not sized the cast cannot be // turned into a gep. @@ -2669,8 +2672,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder, // If we found a path from the src to dest, create the getelementptr now. if (SrcElTy == DstElTy) { SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0)); - GetElementPtrInst *GEP = - GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + SrcPTy->getNonOpaquePointerElementType(), Src, Idxs); // If the source pointer is dereferenceable, then assume it points to an // allocated object and apply "inbounds" to the GEP. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index ed53b88aed61..fd58a44504b3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -503,7 +503,7 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC, /// Returns true if we can rewrite Start as a GEP with pointer Base /// and some integer offset. The nodes that need to be re-written /// for this transformation will be added to Explored. -static bool canRewriteGEPAsOffset(Value *Start, Value *Base, +static bool canRewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base, const DataLayout &DL, SetVector<Value *> &Explored) { SmallVector<Value *, 16> WorkList(1, Start); @@ -551,7 +551,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base, // the original pointer type. We could handle more cases in the // future. if (GEP->getNumIndices() != 1 || !GEP->isInBounds() || - GEP->getType() != Start->getType()) + GEP->getSourceElementType() != ElemTy) return false; if (!Explored.contains(GEP->getOperand(0))) @@ -627,7 +627,7 @@ static void setInsertionPoint(IRBuilder<> &Builder, Value *V, /// Returns a re-written value of Start as an indexed GEP using Base as a /// pointer. -static Value *rewriteGEPAsOffset(Value *Start, Value *Base, +static Value *rewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base, const DataLayout &DL, SetVector<Value *> &Explored) { // Perform all the substitutions. This is a bit tricky because we can @@ -714,6 +714,8 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, } } + PointerType *PtrTy = + ElemTy->getPointerTo(Start->getType()->getPointerAddressSpace()); for (Value *Val : Explored) { if (Val == Base) continue; @@ -722,22 +724,14 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, // a GEP or a GEP + ptrtoint. setInsertionPoint(Builder, Val, false); - // If required, create an inttoptr instruction for Base. - Value *NewBase = Base; - if (!Base->getType()->isPointerTy()) - NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(), - Start->getName() + "to.ptr"); - - Value *GEP = Builder.CreateInBoundsGEP( - Start->getType()->getPointerElementType(), NewBase, - makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); - - if (!Val->getType()->isPointerTy()) { - Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(), - Val->getName() + ".conv"); - GEP = Cast; - } - Val->replaceAllUsesWith(GEP); + // Cast base to the expected type. + Value *NewVal = Builder.CreateBitOrPointerCast( + Base, PtrTy, Start->getName() + "to.ptr"); + NewVal = Builder.CreateInBoundsGEP( + ElemTy, NewVal, makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); + NewVal = Builder.CreateBitOrPointerCast( + NewVal, Val->getType(), Val->getName() + ".conv"); + Val->replaceAllUsesWith(NewVal); } return NewInsts[Start]; @@ -747,7 +741,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, /// the input Value as a constant indexed GEP. Returns a pair containing /// the GEPs Pointer and Index. static std::pair<Value *, Value *> -getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { +getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) { Type *IndexType = IntegerType::get(V->getContext(), DL.getIndexTypeSizeInBits(V->getType())); @@ -759,7 +753,7 @@ getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { if (!GEP->isInBounds()) break; if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 && - GEP->getType() == V->getType()) { + GEP->getSourceElementType() == ElemTy) { V = GEP->getOperand(0); Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1)); Index = ConstantExpr::getAdd( @@ -798,17 +792,14 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, if (!GEPLHS->hasAllConstantIndices()) return nullptr; - // Make sure the pointers have the same type. - if (GEPLHS->getType() != RHS->getType()) - return nullptr; - + Type *ElemTy = GEPLHS->getSourceElementType(); Value *PtrBase, *Index; - std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL); + std::tie(PtrBase, Index) = getAsConstantIndexedAddress(ElemTy, GEPLHS, DL); // The set of nodes that will take part in this transformation. SetVector<Value *> Nodes; - if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes)) + if (!canRewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes)) return nullptr; // We know we can re-write this as @@ -817,7 +808,7 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, // can't have overflow on either side. We can therefore re-write // this as: // OFFSET1 cmp OFFSET2 - Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes); + Value *NewRHS = rewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes); // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written // GEP having PtrBase as the pointer base, and has returned in NewRHS the @@ -894,9 +885,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If the base pointers are different, but the indices are the same, just // compare the base pointer. if (PtrBase != GEPRHS->getOperand(0)) { - bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands(); - IndicesTheSame &= GEPLHS->getOperand(0)->getType() == - GEPRHS->getOperand(0)->getType(); + bool IndicesTheSame = + GEPLHS->getNumOperands() == GEPRHS->getNumOperands() && + GEPLHS->getType() == GEPRHS->getType() && + GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType(); if (IndicesTheSame) for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { @@ -1271,8 +1263,8 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // This is only really a signed overflow check if the inputs have been // sign-extended; check for that condition. For example, if CI2 is 2^31 and // the operands of the add are 64 bits wide, we need at least 33 sign bits. - if (IC.ComputeMinSignedBits(A, 0, &I) > NewWidth || - IC.ComputeMinSignedBits(B, 0, &I) > NewWidth) + if (IC.ComputeMaxSignificantBits(A, 0, &I) > NewWidth || + IC.ComputeMaxSignificantBits(B, 0, &I) > NewWidth) return nullptr; // In order to replace the original add with a narrower @@ -2221,7 +2213,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0 Value *X = Shr->getOperand(0); CmpInst::Predicate Pred = Cmp.getPredicate(); - if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && C.isZero()) + if (Cmp.isEquality() && Shr->isExact() && C.isZero()) return new ICmpInst(Pred, X, Cmp.getOperand(1)); const APInt *ShiftVal; @@ -2247,9 +2239,10 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, // those conditions rather than checking them. This is difficult because of // undef/poison (PR34838). if (IsAShr) { - if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) { - // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC) - // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC) + if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) { + // When ShAmtC can be shifted losslessly: + // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC) + // icmp slt/ult (ashr X, ShAmtC), C --> icmp slt/ult X, (C << ShAmtC) APInt ShiftedC = C.shl(ShAmtVal); if (ShiftedC.ashr(ShAmtVal) == C) return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); @@ -2261,6 +2254,12 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, (ShiftedC + 1).ashr(ShAmtVal) == (C + 1)) return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); } + if (Pred == CmpInst::ICMP_UGT) { + // icmp ugt (ashr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1 + APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1; + if ((ShiftedC + 1).ashr(ShAmtVal) == (C + 1)) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); + } // If the compare constant has significant bits above the lowest sign-bit, // then convert an unsigned cmp to a test of the sign-bit: @@ -3957,6 +3956,33 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) return new ICmpInst(Pred, X, Builder.CreateNot(Op0)); + { + // Similar to above: an unsigned overflow comparison may use offset + mask: + // ((Op1 + C) & C) u< Op1 --> Op1 != 0 + // ((Op1 + C) & C) u>= Op1 --> Op1 == 0 + // Op0 u> ((Op0 + C) & C) --> Op0 != 0 + // Op0 u<= ((Op0 + C) & C) --> Op0 == 0 + BinaryOperator *BO; + const APInt *C; + if ((Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE) && + match(Op0, m_And(m_BinOp(BO), m_LowBitMask(C))) && + match(BO, m_Add(m_Specific(Op1), m_SpecificIntAllowUndef(*C)))) { + CmpInst::Predicate NewPred = + Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + Constant *Zero = ConstantInt::getNullValue(Op1->getType()); + return new ICmpInst(NewPred, Op1, Zero); + } + + if ((Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE) && + match(Op1, m_And(m_BinOp(BO), m_LowBitMask(C))) && + match(BO, m_Add(m_Specific(Op0), m_SpecificIntAllowUndef(*C)))) { + CmpInst::Predicate NewPred = + Pred == ICmpInst::ICMP_UGT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + Constant *Zero = ConstantInt::getNullValue(Op1->getType()); + return new ICmpInst(NewPred, Op0, Zero); + } + } + bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; if (BO0 && isa<OverflowingBinaryOperator>(BO0)) NoOp0WrapProblem = diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 39b55b028110..7743b4c41555 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -148,6 +148,8 @@ public: Instruction *SliceUpIllegalIntegerPHI(PHINode &PN); Instruction *visitPHINode(PHINode &PN); Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); + Instruction *visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src); + Instruction *visitGEPOfBitcast(BitCastInst *BCI, GetElementPtrInst &GEP); Instruction *visitAllocaInst(AllocaInst &AI); Instruction *visitAllocSite(Instruction &FI); Instruction *visitFree(CallInst &FI); @@ -195,8 +197,6 @@ private: bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool shouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; - Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset, - SmallVectorImpl<Value *> &NewIndices); /// Classify whether a cast is worth optimizing. /// @@ -607,6 +607,16 @@ public: /// only possible if all operands to the PHI are constants). Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN); + /// For a binary operator with 2 phi operands, try to hoist the binary + /// operation before the phi. This can result in fewer instructions in + /// patterns where at least one set of phi operands simplifies. + /// Example: + /// BB3: binop (phi [X, BB1], [C1, BB2]), (phi [Y, BB1], [C2, BB2]) + /// --> + /// BB1: BO = binop X, Y + /// BB3: phi [BO, BB1], [(binop C1, C2), BB2] + Instruction *foldBinopWithPhiOperands(BinaryOperator &BO); + /// Given an instruction with a select as one operand and a constant as the /// other operand, try to fold the binary operator into the select arguments. /// This also works for Cast instructions, which obviously do not have a diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 0dbfdba353c4..756792918dba 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -301,16 +301,17 @@ void PointerReplacer::replace(Instruction *I) { assert(V && "Operand not replaced"); SmallVector<Value *, 8> Indices; Indices.append(GEP->idx_begin(), GEP->idx_end()); - auto *NewI = GetElementPtrInst::Create( - V->getType()->getPointerElementType(), V, Indices); + auto *NewI = + GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices); IC.InsertNewInstWith(NewI, *GEP); NewI->takeName(GEP); WorkMap[GEP] = NewI; } else if (auto *BC = dyn_cast<BitCastInst>(I)) { auto *V = getReplacement(BC->getOperand(0)); assert(V && "Operand not replaced"); - auto *NewT = PointerType::get(BC->getType()->getPointerElementType(), - V->getType()->getPointerAddressSpace()); + auto *NewT = PointerType::getWithSamePointeeType( + cast<PointerType>(BC->getType()), + V->getType()->getPointerAddressSpace()); auto *NewI = new BitCastInst(V, NewT); IC.InsertNewInstWith(NewI, *BC); NewI->takeName(BC); @@ -345,8 +346,7 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) { #ifndef NDEBUG auto *PT = cast<PointerType>(I.getType()); auto *NT = cast<PointerType>(V->getType()); - assert(PT != NT && PT->getElementType() == NT->getElementType() && - "Invalid usage"); + assert(PT != NT && PT->hasSameElementTypeAs(NT) && "Invalid usage"); #endif WorkMap[&I] = V; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index aca7ec8d7325..1aa10b550fc4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -155,6 +155,9 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); @@ -348,13 +351,21 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { return CastInst::Create(Instruction::SExt, And, I.getType()); } - // (bool X) * Y --> X ? Y : 0 - // Y * (bool X) --> X ? Y : 0 + // (zext bool X) * Y --> X ? Y : 0 + // Y * (zext bool X) --> X ? Y : 0 if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0)); if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0)); + // (sext bool X) * C --> X ? -C : 0 + Constant *ImmC; + if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1) && + match(Op1, m_ImmConstant(ImmC))) { + Constant *NegC = ConstantExpr::getNeg(ImmC); + return SelectInst::Create(X, NegC, ConstantInt::getNullValue(I.getType())); + } + // (lshr X, 31) * Y --> (ashr X, 31) & Y // Y * (lshr X, 31) --> (ashr X, 31) & Y // TODO: We are not checking one-use because the elimination of the multiply @@ -442,6 +453,9 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) return FoldedMul; @@ -742,6 +756,9 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, /// division instructions. /// Common integer divide transforms Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); bool IsSigned = I.getOpcode() == Instruction::SDiv; Type *Ty = I.getType(); @@ -1359,6 +1376,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *R = foldFDivConstantDivisor(I)) return R; @@ -1460,6 +1480,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { /// remainder instructions. /// Common integer remainder transforms Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) { + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // The RHS is known non-zero. @@ -1638,5 +1661,8 @@ Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index a6d6b5199105..65e60498ff95 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -943,7 +943,7 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, } /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single -/// call to cttz/ctlz with flag 'is_zero_undef' cleared. +/// call to cttz/ctlz with flag 'is_zero_poison' cleared. /// /// For example, we can fold the following code sequence: /// \code @@ -987,7 +987,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, // sizeof in bits of 'Count'. unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits(); if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) { - // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from + // Explicitly clear the 'is_zero_poison' flag. It's always valid to go from // true to false on this flag, so we can replace it for all users. II->setArgOperand(1, ConstantInt::getFalse(II->getContext())); return SelectArg; @@ -995,7 +995,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional // zext/trunc) have one use (ending at the select), the cttz/ctlz result will - // not be used if the input is zero. Relax to 'undef_on_zero' for that case. + // not be used if the input is zero. Relax to 'zero is poison' for that case. if (II->hasOneUse() && SelectArg->hasOneUse() && !match(II->getArgOperand(1), m_One())) II->setArgOperand(1, ConstantInt::getTrue(II->getContext())); @@ -2325,8 +2325,9 @@ Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) { // The two operands of the add/sub must be nsw-truncatable to the NewTy. This // is usually achieved via a sext from a smaller type. - if (ComputeMinSignedBits(AddSub->getOperand(0), 0, AddSub) > NewBitWidth || - ComputeMinSignedBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth) + if (ComputeMaxSignificantBits(AddSub->getOperand(0), 0, AddSub) > + NewBitWidth || + ComputeMaxSignificantBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth) return nullptr; // Finally create and return the sat intrinsic, truncated to the new type diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 06421d553915..17f0c5c4cff0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -369,6 +369,9 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I, } Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) { + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); assert(Op0->getType() == Op1->getType()); @@ -1032,12 +1035,13 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { NewLShr->setIsExact(I.isExact()); return NewLShr; } - // (X << C1) >>u C --> (X >>u (C - C1)) & (-1 >> C) - Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact()); - APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); - return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask)); - } - if (C1->ugt(ShAmtC)) { + if (Op0->hasOneUse()) { + // (X << C1) >>u C --> (X >>u (C - C1)) & (-1 >> C) + Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact()); + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); + return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask)); + } + } else if (C1->ugt(ShAmtC)) { unsigned ShlAmtC = C1->getZExtValue(); Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmtC - ShAmtC); if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) { @@ -1046,15 +1050,33 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { NewShl->setHasNoUnsignedWrap(true); return NewShl; } - // (X << C1) >>u C --> X << (C1 - C) & (-1 >> C) - Value *NewShl = Builder.CreateShl(X, ShiftDiff); + if (Op0->hasOneUse()) { + // (X << C1) >>u C --> X << (C1 - C) & (-1 >> C) + Value *NewShl = Builder.CreateShl(X, ShiftDiff); + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); + return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask)); + } + } else { + assert(*C1 == ShAmtC); + // (X << C) >>u C --> X & (-1 >>u C) APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); - return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask)); + return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); } - assert(*C1 == ShAmtC); - // (X << C) >>u C --> X & (-1 >>u C) - APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); - return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); + } + + // ((X << C) + Y) >>u C --> (X + (Y >>u C)) & (-1 >>u C) + // TODO: Consolidate with the more general transform that starts from shl + // (the shifts are in the opposite order). + Value *Y; + if (match(Op0, + m_OneUse(m_c_Add(m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))), + m_Value(Y))))) { + Value *NewLshr = Builder.CreateLShr(Y, Op1); + Value *NewAdd = Builder.CreateAdd(NewLshr, X); + unsigned Op1Val = C->getLimitedValue(BitWidth); + APInt Bits = APInt::getLowBitsSet(BitWidth, BitWidth - Op1Val); + Constant *Mask = ConstantInt::get(Ty, Bits); + return BinaryOperator::CreateAnd(NewAdd, Mask); } if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && @@ -1094,7 +1116,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { } } - Value *Y; if (ShAmtC == BitWidth - 1) { // lshr i32 or(X,-X), 31 --> zext (X != 0) if (match(Op0, m_OneUse(m_c_Or(m_Neg(m_Value(X)), m_Deferred(X))))) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 4dc712f32536..71a5ae24eead 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -800,22 +800,21 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Round NTZ down to the next byte. If we have 11 trailing zeros, then // we need all the bits down to bit 8. Likewise, round NLZ. If we // have 14 leading zeros, round to 8. - NLZ &= ~7; - NTZ &= ~7; + NLZ = alignDown(NLZ, 8); + NTZ = alignDown(NTZ, 8); // If we need exactly one byte, we can do this transformation. - if (BitWidth-NLZ-NTZ == 8) { - unsigned ResultBit = NTZ; - unsigned InputBit = BitWidth-NTZ-8; - + if (BitWidth - NLZ - NTZ == 8) { // Replace this with either a left or right shift to get the byte into // the right place. Instruction *NewVal; - if (InputBit > ResultBit) - NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0), - ConstantInt::get(I->getType(), InputBit-ResultBit)); + if (NLZ > NTZ) + NewVal = BinaryOperator::CreateLShr( + II->getArgOperand(0), + ConstantInt::get(I->getType(), NLZ - NTZ)); else - NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), - ConstantInt::get(I->getType(), ResultBit-InputBit)); + NewVal = BinaryOperator::CreateShl( + II->getArgOperand(0), + ConstantInt::get(I->getType(), NTZ - NLZ)); NewVal->takeName(I); return InsertNewInstWith(NewVal, *I); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index c6a4602e59e3..736cf9c825d5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -495,8 +495,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { } GetElementPtrInst *NewGEP = GetElementPtrInst::Create( - cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr, - NewOps); + GEP->getSourceElementType(), NewPtr, NewOps); NewGEP->setIsInBounds(GEP->isInBounds()); return NewGEP; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index eb5eadba194d..029be5257694 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1027,13 +1027,11 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, if (!ConstIsRHS) std::swap(Op0, Op1); - auto *BO = cast<BinaryOperator>(&I); - Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1, - SO->getName() + ".op"); - auto *FPInst = dyn_cast<Instruction>(RI); - if (FPInst && isa<FPMathOperator>(FPInst)) - FPInst->copyFastMathFlags(BO); - return RI; + Value *NewBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), Op0, + Op1, SO->getName() + ".op"); + if (auto *NewBOI = dyn_cast<Instruction>(NewBO)) + NewBOI->copyIRFlags(&I); + return NewBO; } Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, @@ -1289,6 +1287,70 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { return replaceInstUsesWith(I, NewPN); } +Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { + // TODO: This should be similar to the incoming values check in foldOpIntoPhi: + // we are guarding against replicating the binop in >1 predecessor. + // This could miss matching a phi with 2 constant incoming values. + auto *Phi0 = dyn_cast<PHINode>(BO.getOperand(0)); + auto *Phi1 = dyn_cast<PHINode>(BO.getOperand(1)); + if (!Phi0 || !Phi1 || !Phi0->hasOneUse() || !Phi1->hasOneUse() || + Phi0->getNumOperands() != 2 || Phi1->getNumOperands() != 2) + return nullptr; + + // TODO: Remove the restriction for binop being in the same block as the phis. + if (BO.getParent() != Phi0->getParent() || + BO.getParent() != Phi1->getParent()) + return nullptr; + + // Match a pair of incoming constants for one of the predecessor blocks. + BasicBlock *ConstBB, *OtherBB; + Constant *C0, *C1; + if (match(Phi0->getIncomingValue(0), m_ImmConstant(C0))) { + ConstBB = Phi0->getIncomingBlock(0); + OtherBB = Phi0->getIncomingBlock(1); + } else if (match(Phi0->getIncomingValue(1), m_ImmConstant(C0))) { + ConstBB = Phi0->getIncomingBlock(1); + OtherBB = Phi0->getIncomingBlock(0); + } else { + return nullptr; + } + if (!match(Phi1->getIncomingValueForBlock(ConstBB), m_ImmConstant(C1))) + return nullptr; + + // The block that we are hoisting to must reach here unconditionally. + // Otherwise, we could be speculatively executing an expensive or + // non-speculative op. + auto *PredBlockBranch = dyn_cast<BranchInst>(OtherBB->getTerminator()); + if (!PredBlockBranch || PredBlockBranch->isConditional() || + !DT.isReachableFromEntry(OtherBB)) + return nullptr; + + // TODO: This check could be tightened to only apply to binops (div/rem) that + // are not safe to speculatively execute. But that could allow hoisting + // potentially expensive instructions (fdiv for example). + for (auto BBIter = BO.getParent()->begin(); &*BBIter != &BO; ++BBIter) + if (!isGuaranteedToTransferExecutionToSuccessor(&*BBIter)) + return nullptr; + + // Make a new binop in the predecessor block with the non-constant incoming + // values. + Builder.SetInsertPoint(PredBlockBranch); + Value *NewBO = Builder.CreateBinOp(BO.getOpcode(), + Phi0->getIncomingValueForBlock(OtherBB), + Phi1->getIncomingValueForBlock(OtherBB)); + if (auto *NotFoldedNewBO = dyn_cast<BinaryOperator>(NewBO)) + NotFoldedNewBO->copyIRFlags(&BO); + + // Fold constants for the predecessor block with constant incoming values. + Constant *NewC = ConstantExpr::get(BO.getOpcode(), C0, C1); + + // Replace the binop with a phi of the new values. The old phis are dead. + PHINode *NewPhi = PHINode::Create(BO.getType(), 2); + NewPhi->addIncoming(NewBO, OtherBB); + NewPhi->addIncoming(NewC, ConstBB); + return NewPhi; +} + Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { if (!isa<Constant>(I.getOperand(1))) return nullptr; @@ -1307,10 +1369,11 @@ Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { /// is a sequence of GEP indices into the pointed type that will land us at the /// specified offset. If so, fill them into NewIndices and return the resultant /// element type, otherwise return null. -Type * -InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset, - SmallVectorImpl<Value *> &NewIndices) { - Type *Ty = PtrTy->getElementType(); +static Type *findElementAtOffset(PointerType *PtrTy, int64_t IntOffset, + SmallVectorImpl<Value *> &NewIndices, + const DataLayout &DL) { + // Only used by visitGEPOfBitcast(), which is skipped for opaque pointers. + Type *Ty = PtrTy->getNonOpaquePointerElementType(); if (!Ty->isSized()) return nullptr; @@ -1320,7 +1383,7 @@ InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset, return nullptr; for (const APInt &Index : Indices) - NewIndices.push_back(Builder.getInt(Index)); + NewIndices.push_back(ConstantInt::get(PtrTy->getContext(), Index)); return Ty; } @@ -1884,12 +1947,254 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP, return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel); } +Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, + GEPOperator *Src) { + // Combine Indices - If the source pointer to this getelementptr instruction + // is a getelementptr instruction with matching element type, combine the + // indices of the two getelementptr instructions into a single instruction. + if (Src->getResultElementType() != GEP.getSourceElementType()) + return nullptr; + + if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) + return nullptr; + + if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && + Src->hasOneUse()) { + Value *GO1 = GEP.getOperand(1); + Value *SO1 = Src->getOperand(1); + + if (LI) { + // Try to reassociate loop invariant GEP chains to enable LICM. + if (Loop *L = LI->getLoopFor(GEP.getParent())) { + // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is + // invariant: this breaks the dependence between GEPs and allows LICM + // to hoist the invariant part out of the loop. + if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { + // We have to be careful here. + // We have something like: + // %src = getelementptr <ty>, <ty>* %base, <ty> %idx + // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2 + // If we just swap idx & idx2 then we could inadvertantly + // change %src from a vector to a scalar, or vice versa. + // Cases: + // 1) %base a scalar & idx a scalar & idx2 a vector + // => Swapping idx & idx2 turns %src into a vector type. + // 2) %base a scalar & idx a vector & idx2 a scalar + // => Swapping idx & idx2 turns %src in a scalar type + // 3) %base, %idx, and %idx2 are scalars + // => %src & %gep are scalars + // => swapping idx & idx2 is safe + // 4) %base a vector + // => %src is a vector + // => swapping idx & idx2 is safe. + auto *SO0 = Src->getOperand(0); + auto *SO0Ty = SO0->getType(); + if (!isa<VectorType>(GEP.getType()) || // case 3 + isa<VectorType>(SO0Ty)) { // case 4 + Src->setOperand(1, GO1); + GEP.setOperand(1, SO1); + return &GEP; + } else { + // Case 1 or 2 + // -- have to recreate %src & %gep + // put NewSrc at same location as %src + Builder.SetInsertPoint(cast<Instruction>(Src)); + Value *NewSrc = Builder.CreateGEP( + GEP.getSourceElementType(), SO0, GO1, Src->getName()); + // Propagate 'inbounds' if the new source was not constant-folded. + if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc)) + NewSrcGEPI->setIsInBounds(Src->isInBounds()); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + GEP.getSourceElementType(), NewSrc, {SO1}); + NewGEP->setIsInBounds(GEP.isInBounds()); + return NewGEP; + } + } + } + } + } + + // Note that if our source is a gep chain itself then we wait for that + // chain to be resolved before we perform this transformation. This + // avoids us creating a TON of code in some cases. + if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) + if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) + return nullptr; // Wait until our source is folded to completion. + + SmallVector<Value*, 8> Indices; + + // Find out whether the last index in the source GEP is a sequential idx. + bool EndsWithSequential = false; + for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); + I != E; ++I) + EndsWithSequential = I.isSequential(); + + // Can we combine the two pointer arithmetics offsets? + if (EndsWithSequential) { + // Replace: gep (gep %P, long B), long A, ... + // With: T = long A+B; gep %P, T, ... + Value *SO1 = Src->getOperand(Src->getNumOperands()-1); + Value *GO1 = GEP.getOperand(1); + + // If they aren't the same type, then the input hasn't been processed + // by the loop above yet (which canonicalizes sequential index types to + // intptr_t). Just avoid transforming this until the input has been + // normalized. + if (SO1->getType() != GO1->getType()) + return nullptr; + + Value *Sum = + SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); + // Only do the combine when we are sure the cost after the + // merge is never more than that before the merge. + if (Sum == nullptr) + return nullptr; + + // Update the GEP in place if possible. + if (Src->getNumOperands() == 2) { + GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); + replaceOperand(GEP, 0, Src->getOperand(0)); + replaceOperand(GEP, 1, Sum); + return &GEP; + } + Indices.append(Src->op_begin()+1, Src->op_end()-1); + Indices.push_back(Sum); + Indices.append(GEP.op_begin()+2, GEP.op_end()); + } else if (isa<Constant>(*GEP.idx_begin()) && + cast<Constant>(*GEP.idx_begin())->isNullValue() && + Src->getNumOperands() != 1) { + // Otherwise we can do the fold if the first index of the GEP is a zero + Indices.append(Src->op_begin()+1, Src->op_end()); + Indices.append(GEP.idx_begin()+1, GEP.idx_end()); + } + + if (!Indices.empty()) + return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) + ? GetElementPtrInst::CreateInBounds( + Src->getSourceElementType(), Src->getOperand(0), Indices, + GEP.getName()) + : GetElementPtrInst::Create(Src->getSourceElementType(), + Src->getOperand(0), Indices, + GEP.getName()); + + return nullptr; +} + +// Note that we may have also stripped an address space cast in between. +Instruction *InstCombinerImpl::visitGEPOfBitcast(BitCastInst *BCI, + GetElementPtrInst &GEP) { + // With opaque pointers, there is no pointer element type we can use to + // adjust the GEP type. + PointerType *SrcType = cast<PointerType>(BCI->getSrcTy()); + if (SrcType->isOpaque()) + return nullptr; + + Type *GEPEltType = GEP.getSourceElementType(); + Type *SrcEltType = SrcType->getNonOpaquePointerElementType(); + Value *SrcOp = BCI->getOperand(0); + + // GEP directly using the source operand if this GEP is accessing an element + // of a bitcasted pointer to vector or array of the same dimensions: + // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z + // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z + auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, + const DataLayout &DL) { + auto *VecVTy = cast<FixedVectorType>(VecTy); + return ArrTy->getArrayElementType() == VecVTy->getElementType() && + ArrTy->getArrayNumElements() == VecVTy->getNumElements() && + DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); + }; + if (GEP.getNumOperands() == 3 && + ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) && + areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || + (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() && + areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { + + // Create a new GEP here, as using `setOperand()` followed by + // `setSourceElementType()` won't actually update the type of the + // existing GEP Value. Causing issues if this Value is accessed when + // constructing an AddrSpaceCastInst + SmallVector<Value *, 8> Indices(GEP.indices()); + Value *NGEP = GEP.isInBounds() + ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, Indices) + : Builder.CreateGEP(SrcEltType, SrcOp, Indices); + NGEP->takeName(&GEP); + + // Preserve GEP address space to satisfy users + if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(NGEP, GEP.getType()); + + return replaceInstUsesWith(GEP, NGEP); + } + + // See if we can simplify: + // X = bitcast A* to B* + // Y = gep X, <...constant indices...> + // into a gep of the original struct. This is important for SROA and alias + // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. + unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEP.getType()); + APInt Offset(OffsetBits, 0); + + // If the bitcast argument is an allocation, The bitcast is for convertion + // to actual type of allocation. Removing such bitcasts, results in having + // GEPs with i8* base and pure byte offsets. That means GEP is not aware of + // struct or array hierarchy. + // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have + // a better chance to succeed. + if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) && + !isAllocationFn(SrcOp, &TLI)) { + // If this GEP instruction doesn't move the pointer, just replace the GEP + // with a bitcast of the real input to the dest type. + if (!Offset) { + // If the bitcast is of an allocation, and the allocation will be + // converted to match the type of the cast, don't touch this. + if (isa<AllocaInst>(SrcOp)) { + // See if the bitcast simplifies, if so, don't nuke this GEP yet. + if (Instruction *I = visitBitCast(*BCI)) { + if (I != BCI) { + I->takeName(BCI); + BCI->getParent()->getInstList().insert(BCI->getIterator(), I); + replaceInstUsesWith(*BCI, I); + } + return &GEP; + } + } + + if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(SrcOp, GEP.getType()); + return new BitCastInst(SrcOp, GEP.getType()); + } + + // Otherwise, if the offset is non-zero, we need to find out if there is a + // field at Offset in 'A's type. If so, we can pull the cast through the + // GEP. + SmallVector<Value*, 8> NewIndices; + if (findElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices, DL)) { + Value *NGEP = + GEP.isInBounds() + ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices) + : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices); + + if (NGEP->getType() == GEP.getType()) + return replaceInstUsesWith(GEP, NGEP); + NGEP->takeName(&GEP); + + if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(NGEP, GEP.getType()); + return new BitCastInst(NGEP, GEP.getType()); + } + } + + return nullptr; +} + Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { - SmallVector<Value *, 8> Ops(GEP.operands()); + Value *PtrOp = GEP.getOperand(0); + SmallVector<Value *, 8> Indices(GEP.indices()); Type *GEPType = GEP.getType(); Type *GEPEltType = GEP.getSourceElementType(); bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType); - if (Value *V = SimplifyGEPInst(GEPEltType, Ops, GEP.isInBounds(), + if (Value *V = SimplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(), SQ.getWithInstruction(&GEP))) return replaceInstUsesWith(GEP, V); @@ -1912,8 +2217,6 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { // undef elements to decrease demanded bits } - Value *PtrOp = GEP.getOperand(0); - // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. bool MadeChange = false; @@ -2063,132 +2366,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { PtrOp = NewGEP; } - // Combine Indices - If the source pointer to this getelementptr instruction - // is a getelementptr instruction, combine the indices of the two - // getelementptr instructions into a single instruction. - if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) { - if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) - return nullptr; - - if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && - Src->hasOneUse()) { - Value *GO1 = GEP.getOperand(1); - Value *SO1 = Src->getOperand(1); - - if (LI) { - // Try to reassociate loop invariant GEP chains to enable LICM. - if (Loop *L = LI->getLoopFor(GEP.getParent())) { - // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is - // invariant: this breaks the dependence between GEPs and allows LICM - // to hoist the invariant part out of the loop. - if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { - // We have to be careful here. - // We have something like: - // %src = getelementptr <ty>, <ty>* %base, <ty> %idx - // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2 - // If we just swap idx & idx2 then we could inadvertantly - // change %src from a vector to a scalar, or vice versa. - // Cases: - // 1) %base a scalar & idx a scalar & idx2 a vector - // => Swapping idx & idx2 turns %src into a vector type. - // 2) %base a scalar & idx a vector & idx2 a scalar - // => Swapping idx & idx2 turns %src in a scalar type - // 3) %base, %idx, and %idx2 are scalars - // => %src & %gep are scalars - // => swapping idx & idx2 is safe - // 4) %base a vector - // => %src is a vector - // => swapping idx & idx2 is safe. - auto *SO0 = Src->getOperand(0); - auto *SO0Ty = SO0->getType(); - if (!isa<VectorType>(GEPType) || // case 3 - isa<VectorType>(SO0Ty)) { // case 4 - Src->setOperand(1, GO1); - GEP.setOperand(1, SO1); - return &GEP; - } else { - // Case 1 or 2 - // -- have to recreate %src & %gep - // put NewSrc at same location as %src - Builder.SetInsertPoint(cast<Instruction>(PtrOp)); - Value *NewSrc = - Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()); - // Propagate 'inbounds' if the new source was not constant-folded. - if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc)) - NewSrcGEPI->setIsInBounds(Src->isInBounds()); - GetElementPtrInst *NewGEP = - GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); - NewGEP->setIsInBounds(GEP.isInBounds()); - return NewGEP; - } - } - } - } - } - - // Note that if our source is a gep chain itself then we wait for that - // chain to be resolved before we perform this transformation. This - // avoids us creating a TON of code in some cases. - if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) - if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) - return nullptr; // Wait until our source is folded to completion. - - SmallVector<Value*, 8> Indices; - - // Find out whether the last index in the source GEP is a sequential idx. - bool EndsWithSequential = false; - for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); - I != E; ++I) - EndsWithSequential = I.isSequential(); - - // Can we combine the two pointer arithmetics offsets? - if (EndsWithSequential) { - // Replace: gep (gep %P, long B), long A, ... - // With: T = long A+B; gep %P, T, ... - Value *SO1 = Src->getOperand(Src->getNumOperands()-1); - Value *GO1 = GEP.getOperand(1); - - // If they aren't the same type, then the input hasn't been processed - // by the loop above yet (which canonicalizes sequential index types to - // intptr_t). Just avoid transforming this until the input has been - // normalized. - if (SO1->getType() != GO1->getType()) - return nullptr; - - Value *Sum = - SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); - // Only do the combine when we are sure the cost after the - // merge is never more than that before the merge. - if (Sum == nullptr) - return nullptr; - - // Update the GEP in place if possible. - if (Src->getNumOperands() == 2) { - GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); - replaceOperand(GEP, 0, Src->getOperand(0)); - replaceOperand(GEP, 1, Sum); - return &GEP; - } - Indices.append(Src->op_begin()+1, Src->op_end()-1); - Indices.push_back(Sum); - Indices.append(GEP.op_begin()+2, GEP.op_end()); - } else if (isa<Constant>(*GEP.idx_begin()) && - cast<Constant>(*GEP.idx_begin())->isNullValue() && - Src->getNumOperands() != 1) { - // Otherwise we can do the fold if the first index of the GEP is a zero - Indices.append(Src->op_begin()+1, Src->op_end()); - Indices.append(GEP.idx_begin()+1, GEP.idx_end()); - } - - if (!Indices.empty()) - return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) - ? GetElementPtrInst::CreateInBounds( - Src->getSourceElementType(), Src->getOperand(0), Indices, - GEP.getName()) - : GetElementPtrInst::Create(Src->getSourceElementType(), - Src->getOperand(0), Indices, - GEP.getName()); - } + if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) + if (Instruction *I = visitGEPOfGEP(GEP, Src)) + return I; // Skip if GEP source element type is scalable. The type alloc size is unknown // at compile-time. @@ -2234,9 +2414,13 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *StrippedPtr = PtrOp->stripPointerCasts(); PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType()); - if (StrippedPtr != PtrOp) { + // TODO: The basic approach of these folds is not compatible with opaque + // pointers, because we can't use bitcasts as a hint for a desirable GEP + // type. Instead, we should perform canonicalization directly on the GEP + // type. For now, skip these. + if (StrippedPtr != PtrOp && !StrippedPtrTy->isOpaque()) { bool HasZeroPointerIndex = false; - Type *StrippedPtrEltTy = StrippedPtrTy->getElementType(); + Type *StrippedPtrEltTy = StrippedPtrTy->getNonOpaquePointerElementType(); if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1))) HasZeroPointerIndex = C->isZero(); @@ -2420,103 +2604,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { ASCStrippedPtrOp = BC; } - if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) { - Value *SrcOp = BCI->getOperand(0); - PointerType *SrcType = cast<PointerType>(BCI->getSrcTy()); - Type *SrcEltType = SrcType->getElementType(); - - // GEP directly using the source operand if this GEP is accessing an element - // of a bitcasted pointer to vector or array of the same dimensions: - // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z - // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z - auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, - const DataLayout &DL) { - auto *VecVTy = cast<FixedVectorType>(VecTy); - return ArrTy->getArrayElementType() == VecVTy->getElementType() && - ArrTy->getArrayNumElements() == VecVTy->getNumElements() && - DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); - }; - if (GEP.getNumOperands() == 3 && - ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) && - areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || - (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() && - areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { - - // Create a new GEP here, as using `setOperand()` followed by - // `setSourceElementType()` won't actually update the type of the - // existing GEP Value. Causing issues if this Value is accessed when - // constructing an AddrSpaceCastInst - Value *NGEP = - GEP.isInBounds() - ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}) - : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}); - NGEP->takeName(&GEP); - - // Preserve GEP address space to satisfy users - if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) - return new AddrSpaceCastInst(NGEP, GEPType); - - return replaceInstUsesWith(GEP, NGEP); - } - - // See if we can simplify: - // X = bitcast A* to B* - // Y = gep X, <...constant indices...> - // into a gep of the original struct. This is important for SROA and alias - // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. - unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType); - APInt Offset(OffsetBits, 0); - - // If the bitcast argument is an allocation, The bitcast is for convertion - // to actual type of allocation. Removing such bitcasts, results in having - // GEPs with i8* base and pure byte offsets. That means GEP is not aware of - // struct or array hierarchy. - // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have - // a better chance to succeed. - if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) && - !isAllocationFn(SrcOp, &TLI)) { - // If this GEP instruction doesn't move the pointer, just replace the GEP - // with a bitcast of the real input to the dest type. - if (!Offset) { - // If the bitcast is of an allocation, and the allocation will be - // converted to match the type of the cast, don't touch this. - if (isa<AllocaInst>(SrcOp)) { - // See if the bitcast simplifies, if so, don't nuke this GEP yet. - if (Instruction *I = visitBitCast(*BCI)) { - if (I != BCI) { - I->takeName(BCI); - BCI->getParent()->getInstList().insert(BCI->getIterator(), I); - replaceInstUsesWith(*BCI, I); - } - return &GEP; - } - } - - if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace()) - return new AddrSpaceCastInst(SrcOp, GEPType); - return new BitCastInst(SrcOp, GEPType); - } - - // Otherwise, if the offset is non-zero, we need to find out if there is a - // field at Offset in 'A's type. If so, we can pull the cast through the - // GEP. - SmallVector<Value*, 8> NewIndices; - if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) { - Value *NGEP = - GEP.isInBounds() - ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices) - : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices); - - if (NGEP->getType() == GEPType) - return replaceInstUsesWith(GEP, NGEP); - NGEP->takeName(&GEP); - - if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) - return new AddrSpaceCastInst(NGEP, GEPType); - return new BitCastInst(NGEP, GEPType); - } - } - } + if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) + if (Instruction *I = visitGEPOfBitcast(BCI, GEP)) + return I; if (!GEP.isInBounds()) { unsigned IdxWidth = @@ -2533,8 +2623,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize()); if (BasePtrOffset.ule(AllocSize)) { return GetElementPtrInst::CreateInBounds( - GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1), - GEP.getName()); + GEP.getSourceElementType(), PtrOp, Indices, GEP.getName()); } } } @@ -2553,10 +2642,6 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI, if (auto *LI = dyn_cast<LoadInst>(V)) return isa<GlobalVariable>(LI->getPointerOperand()); // Two distinct allocations will never be equal. - // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking - // through bitcasts of V can cause - // the result statement below to be true, even when AI and V (ex: - // i8* ->i32* ->i8* of AI) are the same allocations. return isAllocLikeFn(V, &TLI) && V != AI; } @@ -2659,7 +2744,7 @@ static bool isAllocSiteRemovable(Instruction *AI, continue; } - if (isReallocLikeFn(I, &TLI, true)) { + if (isReallocLikeFn(I, &TLI)) { Users.emplace_back(I); Worklist.push_back(I); continue; @@ -2682,6 +2767,8 @@ static bool isAllocSiteRemovable(Instruction *AI, } Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { + assert(isa<AllocaInst>(MI) || isAllocRemovable(&cast<CallBase>(MI), &TLI)); + // If we have a malloc call which is only used in any amount of comparisons to // null and free calls, delete the calls and replace the comparisons with true // or false as appropriate. @@ -2900,7 +2987,7 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI) { // If we had free(realloc(...)) with no intervening uses, then eliminate the // realloc() entirely. if (CallInst *CI = dyn_cast<CallInst>(Op)) { - if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI, true)) { + if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI)) { return eraseInstFromFunction( *replaceInstUsesWith(*CI, CI->getOperand(0))); } @@ -3709,16 +3796,61 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { return nullptr; } +/// Check for case where the call writes to an otherwise dead alloca. This +/// shows up for unused out-params in idiomatic C/C++ code. Note that this +/// helper *only* analyzes the write; doesn't check any other legality aspect. +static bool SoleWriteToDeadLocal(Instruction *I, TargetLibraryInfo &TLI) { + auto *CB = dyn_cast<CallBase>(I); + if (!CB) + // TODO: handle e.g. store to alloca here - only worth doing if we extend + // to allow reload along used path as described below. Otherwise, this + // is simply a store to a dead allocation which will be removed. + return false; + Optional<MemoryLocation> Dest = MemoryLocation::getForDest(CB, TLI); + if (!Dest) + return false; + auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(Dest->Ptr)); + if (!AI) + // TODO: allow malloc? + return false; + // TODO: allow memory access dominated by move point? Note that since AI + // could have a reference to itself captured by the call, we would need to + // account for cycles in doing so. + SmallVector<const User *> AllocaUsers; + SmallPtrSet<const User *, 4> Visited; + auto pushUsers = [&](const Instruction &I) { + for (const User *U : I.users()) { + if (Visited.insert(U).second) + AllocaUsers.push_back(U); + } + }; + pushUsers(*AI); + while (!AllocaUsers.empty()) { + auto *UserI = cast<Instruction>(AllocaUsers.pop_back_val()); + if (isa<BitCastInst>(UserI) || isa<GetElementPtrInst>(UserI) || + isa<AddrSpaceCastInst>(UserI)) { + pushUsers(*UserI); + continue; + } + if (UserI == CB) + continue; + // TODO: support lifetime.start/end here + return false; + } + return true; +} + /// Try to move the specified instruction from its current block into the /// beginning of DestBlock, which can only happen if it's safe to move the /// instruction past all of the instructions between it and the end of its /// block. -static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { +static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock, + TargetLibraryInfo &TLI) { assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!"); BasicBlock *SrcBlock = I->getParent(); // Cannot move control-flow-involving, volatile loads, vaarg, etc. - if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || + if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() || I->isTerminator()) return false; @@ -3738,6 +3870,14 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { if (CI->isConvergent()) return false; } + + // Unless we can prove that the memory write isn't visibile except on the + // path we're sinking to, we must bail. + if (I->mayWriteToMemory()) { + if (!SoleWriteToDeadLocal(I, TLI)) + return false; + } + // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { @@ -3746,7 +3886,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { // successor block. if (DestBlock->getUniquePredecessor() != I->getParent()) return false; - for (BasicBlock::iterator Scan = I->getIterator(), + for (BasicBlock::iterator Scan = std::next(I->getIterator()), E = I->getParent()->end(); Scan != E; ++Scan) if (Scan->mayWriteToMemory()) @@ -3906,12 +4046,11 @@ bool InstCombinerImpl::run() { // predecessor, so that we don't have to split the critical edge. // Another option where we can sink is a block that ends with a // terminator that does not pass control to other block (such as - // return or unreachable). In this case: + // return or unreachable or resume). In this case: // - I dominates the User (by SSA form); // - the User will be executed at most once. // So sinking I down to User is always profitable or neutral. - if (UserParent->getUniquePredecessor() == BB || - (isa<ReturnInst>(Term) || isa<UnreachableInst>(Term))) { + if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) { assert(DT.dominates(BB, UserParent) && "Dominance relation broken?"); return UserParent; } @@ -3922,7 +4061,7 @@ bool InstCombinerImpl::run() { if (OptBB) { auto *UserParent = *OptBB; // Okay, the CFG is simple enough, try to sink this instruction. - if (TryToSinkInstruction(I, UserParent)) { + if (TryToSinkInstruction(I, UserParent, TLI)) { LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n'); MadeIRChange = true; // We'll add uses of the sunk instruction below, but since diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index bd2dc8d639fc..6e72255e51ae 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1547,10 +1547,9 @@ void AddressSanitizer::getInterestingMemoryOperands( Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true, XCHG->getCompareOperand()->getType(), None); } else if (auto CI = dyn_cast<CallInst>(I)) { - auto *F = CI->getCalledFunction(); - if (F && (F->getName().startswith("llvm.masked.load.") || - F->getName().startswith("llvm.masked.store."))) { - bool IsWrite = F->getName().startswith("llvm.masked.store."); + if (CI->getIntrinsicID() == Intrinsic::masked_load || + CI->getIntrinsicID() == Intrinsic::masked_store) { + bool IsWrite = CI->getIntrinsicID() == Intrinsic::masked_store; // Masked store has an initial operand for the value. unsigned OpOffset = IsWrite ? 1 : 0; if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads) @@ -1559,7 +1558,7 @@ void AddressSanitizer::getInterestingMemoryOperands( auto BasePtr = CI->getOperand(OpOffset); if (ignoreAccess(LI, BasePtr)) return; - auto Ty = cast<PointerType>(BasePtr->getType())->getElementType(); + Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType(); MaybeAlign Alignment = Align(1); // Otherwise no alignment guarantees. We probably got Undef. if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) @@ -1653,11 +1652,10 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass, const DataLayout &DL, Type *IntptrTy, Value *Mask, Instruction *I, Value *Addr, MaybeAlign Alignment, - unsigned Granularity, uint32_t TypeSize, + unsigned Granularity, Type *OpType, bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) { - auto *VTy = cast<FixedVectorType>( - cast<PointerType>(Addr->getType())->getElementType()); + auto *VTy = cast<FixedVectorType>(OpType); uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); unsigned Num = VTy->getNumElements(); auto Zero = ConstantInt::get(IntptrTy, 0); @@ -1735,7 +1733,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, unsigned Granularity = 1 << Mapping.Scale; if (O.MaybeMask) { instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(), - Addr, O.Alignment, Granularity, O.TypeSize, + Addr, O.Alignment, Granularity, O.OpType, O.IsWrite, nullptr, UseCalls, Exp); } else { doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment, diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 9f26b37bbc79..ff3aa14a2a83 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -208,6 +208,14 @@ static cl::opt<bool> ClEventCallbacks( cl::desc("Insert calls to __dfsan_*_callback functions on data events."), cl::Hidden, cl::init(false)); +// Experimental feature that inserts callbacks for conditionals, including: +// conditional branch, switch, select. +// This must be true for dfsan_set_conditional_callback() to have effect. +static cl::opt<bool> ClConditionalCallbacks( + "dfsan-conditional-callbacks", + cl::desc("Insert calls to callback functions on conditionals."), cl::Hidden, + cl::init(false)); + // Controls whether the pass tracks the control flow of select instructions. static cl::opt<bool> ClTrackSelectControlFlow( "dfsan-track-select-control-flow", @@ -428,6 +436,8 @@ class DataFlowSanitizer { FunctionType *DFSanSetLabelFnTy; FunctionType *DFSanNonzeroLabelFnTy; FunctionType *DFSanVarargWrapperFnTy; + FunctionType *DFSanConditionalCallbackFnTy; + FunctionType *DFSanConditionalCallbackOriginFnTy; FunctionType *DFSanCmpCallbackFnTy; FunctionType *DFSanLoadStoreCallbackFnTy; FunctionType *DFSanMemTransferCallbackFnTy; @@ -444,6 +454,8 @@ class DataFlowSanitizer { FunctionCallee DFSanLoadCallbackFn; FunctionCallee DFSanStoreCallbackFn; FunctionCallee DFSanMemTransferCallbackFn; + FunctionCallee DFSanConditionalCallbackFn; + FunctionCallee DFSanConditionalCallbackOriginFn; FunctionCallee DFSanCmpCallbackFn; FunctionCallee DFSanChainOriginFn; FunctionCallee DFSanChainOriginIfTaintedFn; @@ -454,7 +466,7 @@ class DataFlowSanitizer { MDNode *OriginStoreWeights; DFSanABIList ABIList; DenseMap<Value *, Function *> UnwrappedFnMap; - AttrBuilder ReadOnlyNoneAttrs; + AttributeMask ReadOnlyNoneAttrs; /// Memory map parameters used in calculation mapping application addresses /// to shadow addresses and origin addresses. @@ -642,6 +654,10 @@ struct DFSanFunction { Align getShadowAlign(Align InstAlignment); + // If ClConditionalCallbacks is enabled, insert a callback after a given + // branch instruction using the given conditional expression. + void addConditionalCallbacksIfEnabled(Instruction &I, Value *Condition); + private: /// Collapses the shadow with aggregate type into a single primitive shadow /// value. @@ -748,6 +764,8 @@ public: void visitSelectInst(SelectInst &I); void visitMemSetInst(MemSetInst &I); void visitMemTransferInst(MemTransferInst &I); + void visitBranchInst(BranchInst &BR); + void visitSwitchInst(SwitchInst &SW); private: void visitCASOrRMW(Align InstAlignment, Instruction &I); @@ -971,6 +989,22 @@ Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow, return PrimitiveShadow; } +void DFSanFunction::addConditionalCallbacksIfEnabled(Instruction &I, + Value *Condition) { + if (!ClConditionalCallbacks) { + return; + } + IRBuilder<> IRB(&I); + Value *CondShadow = getShadow(Condition); + if (DFS.shouldTrackOrigins()) { + Value *CondOrigin = getOrigin(Condition); + IRB.CreateCall(DFS.DFSanConditionalCallbackOriginFn, + {CondShadow, CondOrigin}); + } else { + IRB.CreateCall(DFS.DFSanConditionalCallbackFn, {CondShadow}); + } +} + Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) { if (!OrigTy->isSized()) return PrimitiveShadowTy; @@ -1032,6 +1066,13 @@ bool DataFlowSanitizer::initializeModule(Module &M) { FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false); DFSanVarargWrapperFnTy = FunctionType::get( Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false); + DFSanConditionalCallbackFnTy = + FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy, + /*isVarArg=*/false); + Type *DFSanConditionalCallbackOriginArgs[2] = {PrimitiveShadowTy, OriginTy}; + DFSanConditionalCallbackOriginFnTy = FunctionType::get( + Type::getVoidTy(*Ctx), DFSanConditionalCallbackOriginArgs, + /*isVarArg=*/false); DFSanCmpCallbackFnTy = FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy, /*isVarArg=*/false); @@ -1160,7 +1201,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, // F is called by a wrapped custom function with primitive shadows. So // its arguments and return value need conversion. DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true, - /*ForceZeroLabels=*/false); + /*IsForceZeroLabels=*/false); Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) { @@ -1271,6 +1312,10 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { DFSanRuntimeFunctions.insert( DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( + DFSanConditionalCallbackFn.getCallee()->stripPointerCasts()); + DFSanRuntimeFunctions.insert( + DFSanConditionalCallbackOriginFn.getCallee()->stripPointerCasts()); + DFSanRuntimeFunctions.insert( DFSanCmpCallbackFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( DFSanChainOriginFn.getCallee()->stripPointerCasts()); @@ -1292,6 +1337,12 @@ void DataFlowSanitizer::initializeCallbackFunctions(Module &M) { "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy); DFSanCmpCallbackFn = Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy); + + DFSanConditionalCallbackFn = Mod->getOrInsertFunction( + "__dfsan_conditional_callback", DFSanConditionalCallbackFnTy); + DFSanConditionalCallbackOriginFn = + Mod->getOrInsertFunction("__dfsan_conditional_callback_origin", + DFSanConditionalCallbackOriginFnTy); } void DataFlowSanitizer::injectMetadataGlobals(Module &M) { @@ -2593,6 +2644,8 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { Value *FalseOrigin = ShouldTrackOrigins ? DFSF.getOrigin(I.getFalseValue()) : nullptr; + DFSF.addConditionalCallbacksIfEnabled(I, I.getCondition()); + if (isa<VectorType>(I.getCondition()->getType())) { ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow, FalseShadow, &I); @@ -2683,6 +2736,17 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { } } +void DFSanVisitor::visitBranchInst(BranchInst &BR) { + if (!BR.isConditional()) + return; + + DFSF.addConditionalCallbacksIfEnabled(BR, BR.getCondition()); +} + +void DFSanVisitor::visitSwitchInst(SwitchInst &SW) { + DFSF.addConditionalCallbacksIfEnabled(SW, SW.getCondition()); +} + static bool isAMustTailRetVal(Value *RetVal) { // Tail call may have a bitcast between return. if (auto *I = dyn_cast<BitCastInst>(RetVal)) { diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 8d3bc1383e96..fb10a99d1338 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1403,16 +1403,16 @@ bool HWAddressSanitizer::instrumentStack( size_t Size = getAllocaSizeInBytes(*AI); size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); + auto TagEnd = [&](Instruction *Node) { + IRB.SetInsertPoint(Node); + Value *UARTag = getUARTag(IRB, StackTag); + tagAlloca(IRB, AI, UARTag, AlignedSize); + }; bool StandardLifetime = UnrecognizedLifetimes.empty() && isStandardLifetime(Info, GetDT()); if (DetectUseAfterScope && StandardLifetime) { IntrinsicInst *Start = Info.LifetimeStart[0]; IRB.SetInsertPoint(Start->getNextNode()); - auto TagEnd = [&](Instruction *Node) { - IRB.SetInsertPoint(Node); - Value *UARTag = getUARTag(IRB, StackTag); - tagAlloca(IRB, AI, UARTag, AlignedSize); - }; tagAlloca(IRB, AI, Tag, Size); if (!forAllReachableExits(GetDT(), GetPDT(), Start, Info.LifetimeEnd, RetVec, TagEnd)) { @@ -1421,11 +1421,8 @@ bool HWAddressSanitizer::instrumentStack( } } else { tagAlloca(IRB, AI, Tag, Size); - for (auto *RI : RetVec) { - IRB.SetInsertPoint(RI); - Value *UARTag = getUARTag(IRB, StackTag); - tagAlloca(IRB, AI, UARTag, AlignedSize); - } + for (auto *RI : RetVec) + TagEnd(RI); if (!StandardLifetime) { for (auto &II : Info.LifetimeStart) II->eraseFromParent(); diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index de34348606ef..ab179b03dd29 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -248,8 +248,7 @@ public: PGOCounterPromoter( DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands, Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI) - : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop), - LI(LI), BFI(BFI) { + : LoopToCandidates(LoopToCands), L(CurLoop), LI(LI), BFI(BFI) { // Skip collection of ExitBlocks and InsertPts for loops that will not be // able to have counters promoted. @@ -446,24 +445,19 @@ llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options, return new InstrProfilingLegacyPass(Options, IsCS); } -static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) { - InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr); - if (Inc) - return Inc; - return dyn_cast<InstrProfIncrementInst>(Instr); -} - bool InstrProfiling::lowerIntrinsics(Function *F) { bool MadeChange = false; PromotionCandidates.clear(); for (BasicBlock &BB : *F) { for (Instruction &Instr : llvm::make_early_inc_range(BB)) { - InstrProfIncrementInst *Inc = castToIncrementInst(&Instr); - if (Inc) { - lowerIncrement(Inc); + if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) { + lowerIncrement(IPIS); + MadeChange = true; + } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) { + lowerIncrement(IPI); MadeChange = true; - } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(&Instr)) { - lowerValueProfileInst(Ind); + } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) { + lowerValueProfileInst(IPVP); MadeChange = true; } } @@ -540,19 +534,14 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) { /// Check if the module contains uses of any profiling intrinsics. static bool containsProfilingIntrinsics(Module &M) { - if (auto *F = M.getFunction( - Intrinsic::getName(llvm::Intrinsic::instrprof_increment))) - if (!F->use_empty()) - return true; - if (auto *F = M.getFunction( - Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step))) - if (!F->use_empty()) - return true; - if (auto *F = M.getFunction( - Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile))) - if (!F->use_empty()) - return true; - return false; + auto containsIntrinsic = [&](int ID) { + if (auto *F = M.getFunction(Intrinsic::getName(ID))) + return !F->use_empty(); + return false; + }; + return containsIntrinsic(llvm::Intrinsic::instrprof_increment) || + containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) || + containsIntrinsic(llvm::Intrinsic::instrprof_value_profile); } bool InstrProfiling::run( @@ -771,7 +760,7 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) { } /// Get the name of a profiling variable for a particular function. -static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix, +static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix, bool &Renamed) { StringRef NamePrefix = getInstrProfNameVarPrefix(); StringRef Name = Inc->getName()->getName().substr(NamePrefix.size()); @@ -860,7 +849,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { } GlobalVariable * -InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { +InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) { GlobalVariable *NamePtr = Inc->getName(); auto &PD = ProfileDataMap[NamePtr]; if (PD.RegionCounters) @@ -997,8 +986,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx)); } - if (DebugInfoCorrelate) + if (DebugInfoCorrelate) { + // Mark the counter variable as used so that it isn't optimized out. + CompilerUsedVars.push_back(PD.RegionCounters); return PD.RegionCounters; + } // Create data variable. auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext()); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index 727672fa0605..8fedefccf0e1 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -156,6 +156,7 @@ struct InterestingMemoryAccess { Value *Addr = nullptr; bool IsWrite; unsigned Alignment; + Type *AccessTy; uint64_t TypeSize; Value *MaybeMask = nullptr; }; @@ -181,7 +182,7 @@ public: Value *Addr, uint32_t TypeSize, bool IsWrite); void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask, Instruction *I, Value *Addr, - unsigned Alignment, uint32_t TypeSize, + unsigned Alignment, Type *AccessTy, bool IsWrite); void instrumentMemIntrinsic(MemIntrinsic *MI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); @@ -334,36 +335,32 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { InterestingMemoryAccess Access; - const DataLayout &DL = I->getModule()->getDataLayout(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (!ClInstrumentReads) return None; Access.IsWrite = false; - Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType()); + Access.AccessTy = LI->getType(); Access.Alignment = LI->getAlignment(); Access.Addr = LI->getPointerOperand(); } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { if (!ClInstrumentWrites) return None; Access.IsWrite = true; - Access.TypeSize = - DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType()); + Access.AccessTy = SI->getValueOperand()->getType(); Access.Alignment = SI->getAlignment(); Access.Addr = SI->getPointerOperand(); } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { if (!ClInstrumentAtomics) return None; Access.IsWrite = true; - Access.TypeSize = - DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType()); + Access.AccessTy = RMW->getValOperand()->getType(); Access.Alignment = 0; Access.Addr = RMW->getPointerOperand(); } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { if (!ClInstrumentAtomics) return None; Access.IsWrite = true; - Access.TypeSize = - DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType()); + Access.AccessTy = XCHG->getCompareOperand()->getType(); Access.Alignment = 0; Access.Addr = XCHG->getPointerOperand(); } else if (auto *CI = dyn_cast<CallInst>(I)) { @@ -376,16 +373,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { return None; // Masked store has an initial operand for the value. OpOffset = 1; + Access.AccessTy = CI->getArgOperand(0)->getType(); Access.IsWrite = true; } else { if (!ClInstrumentReads) return None; + Access.AccessTy = CI->getType(); Access.IsWrite = false; } auto *BasePtr = CI->getOperand(0 + OpOffset); - auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType(); - Access.TypeSize = DL.getTypeStoreSizeInBits(Ty); if (auto *AlignmentConstant = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) Access.Alignment = (unsigned)AlignmentConstant->getZExtValue(); @@ -412,15 +409,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { if (Access.Addr->isSwiftError()) return None; + const DataLayout &DL = I->getModule()->getDataLayout(); + Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy); return Access; } void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask, Instruction *I, Value *Addr, unsigned Alignment, - uint32_t TypeSize, bool IsWrite) { - auto *VTy = cast<FixedVectorType>( - cast<PointerType>(Addr->getType())->getElementType()); + Type *AccessTy, bool IsWrite) { + auto *VTy = cast<FixedVectorType>(AccessTy); uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); unsigned Num = VTy->getNumElements(); auto *Zero = ConstantInt::get(IntptrTy, 0); @@ -469,7 +467,7 @@ void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL, if (Access.MaybeMask) { instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr, - Access.Alignment, Access.TypeSize, + Access.Alignment, Access.AccessTy, Access.IsWrite); } else { // Since the access counts will be accumulated across the entire allocation, diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 446e601cd4d7..cfe993dedbc2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -492,7 +492,7 @@ class MemorySanitizer { public: MemorySanitizer(Module &M, MemorySanitizerOptions Options) : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins), - Recover(Options.Recover) { + Recover(Options.Recover), EagerChecks(Options.EagerChecks) { initializeModule(M); } @@ -522,6 +522,7 @@ private: /// Track origins (allocation points) of uninitialized values. int TrackOrigins; bool Recover; + bool EagerChecks; LLVMContext *C; Type *IntptrTy; @@ -665,10 +666,12 @@ template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) { } // end anonymous namespace -MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K) +MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K, + bool EagerChecks) : Kernel(getOptOrDefault(ClEnableKmsan, K)), TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)), - Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {} + Recover(getOptOrDefault(ClKeepGoing, Kernel || R)), + EagerChecks(getOptOrDefault(ClEagerChecks, EagerChecks)) {} PreservedAnalyses MemorySanitizerPass::run(Function &F, FunctionAnalysisManager &FAM) { @@ -695,6 +698,8 @@ void MemorySanitizerPass::printPipeline( OS << "recover;"; if (Options.Kernel) OS << "kernel;"; + if (Options.EagerChecks) + OS << "eager-checks;"; OS << "track-origins=" << Options.TrackOrigins; OS << ">"; } @@ -1667,9 +1672,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// This function either returns the value set earlier with setShadow, /// or extracts if from ParamTLS (for function arguments). Value *getShadow(Value *V) { - if (!PropagateShadow) return getCleanShadow(V); if (Instruction *I = dyn_cast<Instruction>(V)) { - if (I->getMetadata("nosanitize")) + if (!PropagateShadow || I->getMetadata("nosanitize")) return getCleanShadow(V); // For instructions the shadow is already stored in the map. Value *Shadow = ShadowMap[V]; @@ -1681,7 +1685,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return Shadow; } if (UndefValue *U = dyn_cast<UndefValue>(V)) { - Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V); + Value *AllOnes = (PropagateShadow && PoisonUndef) ? getPoisonedShadow(V) + : getCleanShadow(V); LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n"); (void)U; return AllOnes; @@ -1701,22 +1706,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { continue; } - bool FArgByVal = FArg.hasByValAttr(); - bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef); - bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef; - unsigned Size = - FArg.hasByValAttr() - ? DL.getTypeAllocSize(FArg.getParamByValType()) - : DL.getTypeAllocSize(FArg.getType()); + unsigned Size = FArg.hasByValAttr() + ? DL.getTypeAllocSize(FArg.getParamByValType()) + : DL.getTypeAllocSize(FArg.getType()); if (A == &FArg) { bool Overflow = ArgOffset + Size > kParamTLSSize; - if (FArgEagerCheck) { - *ShadowPtr = getCleanShadow(V); - setOrigin(A, getCleanOrigin()); - break; - } else if (FArgByVal) { - Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); + if (FArg.hasByValAttr()) { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. // Figure out maximal valid memcpy alignment. @@ -1727,40 +1723,38 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /*isStore*/ true) .first; // TODO(glider): need to copy origins. - if (Overflow) { + if (!PropagateShadow || Overflow) { // ParamTLS overflow. EntryIRB.CreateMemSet( CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign); } else { + Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base, CopyAlign, Size); LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); (void)Cpy; } + } + + if (!PropagateShadow || Overflow || FArg.hasByValAttr() || + (MS.EagerChecks && FArg.hasAttribute(Attribute::NoUndef))) { *ShadowPtr = getCleanShadow(V); + setOrigin(A, getCleanOrigin()); } else { // Shadow over TLS Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); - if (Overflow) { - // ParamTLS overflow. - *ShadowPtr = getCleanShadow(V); - } else { - *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, - kShadowTLSAlignment); + *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, + kShadowTLSAlignment); + if (MS.TrackOrigins) { + Value *OriginPtr = + getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); + setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr)); } } LLVM_DEBUG(dbgs() << " ARG: " << FArg << " ==> " << **ShadowPtr << "\n"); - if (MS.TrackOrigins && !Overflow) { - Value *OriginPtr = - getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); - setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr)); - } else { - setOrigin(A, getCleanOrigin()); - } - break; } @@ -3664,7 +3658,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // will become a non-readonly function after it is instrumented by us. To // prevent this code from being optimized out, mark that function // non-readonly in advance. - AttrBuilder B; + AttributeMask B; B.addAttribute(Attribute::ReadOnly) .addAttribute(Attribute::ReadNone) .addAttribute(Attribute::WriteOnly) @@ -3679,7 +3673,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI); } IRBuilder<> IRB(&CB); - bool MayCheckCall = ClEagerChecks; + bool MayCheckCall = MS.EagerChecks; if (Function *Func = CB.getCalledFunction()) { // __sanitizer_unaligned_{load,store} functions may be called by users // and always expects shadows in the TLS. So don't check them. @@ -3697,15 +3691,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { continue; } unsigned Size = 0; - Value *Store = nullptr; - // Compute the Shadow for arg even if it is ByVal, because - // in that case getShadow() will copy the actual arg shadow to - // __msan_param_tls. - Value *ArgShadow = getShadow(A); - Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); - LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A - << " Shadow: " << *ArgShadow << "\n"); - bool ArgIsInitialized = false; const DataLayout &DL = F.getParent()->getDataLayout(); bool ByVal = CB.paramHasAttr(i, Attribute::ByVal); @@ -3716,6 +3701,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { insertShadowCheck(A, &CB); Size = DL.getTypeAllocSize(A->getType()); } else { + bool ArgIsInitialized = false; + Value *Store = nullptr; + // Compute the Shadow for arg even if it is ByVal, because + // in that case getShadow() will copy the actual arg shadow to + // __msan_param_tls. + Value *ArgShadow = getShadow(A); + Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); + LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A + << " Shadow: " << *ArgShadow << "\n"); if (ByVal) { // ByVal requires some special handling as it's too big for a single // load @@ -3732,10 +3726,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ false) .first; - - Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr, - Alignment, Size); - // TODO(glider): need to copy origins. + if (!PropagateShadow) { + Store = IRB.CreateMemSet(ArgShadowBase, + Constant::getNullValue(IRB.getInt8Ty()), + Size, Alignment); + } else { + Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr, + Alignment, Size); + } } else { // Any other parameters mean we need bit-grained tracking of uninit // data @@ -3832,10 +3830,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); bool HasNoUndef = F.hasRetAttribute(Attribute::NoUndef); - bool StoreShadow = !(ClEagerChecks && HasNoUndef); + bool StoreShadow = !(MS.EagerChecks && HasNoUndef); // FIXME: Consider using SpecialCaseList to specify a list of functions that // must always return fully initialized values. For now, we hardcode "main". - bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main"); + bool EagerCheck = (MS.EagerChecks && HasNoUndef) || (F.getName() == "main"); Value *Shadow = getShadow(RetVal); bool StoreOrigin = true; @@ -5359,7 +5357,7 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) { MemorySanitizerVisitor Visitor(F, *this, TLI); // Clear out readonly/readnone attributes. - AttrBuilder B; + AttributeMask B; B.addAttribute(Attribute::ReadOnly) .addAttribute(Attribute::ReadNone) .addAttribute(Attribute::WriteOnly) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index b6ba1fc2132c..c46415e5b1f4 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -877,7 +877,10 @@ populateEHOperandBundle(VPCandidateInfo &Cand, DenseMap<BasicBlock *, ColorVector> &BlockColors, SmallVectorImpl<OperandBundleDef> &OpBundles) { auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst); - if (OrigCall && !isa<IntrinsicInst>(OrigCall)) { + if (!OrigCall) + return; + + if (!isa<IntrinsicInst>(OrigCall)) { // The instrumentation call should belong to the same funclet as a // non-intrinsic call, so just copy the operand bundle, if any exists. Optional<OperandBundleUse> ParentFunclet = diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index da8ee1f15bf8..d3b60c7add34 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -917,8 +917,7 @@ void ModuleSanitizerCoverage::InjectTraceForGep( void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores( Function &, ArrayRef<LoadInst *> Loads, ArrayRef<StoreInst *> Stores) { - auto CallbackIdx = [&](const Value *Ptr) -> int { - auto ElementTy = cast<PointerType>(Ptr->getType())->getElementType(); + auto CallbackIdx = [&](Type *ElementTy) -> int { uint64_t TypeSize = DL->getTypeStoreSizeInBits(ElementTy); return TypeSize == 8 ? 0 : TypeSize == 16 ? 1 @@ -932,7 +931,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores( for (auto LI : Loads) { IRBuilder<> IRB(LI); auto Ptr = LI->getPointerOperand(); - int Idx = CallbackIdx(Ptr); + int Idx = CallbackIdx(LI->getType()); if (Idx < 0) continue; IRB.CreateCall(SanCovLoadFunction[Idx], @@ -941,7 +940,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores( for (auto SI : Stores) { IRBuilder<> IRB(SI); auto Ptr = SI->getPointerOperand(); - int Idx = CallbackIdx(Ptr); + int Idx = CallbackIdx(SI->getValueOperand()->getType()); if (Idx < 0) continue; IRB.CreateCall(SanCovStoreFunction[Idx], diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index 764dc5f92707..c11691c613ac 100644 --- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -42,7 +42,7 @@ enum class ARCRuntimeEntryPointKind { Autorelease, StoreStrong, RetainRV, - ClaimRV, + UnsafeClaimRV, RetainAutorelease, RetainAutoreleaseRV, }; @@ -62,7 +62,7 @@ public: Autorelease = nullptr; StoreStrong = nullptr; RetainRV = nullptr; - ClaimRV = nullptr; + UnsafeClaimRV = nullptr; RetainAutorelease = nullptr; RetainAutoreleaseRV = nullptr; } @@ -87,9 +87,9 @@ public: case ARCRuntimeEntryPointKind::RetainRV: return getIntrinsicEntryPoint(RetainRV, Intrinsic::objc_retainAutoreleasedReturnValue); - case ARCRuntimeEntryPointKind::ClaimRV: + case ARCRuntimeEntryPointKind::UnsafeClaimRV: return getIntrinsicEntryPoint( - ClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue); + UnsafeClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue); case ARCRuntimeEntryPointKind::RetainAutorelease: return getIntrinsicEntryPoint(RetainAutorelease, Intrinsic::objc_retainAutorelease); @@ -127,7 +127,7 @@ private: Function *RetainRV = nullptr; /// Declaration for objc_unsafeClaimAutoreleasedReturnValue(). - Function *ClaimRV = nullptr; + Function *UnsafeClaimRV = nullptr; /// Declaration for objc_retainAutorelease(). Function *RetainAutorelease = nullptr; diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 4921209f041b..de0f5803b4c7 100644 --- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -194,9 +194,6 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, return CanInterruptRV(Class); } } - - case RetainRVDep: - return CanInterruptRV(GetBasicARCInstKind(Inst)); } llvm_unreachable("Invalid dependence flavor"); diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h index cf4c05ebe91c..dd6a1c3f9795 100644 --- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h +++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h @@ -46,8 +46,7 @@ enum DependenceKind { AutoreleasePoolBoundary, CanChangeRetainCount, RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease. - RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue. - RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue. + RetainAutoreleaseRVDep ///< Blocks objc_retainAutoreleaseReturnValue. }; /// Find dependent instructions. If there is exactly one dependent instruction, diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index c2ed94e8e1f6..9e2832827686 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -433,7 +433,7 @@ bool ObjCARCContract::tryToPeepholeInstruction( // If we succeed in our optimization, fall through. LLVM_FALLTHROUGH; case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: { + case ARCInstKind::UnsafeClaimRV: { bool IsInstContainedInBundle = BundledInsts->contains(Inst); // Return now if the target doesn't need a special inline-asm marker. Return diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 0fa4904456cd..b6dc97f1e43f 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -515,7 +515,7 @@ class ObjCARCOpt { Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, Instruction *Inst, ARCInstKind Class, const Value *Arg); - /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV. If the + /// Try to optimize an AutoreleaseRV with a RetainRV or UnsafeClaimRV. If the /// optimization occurs, returns true to indicate that the caller should /// assume the instructions are dead. bool OptimizeInlinedAutoreleaseRVCall( @@ -705,14 +705,14 @@ bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall( return true; } - // ClaimRV is a frontend peephole for RetainRV + Release. Since the - // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release. - assert(Class == ARCInstKind::ClaimRV); + // UnsafeClaimRV is a frontend peephole for RetainRV + Release. Since the + // AutoreleaseRV and RetainRV cancel out, replace UnsafeClaimRV with Release. + assert(Class == ARCInstKind::UnsafeClaimRV); Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0); CallInst *Release = CallInst::Create( EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst); - assert(IsAlwaysTail(ARCInstKind::ClaimRV) && - "Expected ClaimRV to be safe to tail call"); + assert(IsAlwaysTail(ARCInstKind::UnsafeClaimRV) && + "Expected UnsafeClaimRV to be safe to tail call"); Release->setTailCall(); Inst->replaceAllUsesWith(CallArg); EraseInstruction(Inst); @@ -810,7 +810,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { BlockColors = colorEHFunclets(F); // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired - // with RetainRV and ClaimRV. + // with RetainRV and UnsafeClaimRV. Instruction *DelayedAutoreleaseRV = nullptr; const Value *DelayedAutoreleaseRVArg = nullptr; auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) { @@ -837,7 +837,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { return false; // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and - // ClaimRV, it's probably safe to skip over even opaque function calls + // UnsafeClaimRV, it's probably safe to skip over even opaque function calls // here since OptimizeInlinedAutoreleaseRVCall will confirm that they // have the same RCIdentityRoot. However, what really matters is // skipping instructions or intrinsics that the inliner could leave behind; @@ -881,7 +881,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { setDelayedAutoreleaseRV(Inst); continue; case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: if (DelayedAutoreleaseRV) { // We have a potential RV pair. Check if they cancel out. if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class, @@ -979,9 +979,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl( CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(0))) { Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), CI); + new StoreInst(ConstantInt::getTrue(CI->getContext()), + UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI); Value *NewValue = UndefValue::get(CI->getType()); LLVM_DEBUG( dbgs() << "A null pointer-to-weak-pointer is undefined behavior." @@ -999,9 +998,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl( if (IsNullOrUndef(CI->getArgOperand(0)) || IsNullOrUndef(CI->getArgOperand(1))) { Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), CI); + new StoreInst(ConstantInt::getTrue(CI->getContext()), + UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI); Value *NewValue = UndefValue::get(CI->getType()); LLVM_DEBUG( @@ -1165,7 +1163,7 @@ void ObjCARCOpt::OptimizeIndividualCallImpl( DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst, PA); break; - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::RetainRV: case ARCInstKind::AutoreleaseRV: // Don't move these; the RV optimization depends on the autoreleaseRV diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 6d0a67c91cfa..1624cf26094a 100644 --- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -32,7 +32,6 @@ namespace llvm { class AAResults; -class DataLayout; class PHINode; class SelectInst; class Value; diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp index b693acceb3f6..1cda206a7e14 100644 --- a/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -579,6 +579,7 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() { // Don't compute the post ordering unless we needed it. bool HavePostOrder = false; bool Changed = false; + SmallVector<DominatorTree::UpdateType, 10> DeletedEdges; for (auto *BB : BlocksWithDeadTerminators) { auto &Info = BlockInfo[BB]; @@ -617,7 +618,6 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() { makeUnconditional(BB, PreferredSucc->BB); // Inform the dominators about the deleted CFG edges. - SmallVector<DominatorTree::UpdateType, 4> DeletedEdges; for (auto *Succ : RemovedSuccessors) { // It might have happened that the same successor appeared multiple times // and the CFG edge wasn't really removed. @@ -629,13 +629,14 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() { } } - DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager) - .applyUpdates(DeletedEdges); - NumBranchesRemoved += 1; Changed = true; } + if (!DeletedEdges.empty()) + DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager) + .applyUpdates(DeletedEdges); + return Changed; } diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 37a7053d778e..25e8c3ef3b48 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -414,6 +414,14 @@ void ConstantHoistingPass::collectConstantCandidates( IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace()); APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true); auto *GEPO = cast<GEPOperator>(ConstExpr); + + // TODO: If we have a mix of inbounds and non-inbounds GEPs, then basing a + // non-inbounds GEP on an inbounds GEP is potentially incorrect. Restrict to + // inbounds GEP for now -- alternatively, we could drop inbounds from the + // constant expression, + if (!GEPO->isInBounds()) + return; + if (!GEPO->accumulateConstantOffset(*DL, Offset)) return; @@ -470,7 +478,7 @@ void ConstantHoistingPass::collectConstantCandidates( // Visit constant expressions that have constant integers. if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { // Handle constant gep expressions. - if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing()) + if (ConstHoistGEP && isa<GEPOperator>(ConstExpr)) collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr); // Only visit constant cast expressions. @@ -810,7 +818,7 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, // Visit constant expression. if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { - if (ConstExpr->isGEPWithNoNotionalOverIndexing()) { + if (isa<GEPOperator>(ConstExpr)) { // Operand is a ConstantGEP, replace it. updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat); return; diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 7f2d5d7d9987..13963657d183 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -43,6 +43,51 @@ DEBUG_COUNTER(EliminatedCounter, "conds-eliminated", static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max(); +namespace { +struct ConstraintTy { + SmallVector<int64_t, 8> Coefficients; + + ConstraintTy(SmallVector<int64_t, 8> Coefficients) + : Coefficients(Coefficients) {} + + unsigned size() const { return Coefficients.size(); } +}; + +/// Struct to manage a list of constraints. +struct ConstraintListTy { + SmallVector<ConstraintTy, 4> Constraints; + + ConstraintListTy() {} + + ConstraintListTy(const SmallVector<ConstraintTy, 4> &Constraints) + : Constraints(Constraints) {} + + void mergeIn(const ConstraintListTy &Other) { + append_range(Constraints, Other.Constraints); + } + + unsigned size() const { return Constraints.size(); } + + unsigned empty() const { return Constraints.empty(); } + + /// Returns true if any constraint has a non-zero coefficient for any of the + /// newly added indices. Zero coefficients for new indices are removed. If it + /// returns true, no new variable need to be added to the system. + bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) { + assert(size() == 1); + for (unsigned I = 0; I < NewIndices.size(); ++I) { + int64_t Last = get(0).Coefficients.pop_back_val(); + if (Last != 0) + return true; + } + return false; + } + + ConstraintTy &get(unsigned I) { return Constraints[I]; } +}; + +} // namespace + // Decomposes \p V into a vector of pairs of the form { c, X } where c * X. The // sum of the pairs equals \p V. The first pair is the constant-factor and X // must be nullptr. If the expression cannot be decomposed, returns an empty @@ -108,24 +153,15 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI)))) return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}}; if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1)))) - return {{0, nullptr}, {1, Op0}, {1, Op1}}; + return {{0, nullptr}, {1, Op0}, {-1, Op1}}; return {{0, nullptr}, {1, V}}; } -struct ConstraintTy { - SmallVector<int64_t, 8> Coefficients; - - ConstraintTy(SmallVector<int64_t, 8> Coefficients) - : Coefficients(Coefficients) {} - - unsigned size() const { return Coefficients.size(); } -}; - /// Turn a condition \p CmpI into a vector of constraints, using indices from \p /// Value2Index. Additional indices for newly discovered values are added to \p /// NewIndices. -static SmallVector<ConstraintTy, 4> +static ConstraintListTy getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, const DenseMap<Value *, unsigned> &Value2Index, DenseMap<Value *, unsigned> &NewIndices) { @@ -151,11 +187,15 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, Value2Index, NewIndices); if (Pred == CmpInst::ICMP_EQ) { + if (match(Op1, m_Zero())) + return getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, + NewIndices); + auto A = getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices); auto B = getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices); - append_range(A, B); + A.mergeIn(B); return A; } @@ -200,10 +240,10 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, R[GetOrAddIndex(KV.second)] -= KV.first; R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0); - return {R}; + return {{R}}; } -static SmallVector<ConstraintTy, 4> +static ConstraintListTy getConstraint(CmpInst *Cmp, const DenseMap<Value *, unsigned> &Value2Index, DenseMap<Value *, unsigned> &NewIndices) { return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0), @@ -397,21 +437,10 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { if (R.size() != 1) continue; - // Check if all coefficients of new indices are 0 after building the - // constraint. Skip if any of the new indices has a non-null - // coefficient. - bool HasNewIndex = false; - for (unsigned I = 0; I < NewIndices.size(); ++I) { - int64_t Last = R[0].Coefficients.pop_back_val(); - if (Last != 0) { - HasNewIndex = true; - break; - } - } - if (HasNewIndex || R[0].size() == 1) + if (R.needsNewIndices(NewIndices)) continue; - if (CS.isConditionImplied(R[0].Coefficients)) { + if (CS.isConditionImplied(R.get(0).Coefficients)) { if (!DebugCounter::shouldExecute(EliminatedCounter)) continue; @@ -432,7 +461,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { Changed = true; } if (CS.isConditionImplied( - ConstraintSystem::negate(R[0].Coefficients))) { + ConstraintSystem::negate(R.get(0).Coefficients))) { if (!DebugCounter::shouldExecute(EliminatedCounter)) continue; @@ -479,7 +508,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n"); bool Added = false; - for (auto &C : R) { + for (auto &C : R.Constraints) { auto Coeffs = C.Coefficients; LLVM_DEBUG({ dbgs() << " constraint: "; diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index eadbb4293539..ae636e7b61f7 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -699,17 +699,14 @@ bool isNoopIntrinsic(Instruction *I) { } // Check if we can ignore \p D for DSE. -bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller, - const TargetLibraryInfo &TLI) { +bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { Instruction *DI = D->getMemoryInst(); // Calls that only access inaccessible memory cannot read or write any memory // locations we consider for elimination. if (auto *CB = dyn_cast<CallBase>(DI)) - if (CB->onlyAccessesInaccessibleMemory()) { - if (isAllocLikeFn(DI, &TLI)) - return false; + if (CB->onlyAccessesInaccessibleMemory()) return true; - } + // We can eliminate stores to locations not visible to the caller across // throwing instructions. if (DI->mayThrow() && !DefVisibleToCaller) @@ -759,10 +756,8 @@ struct DSEState { SmallVector<MemoryDef *, 64> MemDefs; // Any that should be skipped as they are already deleted SmallPtrSet<MemoryAccess *, 4> SkipStores; - // Keep track of all of the objects that are invisible to the caller before - // the function returns. - // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet; - DenseMap<const Value *, bool> InvisibleToCallerBeforeRet; + // Keep track whether a given object is captured before return or not. + DenseMap<const Value *, bool> CapturedBeforeReturn; // Keep track of all of the objects that are invisible to the caller after // the function returns. DenseMap<const Value *, bool> InvisibleToCallerAfterRet; @@ -805,12 +800,8 @@ struct DSEState { // Treat byval or inalloca arguments the same as Allocas, stores to them are // dead at the end of the function. for (Argument &AI : F.args()) - if (AI.hasPassPointeeByValueCopyAttr()) { - // For byval, the caller doesn't know the address of the allocation. - if (AI.hasByValAttr()) - InvisibleToCallerBeforeRet.insert({&AI, true}); + if (AI.hasPassPointeeByValueCopyAttr()) InvisibleToCallerAfterRet.insert({&AI, true}); - } // Collect whether there is any irreducible control flow in the function. ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); @@ -835,6 +826,20 @@ struct DSEState { if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc)) return OW_Unknown; + const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts(); + const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts(); + const Value *DeadUndObj = getUnderlyingObject(DeadPtr); + const Value *KillingUndObj = getUnderlyingObject(KillingPtr); + + // Check whether the killing store overwrites the whole object, in which + // case the size/offset of the dead store does not matter. + if (DeadUndObj == KillingUndObj && KillingLoc.Size.isPrecise()) { + uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); + if (KillingUndObjSize != MemoryLocation::UnknownSize && + KillingUndObjSize == KillingLoc.Size.getValue()) + return OW_Complete; + } + // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll // get imprecise values here, though (except for unknown sizes). if (!KillingLoc.Size.isPrecise() || !DeadLoc.Size.isPrecise()) { @@ -875,14 +880,6 @@ struct DSEState { return OW_Complete; } - // Check to see if the killing store is to the entire object (either a - // global, an alloca, or a byval/inalloca argument). If so, then it clearly - // overwrites any other store to the same object. - const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts(); - const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts(); - const Value *DeadUndObj = getUnderlyingObject(DeadPtr); - const Value *KillingUndObj = getUnderlyingObject(KillingPtr); - // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. if (DeadUndObj != KillingUndObj) { @@ -896,12 +893,6 @@ struct DSEState { return OW_Unknown; } - // If the KillingI store is to a recognizable object, get its size. - uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); - if (KillingUndObjSize != MemoryLocation::UnknownSize) - if (KillingUndObjSize == KillingSize && KillingUndObjSize >= DeadSize) - return OW_Complete; - // Okay, we have stores to two completely different pointers. Try to // decompose the pointer into a "base + constant_offset" form. If the base // pointers are equal, then we can reason about the two stores. @@ -957,31 +948,30 @@ struct DSEState { return true; auto I = InvisibleToCallerAfterRet.insert({V, false}); if (I.second) { - if (!isInvisibleToCallerBeforeRet(V)) { + if (!isInvisibleToCallerOnUnwind(V)) { I.first->second = false; - } else { - auto *Inst = dyn_cast<Instruction>(V); - if (Inst && isAllocLikeFn(Inst, &TLI)) - I.first->second = !PointerMayBeCaptured(V, true, false); + } else if (isNoAliasCall(V)) { + I.first->second = !PointerMayBeCaptured(V, true, false); } } return I.first->second; } - bool isInvisibleToCallerBeforeRet(const Value *V) { - if (isa<AllocaInst>(V)) + bool isInvisibleToCallerOnUnwind(const Value *V) { + bool RequiresNoCaptureBeforeUnwind; + if (!isNotVisibleOnUnwind(V, RequiresNoCaptureBeforeUnwind)) + return false; + if (!RequiresNoCaptureBeforeUnwind) return true; - auto I = InvisibleToCallerBeforeRet.insert({V, false}); - if (I.second) { - auto *Inst = dyn_cast<Instruction>(V); - if (Inst && isAllocLikeFn(Inst, &TLI)) - // NOTE: This could be made more precise by PointerMayBeCapturedBefore - // with the killing MemoryDef. But we refrain from doing so for now to - // limit compile-time and this does not cause any changes to the number - // of stores removed on a large test set in practice. - I.first->second = !PointerMayBeCaptured(V, false, true); - } - return I.first->second; + + auto I = CapturedBeforeReturn.insert({V, true}); + if (I.second) + // NOTE: This could be made more precise by PointerMayBeCapturedBefore + // with the killing MemoryDef. But we refrain from doing so for now to + // limit compile-time and this does not cause any changes to the number + // of stores removed on a large test set in practice. + I.first->second = PointerMayBeCaptured(V, false, true); + return !I.first->second; } Optional<MemoryLocation> getLocForWrite(Instruction *I) const { @@ -1269,8 +1259,7 @@ struct DSEState { MemoryDef *CurrentDef = cast<MemoryDef>(Current); Instruction *CurrentI = CurrentDef->getMemoryInst(); - if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj), - TLI)) { + if (canSkipDef(CurrentDef, !isInvisibleToCallerOnUnwind(KillingUndObj))) { CanOptimize = false; continue; } @@ -1442,7 +1431,7 @@ struct DSEState { continue; } - if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) { + if (UseInst->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) { LLVM_DEBUG(dbgs() << " ... found throwing instruction\n"); return None; } @@ -1623,7 +1612,7 @@ struct DSEState { // First see if we can ignore it by using the fact that KillingI is an // alloca/alloca like object that is not visible to the caller during // execution of the function. - if (KillingUndObj && isInvisibleToCallerBeforeRet(KillingUndObj)) + if (KillingUndObj && isInvisibleToCallerOnUnwind(KillingUndObj)) return false; if (KillingI->getParent() == DeadI->getParent()) @@ -1639,7 +1628,7 @@ struct DSEState { bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) { // If DeadI may throw it acts as a barrier, unless we are to an // alloca/alloca like object that does not escape. - if (DeadI->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) + if (DeadI->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) return true; // If DeadI is an atomic load/store stronger than monotonic, do not try to @@ -1696,6 +1685,84 @@ struct DSEState { return MadeChange; } + /// If we have a zero initializing memset following a call to malloc, + /// try folding it into a call to calloc. + bool tryFoldIntoCalloc(MemoryDef *Def, const Value *DefUO) { + Instruction *DefI = Def->getMemoryInst(); + MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI); + if (!MemSet) + // TODO: Could handle zero store to small allocation as well. + return false; + Constant *StoredConstant = dyn_cast<Constant>(MemSet->getValue()); + if (!StoredConstant || !StoredConstant->isNullValue()) + return false; + + if (!isRemovable(DefI)) + // The memset might be volatile.. + return false; + + if (F.hasFnAttribute(Attribute::SanitizeMemory) || + F.hasFnAttribute(Attribute::SanitizeAddress) || + F.hasFnAttribute(Attribute::SanitizeHWAddress) || + F.getName() == "calloc") + return false; + auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUO)); + if (!Malloc) + return false; + auto *InnerCallee = Malloc->getCalledFunction(); + if (!InnerCallee) + return false; + LibFunc Func; + if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || + Func != LibFunc_malloc) + return false; + + auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) { + // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end + // of malloc block + auto *MallocBB = Malloc->getParent(), + *MemsetBB = Memset->getParent(); + if (MallocBB == MemsetBB) + return true; + auto *Ptr = Memset->getArgOperand(0); + auto *TI = MallocBB->getTerminator(); + ICmpInst::Predicate Pred; + BasicBlock *TrueBB, *FalseBB; + if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB, + FalseBB))) + return false; + if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB) + return false; + return true; + }; + + if (Malloc->getOperand(0) != MemSet->getLength()) + return false; + if (!shouldCreateCalloc(Malloc, MemSet) || + !DT.dominates(Malloc, MemSet) || + !memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) + return false; + IRBuilder<> IRB(Malloc); + const auto &DL = Malloc->getModule()->getDataLayout(); + auto *Calloc = + emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1), + Malloc->getArgOperand(0), IRB, TLI); + if (!Calloc) + return false; + MemorySSAUpdater Updater(&MSSA); + auto *LastDef = + cast<MemoryDef>(Updater.getMemorySSA()->getMemoryAccess(Malloc)); + auto *NewAccess = + Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), LastDef, + LastDef); + auto *NewAccessMD = cast<MemoryDef>(NewAccess); + Updater.insertDef(NewAccessMD, /*RenameUses=*/true); + Updater.removeMemoryAccess(Malloc); + Malloc->replaceAllUsesWith(Calloc); + Malloc->eraseFromParent(); + return true; + } + /// \returns true if \p Def is a no-op store, either because it /// directly stores back a loaded value or stores zero to a calloced object. bool storeIsNoop(MemoryDef *Def, const Value *DefUO) { @@ -1713,81 +1780,15 @@ struct DSEState { if (!isRemovable(DefI)) return false; - if (StoredConstant && StoredConstant->isNullValue()) { - auto *DefUOInst = dyn_cast<Instruction>(DefUO); - if (DefUOInst) { - if (isCallocLikeFn(DefUOInst, &TLI)) { - auto *UnderlyingDef = - cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst)); - // If UnderlyingDef is the clobbering access of Def, no instructions - // between them can modify the memory location. - auto *ClobberDef = - MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def); - return UnderlyingDef == ClobberDef; - } - - if (MemSet) { - if (F.hasFnAttribute(Attribute::SanitizeMemory) || - F.hasFnAttribute(Attribute::SanitizeAddress) || - F.hasFnAttribute(Attribute::SanitizeHWAddress) || - F.getName() == "calloc") - return false; - auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUOInst)); - if (!Malloc) - return false; - auto *InnerCallee = Malloc->getCalledFunction(); - if (!InnerCallee) - return false; - LibFunc Func; - if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || - Func != LibFunc_malloc) - return false; - - auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) { - // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end - // of malloc block - auto *MallocBB = Malloc->getParent(), - *MemsetBB = Memset->getParent(); - if (MallocBB == MemsetBB) - return true; - auto *Ptr = Memset->getArgOperand(0); - auto *TI = MallocBB->getTerminator(); - ICmpInst::Predicate Pred; - BasicBlock *TrueBB, *FalseBB; - if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB, - FalseBB))) - return false; - if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB) - return false; - return true; - }; - - if (Malloc->getOperand(0) == MemSet->getLength()) { - if (shouldCreateCalloc(Malloc, MemSet) && - DT.dominates(Malloc, MemSet) && - memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) { - IRBuilder<> IRB(Malloc); - const auto &DL = Malloc->getModule()->getDataLayout(); - if (auto *Calloc = - emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1), - Malloc->getArgOperand(0), IRB, TLI)) { - MemorySSAUpdater Updater(&MSSA); - auto *LastDef = cast<MemoryDef>( - Updater.getMemorySSA()->getMemoryAccess(Malloc)); - auto *NewAccess = Updater.createMemoryAccessAfter( - cast<Instruction>(Calloc), LastDef, LastDef); - auto *NewAccessMD = cast<MemoryDef>(NewAccess); - Updater.insertDef(NewAccessMD, /*RenameUses=*/true); - Updater.removeMemoryAccess(Malloc); - Malloc->replaceAllUsesWith(Calloc); - Malloc->eraseFromParent(); - return true; - } - return false; - } - } - } - } + if (StoredConstant && isAllocationFn(DefUO, &TLI)) { + auto *CB = cast<CallBase>(DefUO); + auto *InitC = getInitialValueOfAllocation(CB, &TLI, + StoredConstant->getType()); + // If the clobbering access is LiveOnEntry, no instructions between them + // can modify the memory location. + if (InitC && InitC == StoredConstant) + return MSSA.isLiveOnEntryDef( + MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def)); } if (!Store) @@ -2074,6 +2075,15 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, MadeChange = true; continue; } + + // Can we form a calloc from a memset/malloc pair? + if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) { + LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n" + << " DEAD: " << *KillingI << '\n'); + State.deleteDeadInstruction(KillingI); + MadeChange = true; + continue; + } } if (EnablePartialOverwriteTracking) diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index a24997dd3fd4..59b934c16c8a 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -827,10 +827,13 @@ private: const ParseMemoryInst &Later); Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { + // TODO: We could insert relevant casts on type mismatch here. if (auto *LI = dyn_cast<LoadInst>(Inst)) - return LI; - if (auto *SI = dyn_cast<StoreInst>(Inst)) - return SI->getValueOperand(); + return LI->getType() == ExpectedType ? LI : nullptr; + else if (auto *SI = dyn_cast<StoreInst>(Inst)) { + Value *V = SI->getValueOperand(); + return V->getType() == ExpectedType ? V : nullptr; + } assert(isa<IntrinsicInst>(Inst) && "Instruction not supported"); auto *II = cast<IntrinsicInst>(Inst); if (isHandledNonTargetIntrinsic(II->getIntrinsicID())) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 00506fb86006..398c93e8758c 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1104,20 +1104,19 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, } assert(DepInfo.isDef() && "follows from above"); - // Loading the allocation -> undef. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || - isAlignedAllocLikeFn(DepInst, TLI) || - // Loading immediately after lifetime begin -> undef. - isLifetimeStart(DepInst)) { + // Loading the alloca -> undef. + // Loading immediately after lifetime begin -> undef. + if (isa<AllocaInst>(DepInst) || isLifetimeStart(DepInst)) { Res = AvailableValue::get(UndefValue::get(Load->getType())); return true; } - // Loading from calloc (which zero initializes memory) -> zero - if (isCallocLikeFn(DepInst, TLI)) { - Res = AvailableValue::get(Constant::getNullValue(Load->getType())); - return true; - } + if (isAllocationFn(DepInst, TLI)) + if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst), + TLI, Load->getType())) { + Res = AvailableValue::get(InitVal); + return true; + } if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of @@ -1769,7 +1768,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { // Insert a new store to null instruction before the load to indicate that // this code is not reachable. FIXME: We could insert unreachable // instruction directly because we can modify the CFG. - auto *NewS = new StoreInst(UndefValue::get(Int8Ty), + auto *NewS = new StoreInst(PoisonValue::get(Int8Ty), Constant::getNullValue(Int8Ty->getPointerTo()), IntrinsicI); if (MSSAU) { @@ -2991,12 +2990,12 @@ void GVNPass::addDeadBlock(BasicBlock *BB) { } } - // Now undef the incoming values from the dead predecessors. + // Now poison the incoming values from the dead predecessors. for (BasicBlock *P : predecessors(B)) { if (!DeadBlocks.count(P)) continue; for (PHINode &Phi : B->phis()) { - Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType())); + Phi.setIncomingValueForBlock(P, PoisonValue::get(Phi.getType())); if (MD) MD->invalidateCachedPointerInfo(&Phi); } diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7001d330fce0..ceb03eb17f6d 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -138,8 +138,6 @@ AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), namespace { -struct RewritePhi; - class IndVarSimplify { LoopInfo *LI; ScalarEvolution *SE; @@ -982,6 +980,7 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, assert(isLoopCounter(IndVar, L, SE)); const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); const SCEV *IVInit = AR->getStart(); + assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter // finds a valid pointer IV. Sign extend ExitCount in order to materialize a @@ -1004,13 +1003,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, assert(SE->isLoopInvariant(IVOffset, L) && "Computed iteration count is not loop invariant!"); - // We could handle pointer IVs other than i8*, but we need to compensate for - // gep index scaling. - assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), - cast<PointerType>(IndVar->getType()) - ->getElementType())->isOne() && - "unit stride pointer IV must be i8*"); - const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset); BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI); @@ -1026,7 +1018,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, // IVInit integer and ExitCount pointer would only occur if a canonical IV // were generated on top of case #2, which is not expected. - assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); // For unit stride, IVCount = Start + ExitCount with 2's complement // overflow. diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 883d4afff3bd..8f5933b7bd71 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -250,12 +250,6 @@ public: char InferAddressSpaces::ID = 0; -namespace llvm { - -void initializeInferAddressSpacesPass(PassRegistry &); - -} // end namespace llvm - INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index fe9a7211967c..a3efad104ca6 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -728,8 +728,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Handle some boolean conditions. if (I->getType()->getPrimitiveSizeInBits() == 1) { using namespace PatternMatch; - - assert(Preference == WantInteger && "One-bit non-integer type?"); + if (Preference != WantInteger) + return false; // X | true -> true // X & false -> false Value *Op0, *Op1; @@ -789,8 +789,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Try to simplify some other binary operator values. } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { - assert(Preference != WantBlockAddress - && "A binary operator creating a block address?"); + if (Preference != WantInteger) + return false; if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { PredValueInfoTy LHSVals; computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals, @@ -811,7 +811,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Handle compare with phi operand, where the PHI is defined in this block. if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { - assert(Preference == WantInteger && "Compares only produce integers"); + if (Preference != WantInteger) + return false; Type *CmpType = Cmp->getType(); Value *CmpLHS = Cmp->getOperand(0); Value *CmpRHS = Cmp->getOperand(1); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index bc792ca3d8da..7fb1a25bdf13 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1355,7 +1355,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop, TargetTransformInfo::TCC_Free) return false; // For a GEP, we cannot simply use getUserCost because currently it - // optimistically assume that a GEP will fold into addressing mode + // optimistically assumes that a GEP will fold into addressing mode // regardless of its users. const BasicBlock *BB = GEP->getParent(); for (const User *U : GEP->users()) { @@ -1923,26 +1923,15 @@ bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L, L->getHeader()->getTerminator(), DT); } -/// Return true iff we can prove that a caller of this function can not inspect -/// the contents of the provided object in a well defined program. -bool isKnownNonEscaping(Value *Object, const Loop *L, - const TargetLibraryInfo *TLI, DominatorTree *DT) { - if (isa<AllocaInst>(Object)) - // Since the alloca goes out of scope, we know the caller can't retain a - // reference to it and be well defined. Thus, we don't need to check for - // capture. - return true; +/// Return true if we can prove that a caller cannot inspect the object if an +/// unwind occurs inside the loop. +bool isNotVisibleOnUnwindInLoop(const Value *Object, const Loop *L, + DominatorTree *DT) { + bool RequiresNoCaptureBeforeUnwind; + if (!isNotVisibleOnUnwind(Object, RequiresNoCaptureBeforeUnwind)) + return false; - // For all other objects we need to know that the caller can't possibly - // have gotten a reference to the object. There are two components of - // that: - // 1) Object can't be escaped by this function. This is what - // PointerMayBeCaptured checks. - // 2) Object can't have been captured at definition site. For this, we - // need to know the return value is noalias. At the moment, we use a - // weaker condition and handle only AllocLikeFunctions (which are - // known to be noalias). TODO - return isAllocLikeFn(Object, TLI) && + return !RequiresNoCaptureBeforeUnwind || isNotCapturedBeforeOrInLoop(Object, L, DT); } @@ -2030,7 +2019,7 @@ bool llvm::promoteLoopAccessesToScalars( // this by proving that the caller can't have a reference to the object // after return and thus can't possibly load from the object. Value *Object = getUnderlyingObject(SomePtr); - if (!isKnownNonEscaping(Object, CurLoop, TLI, DT)) + if (!isNotVisibleOnUnwindInLoop(Object, CurLoop, DT)) return false; // Subtlety: Alloca's aren't visible to callers, but *are* potentially // visible to other threads if captured and used during their lifetimes. @@ -2163,7 +2152,7 @@ bool llvm::promoteLoopAccessesToScalars( else { Value *Object = getUnderlyingObject(SomePtr); SafeToInsertStore = - (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && + (isNoAliasCall(Object) || isa<AllocaInst>(Object)) && isNotCapturedBeforeOrInLoop(Object, CurLoop, DT); } } diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 5814e2f043d5..361d6c0d9381 100644 --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -407,25 +407,19 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE, if (!L->getLoopLatch()) return LoopDeletionResult::Unmodified; - auto *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); - if (BTC->isZero()) { - // SCEV knows this backedge isn't taken! - breakLoopBackedge(L, DT, SE, LI, MSSA); - ++NumBackedgesBroken; - return LoopDeletionResult::Deleted; - } - - // If SCEV leaves open the possibility of a zero trip count, see if - // symbolically evaluating the first iteration lets us prove the backedge - // unreachable. - if (isa<SCEVCouldNotCompute>(BTC) || !SE.isKnownNonZero(BTC)) - if (canProveExitOnFirstIteration(L, DT, LI)) { - breakLoopBackedge(L, DT, SE, LI, MSSA); - ++NumBackedgesBroken; - return LoopDeletionResult::Deleted; + auto *BTCMax = SE.getConstantMaxBackedgeTakenCount(L); + if (!BTCMax->isZero()) { + auto *BTC = SE.getBackedgeTakenCount(L); + if (!BTC->isZero()) { + if (!isa<SCEVCouldNotCompute>(BTC) && SE.isKnownNonZero(BTC)) + return LoopDeletionResult::Unmodified; + if (!canProveExitOnFirstIteration(L, DT, LI)) + return LoopDeletionResult::Unmodified; } - - return LoopDeletionResult::Unmodified; + } + ++NumBackedgesBroken; + breakLoopBackedge(L, DT, SE, LI, MSSA); + return LoopDeletionResult::Deleted; } /// Remove a loop if it is dead. diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 965d1575518e..c46db4e63bfe 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -10,10 +10,13 @@ // // The intention is to optimise loop nests like this, which together access an // array linearly: +// // for (int i = 0; i < N; ++i) // for (int j = 0; j < M; ++j) // f(A[i*M+j]); +// // into one loop: +// // for (int i = 0; i < (N*M); ++i) // f(A[i]); // @@ -22,7 +25,27 @@ // expression like i*M+j. If they had any other uses, we would have to insert a // div/mod to reconstruct the original values, so this wouldn't be profitable. // -// We also need to prove that N*M will not overflow. +// We also need to prove that N*M will not overflow. The preferred solution is +// to widen the IV, which avoids overflow checks, so that is tried first. If +// the IV cannot be widened, then we try to determine that this new tripcount +// expression won't overflow. +// +// Q: Does LoopFlatten use SCEV? +// Short answer: Yes and no. +// +// Long answer: +// For this transformation to be valid, we require all uses of the induction +// variables to be linear expressions of the form i*M+j. The different Loop +// APIs are used to get some loop components like the induction variable, +// compare statement, etc. In addition, we do some pattern matching to find the +// linear expressions and other loop components like the loop increment. The +// latter are examples of expressions that do use the induction variable, but +// are safe to ignore when we check all uses to be of the form i*M+j. We keep +// track of all of this in bookkeeping struct FlattenInfo. +// We assume the loops to be canonical, i.e. starting at 0 and increment with +// 1. This makes RHS of the compare the loop tripcount (with the right +// predicate). We use SCEV to then sanity check that this tripcount matches +// with the tripcount as computed by SCEV. // //===----------------------------------------------------------------------===// @@ -31,6 +54,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -70,37 +94,54 @@ static cl::opt<bool> "trip counts will never overflow")); static cl::opt<bool> - WidenIV("loop-flatten-widen-iv", cl::Hidden, - cl::init(true), + WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true), cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening")); +// We require all uses of both induction variables to match this pattern: +// +// (OuterPHI * InnerTripCount) + InnerPHI +// +// I.e., it needs to be a linear expression of the induction variables and the +// inner loop trip count. We keep track of all different expressions on which +// checks will be performed in this bookkeeping struct. +// struct FlattenInfo { - Loop *OuterLoop = nullptr; + Loop *OuterLoop = nullptr; // The loop pair to be flattened. Loop *InnerLoop = nullptr; - // These PHINodes correspond to loop induction variables, which are expected - // to start at zero and increment by one on each loop. - PHINode *InnerInductionPHI = nullptr; - PHINode *OuterInductionPHI = nullptr; - Value *InnerTripCount = nullptr; - Value *OuterTripCount = nullptr; - BinaryOperator *InnerIncrement = nullptr; - BinaryOperator *OuterIncrement = nullptr; - BranchInst *InnerBranch = nullptr; - BranchInst *OuterBranch = nullptr; - SmallPtrSet<Value *, 4> LinearIVUses; + + PHINode *InnerInductionPHI = nullptr; // These PHINodes correspond to loop + PHINode *OuterInductionPHI = nullptr; // induction variables, which are + // expected to start at zero and + // increment by one on each loop. + + Value *InnerTripCount = nullptr; // The product of these two tripcounts + Value *OuterTripCount = nullptr; // will be the new flattened loop + // tripcount. Also used to recognise a + // linear expression that will be replaced. + + SmallPtrSet<Value *, 4> LinearIVUses; // Contains the linear expressions + // of the form i*M+j that will be + // replaced. + + BinaryOperator *InnerIncrement = nullptr; // Uses of induction variables in + BinaryOperator *OuterIncrement = nullptr; // loop control statements that + BranchInst *InnerBranch = nullptr; // are safe to ignore. + + BranchInst *OuterBranch = nullptr; // The instruction that needs to be + // updated with new tripcount. + SmallPtrSet<PHINode *, 4> InnerPHIsToTransform; - // Whether this holds the flatten info before or after widening. - bool Widened = false; + bool Widened = false; // Whether this holds the flatten info before or after + // widening. - // Holds the old/narrow induction phis, i.e. the Phis before IV widening has - // been applied. This bookkeeping is used so we can skip some checks on these - // phi nodes. - PHINode *NarrowInnerInductionPHI = nullptr; - PHINode *NarrowOuterInductionPHI = nullptr; + PHINode *NarrowInnerInductionPHI = nullptr; // Holds the old/narrow induction + PHINode *NarrowOuterInductionPHI = nullptr; // phis, i.e. the Phis before IV + // has been apllied. Used to skip + // checks on phi nodes. - FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {}; + FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){}; bool isNarrowInductionPhi(PHINode *Phi) { // This can't be the narrow phi if we haven't widened the IV first. @@ -108,6 +149,118 @@ struct FlattenInfo { return false; return NarrowInnerInductionPHI == Phi || NarrowOuterInductionPHI == Phi; } + bool isInnerLoopIncrement(User *U) { + return InnerIncrement == U; + } + bool isOuterLoopIncrement(User *U) { + return OuterIncrement == U; + } + bool isInnerLoopTest(User *U) { + return InnerBranch->getCondition() == U; + } + + bool checkOuterInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) { + for (User *U : OuterInductionPHI->users()) { + if (isOuterLoopIncrement(U)) + continue; + + auto IsValidOuterPHIUses = [&] (User *U) -> bool { + LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump()); + if (!ValidOuterPHIUses.count(U)) { + LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Use is optimisable\n"); + return true; + }; + + if (auto *V = dyn_cast<TruncInst>(U)) { + for (auto *K : V->users()) { + if (!IsValidOuterPHIUses(K)) + return false; + } + continue; + } + + if (!IsValidOuterPHIUses(U)) + return false; + } + return true; + } + + bool matchLinearIVUser(User *U, Value *InnerTripCount, + SmallPtrSet<Value *, 4> &ValidOuterPHIUses) { + LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump()); + Value *MatchedMul = nullptr; + Value *MatchedItCount = nullptr; + + bool IsAdd = match(U, m_c_Add(m_Specific(InnerInductionPHI), + m_Value(MatchedMul))) && + match(MatchedMul, m_c_Mul(m_Specific(OuterInductionPHI), + m_Value(MatchedItCount))); + + // Matches the same pattern as above, except it also looks for truncs + // on the phi, which can be the result of widening the induction variables. + bool IsAddTrunc = + match(U, m_c_Add(m_Trunc(m_Specific(InnerInductionPHI)), + m_Value(MatchedMul))) && + match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(OuterInductionPHI)), + m_Value(MatchedItCount))); + + if (!MatchedItCount) + return false; + + // Look through extends if the IV has been widened. + if (Widened && + (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) { + assert(MatchedItCount->getType() == InnerInductionPHI->getType() && + "Unexpected type mismatch in types after widening"); + MatchedItCount = isa<SExtInst>(MatchedItCount) + ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0) + : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0); + } + + if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) { + LLVM_DEBUG(dbgs() << "Use is optimisable\n"); + ValidOuterPHIUses.insert(MatchedMul); + LinearIVUses.insert(U); + return true; + } + + LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); + return false; + } + + bool checkInnerInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) { + Value *SExtInnerTripCount = InnerTripCount; + if (Widened && + (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount))) + SExtInnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0); + + for (User *U : InnerInductionPHI->users()) { + if (isInnerLoopIncrement(U)) + continue; + + // After widening the IVs, a trunc instruction might have been introduced, + // so look through truncs. + if (isa<TruncInst>(U)) { + if (!U->hasOneUse()) + return false; + U = *U->user_begin(); + } + + // If the use is in the compare (which is also the condition of the inner + // branch) then the compare has been altered by another transformation e.g + // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is + // a constant. Ignore this use as the compare gets removed later anyway. + if (isInnerLoopTest(U)) + continue; + + if (!matchLinearIVUser(U, SExtInnerTripCount, ValidOuterPHIUses)) + return false; + } + return true; + } }; static bool @@ -121,6 +274,77 @@ setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment, return true; } +// Given the RHS of the loop latch compare instruction, verify with SCEV +// that this is indeed the loop tripcount. +// TODO: This used to be a straightforward check but has grown to be quite +// complicated now. It is therefore worth revisiting what the additional +// benefits are of this (compared to relying on canonical loops and pattern +// matching). +static bool verifyTripCount(Value *RHS, Loop *L, + SmallPtrSetImpl<Instruction *> &IterationInstructions, + PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment, + BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) { + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n"); + return false; + } + + // The Extend=false flag is used for getTripCountFromExitCount as we want + // to verify and match it with the pattern matched tripcount. Please note + // that overflow checks are performed in checkOverflow, but are first tried + // to avoid by widening the IV. + const SCEV *SCEVTripCount = + SE->getTripCountFromExitCount(BackedgeTakenCount, /*Extend=*/false); + + const SCEV *SCEVRHS = SE->getSCEV(RHS); + if (SCEVRHS == SCEVTripCount) + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS); + if (ConstantRHS) { + const SCEV *BackedgeTCExt = nullptr; + if (IsWidened) { + const SCEV *SCEVTripCountExt; + // Find the extended backedge taken count and extended trip count using + // SCEV. One of these should now match the RHS of the compare. + BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType()); + SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false); + if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + } + // If the RHS of the compare is equal to the backedge taken count we need + // to add one to get the trip count. + if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) { + ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1); + Value *NewRHS = ConstantInt::get( + ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue()); + return setLoopComponents(NewRHS, TripCount, Increment, + IterationInstructions); + } + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + } + // If the RHS isn't a constant then check that the reason it doesn't match + // the SCEV trip count is because the RHS is a ZExt or SExt instruction + // (and take the trip count to be the RHS). + if (!IsWidened) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + auto *TripCountInst = dyn_cast<Instruction>(RHS); + if (!TripCountInst) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) || + SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) { + LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); + return false; + } + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); +} + // Finds the induction variable, increment and trip count for a simple loop that // we can flatten. static bool findLoopComponents( @@ -197,63 +421,9 @@ static bool findLoopComponents( // another transformation has changed the compare (e.g. icmp ult %inc, // tripcount -> icmp ult %j, tripcount-1), or both. Value *RHS = Compare->getOperand(1); - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { - LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n"); - return false; - } - // The use of the Extend=false flag on getTripCountFromExitCount was added - // during a refactoring to preserve existing behavior. However, there's - // nothing obvious in the surrounding code when handles the overflow case. - // FIXME: audit code to establish whether there's a latent bug here. - const SCEV *SCEVTripCount = - SE->getTripCountFromExitCount(BackedgeTakenCount, false); - const SCEV *SCEVRHS = SE->getSCEV(RHS); - if (SCEVRHS == SCEVTripCount) - return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); - ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS); - if (ConstantRHS) { - const SCEV *BackedgeTCExt = nullptr; - if (IsWidened) { - const SCEV *SCEVTripCountExt; - // Find the extended backedge taken count and extended trip count using - // SCEV. One of these should now match the RHS of the compare. - BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType()); - SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false); - if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - } - // If the RHS of the compare is equal to the backedge taken count we need - // to add one to get the trip count. - if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) { - ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1); - Value *NewRHS = ConstantInt::get( - ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue()); - return setLoopComponents(NewRHS, TripCount, Increment, - IterationInstructions); - } - return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); - } - // If the RHS isn't a constant then check that the reason it doesn't match - // the SCEV trip count is because the RHS is a ZExt or SExt instruction - // (and take the trip count to be the RHS). - if (!IsWidened) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - auto *TripCountInst = dyn_cast<Instruction>(RHS); - if (!TripCountInst) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) || - SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) { - LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); - return false; - } - return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + + return verifyTripCount(RHS, L, IterationInstructions, InductionPHI, TripCount, + Increment, BackBranch, SE, IsWidened); } static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) { @@ -399,108 +569,26 @@ checkOuterLoopInsts(FlattenInfo &FI, return true; } -static bool checkIVUsers(FlattenInfo &FI) { - // We require all uses of both induction variables to match this pattern: - // - // (OuterPHI * InnerTripCount) + InnerPHI - // - // Any uses of the induction variables not matching that pattern would - // require a div/mod to reconstruct in the flattened loop, so the - // transformation wouldn't be profitable. - - Value *InnerTripCount = FI.InnerTripCount; - if (FI.Widened && - (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount))) - InnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0); + +// We require all uses of both induction variables to match this pattern: +// +// (OuterPHI * InnerTripCount) + InnerPHI +// +// Any uses of the induction variables not matching that pattern would +// require a div/mod to reconstruct in the flattened loop, so the +// transformation wouldn't be profitable. +static bool checkIVUsers(FlattenInfo &FI) { // Check that all uses of the inner loop's induction variable match the // expected pattern, recording the uses of the outer IV. SmallPtrSet<Value *, 4> ValidOuterPHIUses; - for (User *U : FI.InnerInductionPHI->users()) { - if (U == FI.InnerIncrement) - continue; - - // After widening the IVs, a trunc instruction might have been introduced, - // so look through truncs. - if (isa<TruncInst>(U)) { - if (!U->hasOneUse()) - return false; - U = *U->user_begin(); - } - - // If the use is in the compare (which is also the condition of the inner - // branch) then the compare has been altered by another transformation e.g - // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is - // a constant. Ignore this use as the compare gets removed later anyway. - if (U == FI.InnerBranch->getCondition()) - continue; - - LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump()); - - Value *MatchedMul = nullptr; - Value *MatchedItCount = nullptr; - bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI), - m_Value(MatchedMul))) && - match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI), - m_Value(MatchedItCount))); - - // Matches the same pattern as above, except it also looks for truncs - // on the phi, which can be the result of widening the induction variables. - bool IsAddTrunc = - match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)), - m_Value(MatchedMul))) && - match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)), - m_Value(MatchedItCount))); - - if (!MatchedItCount) - return false; - // Look through extends if the IV has been widened. - if (FI.Widened && - (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) { - assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() && - "Unexpected type mismatch in types after widening"); - MatchedItCount = isa<SExtInst>(MatchedItCount) - ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0) - : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0); - } - - if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) { - LLVM_DEBUG(dbgs() << "Use is optimisable\n"); - ValidOuterPHIUses.insert(MatchedMul); - FI.LinearIVUses.insert(U); - } else { - LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); - return false; - } - } + if (!FI.checkInnerInductionPhiUsers(ValidOuterPHIUses)) + return false; // Check that there are no uses of the outer IV other than the ones found // as part of the pattern above. - for (User *U : FI.OuterInductionPHI->users()) { - if (U == FI.OuterIncrement) - continue; - - auto IsValidOuterPHIUses = [&] (User *U) -> bool { - LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump()); - if (!ValidOuterPHIUses.count(U)) { - LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Use is optimisable\n"); - return true; - }; - - if (auto *V = dyn_cast<TruncInst>(U)) { - for (auto *K : V->users()) { - if (!IsValidOuterPHIUses(K)) - return false; - } - continue; - } - - if (!IsValidOuterPHIUses(U)) - return false; - } + if (!FI.checkOuterInductionPhiUsers(ValidOuterPHIUses)) + return false; LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n"; dbgs() << "Found " << FI.LinearIVUses.size() @@ -535,7 +623,7 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT, for (Value *U : V->users()) { if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { for (Value *GEPUser : U->users()) { - Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser); + auto *GEPUserInst = cast<Instruction>(GEPUser); if (!isa<LoadInst>(GEPUserInst) && !(isa<StoreInst>(GEPUserInst) && GEP == GEPUserInst->getOperand(1))) @@ -611,7 +699,8 @@ static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI, LPMUpdater *U) { + const TargetTransformInfo *TTI, LPMUpdater *U, + MemorySSAUpdater *MSSAU) { Function *F = FI.OuterLoop->getHeader()->getParent(); LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n"); { @@ -647,7 +736,11 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock(); InnerExitingBlock->getTerminator()->eraseFromParent(); BranchInst::Create(InnerExitBlock, InnerExitingBlock); + + // Update the DomTree and MemorySSA. DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader()); + if (MSSAU) + MSSAU->removeEdge(InnerExitingBlock, FI.InnerLoop->getHeader()); // Replace all uses of the polynomial calculated from the two induction // variables with the one new one. @@ -658,8 +751,8 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(), "flatten.trunciv"); - LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); - dbgs() << "with: "; OuterValue->dump()); + LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); dbgs() << "with: "; + OuterValue->dump()); V->replaceAllUsesWith(OuterValue); } @@ -698,7 +791,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // (OuterTripCount * InnerTripCount) as the new trip count is safe. if (InnerType != OuterType || InnerType->getScalarSizeInBits() >= MaxLegalSize || - MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) { + MaxLegalType->getScalarSizeInBits() < + InnerType->getScalarSizeInBits() * 2) { LLVM_DEBUG(dbgs() << "Can't widen the IV\n"); return false; } @@ -708,10 +802,10 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, unsigned ElimExt = 0; unsigned Widened = 0; - auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool { - PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, - ElimExt, Widened, true /* HasGuards */, - true /* UsePostIncrementRanges */); + auto CreateWideIV = [&](WideIVInfo WideIV, bool &Deleted) -> bool { + PHINode *WidePhi = + createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, ElimExt, Widened, + true /* HasGuards */, true /* UsePostIncrementRanges */); if (!WidePhi) return false; LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump()); @@ -721,14 +815,14 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, }; bool Deleted; - if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted)) + if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false}, Deleted)) return false; // Add the narrow phi to list, so that it will be adjusted later when the // the transformation is performed. if (!Deleted) FI.InnerPHIsToTransform.insert(FI.InnerInductionPHI); - if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted)) + if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false}, Deleted)) return false; assert(Widened && "Widened IV expected"); @@ -744,7 +838,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI, LPMUpdater *U) { + const TargetTransformInfo *TTI, LPMUpdater *U, + MemorySSAUpdater *MSSAU) { LLVM_DEBUG( dbgs() << "Loop flattening running on outer loop " << FI.OuterLoop->getHeader()->getName() << " and inner loop " @@ -773,7 +868,7 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // If we have widened and can perform the transformation, do that here. if (CanFlatten) - return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); // Otherwise, if we haven't widened the IV, check if the new iteration // variable might overflow. In this case, we need to version the loop, and @@ -791,18 +886,19 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, } LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); - return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); } bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, - AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U) { + AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U, + MemorySSAUpdater *MSSAU) { bool Changed = false; for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) continue; FlattenInfo FI(OuterLoop, InnerLoop); - Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); } return Changed; } @@ -813,16 +909,30 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, bool Changed = false; + Optional<MemorySSAUpdater> MSSAU; + if (AR.MSSA) { + MSSAU = MemorySSAUpdater(AR.MSSA); + if (VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); + } + // The loop flattening pass requires loops to be // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. - Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U); + Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U, + MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); if (!Changed) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + if (AR.MSSA && VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); + + auto PA = getLoopPassPreservedAnalyses(); + if (AR.MSSA) + PA.preserve<MemorySSAAnalysis>(); + return PA; } namespace { @@ -842,6 +952,7 @@ public: AU.addPreserved<TargetTransformInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<AssumptionCacheTracker>(); + AU.addPreserved<MemorySSAWrapperPass>(); } }; } // namespace @@ -854,7 +965,9 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops", false, false) -FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); } +FunctionPass *llvm::createLoopFlattenPass() { + return new LoopFlattenLegacyPass(); +} bool LoopFlattenLegacyPass::runOnFunction(Function &F) { ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); @@ -864,10 +977,17 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) { auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>(); auto *TTI = &TTIP.getTTI(F); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *MSSA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + + Optional<MemorySSAUpdater> MSSAU; + if (MSSA) + MSSAU = MemorySSAUpdater(&MSSA->getMSSA()); + bool Changed = false; for (Loop *L : *LI) { auto LN = LoopNest::getLoopNest(*L, *SE); - Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr); + Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr, + MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); } return Changed; } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 5d00fa56e888..35ba4e2b4032 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1117,7 +1117,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, *DL, "loop-idiom"); - SCEVExpanderCleaner ExpCleaner(Expander, *DT); + SCEVExpanderCleaner ExpCleaner(Expander); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); @@ -1328,7 +1328,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, *DL, "loop-idiom"); - SCEVExpanderCleaner ExpCleaner(Expander, *DT); + SCEVExpanderCleaner ExpCleaner(Expander); bool Changed = false; const SCEV *StrStart = StoreEv->getStart(); diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9f605b4ac4ad..c2b065c4eb31 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -292,33 +292,6 @@ static LoopVector populateWorklist(Loop &L) { return LoopList; } -static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { - PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); - if (InnerIndexVar) - return InnerIndexVar; - if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) - return nullptr; - for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - PHINode *PhiVar = cast<PHINode>(I); - Type *PhiTy = PhiVar->getType(); - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) - return nullptr; - const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); - if (!AddRec || !AddRec->isAffine()) - continue; - const SCEV *Step = AddRec->getStepRecurrence(*SE); - if (!isa<SCEVConstant>(Step)) - continue; - // Found the induction variable. - // FIXME: Handle loops with more than one induction variable. Note that, - // currently, legality makes sure we have only one induction variable. - return PhiVar; - } - return nullptr; -} - namespace { /// LoopInterchangeLegality checks if it is legal to interchange the loop. @@ -332,9 +305,13 @@ public: bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix); + /// Discover induction PHIs in the header of \p L. Induction + /// PHIs are added to \p Inductions. + bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions); + /// Check if the loop structure is understood. We do not handle triangular /// loops for now. - bool isLoopStructureUnderstood(PHINode *InnerInductionVar); + bool isLoopStructureUnderstood(); bool currentLimitations(); @@ -342,6 +319,10 @@ public: return OuterInnerReductions; } + const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const { + return InnerLoopInductions; + } + private: bool tightlyNested(Loop *Outer, Loop *Inner); bool containsUnsafeInstructions(BasicBlock *BB); @@ -365,6 +346,9 @@ private: /// Set of reduction PHIs taking part of a reduction across the inner and /// outer loop. SmallPtrSet<PHINode *, 4> OuterInnerReductions; + + /// Set of inner loop induction PHIs + SmallVector<PHINode *, 8> InnerLoopInductions; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -635,25 +619,26 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { return true; } -bool LoopInterchangeLegality::isLoopStructureUnderstood( - PHINode *InnerInduction) { - unsigned Num = InnerInduction->getNumOperands(); +bool LoopInterchangeLegality::isLoopStructureUnderstood() { BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); - for (unsigned i = 0; i < Num; ++i) { - Value *Val = InnerInduction->getOperand(i); - if (isa<Constant>(Val)) - continue; - Instruction *I = dyn_cast<Instruction>(Val); - if (!I) - return false; - // TODO: Handle triangular loops. - // e.g. for(int i=0;i<N;i++) - // for(int j=i;j<N;j++) - unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); - if (InnerInduction->getIncomingBlock(IncomBlockIndx) == - InnerLoopPreheader && - !OuterLoop->isLoopInvariant(I)) { - return false; + for (PHINode *InnerInduction : InnerLoopInductions) { + unsigned Num = InnerInduction->getNumOperands(); + for (unsigned i = 0; i < Num; ++i) { + Value *Val = InnerInduction->getOperand(i); + if (isa<Constant>(Val)) + continue; + Instruction *I = dyn_cast<Instruction>(Val); + if (!I) + return false; + // TODO: Handle triangular loops. + // e.g. for(int i=0;i<N;i++) + // for(int j=i;j<N;j++) + unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); + if (InnerInduction->getIncomingBlock(IncomBlockIndx) == + InnerLoopPreheader && + !OuterLoop->isLoopInvariant(I)) { + return false; + } } } @@ -682,27 +667,34 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood( // Return true if V is InnerInduction, or a cast from // InnerInduction, or a binary operator that involves // InnerInduction and a constant. - std::function<bool(Value *)> IsPathToIndVar; - IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool { - if (V == InnerInduction) + std::function<bool(Value *)> IsPathToInnerIndVar; + IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool { + if (llvm::is_contained(InnerLoopInductions, V)) return true; if (isa<Constant>(V)) return true; - Instruction *I = dyn_cast<Instruction>(V); + const Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; if (isa<CastInst>(I)) - return IsPathToIndVar(I->getOperand(0)); + return IsPathToInnerIndVar(I->getOperand(0)); if (isa<BinaryOperator>(I)) - return IsPathToIndVar(I->getOperand(0)) && - IsPathToIndVar(I->getOperand(1)); + return IsPathToInnerIndVar(I->getOperand(0)) && + IsPathToInnerIndVar(I->getOperand(1)); return false; }; - if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) { + // In case of multiple inner loop indvars, it is okay if LHS and RHS + // are both inner indvar related variables. + if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1)) + return true; + + // Otherwise we check if the cmp instruction compares an inner indvar + // related variable (Left) with a outer loop invariant (Right). + if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) { Left = Op0; Right = Op1; - } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) { + } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) { Left = Op1; Right = Op0; } @@ -793,7 +785,6 @@ bool LoopInterchangeLegality::findInductionAndReductions( // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { - BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); // transform currently expects the loop latches to also be the exiting @@ -815,7 +806,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - PHINode *InnerInductionVar; SmallVector<PHINode *, 8> Inductions; if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { LLVM_DEBUG( @@ -831,20 +821,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: Currently we handle only loops with 1 induction variable. - if (Inductions.size() != 1) { - LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " - << "supported currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with 1 induction variable can be " - "interchanged currently."; - }); - return true; - } - Inductions.clear(); if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { LLVM_DEBUG( @@ -860,24 +836,8 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: Currently we handle only loops with 1 induction variable. - if (Inductions.size() != 1) { - LLVM_DEBUG( - dbgs() << "We currently only support loops with 1 induction variable." - << "Failed to interchange due to current limitation\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with 1 induction variable can be " - "interchanged currently."; - }); - return true; - } - InnerInductionVar = Inductions.pop_back_val(); - // TODO: Triangular loops are not handled for now. - if (!isLoopStructureUnderstood(InnerInductionVar)) { + if (!isLoopStructureUnderstood()) { LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", @@ -888,79 +848,17 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: Current limitation: Since we split the inner loop latch at the point - // were induction variable is incremented (induction.next); We cannot have - // more than 1 user of induction.next since it would result in broken code - // after split. - // e.g. - // for(i=0;i<N;i++) { - // for(j = 0;j<M;j++) { - // A[j+1][i+2] = A[j][i]+k; - // } - // } - Instruction *InnerIndexVarInc = nullptr; - if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) - InnerIndexVarInc = - dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); - else - InnerIndexVarInc = - dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); - - if (!InnerIndexVarInc) { - LLVM_DEBUG( - dbgs() << "Did not find an instruction to increment the induction " - << "variable.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "The inner loop does not increment the induction variable."; - }); - return true; - } - - // Since we split the inner loop latch on this induction variable. Make sure - // we do not have any instruction between the induction variable and branch - // instruction. - - bool FoundInduction = false; - for (const Instruction &I : - llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) { - if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || - isa<ZExtInst>(I)) - continue; - - // We found an instruction. If this is not induction variable then it is not - // safe to split this loop latch. - if (!I.isIdenticalTo(InnerIndexVarInc)) { - LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction " - << "variable increment and branch.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed( - DEBUG_TYPE, "UnsupportedInsBetweenInduction", - InnerLoop->getStartLoc(), InnerLoop->getHeader()) - << "Found unsupported instruction between induction variable " - "increment and branch."; - }); - return true; - } + return false; +} - FoundInduction = true; - break; - } - // The loop latch ended and we didn't find the induction variable return as - // current limitation. - if (!FoundInduction) { - LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Did not find the induction variable."; - }); - return true; +bool LoopInterchangeLegality::findInductions( + Loop *L, SmallVectorImpl<PHINode *> &Inductions) { + for (PHINode &PHI : L->getHeader()->phis()) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) + Inductions.push_back(&PHI); } - return false; + return !Inductions.empty(); } // We currently only support LCSSA PHI nodes in the inner loop exit, if their @@ -1076,7 +974,7 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, for (Instruction &I : BB->instructionsWithoutDebug()) if (CallInst *CI = dyn_cast<CallInst>(&I)) { // readnone functions do not prevent interchanging. - if (CI->doesNotReadMemory()) + if (CI->onlyWritesMemory()) continue; LLVM_DEBUG( dbgs() << "Loops with call instructions cannot be interchanged " @@ -1091,6 +989,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } + if (!findInductions(InnerLoop, InnerLoopInductions)) { + LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n"); + return false; + } + if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n"); ORE->emit([&]() { @@ -1347,25 +1250,25 @@ void LoopInterchangeTransform::restructureLoops( bool LoopInterchangeTransform::transform() { bool Transformed = false; - Instruction *InnerIndexVar; if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); - PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); - if (!InductionPHI) { + auto &InductionPHIs = LIL.getInnerLoopInductions(); + if (InductionPHIs.empty()) { LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); return false; } - if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) - InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); - else - InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); - - // Ensure that InductionPHI is the first Phi node. - if (&InductionPHI->getParent()->front() != InductionPHI) - InductionPHI->moveBefore(&InductionPHI->getParent()->front()); + SmallVector<Instruction *, 8> InnerIndexVarList; + for (PHINode *CurInductionPHI : InductionPHIs) { + if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVarList.push_back( + dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1))); + else + InnerIndexVarList.push_back( + dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0))); + } // Create a new latch block for the inner loop. We split at the // current latch's terminator and then move the condition and all @@ -1377,7 +1280,7 @@ bool LoopInterchangeTransform::transform() { SmallSetVector<Instruction *, 4> WorkList; unsigned i = 0; - auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() { + auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() { for (; i < WorkList.size(); i++) { // Duplicate instruction and move it the new latch. Update uses that // have been moved. @@ -1389,7 +1292,8 @@ bool LoopInterchangeTransform::transform() { for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) { Instruction *UserI = cast<Instruction>(U.getUser()); if (!InnerLoop->contains(UserI->getParent()) || - UserI->getParent() == NewLatch || UserI == InductionPHI) + UserI->getParent() == NewLatch || + llvm::is_contained(InductionPHIs, UserI)) U.set(NewI); } // Add operands of moved instruction to the worklist, except if they are @@ -1398,7 +1302,7 @@ bool LoopInterchangeTransform::transform() { Instruction *OpI = dyn_cast<Instruction>(Op); if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || - OpI == InductionPHI) + llvm::is_contained(InductionPHIs, OpI)) continue; WorkList.insert(OpI); } @@ -1412,7 +1316,8 @@ bool LoopInterchangeTransform::transform() { if (CondI) WorkList.insert(CondI); MoveInstructions(); - WorkList.insert(cast<Instruction>(InnerIndexVar)); + for (Instruction *InnerIndexVar : InnerIndexVarList) + WorkList.insert(cast<Instruction>(InnerIndexVar)); MoveInstructions(); // Splits the inner loops phi nodes out into a separate basic block. @@ -1685,7 +1590,6 @@ bool LoopInterchangeTransform::adjustLoopBranches() { updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, InnerLoopLatchSuccessor, DTUpdates); - if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); else @@ -1712,19 +1616,22 @@ bool LoopInterchangeTransform::adjustLoopBranches() { SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; for (PHINode &PHI : InnerLoopHeader->phis()) if (OuterInnerReductions.contains(&PHI)) - InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); + InnerLoopPHIs.push_back(&PHI); + for (PHINode &PHI : OuterLoopHeader->phis()) if (OuterInnerReductions.contains(&PHI)) - OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); + OuterLoopPHIs.push_back(&PHI); // Now move the remaining reduction PHIs from outer to inner loop header and // vice versa. The PHI nodes must be part of a reduction across the inner and // outer loop and all the remains to do is and updating the incoming blocks. for (PHINode *PHI : OuterLoopPHIs) { + LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump();); PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); } for (PHINode *PHI : InnerLoopPHIs) { + LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump();); PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 798af48c2337..654f0d2a03a8 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3486,6 +3486,31 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { // Don't bother if the instruction is in a BB which ends in an EHPad. if (UseBB->getTerminator()->isEHPad()) continue; + + // Ignore cases in which the currently-examined value could come from + // a basic block terminated with an EHPad. This checks all incoming + // blocks of the phi node since it is possible that the same incoming + // value comes from multiple basic blocks, only some of which may end + // in an EHPad. If any of them do, a subsequent rewrite attempt by this + // pass would try to insert instructions into an EHPad, hitting an + // assertion. + if (isa<PHINode>(UserInst)) { + const auto *PhiNode = cast<PHINode>(UserInst); + bool HasIncompatibleEHPTerminatedBlock = false; + llvm::Value *ExpectedValue = U; + for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) { + if (PhiNode->getIncomingValue(I) == ExpectedValue) { + if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) { + HasIncompatibleEHPTerminatedBlock = true; + break; + } + } + } + if (HasIncompatibleEHPTerminatedBlock) { + continue; + } + } + // Don't bother rewriting PHIs in catchswitch blocks. if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator())) continue; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 893928fb0560..022d9c7abc8c 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1142,7 +1142,7 @@ static LoopUnrollResult tryToUnrollLoop( // automatic unrolling from interfering with the user requested // transformation. Loop *ParentL = L->getParentLoop(); - if (ParentL != NULL && + if (ParentL != nullptr && hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser && hasUnrollTransformation(L) != TM_ForcedByUser) { LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has" diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 1c186e9a0488..a7eb60b5e032 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -64,7 +64,7 @@ getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) { // __builtin_expect_with_probability assert(CI->getNumOperands() >= 3 && "expect with probability must have 3 arguments"); - ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2)); + auto *Confidence = cast<ConstantFP>(CI->getArgOperand(2)); double TrueProb = Confidence->getValueAPF().convertToDouble(); assert((TrueProb >= 0.0 && TrueProb <= 1.0) && "probability value must be in the range [0.0, 1.0]"); diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 4e4097e13271..8f1d0181ee5b 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -220,9 +220,7 @@ class LowerMatrixIntrinsics { bool IsColumnMajor = true; public: - MatrixTy() - : Vectors(), - IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} + MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} MatrixTy(ArrayRef<Value *> Vectors) : Vectors(Vectors.begin(), Vectors.end()), IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} @@ -1393,7 +1391,8 @@ public: // reloads necessary. unsigned Op0Regs = (R + VF - 1) / VF * M; unsigned Op1Regs = (M + VF - 1) / VF * C; - return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true); + return Op0Regs + Op1Regs > + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true)); } MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { @@ -1832,7 +1831,7 @@ public: const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared, const SmallSetVector<Value *, 32> &ExprsInSubprogram, Value *Leaf) - : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), + : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {} void indent(unsigned N) { @@ -1895,7 +1894,7 @@ public: write(Name); return; } - IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + auto *II = cast<IntrinsicInst>(CI); write(Intrinsic::getBaseName(II->getIntrinsicID()) .drop_front(StringRef("llvm.matrix.").size())); write("."); diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 67335a45fb58..6698db26626b 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryLocation.h" @@ -171,7 +172,7 @@ public: bool empty() const { return Ranges.empty(); } void addInst(int64_t OffsetFromFirst, Instruction *Inst) { - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + if (auto *SI = dyn_cast<StoreInst>(Inst)) addStore(OffsetFromFirst, SI); else addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); @@ -312,15 +313,21 @@ INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, Instruction *End) { assert(Start->getParent() == End->getParent() && "Must be in same block"); - if (!Start->getFunction()->doesNotThrow() && - !isa<AllocaInst>(getUnderlyingObject(V))) { - for (const Instruction &I : - make_range(Start->getIterator(), End->getIterator())) { - if (I.mayThrow()) - return true; - } - } - return false; + // Function can't unwind, so it also can't be visible through unwinding. + if (Start->getFunction()->doesNotThrow()) + return false; + + // Object is not visible on unwind. + // TODO: Support RequiresNoCaptureBeforeUnwind case. + bool RequiresNoCaptureBeforeUnwind; + if (isNotVisibleOnUnwind(getUnderlyingObject(V), + RequiresNoCaptureBeforeUnwind) && + !RequiresNoCaptureBeforeUnwind) + return false; + + // Check whether there are any unwinding instructions in the range. + return any_of(make_range(Start->getIterator(), End->getIterator()), + [](const Instruction &I) { return I.mayThrow(); }); } void MemCpyOptPass::eraseInstruction(Instruction *I) { @@ -364,7 +371,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, const DataLayout &DL = StartInst->getModule()->getDataLayout(); // We can't track scalable types - if (StoreInst *SI = dyn_cast<StoreInst>(StartInst)) + if (auto *SI = dyn_cast<StoreInst>(StartInst)) if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable()) return nullptr; @@ -410,7 +417,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, continue; } - if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + if (auto *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. if (!NextStore->isSimple()) break; @@ -440,7 +447,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, Ranges.addStore(*Offset, NextStore); } else { - MemSetInst *MSI = cast<MemSetInst>(BI); + auto *MSI = cast<MemSetInst>(BI); if (MSI->isVolatile() || ByteVal != MSI->getValue() || !isa<ConstantInt>(MSI->getLength())) @@ -661,7 +668,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { return false; // Load to store forwarding can be interpreted as memcpy. - if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { + if (auto *LI = dyn_cast<LoadInst>(StoredVal)) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { @@ -871,7 +878,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, return false; // Require that src be an alloca. This simplifies the reasoning considerably. - AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); + auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc); if (!srcAlloca) return false; @@ -890,8 +897,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // trap. Otherwise the transform is invalid since it might cause a trap // to occur earlier than it otherwise would. if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize), - DL, C, DT)) + DL, C, DT)) { + LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n"); return false; + } // Make sure that nothing can observe cpyDest being written early. There are // a number of cases to consider: @@ -907,8 +916,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // guaranteed to be executed if C is. As it is a non-atomic access, it // renders accesses from other threads undefined. // TODO: This is currently not checked. - if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) + if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) { + LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding"); return false; + } // Check that dest points to memory that is at least as aligned as src. Align srcAlign = srcAlloca->getAlign(); @@ -930,14 +941,14 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, append_range(srcUseList, U->users()); continue; } - if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { + if (const auto *G = dyn_cast<GetElementPtrInst>(U)) { if (!G->hasAllZeroIndices()) return false; append_range(srcUseList, U->users()); continue; } - if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U)) + if (const auto *IT = dyn_cast<IntrinsicInst>(U)) if (IT->isLifetimeStartOrEnd()) continue; @@ -945,12 +956,57 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, return false; } - // Check that src isn't captured by the called function since the - // transformation can cause aliasing issues in that case. - for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI) - if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI)) + // Check whether src is captured by the called function, in which case there + // may be further indirect uses of src. + bool SrcIsCaptured = any_of(C->args(), [&](Use &U) { + return U->stripPointerCasts() == cpySrc && + !C->doesNotCapture(C->getArgOperandNo(&U)); + }); + + // If src is captured, then check whether there are any potential uses of + // src through the captured pointer before the lifetime of src ends, either + // due to a lifetime.end or a return from the function. + if (SrcIsCaptured) { + // Check that dest is not captured before/at the call. We have already + // checked that src is not captured before it. If either had been captured, + // then the call might be comparing the argument against the captured dest + // or src pointer. + Value *DestObj = getUnderlyingObject(cpyDest); + if (!isIdentifiedFunctionLocal(DestObj) || + PointerMayBeCapturedBefore(DestObj, /* ReturnCaptures */ true, + /* StoreCaptures */ true, C, DT, + /* IncludeI */ true)) return false; + MemoryLocation SrcLoc = + MemoryLocation(srcAlloca, LocationSize::precise(srcSize)); + for (Instruction &I : + make_range(++C->getIterator(), C->getParent()->end())) { + // Lifetime of srcAlloca ends at lifetime.end. + if (auto *II = dyn_cast<IntrinsicInst>(&I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_end && + II->getArgOperand(1)->stripPointerCasts() == srcAlloca && + cast<ConstantInt>(II->getArgOperand(0))->uge(srcSize)) + break; + } + + // Lifetime of srcAlloca ends at return. + if (isa<ReturnInst>(&I)) + break; + + // Ignore the direct read of src in the load. + if (&I == cpyLoad) + continue; + + // Check whether this instruction may mod/ref src through the captured + // pointer (we have already any direct mod/refs in the loop above). + // Also bail if we hit a terminator, as we don't want to scan into other + // blocks. + if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator()) + return false; + } + } + // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. if (!DT->dominates(cpyDest, C)) { @@ -1018,6 +1074,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, LLVMContext::MD_invariant_group, LLVMContext::MD_access_group}; combineMetadata(C, cpyLoad, KnownIDs, true); + if (cpyLoad != cpyStore) + combineMetadata(C, cpyStore, KnownIDs, true); ++NumCallSlot; return true; @@ -1043,8 +1101,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. if (MDep->getLength() != M->getLength()) { - ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); - ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); + auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); + auto *MLen = dyn_cast<ConstantInt>(M->getLength()); if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; } @@ -1163,7 +1221,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, const unsigned DestAlign = std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment()); if (DestAlign > 1) - if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) + if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); IRBuilder<> Builder(MemCpy); @@ -1211,12 +1269,11 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, if (MSSA->isLiveOnEntryDef(Def)) return isa<AllocaInst>(getUnderlyingObject(V)); - if (IntrinsicInst *II = - dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) { + if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) { - ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0)); + auto *LTSize = cast<ConstantInt>(II->getArgOperand(0)); - if (ConstantInt *CSize = dyn_cast<ConstantInt>(Size)) { + if (auto *CSize = dyn_cast<ConstantInt>(Size)) { if (AA->isMustAlias(V, II->getArgOperand(1)) && LTSize->getZExtValue() >= CSize->getZExtValue()) return true; @@ -1226,12 +1283,14 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, // does) and we're querying a pointer based on that alloca, then we know // the memory is definitely undef, regardless of how exactly we alias. // The size also doesn't matter, as an out-of-bounds access would be UB. - AllocaInst *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V)); - if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) { - const DataLayout &DL = Alloca->getModule()->getDataLayout(); - if (Optional<TypeSize> AllocaSize = Alloca->getAllocationSizeInBits(DL)) - if (*AllocaSize == LTSize->getValue() * 8) - return true; + if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) { + if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) { + const DataLayout &DL = Alloca->getModule()->getDataLayout(); + if (Optional<TypeSize> AllocaSize = + Alloca->getAllocationSizeInBits(DL)) + if (*AllocaSize == LTSize->getValue() * 8) + return true; + } } } } @@ -1266,12 +1325,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, // Don't worry about sizes larger than i64. // A known memset size is required. - ConstantInt *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize); + auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize); if (!CMemSetSize) return false; // A known memcpy size is also required. - ConstantInt *CCopySize = dyn_cast<ConstantInt>(CopySize); + auto *CCopySize = dyn_cast<ConstantInt>(CopySize); if (!CCopySize) return false; if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { @@ -1323,7 +1382,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { } // If copying from a constant, try to turn the memcpy into a memset. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) + if (auto *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer(), M->getModule()->getDataLayout())) { @@ -1370,7 +1429,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { // d) memcpy from a just-memset'd source can be turned into memset. if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) { if (Instruction *MI = MD->getMemoryInst()) { - if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) { + if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) { if (auto *C = dyn_cast<CallInst>(MI)) { // The memcpy must post-dom the call. Limit to the same block for // now. Additionally, we need to ensure that there are no accesses @@ -1469,7 +1528,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { return false; // The length of the memcpy must be larger or equal to the size of the byval. - ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); + auto *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (!C1 || !TypeSize::isKnownGE( TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize)) return false; @@ -1540,13 +1599,13 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { bool RepeatInstruction = false; - if (StoreInst *SI = dyn_cast<StoreInst>(I)) + if (auto *SI = dyn_cast<StoreInst>(I)) MadeChange |= processStore(SI, BI); - else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) + else if (auto *M = dyn_cast<MemSetInst>(I)) RepeatInstruction = processMemSet(M, BI); - else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) + else if (auto *M = dyn_cast<MemCpyInst>(I)) RepeatInstruction = processMemCpy(M, BI); - else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) + else if (auto *M = dyn_cast<MemMoveInst>(I)) RepeatInstruction = processMemMove(M); else if (auto *CB = dyn_cast<CallBase>(I)) { for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 10a8742940b1..2476e6c408b1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1198,9 +1198,10 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) { - Value *V = SimplifyGEPInst(GEPI->getSourceElementType(), - ArrayRef<Value *>(E->op_begin(), E->op_end()), - GEPI->isInBounds(), SQ); + Value *V = + SimplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(), + makeArrayRef(std::next(E->op_begin()), E->op_end()), + GEPI->isInBounds(), SQ); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (AllConstant) { @@ -1322,11 +1323,11 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst, Value *NewGVN::lookupOperandLeader(Value *V) const { CongruenceClass *CC = ValueToClass.lookup(V); if (CC) { - // Everything in TOP is represented by undef, as it can be any value. + // Everything in TOP is represented by poison, as it can be any value. // We do have to make sure we get the type right though, so we can't set the - // RepLeader to undef. + // RepLeader to poison. if (CC == TOPClass) - return UndefValue::get(V->getType()); + return PoisonValue::get(V->getType()); return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader(); } @@ -1493,8 +1494,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, // undef value. This can happen when loading for a fresh allocation with no // intervening stores, for example. Note that this is only true in the case // that the result of the allocation is pointer equal to the load ptr. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || - isAlignedAllocLikeFn(DepInst, TLI)) { + if (isa<AllocaInst>(DepInst)) { return createConstantExpression(UndefValue::get(LoadType)); } // If this load occurs either right after a lifetime begin, @@ -1502,12 +1502,10 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) return createConstantExpression(UndefValue::get(LoadType)); - } - // If this load follows a calloc (which zero initializes memory), - // then the loaded value is zero - else if (isCallocLikeFn(DepInst, TLI)) { - return createConstantExpression(Constant::getNullValue(LoadType)); - } + } else if (isAllocationFn(DepInst, TLI)) + if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst), + TLI, LoadType)) + return createConstantExpression(InitVal); return nullptr; } @@ -1521,9 +1519,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { return nullptr; Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand()); - // Load of undef is undef. + // Load of undef is UB. if (isa<UndefValue>(LoadAddressLeader)) - return createConstantExpression(UndefValue::get(LI->getType())); + return createConstantExpression(PoisonValue::get(LI->getType())); MemoryAccess *OriginalAccess = getMemoryAccess(I); MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(OriginalAccess); @@ -1531,9 +1529,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { if (!MSSA->isLiveOnEntryDef(DefiningAccess)) { if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) { Instruction *DefiningInst = MD->getMemoryInst(); - // If the defining instruction is not reachable, replace with undef. + // If the defining instruction is not reachable, replace with poison. if (!ReachableBlocks.count(DefiningInst->getParent())) - return createConstantExpression(UndefValue::get(LI->getType())); + return createConstantExpression(PoisonValue::get(LI->getType())); // This will handle stores and memory insts. We only do if it the // defining access has a different type, or it is a pointer produced by // certain memory operations that cause the memory to have a fixed value @@ -1722,8 +1720,12 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, // We match the semantics of SimplifyPhiNode from InstructionSimplify here. // See if all arguments are the same. // We track if any were undef because they need special handling. - bool HasUndef = false; + bool HasUndef = false, HasPoison = false; auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) { + if (isa<PoisonValue>(Arg)) { + HasPoison = true; + return false; + } if (isa<UndefValue>(Arg)) { HasUndef = true; return false; @@ -1732,8 +1734,14 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, }); // If we are left with no operands, it's dead. if (Filtered.empty()) { - // If it has undef at this point, it means there are no-non-undef arguments, - // and thus, the value of the phi node must be undef. + // If it has undef or poison at this point, it means there are no-non-undef + // arguments, and thus, the value of the phi node must be undef. + if (HasPoison && !HasUndef) { + LLVM_DEBUG( + dbgs() << "PHI Node " << *I + << " has no non-poison arguments, valuing it as poison\n"); + return createConstantExpression(PoisonValue::get(I->getType())); + } if (HasUndef) { LLVM_DEBUG( dbgs() << "PHI Node " << *I @@ -1758,7 +1766,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, // expression to say if one is equivalent to the other. // We also special case undef, so that if we have an undef, we can't use the // common value unless it dominates the phi block. - if (HasUndef) { + if (HasPoison || HasUndef) { // If we have undef and at least one other value, this is really a // multivalued phi, and we need to know if it's cycle free in order to // evaluate whether we can ignore the undef. The other parts of this are @@ -2579,6 +2587,15 @@ bool NewGVN::OpIsSafeForPHIOfOpsHelper( } auto *OrigI = cast<Instruction>(V); + // When we hit an instruction that reads memory (load, call, etc), we must + // consider any store that may happen in the loop. For now, we assume the + // worst: there is a store in the loop that alias with this read. + // The case where the load is outside the loop is already covered by the + // dominator check above. + // TODO: relax this condition + if (OrigI->mayReadFromMemory()) + return false; + for (auto *Op : OrigI->operand_values()) { if (!isa<Instruction>(Op)) continue; @@ -2780,7 +2797,7 @@ NewGVN::makePossiblePHIOfOps(Instruction *I, LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " << getBlockName(PredBB) << " because the block is unreachable\n"); - FoundVal = UndefValue::get(I->getType()); + FoundVal = PoisonValue::get(I->getType()); RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); } @@ -3459,7 +3476,7 @@ bool NewGVN::runGVN() { // Delete all instructions marked for deletion. for (Instruction *ToErase : InstructionsToErase) { if (!ToErase->use_empty()) - ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType())); + ToErase->replaceAllUsesWith(PoisonValue::get(ToErase->getType())); assert(ToErase->getParent() && "BB containing ToErase deleted unexpectedly!"); @@ -3677,7 +3694,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) { Instruction &Inst = *I++; if (!Inst.use_empty()) - Inst.replaceAllUsesWith(UndefValue::get(Inst.getType())); + Inst.replaceAllUsesWith(PoisonValue::get(Inst.getType())); if (isa<LandingPadInst>(Inst)) continue; salvageKnowledge(&Inst, AC); @@ -3687,7 +3704,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { } // Now insert something that simplifycfg will turn into an unreachable. Type *Int8Ty = Type::getInt8Ty(BB->getContext()); - new StoreInst(UndefValue::get(Int8Ty), + new StoreInst(PoisonValue::get(Int8Ty), Constant::getNullValue(Int8Ty->getPointerTo()), BB->getTerminator()); } @@ -3827,8 +3844,8 @@ bool NewGVN::eliminateInstructions(Function &F) { LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block " << getBlockName(PHI->getIncomingBlock(Operand)) - << " with undef due to it being unreachable\n"); - Operand.set(UndefValue::get(PHI->getType())); + << " with poison due to it being unreachable\n"); + Operand.set(PoisonValue::get(PHI->getType())); } }; // Replace unreachable phi arguments. @@ -4128,21 +4145,25 @@ bool NewGVN::eliminateInstructions(Function &F) { unsigned int NewGVN::getRank(const Value *V) const { // Prefer constants to undef to anything else // Undef is a constant, have to check it first. + // Prefer poison to undef as it's less defined. // Prefer smaller constants to constantexprs + // Note that the order here matters because of class inheritance if (isa<ConstantExpr>(V)) - return 2; - if (isa<UndefValue>(V)) + return 3; + if (isa<PoisonValue>(V)) return 1; + if (isa<UndefValue>(V)) + return 2; if (isa<Constant>(V)) return 0; - else if (auto *A = dyn_cast<Argument>(V)) - return 3 + A->getArgNo(); + if (auto *A = dyn_cast<Argument>(V)) + return 4 + A->getArgNo(); - // Need to shift the instruction DFS by number of arguments + 3 to account for + // Need to shift the instruction DFS by number of arguments + 5 to account for // the constant and argument ranking above. unsigned Result = InstrToDFSNum(V); if (Result > 0) - return 4 + NumFuncArgs + Result; + return 5 + NumFuncArgs + Result; // Unreachable or something else, just return a really large number. return ~0; } diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 44027ccd92ca..e0d0301c1ef6 100644 --- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -82,6 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, // Add attribute "readnone" so that backend can use a native sqrt instruction // for this call. + Call->removeFnAttr(Attribute::WriteOnly); Call->addFnAttr(Attribute::ReadNone); // Insert a FP compare instruction and use it as the CurrBB branch condition. diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index e12eca0ed287..3da367341d2a 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1367,13 +1367,13 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx, return AL; // Remove the readonly, readnone, and statepoint function attributes. - AttrBuilder FnAttrs = AL.getFnAttrs(); + AttrBuilder FnAttrs(Ctx, AL.getFnAttrs()); for (auto Attr : FnAttrsToStrip) FnAttrs.removeAttribute(Attr); for (Attribute A : AL.getFnAttrs()) { if (isStatepointDirectiveAttr(A)) - FnAttrs.remove(A); + FnAttrs.removeAttribute(A); } // Just skip parameter and return attributes for now @@ -2643,10 +2643,10 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, // List of all parameter and return attributes which must be stripped when // lowering from the abstract machine model. Note that we list attributes // here which aren't valid as return attributes, that is okay. -static AttrBuilder getParamAndReturnAttributesToRemove() { - AttrBuilder R; - R.addDereferenceableAttr(1); - R.addDereferenceableOrNullAttr(1); +static AttributeMask getParamAndReturnAttributesToRemove() { + AttributeMask R; + R.addAttribute(Attribute::Dereferenceable); + R.addAttribute(Attribute::DereferenceableOrNull); R.addAttribute(Attribute::ReadNone); R.addAttribute(Attribute::ReadOnly); R.addAttribute(Attribute::WriteOnly); @@ -2668,7 +2668,7 @@ static void stripNonValidAttributesFromPrototype(Function &F) { return; } - AttrBuilder R = getParamAndReturnAttributesToRemove(); + AttributeMask R = getParamAndReturnAttributesToRemove(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) F.removeParamAttrs(A.getArgNo(), R); @@ -2742,7 +2742,7 @@ static void stripNonValidDataFromBody(Function &F) { stripInvalidMetadataFromInstruction(I); - AttrBuilder R = getParamAndReturnAttributesToRemove(); + AttributeMask R = getParamAndReturnAttributesToRemove(); if (auto *Call = dyn_cast<CallBase>(&I)) { for (int i = 0, e = Call->arg_size(); i != e; i++) if (isa<PointerType>(Call->getArgOperand(i)->getType())) diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index ff2f8a25f379..c34da51e6dc1 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -486,7 +486,7 @@ bool llvm::runIPSCCP( // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove // them from both the function and callsites. if (ReplacedPointerArg) { - AttrBuilder AttributesToRemove; + AttributeMask AttributesToRemove; AttributesToRemove.addAttribute(Attribute::ArgMemOnly); AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); F.removeFnAttrs(AttributesToRemove); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 31c8999c3724..35497ae5ed9a 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -323,7 +323,7 @@ private: /// /// Note that these are not separated by slice. This is because we expect an /// alloca to be completely rewritten or not rewritten at all. If rewritten, - /// all these instructions can simply be removed and replaced with undef as + /// all these instructions can simply be removed and replaced with poison as /// they come from outside of the allocated space. SmallVector<Instruction *, 8> DeadUsers; @@ -333,10 +333,10 @@ private: /// Operands which will become dead if we rewrite the alloca. /// /// These are operands that in their particular use can be replaced with - /// undef when we rewrite the alloca. These show up in out-of-bounds inputs + /// poison when we rewrite the alloca. These show up in out-of-bounds inputs /// to PHI nodes and the like. They aren't entirely dead (there might be /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we - /// want to swap this particular input for undef to simplify the use lists of + /// want to swap this particular input for poison to simplify the use lists of /// the alloca. SmallVector<Use *, 8> DeadOperands; }; @@ -1008,6 +1008,13 @@ private: if (I.use_empty()) return markAsDead(I); + // If this is a PHI node before a catchswitch, we cannot insert any non-PHI + // instructions in this BB, which may be required during rewriting. Bail out + // on these cases. + if (isa<PHINode>(I) && + I.getParent()->getFirstInsertionPt() == I.getParent()->end()) + return PI.setAborted(&I); + // TODO: We could use SimplifyInstruction here to fold PHINodes and // SelectInsts. However, doing so requires to change the current // dead-operand-tracking mechanism. For instance, suppose neither loading @@ -1023,7 +1030,7 @@ private: enqueueUsers(I); else // Otherwise the operand to the PHI/select is dead, and we can replace - // it with undef. + // it with poison. AS.DeadOperands.push_back(U); return; @@ -1043,7 +1050,7 @@ private: // For PHI and select operands outside the alloca, we can't nuke the entire // phi or select -- the other side might still be relevant, so we special // case them here and use a separate structure to track the operands - // themselves which should be replaced with undef. + // themselves which should be replaced with poison. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. if (Offset.uge(AllocSize)) { @@ -1264,14 +1271,14 @@ static bool isSafePHIToSpeculate(PHINode &PN) { return true; } -static void speculatePHINodeLoads(PHINode &PN) { +static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) { LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); Type *LoadTy = SomeLoad->getType(); - IRBuilderTy PHIBuilder(&PN); - PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), - PN.getName() + ".sroa.speculated"); + IRB.SetInsertPoint(&PN); + PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); // Get the AA tags and alignment to use from one of the loads. It does not // matter which one we get and if any differ. @@ -1301,9 +1308,9 @@ static void speculatePHINodeLoads(PHINode &PN) { } Instruction *TI = Pred->getTerminator(); - IRBuilderTy PredBuilder(TI); + IRB.SetInsertPoint(TI); - LoadInst *Load = PredBuilder.CreateAlignedLoad( + LoadInst *Load = IRB.CreateAlignedLoad( LoadTy, InVal, Alignment, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); ++NumLoadsSpeculated; @@ -1361,10 +1368,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { return true; } -static void speculateSelectInstLoads(SelectInst &SI) { +static void speculateSelectInstLoads(IRBuilderTy &IRB, SelectInst &SI) { LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); - IRBuilderTy IRB(&SI); + IRB.SetInsertPoint(&SI); Value *TV = SI.getTrueValue(); Value *FV = SI.getFalseValue(); // Replace the loads of the select with a select of two loads. @@ -1430,8 +1437,10 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) return BasePtr; - return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(), - BasePtr, Indices, NamePrefix + "sroa_idx"); + // buildGEP() is only called for non-opaque pointers. + return IRB.CreateInBoundsGEP( + BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices, + NamePrefix + "sroa_idx"); } /// Get a natural GEP off of the BasePtr walking through Ty toward @@ -1504,7 +1513,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) return nullptr; - Type *ElementTy = Ty->getElementType(); + Type *ElementTy = Ty->getNonOpaquePointerElementType(); if (!ElementTy->isSized()) return nullptr; // We can't GEP through an unsized element. @@ -1563,7 +1572,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Int8PtrOffset(Offset.getBitWidth(), 0); PointerType *TargetPtrTy = cast<PointerType>(PointerTy); - Type *TargetTy = TargetPtrTy->getElementType(); + Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType(); // As `addrspacecast` is , `Ptr` (the storage pointer) may have different // address space from the expected `PointerTy` (the pointer to be used). @@ -2558,7 +2567,7 @@ private: // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. Value *Placeholder = new LoadInst( - LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "", + LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "", false, Align(1)); V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); @@ -3223,8 +3232,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { /// Used to calculate offsets, and hence alignment, of subobjects. const DataLayout &DL; + IRBuilderTy &IRB; + public: - AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} + AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB) + : DL(DL), IRB(IRB) {} /// Rewrite loads and stores through a pointer and all pointers derived from /// it. @@ -3255,7 +3267,7 @@ private: template <typename Derived> class OpSplitter { protected: /// The builder used to form new instructions. - IRBuilderTy IRB; + IRBuilderTy &IRB; /// The indices which to be used with insert- or extractvalue to select the /// appropriate value within the aggregate. @@ -3282,9 +3294,11 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - Align BaseAlign, const DataLayout &DL) - : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), - BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} + Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB) + : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), + BaseAlign(BaseAlign), DL(DL) { + IRB.SetInsertPoint(InsertionPoint); + } public: /// Generic recursive split emission routine. @@ -3345,9 +3359,10 @@ private: AAMDNodes AATags; LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) - : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, + IRBuilderTy &IRB) + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL, + IRB), AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the @@ -3379,8 +3394,8 @@ private: // We have an aggregate being loaded, split it apart. LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(), - getAdjustedAlignment(&LI, 0), DL); - Value *V = UndefValue::get(LI.getType()); + getAdjustedAlignment(&LI, 0), DL, IRB); + Value *V = PoisonValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); Visited.erase(&LI); LI.replaceAllUsesWith(V); @@ -3390,9 +3405,10 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, + IRBuilderTy &IRB) : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), + DL, IRB), AATags(AATags) {} AAMDNodes AATags; /// Emit a leaf store of a single value. This is called at the leaves of the @@ -3430,7 +3446,7 @@ private: // We have an aggregate being stored, split it apart. LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), - getAdjustedAlignment(&SI, 0), DL); + getAdjustedAlignment(&SI, 0), DL, IRB); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); Visited.erase(&SI); SI.eraseFromParent(); @@ -3458,7 +3474,7 @@ private: << "\n original: " << *Sel << "\n " << GEPI); - IRBuilderTy Builder(&GEPI); + IRB.SetInsertPoint(&GEPI); SmallVector<Value *, 4> Index(GEPI.indices()); bool IsInBounds = GEPI.isInBounds(); @@ -3466,21 +3482,20 @@ private: Value *True = Sel->getTrueValue(); Value *NTrue = IsInBounds - ? Builder.CreateInBoundsGEP(Ty, True, Index, - True->getName() + ".sroa.gep") - : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep"); + ? IRB.CreateInBoundsGEP(Ty, True, Index, + True->getName() + ".sroa.gep") + : IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep"); Value *False = Sel->getFalseValue(); Value *NFalse = IsInBounds - ? Builder.CreateInBoundsGEP(Ty, False, Index, - False->getName() + ".sroa.gep") - : Builder.CreateGEP(Ty, False, Index, - False->getName() + ".sroa.gep"); + ? IRB.CreateInBoundsGEP(Ty, False, Index, + False->getName() + ".sroa.gep") + : IRB.CreateGEP(Ty, False, Index, False->getName() + ".sroa.gep"); - Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse, - Sel->getName() + ".sroa.sel"); + Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, + Sel->getName() + ".sroa.sel"); Visited.erase(&GEPI); GEPI.replaceAllUsesWith(NSel); GEPI.eraseFromParent(); @@ -3517,10 +3532,9 @@ private: SmallVector<Value *, 4> Index(GEPI.indices()); bool IsInBounds = GEPI.isInBounds(); - IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI()); - PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(), - PHI->getNumIncomingValues(), - PHI->getName() + ".sroa.phi"); + IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI()); + PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(), + PHI->getName() + ".sroa.phi"); for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { BasicBlock *B = PHI->getIncomingBlock(I); Value *NewVal = nullptr; @@ -3530,11 +3544,12 @@ private: } else { Instruction *In = cast<Instruction>(PHI->getIncomingValue(I)); - IRBuilderTy B(In->getParent(), std::next(In->getIterator())); + IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator())); Type *Ty = GEPI.getSourceElementType(); - NewVal = IsInBounds - ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep") - : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep"); + NewVal = IsInBounds ? IRB.CreateInBoundsGEP(Ty, In, Index, + In->getName() + ".sroa.gep") + : IRB.CreateGEP(Ty, In, Index, + In->getName() + ".sroa.gep"); } NewPN->addIncoming(NewVal, B); } @@ -4557,11 +4572,11 @@ bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { return Changed; } -/// Clobber a use with undef, deleting the used value if it becomes dead. +/// Clobber a use with poison, deleting the used value if it becomes dead. void SROAPass::clobberUse(Use &U) { Value *OldV = U; - // Replace the use with an undef value. - U = UndefValue::get(OldV->getType()); + // Replace the use with an poison value. + U = PoisonValue::get(OldV->getType()); // Check for this making an instruction dead. We have to garbage collect // all the dead instructions to ensure the uses of any alloca end up being @@ -4598,7 +4613,8 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) { // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(DL); + IRBuilderTy IRB(&AI); + AggLoadStoreRewriter AggRewriter(DL, IRB); Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. @@ -4614,7 +4630,7 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) { clobberUse(DeadOp); // Now replace the uses of this instruction. - DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType())); + DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType())); // And mark it for deletion. DeadInsts.push_back(DeadUser); @@ -4633,11 +4649,11 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) { LLVM_DEBUG(dbgs() << " Speculating PHIs\n"); while (!SpeculatablePHIs.empty()) - speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); + speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val()); LLVM_DEBUG(dbgs() << " Speculating Selects\n"); while (!SpeculatableSelects.empty()) - speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); + speculateSelectInstLoads(IRB, *SpeculatableSelects.pop_back_val()); return Changed; } diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 1284bae820a4..29cea42e4a00 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -959,7 +959,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, Type *LoadTy = CI->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, LoadTy->getScalarType()); - if (TTI.isLegalMaskedGather(LoadTy, Alignment)) + if (TTI.isLegalMaskedGather(LoadTy, Alignment) && + !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment)) return false; scalarizeMaskedGather(DL, CI, DTU, ModifiedDT); return true; @@ -970,7 +971,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, Type *StoreTy = CI->getArgOperand(0)->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, StoreTy->getScalarType()); - if (TTI.isLegalMaskedScatter(StoreTy, Alignment)) + if (TTI.isLegalMaskedScatter(StoreTy, Alignment) && + !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy), + Alignment)) return false; scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT); return true; diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 6b7419abe1d1..3606c8a4b073 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -270,7 +270,7 @@ Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, Type *Ty = V->getType(); PtrTy = dyn_cast<PointerType>(Ty); if (PtrTy) - Ty = PtrTy->getElementType(); + Ty = PtrTy->getPointerElementType(); Size = cast<FixedVectorType>(Ty)->getNumElements(); if (!CachePtr) Tmp.resize(Size, nullptr); @@ -288,7 +288,8 @@ Value *Scatterer::operator[](unsigned I) { return CV[I]; IRBuilder<> Builder(BB, BBI); if (PtrTy) { - Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType(); + Type *ElTy = + cast<VectorType>(PtrTy->getPointerElementType())->getElementType(); if (!CV[0]) { Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace()); CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0"); diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 3799d2dd1cf2..ee17da1875e5 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -78,6 +78,79 @@ static cl::opt<bool> UserSinkCommonInsts( STATISTIC(NumSimpl, "Number of blocks simplified"); +static bool +performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs, + std::vector<DominatorTree::UpdateType> *Updates) { + SmallVector<PHINode *, 1> NewOps; + + // We don't want to change IR just because we can. + // Only do that if there are at least two blocks we'll tail-merge. + if (BBs.size() < 2) + return false; + + if (Updates) + Updates->reserve(Updates->size() + BBs.size()); + + BasicBlock *CanonicalBB; + Instruction *CanonicalTerm; + { + auto *Term = BBs[0]->getTerminator(); + + // Create a canonical block for this function terminator type now, + // placing it *before* the first block that will branch to it. + CanonicalBB = BasicBlock::Create( + F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]); + // We'll also need a PHI node per each operand of the terminator. + NewOps.resize(Term->getNumOperands()); + for (auto I : zip(Term->operands(), NewOps)) { + std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(), + /*NumReservedValues=*/BBs.size(), + CanonicalBB->getName() + ".op"); + CanonicalBB->getInstList().push_back(std::get<1>(I)); + } + // Make it so that this canonical block actually has the right + // terminator. + CanonicalTerm = Term->clone(); + CanonicalBB->getInstList().push_back(CanonicalTerm); + // If the canonical terminator has operands, rewrite it to take PHI's. + for (auto I : zip(NewOps, CanonicalTerm->operands())) + std::get<1>(I) = std::get<0>(I); + } + + // Now, go through each block (with the current terminator type) + // we've recorded, and rewrite it to branch to the new common block. + const DILocation *CommonDebugLoc = nullptr; + for (BasicBlock *BB : BBs) { + auto *Term = BB->getTerminator(); + assert(Term->getOpcode() == CanonicalTerm->getOpcode() && + "All blocks to be tail-merged must be the same " + "(function-terminating) terminator type."); + + // Aha, found a new non-canonical function terminator. If it has operands, + // forward them to the PHI nodes in the canonical block. + for (auto I : zip(Term->operands(), NewOps)) + std::get<1>(I)->addIncoming(std::get<0>(I), BB); + + // Compute the debug location common to all the original terminators. + if (!CommonDebugLoc) + CommonDebugLoc = Term->getDebugLoc(); + else + CommonDebugLoc = + DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc()); + + // And turn BB into a block that just unconditionally branches + // to the canonical block. + Term->eraseFromParent(); + BranchInst::Create(CanonicalBB, BB); + if (Updates) + Updates->push_back({DominatorTree::Insert, BB, CanonicalBB}); + } + + CanonicalTerm->setDebugLoc(CommonDebugLoc); + + return true; +} + static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F, DomTreeUpdater *DTU) { SmallMapVector<unsigned /*TerminatorOpcode*/, SmallVector<BasicBlock *, 2>, 4> @@ -133,73 +206,8 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F, std::vector<DominatorTree::UpdateType> Updates; - for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure)) { - SmallVector<PHINode *, 1> NewOps; - - // We don't want to change IR just because we can. - // Only do that if there are at least two blocks we'll tail-merge. - if (BBs.size() < 2) - continue; - - Changed = true; - - if (DTU) - Updates.reserve(Updates.size() + BBs.size()); - - BasicBlock *CanonicalBB; - Instruction *CanonicalTerm; - { - auto *Term = BBs[0]->getTerminator(); - - // Create a canonical block for this function terminator type now, - // placing it *before* the first block that will branch to it. - CanonicalBB = BasicBlock::Create( - F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]); - // We'll also need a PHI node per each operand of the terminator. - NewOps.resize(Term->getNumOperands()); - for (auto I : zip(Term->operands(), NewOps)) { - std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(), - /*NumReservedValues=*/BBs.size(), - CanonicalBB->getName() + ".op"); - CanonicalBB->getInstList().push_back(std::get<1>(I)); - } - // Make it so that this canonical block actually has the right - // terminator. - CanonicalTerm = Term->clone(); - CanonicalBB->getInstList().push_back(CanonicalTerm); - // If the canonical terminator has operands, rewrite it to take PHI's. - for (auto I : zip(NewOps, CanonicalTerm->operands())) - std::get<1>(I) = std::get<0>(I); - } - - // Now, go through each block (with the current terminator type) - // we've recorded, and rewrite it to branch to the new common block. - const DILocation *CommonDebugLoc = nullptr; - for (BasicBlock *BB : BBs) { - auto *Term = BB->getTerminator(); - - // Aha, found a new non-canonical function terminator. If it has operands, - // forward them to the PHI nodes in the canonical block. - for (auto I : zip(Term->operands(), NewOps)) - std::get<1>(I)->addIncoming(std::get<0>(I), BB); - - // Compute the debug location common to all the original terminators. - if (!CommonDebugLoc) - CommonDebugLoc = Term->getDebugLoc(); - else - CommonDebugLoc = - DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc()); - - // And turn BB into a block that just unconditionally branches - // to the canonical block. - Term->eraseFromParent(); - BranchInst::Create(CanonicalBB, BB); - if (DTU) - Updates.push_back({DominatorTree::Insert, BB, CanonicalBB}); - } - - CanonicalTerm->setDebugLoc(CommonDebugLoc); - } + for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure)) + Changed |= performBlockTailMerging(F, BBs, DTU ? &Updates : nullptr); if (DTU) DTU->applyUpdates(Updates); @@ -313,7 +321,7 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) { Options.SinkCommonInsts = UserSinkCommonInsts; } -SimplifyCFGPass::SimplifyCFGPass() : Options() { +SimplifyCFGPass::SimplifyCFGPass() { applyCommandLineOverridesToOptions(Options); } diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp index fdc914a72bfd..c734611836eb 100644 --- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp +++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp @@ -22,19 +22,6 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-emit-printf" -static bool isCString(const Value *Arg) { - auto Ty = Arg->getType(); - auto PtrTy = dyn_cast<PointerType>(Ty); - if (!PtrTy) - return false; - - auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType()); - if (!IntTy) - return false; - - return IntTy->getBitWidth() == 8; -} - static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) { auto Int64Ty = Builder.getInt64Ty(); auto Ty = Arg->getType(); @@ -176,13 +163,15 @@ static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str, static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg, bool IsLast) { + Arg = Builder.CreateBitCast( + Arg, Builder.getInt8PtrTy(Arg->getType()->getPointerAddressSpace())); auto Length = getStrlenWithNull(Builder, Arg); return callAppendStringN(Builder, Desc, Arg, Length, IsLast); } static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, bool SpecIsCString, bool IsLast) { - if (SpecIsCString && isCString(Arg)) { + if (SpecIsCString && isa<PointerType>(Arg->getType())) { return appendString(Builder, Desc, Arg, IsLast); } // If the format specifies a string but the argument is not, the frontend will diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 580cfd80141e..97f11ca71726 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -34,6 +34,7 @@ STATISTIC(NumReadNone, "Number of functions inferred as readnone"); STATISTIC(NumInaccessibleMemOnly, "Number of functions inferred as inaccessiblememonly"); STATISTIC(NumReadOnly, "Number of functions inferred as readonly"); +STATISTIC(NumWriteOnly, "Number of functions inferred as writeonly"); STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly"); STATISTIC(NumInaccessibleMemOrArgMemOnly, "Number of functions inferred as inaccessiblemem_or_argmemonly"); @@ -71,6 +72,19 @@ static bool setOnlyReadsMemory(Function &F) { return true; } +static bool setOnlyWritesMemory(Function &F) { + if (F.onlyWritesMemory()) // writeonly or readnone + return false; + // Turn readonly and writeonly into readnone. + if (F.hasFnAttribute(Attribute::ReadOnly)) { + F.removeFnAttr(Attribute::ReadOnly); + return setDoesNotAccessMemory(F); + } + ++NumWriteOnly; + F.setOnlyWritesMemory(); + return true; +} + static bool setOnlyAccessesArgMemory(Function &F) { if (F.onlyAccessesArgMemory()) return false; @@ -233,6 +247,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { switch (TheLibFunc) { case LibFunc_strlen: + case LibFunc_strnlen: case LibFunc_wcslen: Changed |= setOnlyReadsMemory(F); Changed |= setDoesNotThrow(F); @@ -400,6 +415,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; + case LibFunc_aligned_alloc: + case LibFunc_valloc: case LibFunc_malloc: case LibFunc_vec_malloc: Changed |= setOnlyAccessesInaccessibleMemory(F); @@ -484,6 +501,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_realloc: case LibFunc_vec_realloc: + case LibFunc_reallocf: Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F); Changed |= setRetNoUndef(F); Changed |= setDoesNotThrow(F); @@ -492,11 +510,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setArgNoUndef(F, 1); return Changed; - case LibFunc_reallocf: - Changed |= setRetNoUndef(F); - Changed |= setWillReturn(F); - Changed |= setArgNoUndef(F, 1); - return Changed; case LibFunc_read: // May throw; "read" is a valid pthread cancellation point. Changed |= setRetAndArgsNoUndef(F); @@ -536,13 +549,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; - case LibFunc_aligned_alloc: - Changed |= setOnlyAccessesInaccessibleMemory(F); - Changed |= setRetAndArgsNoUndef(F); - Changed |= setDoesNotThrow(F); - Changed |= setRetDoesNotAlias(F); - Changed |= setWillReturn(F); - return Changed; case LibFunc_bcopy: Changed |= setDoesNotThrow(F); Changed |= setOnlyAccessesArgMemory(F); @@ -569,6 +575,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_calloc: case LibFunc_vec_calloc: + Changed |= setOnlyAccessesInaccessibleMemory(F); Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); @@ -851,13 +858,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; - case LibFunc_valloc: - Changed |= setOnlyAccessesInaccessibleMemory(F); - Changed |= setRetAndArgsNoUndef(F); - Changed |= setDoesNotThrow(F); - Changed |= setRetDoesNotAlias(F); - Changed |= setWillReturn(F); - return Changed; case LibFunc_vprintf: Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); @@ -1020,12 +1020,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_memset_pattern4: case LibFunc_memset_pattern8: case LibFunc_memset_pattern16: - Changed |= setOnlyAccessesArgMemory(F); Changed |= setDoesNotCapture(F, 0); - Changed |= setOnlyWritesMemory(F, 0); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); - return Changed; + LLVM_FALLTHROUGH; case LibFunc_memset: Changed |= setWillReturn(F); LLVM_FALLTHROUGH; @@ -1158,7 +1156,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl: - case LibFunc_strnlen: case LibFunc_tan: case LibFunc_tanf: case LibFunc_tanh: @@ -1171,6 +1168,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_truncl: Changed |= setDoesNotThrow(F); Changed |= setDoesNotFreeMemory(F); + Changed |= setOnlyWritesMemory(F); Changed |= setWillReturn(F); return Changed; default: diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp index b2763900e154..ac3839f2a4ab 100644 --- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp +++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp @@ -20,8 +20,7 @@ using namespace llvm; bool CallGraphUpdater::finalize() { if (!DeadFunctionsInComdats.empty()) { - filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(), - DeadFunctionsInComdats); + filterDeadComdatFunctions(DeadFunctionsInComdats); DeadFunctions.append(DeadFunctionsInComdats.begin(), DeadFunctionsInComdats.end()); } diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index ebe19f1751e5..56b6e4bc46a5 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -500,7 +500,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, CB.setArgOperand(ArgNo, Cast); // Remove any incompatible attributes for the argument. - AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo)); + AttrBuilder ArgAttrs(Ctx, CallerPAL.getParamAttrs(ArgNo)); ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); // We may have a different byval/inalloca type. @@ -518,7 +518,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, // If the return type of the call site doesn't match that of the callee, cast // the returned value to the appropriate type. // Remove any incompatible return value attribute. - AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + AttrBuilder RAttrs(Ctx, CallerPAL.getRetAttrs()); if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) { createRetBitCast(CB, CallSiteRetTy, RetBitCast); RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy)); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 96aff563aa9b..24cd5747c5a4 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -829,39 +829,54 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, default: RetTy = Type::getInt16Ty(header->getContext()); break; } - std::vector<Type *> paramTy; + std::vector<Type *> ParamTy; + std::vector<Type *> AggParamTy; + ValueSet StructValues; // Add the types of the input values to the function's argument list for (Value *value : inputs) { LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n"); - paramTy.push_back(value->getType()); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(value)) { + AggParamTy.push_back(value->getType()); + StructValues.insert(value); + } else + ParamTy.push_back(value->getType()); } // Add the types of the output values to the function's argument list. for (Value *output : outputs) { LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n"); - if (AggregateArgs) - paramTy.push_back(output->getType()); - else - paramTy.push_back(PointerType::getUnqual(output->getType())); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) { + AggParamTy.push_back(output->getType()); + StructValues.insert(output); + } else + ParamTy.push_back(PointerType::getUnqual(output->getType())); + } + + assert( + (ParamTy.size() + AggParamTy.size()) == + (inputs.size() + outputs.size()) && + "Number of scalar and aggregate params does not match inputs, outputs"); + assert(StructValues.empty() || + AggregateArgs && "Expeced StructValues only with AggregateArgs set"); + + // Concatenate scalar and aggregate params in ParamTy. + size_t NumScalarParams = ParamTy.size(); + StructType *StructTy = nullptr; + if (AggregateArgs && !AggParamTy.empty()) { + StructTy = StructType::get(M->getContext(), AggParamTy); + ParamTy.push_back(PointerType::getUnqual(StructTy)); } LLVM_DEBUG({ dbgs() << "Function type: " << *RetTy << " f("; - for (Type *i : paramTy) + for (Type *i : ParamTy) dbgs() << *i << ", "; dbgs() << ")\n"; }); - StructType *StructTy = nullptr; - if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { - StructTy = StructType::get(M->getContext(), paramTy); - paramTy.clear(); - paramTy.push_back(PointerType::getUnqual(StructTy)); - } - FunctionType *funcType = - FunctionType::get(RetTy, paramTy, - AllowVarArgs && oldFunction->isVarArg()); + FunctionType *funcType = FunctionType::get( + RetTy, ParamTy, AllowVarArgs && oldFunction->isVarArg()); std::string SuffixToUse = Suffix.empty() @@ -871,13 +886,6 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, Function *newFunction = Function::Create( funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(), oldFunction->getName() + "." + SuffixToUse, M); - // If the old function is no-throw, so is the new one. - if (oldFunction->doesNotThrow()) - newFunction->setDoesNotThrow(); - - // Inherit the uwtable attribute if we need to. - if (oldFunction->hasUWTable()) - newFunction->setHasUWTable(); // Inherit all of the target dependent attributes and white-listed // target independent attributes. @@ -893,53 +901,26 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, } else switch (Attr.getKindAsEnum()) { // Those attributes cannot be propagated safely. Explicitly list them - // here so we get a warning if new attributes are added. This list also - // includes non-function attributes. - case Attribute::Alignment: + // here so we get a warning if new attributes are added. case Attribute::AllocSize: case Attribute::ArgMemOnly: case Attribute::Builtin: - case Attribute::ByVal: case Attribute::Convergent: - case Attribute::Dereferenceable: - case Attribute::DereferenceableOrNull: - case Attribute::ElementType: - case Attribute::InAlloca: - case Attribute::InReg: case Attribute::InaccessibleMemOnly: case Attribute::InaccessibleMemOrArgMemOnly: case Attribute::JumpTable: case Attribute::Naked: - case Attribute::Nest: - case Attribute::NoAlias: case Attribute::NoBuiltin: - case Attribute::NoCapture: case Attribute::NoMerge: case Attribute::NoReturn: case Attribute::NoSync: - case Attribute::NoUndef: - case Attribute::None: - case Attribute::NonNull: - case Attribute::Preallocated: case Attribute::ReadNone: case Attribute::ReadOnly: - case Attribute::Returned: case Attribute::ReturnsTwice: - case Attribute::SExt: case Attribute::Speculatable: case Attribute::StackAlignment: - case Attribute::StructRet: - case Attribute::SwiftError: - case Attribute::SwiftSelf: - case Attribute::SwiftAsync: case Attribute::WillReturn: case Attribute::WriteOnly: - case Attribute::ZExt: - case Attribute::ImmArg: - case Attribute::ByRef: - case Attribute::EndAttrKinds: - case Attribute::EmptyKey: - case Attribute::TombstoneKey: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: @@ -980,30 +961,62 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::MustProgress: case Attribute::NoProfile: break; + // These attributes cannot be applied to functions. + case Attribute::Alignment: + case Attribute::ByVal: + case Attribute::Dereferenceable: + case Attribute::DereferenceableOrNull: + case Attribute::ElementType: + case Attribute::InAlloca: + case Attribute::InReg: + case Attribute::Nest: + case Attribute::NoAlias: + case Attribute::NoCapture: + case Attribute::NoUndef: + case Attribute::NonNull: + case Attribute::Preallocated: + case Attribute::Returned: + case Attribute::SExt: + case Attribute::StructRet: + case Attribute::SwiftError: + case Attribute::SwiftSelf: + case Attribute::SwiftAsync: + case Attribute::ZExt: + case Attribute::ImmArg: + case Attribute::ByRef: + // These are not really attributes. + case Attribute::None: + case Attribute::EndAttrKinds: + case Attribute::EmptyKey: + case Attribute::TombstoneKey: + llvm_unreachable("Not a function attribute"); } newFunction->addFnAttr(Attr); } newFunction->getBasicBlockList().push_back(newRootNode); - // Create an iterator to name all of the arguments we inserted. - Function::arg_iterator AI = newFunction->arg_begin(); + // Create scalar and aggregate iterators to name all of the arguments we + // inserted. + Function::arg_iterator ScalarAI = newFunction->arg_begin(); + Function::arg_iterator AggAI = std::next(ScalarAI, NumScalarParams); // Rewrite all users of the inputs in the extracted region to use the // arguments (or appropriate addressing into struct) instead. - for (unsigned i = 0, e = inputs.size(); i != e; ++i) { + for (unsigned i = 0, e = inputs.size(), aggIdx = 0; i != e; ++i) { Value *RewriteVal; - if (AggregateArgs) { + if (AggregateArgs && StructValues.contains(inputs[i])) { Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext())); - Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); + Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx); Instruction *TI = newFunction->begin()->getTerminator(); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI); - RewriteVal = new LoadInst(StructTy->getElementType(i), GEP, + StructTy, &*AggAI, Idx, "gep_" + inputs[i]->getName(), TI); + RewriteVal = new LoadInst(StructTy->getElementType(aggIdx), GEP, "loadgep_" + inputs[i]->getName(), TI); + ++aggIdx; } else - RewriteVal = &*AI++; + RewriteVal = &*ScalarAI++; std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end()); for (User *use : Users) @@ -1013,12 +1026,14 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, } // Set names for input and output arguments. - if (!AggregateArgs) { - AI = newFunction->arg_begin(); - for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI) - AI->setName(inputs[i]->getName()); - for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI) - AI->setName(outputs[i]->getName()+".out"); + if (NumScalarParams) { + ScalarAI = newFunction->arg_begin(); + for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++ScalarAI) + if (!StructValues.contains(inputs[i])) + ScalarAI->setName(inputs[i]->getName()); + for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++ScalarAI) + if (!StructValues.contains(outputs[i])) + ScalarAI->setName(outputs[i]->getName() + ".out"); } // Rewrite branches to basic blocks outside of the loop to new dummy blocks @@ -1126,7 +1141,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, ValueSet &outputs) { // Emit a call to the new function, passing in: *pointer to struct (if // aggregating parameters), or plan inputs and allocated memory for outputs - std::vector<Value *> params, StructValues, ReloadOutputs, Reloads; + std::vector<Value *> params, ReloadOutputs, Reloads; + ValueSet StructValues; Module *M = newFunction->getParent(); LLVMContext &Context = M->getContext(); @@ -1134,23 +1150,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, CallInst *call = nullptr; // Add inputs as params, or to be filled into the struct - unsigned ArgNo = 0; + unsigned ScalarInputArgNo = 0; SmallVector<unsigned, 1> SwiftErrorArgs; for (Value *input : inputs) { - if (AggregateArgs) - StructValues.push_back(input); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(input)) + StructValues.insert(input); else { params.push_back(input); if (input->isSwiftError()) - SwiftErrorArgs.push_back(ArgNo); + SwiftErrorArgs.push_back(ScalarInputArgNo); } - ++ArgNo; + ++ScalarInputArgNo; } // Create allocas for the outputs + unsigned ScalarOutputArgNo = 0; for (Value *output : outputs) { - if (AggregateArgs) { - StructValues.push_back(output); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) { + StructValues.insert(output); } else { AllocaInst *alloca = new AllocaInst(output->getType(), DL.getAllocaAddrSpace(), @@ -1158,12 +1175,14 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, &codeReplacer->getParent()->front().front()); ReloadOutputs.push_back(alloca); params.push_back(alloca); + ++ScalarOutputArgNo; } } StructType *StructArgTy = nullptr; AllocaInst *Struct = nullptr; - if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { + unsigned NumAggregatedInputs = 0; + if (AggregateArgs && !StructValues.empty()) { std::vector<Type *> ArgTypes; for (Value *V : StructValues) ArgTypes.push_back(V->getType()); @@ -1175,14 +1194,18 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, &codeReplacer->getParent()->front().front()); params.push_back(Struct); - for (unsigned i = 0, e = inputs.size(); i != e; ++i) { - Value *Idx[2]; - Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); - GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); - codeReplacer->getInstList().push_back(GEP); - new StoreInst(StructValues[i], GEP, codeReplacer); + // Store aggregated inputs in the struct. + for (unsigned i = 0, e = StructValues.size(); i != e; ++i) { + if (inputs.contains(StructValues[i])) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); + codeReplacer->getInstList().push_back(GEP); + new StoreInst(StructValues[i], GEP, codeReplacer); + NumAggregatedInputs++; + } } } @@ -1205,24 +1228,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError); } - Function::arg_iterator OutputArgBegin = newFunction->arg_begin(); - unsigned FirstOut = inputs.size(); - if (!AggregateArgs) - std::advance(OutputArgBegin, inputs.size()); - - // Reload the outputs passed in by reference. - for (unsigned i = 0, e = outputs.size(); i != e; ++i) { + // Reload the outputs passed in by reference, use the struct if output is in + // the aggregate or reload from the scalar argument. + for (unsigned i = 0, e = outputs.size(), scalarIdx = 0, + aggIdx = NumAggregatedInputs; + i != e; ++i) { Value *Output = nullptr; - if (AggregateArgs) { + if (AggregateArgs && StructValues.contains(outputs[i])) { Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx); GetElementPtrInst *GEP = GetElementPtrInst::Create( StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName()); codeReplacer->getInstList().push_back(GEP); Output = GEP; + ++aggIdx; } else { - Output = ReloadOutputs[i]; + Output = ReloadOutputs[scalarIdx]; + ++scalarIdx; } LoadInst *load = new LoadInst(outputs[i]->getType(), Output, outputs[i]->getName() + ".reload", @@ -1304,8 +1327,13 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, // Store the arguments right after the definition of output value. // This should be proceeded after creating exit stubs to be ensure that invoke // result restore will be placed in the outlined function. - Function::arg_iterator OAI = OutputArgBegin; - for (unsigned i = 0, e = outputs.size(); i != e; ++i) { + Function::arg_iterator ScalarOutputArgBegin = newFunction->arg_begin(); + std::advance(ScalarOutputArgBegin, ScalarInputArgNo); + Function::arg_iterator AggOutputArgBegin = newFunction->arg_begin(); + std::advance(AggOutputArgBegin, ScalarInputArgNo + ScalarOutputArgNo); + + for (unsigned i = 0, e = outputs.size(), aggIdx = NumAggregatedInputs; i != e; + ++i) { auto *OutI = dyn_cast<Instruction>(outputs[i]); if (!OutI) continue; @@ -1325,23 +1353,27 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, assert((InsertBefore->getFunction() == newFunction || Blocks.count(InsertBefore->getParent())) && "InsertPt should be in new function"); - assert(OAI != newFunction->arg_end() && - "Number of output arguments should match " - "the amount of defined values"); - if (AggregateArgs) { + if (AggregateArgs && StructValues.contains(outputs[i])) { + assert(AggOutputArgBegin != newFunction->arg_end() && + "Number of aggregate output arguments should match " + "the number of defined values"); Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), + StructArgTy, &*AggOutputArgBegin, Idx, "gep_" + outputs[i]->getName(), InsertBefore); new StoreInst(outputs[i], GEP, InsertBefore); + ++aggIdx; // Since there should be only one struct argument aggregating - // all the output values, we shouldn't increment OAI, which always - // points to the struct argument, in this case. + // all the output values, we shouldn't increment AggOutputArgBegin, which + // always points to the struct argument, in this case. } else { - new StoreInst(outputs[i], &*OAI, InsertBefore); - ++OAI; + assert(ScalarOutputArgBegin != newFunction->arg_end() && + "Number of scalar output arguments should match " + "the number of defined values"); + new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertBefore); + ++ScalarOutputArgBegin; } } @@ -1840,3 +1872,7 @@ bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc, } return false; } + +void CodeExtractor::excludeArgFromAggregate(Value *Arg) { + ExcludeArgsFromAggregate.insert(Arg); +} diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp index 91630d876fc8..e73287c060ae 100644 --- a/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -122,129 +122,114 @@ isSimpleEnoughValueToCommit(Constant *C, return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL); } -/// Return true if this constant is simple enough for us to understand. In -/// particular, if it is a cast to anything other than from one pointer type to -/// another pointer type, we punt. We basically just support direct accesses to -/// globals and GEP's of globals. This should be kept up to date with -/// CommitValueTo. -static bool isSimpleEnoughPointerToCommit(Constant *C, const DataLayout &DL) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) - // Do not allow weak/*_odr/linkonce linkage or external globals. - return GV->hasUniqueInitializer(); - - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) { - // Handle a constantexpr gep. - if (CE->getOpcode() == Instruction::GetElementPtr && - isa<GlobalVariable>(CE->getOperand(0)) && - cast<GEPOperator>(CE)->isInBounds()) { - GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); - // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or - // external globals. - if (!GV->hasUniqueInitializer()) - return false; +void Evaluator::MutableValue::clear() { + if (auto *Agg = Val.dyn_cast<MutableAggregate *>()) + delete Agg; + Val = nullptr; +} - // The first index must be zero. - ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin())); - if (!CI || !CI->isZero()) return false; +Constant *Evaluator::MutableValue::read(Type *Ty, APInt Offset, + const DataLayout &DL) const { + TypeSize TySize = DL.getTypeStoreSize(Ty); + const MutableValue *V = this; + while (const auto *Agg = V->Val.dyn_cast<MutableAggregate *>()) { + Type *AggTy = Agg->Ty; + Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset); + if (!Index || Index->uge(Agg->Elements.size()) || + !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy))) + return nullptr; + + V = &Agg->Elements[Index->getZExtValue()]; + } - // The remaining indices must be compile-time known integers within the - // notional bounds of the corresponding static array types. - if (!CE->isGEPWithNoNotionalOverIndexing()) - return false; + return ConstantFoldLoadFromConst(V->Val.get<Constant *>(), Ty, Offset, DL); +} - return ConstantFoldLoadThroughGEPConstantExpr( - GV->getInitializer(), CE, - cast<GEPOperator>(CE)->getResultElementType(), DL); - } else if (CE->getOpcode() == Instruction::BitCast && - isa<GlobalVariable>(CE->getOperand(0))) { - // A constantexpr bitcast from a pointer to another pointer is a no-op, - // and we know how to evaluate it by moving the bitcast from the pointer - // operand to the value operand. - // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or - // external globals. - return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer(); - } - } +bool Evaluator::MutableValue::makeMutable() { + Constant *C = Val.get<Constant *>(); + Type *Ty = C->getType(); + unsigned NumElements; + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + NumElements = VT->getNumElements(); + } else if (auto *AT = dyn_cast<ArrayType>(Ty)) + NumElements = AT->getNumElements(); + else if (auto *ST = dyn_cast<StructType>(Ty)) + NumElements = ST->getNumElements(); + else + return false; - return false; + MutableAggregate *MA = new MutableAggregate(Ty); + MA->Elements.reserve(NumElements); + for (unsigned I = 0; I < NumElements; ++I) + MA->Elements.push_back(C->getAggregateElement(I)); + Val = MA; + return true; } -/// Apply \p TryLoad to Ptr. If this returns \p nullptr, introspect the -/// pointer's type and walk down through the initial elements to obtain -/// additional pointers to try. Returns the first non-null return value from -/// \p TryLoad, or \p nullptr if the type can't be introspected further. -static Constant * -evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL, - const TargetLibraryInfo *TLI, - std::function<Constant *(Constant *)> TryLoad) { - Constant *Val; - while (!(Val = TryLoad(Ptr))) { - // If Ty is a non-opaque struct, we can convert the pointer to the struct - // into a pointer to its first member. - // FIXME: This could be extended to support arrays as well. - Type *Ty = cast<PointerType>(Ptr->getType())->getElementType(); - if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque()) - break; - - IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32); - Constant *IdxZero = ConstantInt::get(IdxTy, 0, false); - Constant *const IdxList[] = {IdxZero, IdxZero}; - - Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList); - Ptr = ConstantFoldConstant(Ptr, DL, TLI); +bool Evaluator::MutableValue::write(Constant *V, APInt Offset, + const DataLayout &DL) { + Type *Ty = V->getType(); + TypeSize TySize = DL.getTypeStoreSize(Ty); + MutableValue *MV = this; + while (Offset != 0 || + !CastInst::isBitOrNoopPointerCastable(Ty, MV->getType(), DL)) { + if (MV->Val.is<Constant *>() && !MV->makeMutable()) + return false; + + MutableAggregate *Agg = MV->Val.get<MutableAggregate *>(); + Type *AggTy = Agg->Ty; + Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset); + if (!Index || Index->uge(Agg->Elements.size()) || + !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy))) + return false; + + MV = &Agg->Elements[Index->getZExtValue()]; } - return Val; + + Type *MVType = MV->getType(); + MV->clear(); + if (Ty->isIntegerTy() && MVType->isPointerTy()) + MV->Val = ConstantExpr::getIntToPtr(V, MVType); + else if (Ty->isPointerTy() && MVType->isIntegerTy()) + MV->Val = ConstantExpr::getPtrToInt(V, MVType); + else if (Ty != MVType) + MV->Val = ConstantExpr::getBitCast(V, MVType); + else + MV->Val = V; + return true; } -static Constant *getInitializer(Constant *C) { - auto *GV = dyn_cast<GlobalVariable>(C); - return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr; +Constant *Evaluator::MutableAggregate::toConstant() const { + SmallVector<Constant *, 32> Consts; + for (const MutableValue &MV : Elements) + Consts.push_back(MV.toConstant()); + + if (auto *ST = dyn_cast<StructType>(Ty)) + return ConstantStruct::get(ST, Consts); + if (auto *AT = dyn_cast<ArrayType>(Ty)) + return ConstantArray::get(AT, Consts); + assert(isa<FixedVectorType>(Ty) && "Must be vector"); + return ConstantVector::get(Consts); } /// Return the value that would be computed by a load from P after the stores /// reflected by 'memory' have been performed. If we can't decide, return null. Constant *Evaluator::ComputeLoadResult(Constant *P, Type *Ty) { - // If this memory location has been recently stored, use the stored value: it - // is the most up-to-date. - auto TryFindMemLoc = [this](Constant *Ptr) { - return MutatedMemory.lookup(Ptr); - }; - - if (Constant *Val = TryFindMemLoc(P)) - return Val; - - // Access it. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { - if (GV->hasDefinitiveInitializer()) - return GV->getInitializer(); + APInt Offset(DL.getIndexTypeSizeInBits(P->getType()), 0); + P = cast<Constant>(P->stripAndAccumulateConstantOffsets( + DL, Offset, /* AllowNonInbounds */ true)); + Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(P->getType())); + auto *GV = dyn_cast<GlobalVariable>(P); + if (!GV) return nullptr; - } - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) { - switch (CE->getOpcode()) { - // Handle a constantexpr getelementptr. - case Instruction::GetElementPtr: - if (auto *I = getInitializer(CE->getOperand(0))) - return ConstantFoldLoadThroughGEPConstantExpr(I, CE, Ty, DL); - break; - // Handle a constantexpr bitcast. - case Instruction::BitCast: - // We're evaluating a load through a pointer that was bitcast to a - // different type. See if the "from" pointer has recently been stored. - // If it hasn't, we may still be able to find a stored pointer by - // introspecting the type. - Constant *Val = - evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryFindMemLoc); - if (!Val) - Val = getInitializer(CE->getOperand(0)); - if (Val) - return ConstantFoldLoadThroughBitcast( - Val, P->getType()->getPointerElementType(), DL); - break; - } - } + auto It = MutatedMemory.find(GV); + if (It != MutatedMemory.end()) + return It->second.read(Ty, Offset, DL); - return nullptr; // don't know how to evaluate. + if (!GV->hasDefinitiveInitializer()) + return nullptr; + return ConstantFoldLoadFromConst(GV->getInitializer(), Ty, Offset, DL); } static Function *getFunction(Constant *C) { @@ -260,17 +245,10 @@ static Function *getFunction(Constant *C) { Function * Evaluator::getCalleeWithFormalArgs(CallBase &CB, SmallVectorImpl<Constant *> &Formals) { - auto *V = CB.getCalledOperand(); + auto *V = CB.getCalledOperand()->stripPointerCasts(); if (auto *Fn = getFunction(getVal(V))) return getFormalParams(CB, Fn, Formals) ? Fn : nullptr; - - auto *CE = dyn_cast<ConstantExpr>(V); - if (!CE || CE->getOpcode() != Instruction::BitCast || - !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals)) - return nullptr; - - return dyn_cast<Function>( - ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL)); + return nullptr; } bool Evaluator::getFormalParams(CallBase &CB, Function *F, @@ -299,17 +277,13 @@ bool Evaluator::getFormalParams(CallBase &CB, Function *F, /// If call expression contains bitcast then we may need to cast /// evaluated return value to a type of the call expression. -Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) { - ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr); - if (!RV || !CE || CE->getOpcode() != Instruction::BitCast) +Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) { + if (!RV || RV->getType() == ReturnType) return RV; - if (auto *FT = - dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) { - RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL); - if (!RV) - LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n"); - } + RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL); + if (!RV) + LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n"); return RV; } @@ -337,68 +311,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, Ptr = FoldedPtr; LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n"); } - // Conservatively, avoid aggregate types. This is because we don't - // want to worry about them partially overlapping other stores. - if (!SI->getValueOperand()->getType()->isSingleValueType() || - !isSimpleEnoughPointerToCommit(Ptr, DL)) { - // If this is too complex for us to commit, reject it. - LLVM_DEBUG( - dbgs() << "Pointer is too complex for us to evaluate store."); + + APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Ptr = cast<Constant>(Ptr->stripAndAccumulateConstantOffsets( + DL, Offset, /* AllowNonInbounds */ true)); + Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(Ptr->getType())); + auto *GV = dyn_cast<GlobalVariable>(Ptr); + if (!GV || !GV->hasUniqueInitializer()) { + LLVM_DEBUG(dbgs() << "Store is not to global with unique initializer: " + << *Ptr << "\n"); return false; } - Constant *Val = getVal(SI->getOperand(0)); - // If this might be too difficult for the backend to handle (e.g. the addr // of one global variable divided by another) then we can't commit it. + Constant *Val = getVal(SI->getOperand(0)); if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) { LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val << "\n"); return false; } - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) { - if (CE->getOpcode() == Instruction::BitCast) { - LLVM_DEBUG(dbgs() - << "Attempting to resolve bitcast on constant ptr.\n"); - // If we're evaluating a store through a bitcast, then we need - // to pull the bitcast off the pointer type and push it onto the - // stored value. In order to push the bitcast onto the stored value, - // a bitcast from the pointer's element type to Val's type must be - // legal. If it's not, we can try introspecting the type to find a - // legal conversion. - - auto TryCastValTy = [&](Constant *P) -> Constant * { - // The conversion is illegal if the store is wider than the - // pointee proposed by `evaluateBitcastFromPtr`, since that would - // drop stores to other struct elements when the caller attempts to - // look through a struct's 0th element. - Type *NewTy = cast<PointerType>(P->getType())->getElementType(); - Type *STy = Val->getType(); - if (DL.getTypeSizeInBits(NewTy) < DL.getTypeSizeInBits(STy)) - return nullptr; - - if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, NewTy, DL)) { - Ptr = P; - return FV; - } - return nullptr; - }; - - Constant *NewVal = - evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryCastValTy); - if (!NewVal) { - LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not " - "evaluate.\n"); - return false; - } - - Val = NewVal; - LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n"); - } - } - - MutatedMemory[Ptr] = Val; + auto Res = MutatedMemory.try_emplace(GV, GV->getInitializer()); + if (!Res.first->second.write(Val, Offset, DL)) + return false; } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) { InstResult = ConstantExpr::get(BO->getOpcode(), getVal(BO->getOperand(0)), @@ -593,7 +529,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, if (Callee->isDeclaration()) { // If this is a function we can constant fold, do it. if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) { - InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C); + InstResult = castCallResultIfNeeded(CB.getType(), C); if (!InstResult) return false; LLVM_DEBUG(dbgs() << "Constant folded function call. Result: " @@ -617,7 +553,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, return false; } ValueStack.pop_back(); - InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal); + InstResult = castCallResultIfNeeded(CB.getType(), RetVal); if (RetVal && !InstResult) return false; diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 9bfc73e4ba6c..f8ec8c6ad426 100644 --- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -66,8 +66,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, for (const Use &U : V->uses()) { const User *UR = U.getUser(); if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { - GS.HasNonInstructionUser = true; - // If the result of the constantexpr isn't pointer type, then we won't // know to expect it in various places. Just reject early. if (!isa<PointerType>(CE->getType())) @@ -105,9 +103,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, // value, not an aggregate), keep more specific information about // stores. if (GS.StoredType != GlobalStatus::Stored) { - const Value *Ptr = SI->getPointerOperand(); - if (isa<ConstantExpr>(Ptr)) - Ptr = Ptr->stripPointerCasts(); + const Value *Ptr = SI->getPointerOperand()->stripPointerCasts(); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { Value *StoredVal = SI->getOperand(0); @@ -174,12 +170,10 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, return true; // Any other non-load instruction might take address! } } else if (const Constant *C = dyn_cast<Constant>(UR)) { - GS.HasNonInstructionUser = true; // We might have a dead and dangling constant hanging off of here. if (!isSafeToDestroyConstant(C)) return true; } else { - GS.HasNonInstructionUser = true; // Otherwise must be some other user. return true; } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 997667810580..c9f872f5b7e1 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1185,10 +1185,10 @@ static bool MayContainThrowingOrExitingCall(Instruction *Begin, static AttrBuilder IdentifyValidAttributes(CallBase &CB) { - AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex); - if (AB.empty()) + AttrBuilder AB(CB.getContext(), CB.getAttributes().getRetAttrs()); + if (!AB.hasAttributes()) return AB; - AttrBuilder Valid; + AttrBuilder Valid(CB.getContext()); // Only allow these white listed attributes to be propagated back to the // callee. This is because other attributes may only be valid on the call // itself, i.e. attributes such as signext and zeroext. @@ -1208,7 +1208,7 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { return; AttrBuilder Valid = IdentifyValidAttributes(CB); - if (Valid.empty()) + if (!Valid.hasAttributes()) return; auto *CalledFunction = CB.getCalledFunction(); auto &Context = CalledFunction->getContext(); @@ -1667,7 +1667,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, Module *Mod = CB.getModule(); assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function"); bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV, - IsClaimRV = !IsRetainRV; + IsUnsafeClaimRV = !IsRetainRV; for (auto *RI : Returns) { Value *RetOpnd = objcarc::GetRCIdentityRoot(RI->getOperand(0)); @@ -1694,7 +1694,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // and erase the autoreleaseRV call. // - If retainRV is attached to the call, just erase the autoreleaseRV // call. - if (IsClaimRV) { + if (IsUnsafeClaimRV) { Builder.SetInsertPoint(II); Function *IFn = Intrinsic::getDeclaration(Mod, Intrinsic::objc_release); diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index 668626fef933..72b864dc3e48 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -339,8 +339,10 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI, #ifdef EXPENSIVE_CHECKS // Verify all sub-loops are in LCSSA form already. - for (Loop *SubLoop: L) + for (Loop *SubLoop: L) { + (void)SubLoop; // Silence unused variable warning. assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!"); + } #endif SmallVector<BasicBlock *, 8> ExitBlocks; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ecad79b68185..9f33d2f82732 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -492,7 +492,7 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, } } - if (isAllocLikeFn(I, TLI)) + if (isAllocationFn(I, TLI) && isAllocRemovable(cast<CallBase>(I), TLI)) return true; if (CallInst *CI = isFreeCall(I, TLI)) @@ -2189,8 +2189,8 @@ CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) { return NewCall; } -/// changeToCall - Convert the specified invoke into a normal call. -void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { +// changeToCall - Convert the specified invoke into a normal call. +CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { CallInst *NewCall = createCallMatchingInvoke(II); NewCall->takeName(II); NewCall->insertBefore(II); @@ -2207,6 +2207,7 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { II->eraseFromParent(); if (DTU) DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}}); + return NewCall; } BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI, @@ -3147,11 +3148,6 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128) return false; // Can't do integer/elements > 128 bits. - Type *DemandedTy = ITy; - if (I->hasOneUse()) - if (auto *Trunc = dyn_cast<TruncInst>(I->user_back())) - DemandedTy = Trunc->getType(); - // Try to find all the pieces corresponding to the bswap. bool FoundRoot = false; std::map<Value *, Optional<BitPart>> BPS; @@ -3165,6 +3161,7 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( "Illegal bit provenance index"); // If the upper bits are zero, then attempt to perform as a truncated op. + Type *DemandedTy = ITy; if (BitProvenance.back() == BitPart::Unset) { while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset) BitProvenance = BitProvenance.drop_back(); diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 69fd110dc3c2..92333408aaef 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -359,7 +359,7 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) { // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, - unsigned &TripCount, DominatorTree &DT, + unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, unsigned Threshold) { assert(LoopSize > 0 && "Zero loop size is not allowed!"); // Save the PP.PeelCount value set by the target in @@ -370,7 +370,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, return; // Only try to peel innermost loops by default. - // The constraint can be relaxed by the target in TTI.getUnrollingPreferences + // The constraint can be relaxed by the target in TTI.getPeelingPreferences // or by the flag -unroll-allow-loop-nests-peeling. if (!PP.AllowLoopNestsPeeling && !L->isInnermost()) return; @@ -407,8 +407,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, SmallDenseMap<PHINode *, Optional<unsigned> > IterationsToInvariance; // Now go through all Phis to calculate their the number of iterations they // need to become invariants. - // Start the max computation with the UP.PeelCount value set by the target - // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count. + // Start the max computation with the PP.PeelCount value set by the target + // in TTI.getPeelingPreferences or by the flag -unroll-peel-count. unsigned DesiredPeelCount = TargetPeelCount; BasicBlock *BackEdge = L->getLoopLatch(); assert(BackEdge && "Loop is not in simplified form?"); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index b0c622b98d5e..9ca1f4f44b97 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -99,6 +99,17 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden, #endif ); +static cl::opt<bool> +UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden, + cl::desc("Verify loopinfo after unrolling"), +#ifdef EXPENSIVE_CHECKS + cl::init(true) +#else + cl::init(false) +#endif + ); + + /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. /// \param Blocks is a vector of basic blocks representing unrolled loop. @@ -764,6 +775,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // Apply updates to the DomTree. DT = &DTU.getDomTree(); + assert(!UnrollVerifyDomtree || + DT->verify(DominatorTree::VerificationLevel::Fast)); + // At this point, the code is well formed. We now simplify the unrolled loop, // doing constant propagation and dead code elimination as we go. simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC, @@ -777,6 +791,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, if (CompletelyUnroll) LI->erase(L); + // LoopInfo should not be valid, confirm that. + if (UnrollVerifyLoopInfo) + LI->verify(*DT); + // After complete unrolling most of the blocks should be contained in OuterL. // However, some of them might happen to be out of OuterL (e.g. if they // precede a loop exit). In this case we might need to insert PHI nodes in diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 93157bd87c34..95db2fe8d310 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -1567,7 +1568,9 @@ Value *llvm::addRuntimeChecks( auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp); LLVMContext &Ctx = Loc->getContext(); - IRBuilder<> ChkBuilder(Loc); + IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx, + Loc->getModule()->getDataLayout()); + ChkBuilder.SetInsertPoint(Loc); // Our instructions might fold to a constant. Value *MemoryRuntimeCheck = nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 771b7d25b0f2..f0bf625fa18e 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -70,17 +71,14 @@ void LoopVersioning::versionLoop( "scev.check"); SCEVRuntimeCheck = Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator()); - auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck); - - // Discard the SCEV runtime check if it is always true. - if (CI && CI->isZero()) - SCEVRuntimeCheck = nullptr; + IRBuilder<InstSimplifyFolder> Builder( + RuntimeCheckBB->getContext(), + InstSimplifyFolder(RuntimeCheckBB->getModule()->getDataLayout())); if (MemRuntimeCheck && SCEVRuntimeCheck) { - RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck, - SCEVRuntimeCheck, "lver.safe"); - if (auto *I = dyn_cast<Instruction>(RuntimeCheck)) - I->insertBefore(RuntimeCheckBB->getTerminator()); + Builder.SetInsertPoint(RuntimeCheckBB->getTerminator()); + RuntimeCheck = + Builder.CreateOr(MemRuntimeCheck, SCEVRuntimeCheck, "lver.safe"); } else RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; @@ -109,8 +107,9 @@ void LoopVersioning::versionLoop( // Insert the conditional branch based on the result of the memchecks. Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); - BranchInst::Create(NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm); + Builder.SetInsertPoint(OrigTerm); + Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader()); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 8dc4702993c3..3d75dd57456d 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -297,7 +297,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, Function *F = OrigBB->getParent(); const DataLayout &DL = F->getParent()->getDataLayout(); - Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType(); + Type *EltTy = SrcAddr->getType()->getPointerElementType(); // Create the a comparison of src and dst, based on which we jump to either // the forward-copy part of the function (if src >= dst) or the backwards-copy diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index bb5ff59cba4b..7c9ab7f6ca2c 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -178,66 +178,30 @@ llvm::getOrCreateSanitizerCtorAndInitFunctions( } void llvm::filterDeadComdatFunctions( - Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) { - // Build a map from the comdat to the number of entries in that comdat we - // think are dead. If this fully covers the comdat group, then the entire - // group is dead. If we find another entry in the comdat group though, we'll - // have to preserve the whole group. - SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered; + SmallVectorImpl<Function *> &DeadComdatFunctions) { + SmallPtrSet<Function *, 32> MaybeDeadFunctions; + SmallPtrSet<Comdat *, 32> MaybeDeadComdats; for (Function *F : DeadComdatFunctions) { - Comdat *C = F->getComdat(); - assert(C && "Expected all input GVs to be in a comdat!"); - ComdatEntriesCovered[C] += 1; + MaybeDeadFunctions.insert(F); + if (Comdat *C = F->getComdat()) + MaybeDeadComdats.insert(C); } - auto CheckComdat = [&](Comdat &C) { - auto CI = ComdatEntriesCovered.find(&C); - if (CI == ComdatEntriesCovered.end()) - return; - - // If this could have been covered by a dead entry, just subtract one to - // account for it. - if (CI->second > 0) { - CI->second -= 1; - return; - } - - // If we've already accounted for all the entries that were dead, the - // entire comdat is alive so remove it from the map. - ComdatEntriesCovered.erase(CI); - }; - - auto CheckAllComdats = [&] { - for (Function &F : M.functions()) - if (Comdat *C = F.getComdat()) { - CheckComdat(*C); - if (ComdatEntriesCovered.empty()) - return; - } - for (GlobalVariable &GV : M.globals()) - if (Comdat *C = GV.getComdat()) { - CheckComdat(*C); - if (ComdatEntriesCovered.empty()) - return; - } - for (GlobalAlias &GA : M.aliases()) - if (Comdat *C = GA.getComdat()) { - CheckComdat(*C); - if (ComdatEntriesCovered.empty()) - return; - } - }; - CheckAllComdats(); - - if (ComdatEntriesCovered.empty()) { - DeadComdatFunctions.clear(); - return; + // Find comdats for which all users are dead now. + SmallPtrSet<Comdat *, 32> DeadComdats; + for (Comdat *C : MaybeDeadComdats) { + auto IsUserDead = [&](GlobalObject *GO) { + auto *F = dyn_cast<Function>(GO); + return F && MaybeDeadFunctions.contains(F); + }; + if (all_of(C->getUsers(), IsUserDead)) + DeadComdats.insert(C); } - // Remove the entries that were not covering. - erase_if(DeadComdatFunctions, [&](GlobalValue *GV) { - return ComdatEntriesCovered.find(GV->getComdat()) == - ComdatEntriesCovered.end(); + // Only keep functions which have no comdat or a dead comdat. + erase_if(DeadComdatFunctions, [&](Function *F) { + Comdat *C = F->getComdat(); + return C && !DeadComdats.contains(C); }); } diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 2f2dff6b5f0b..961adf2570a7 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/SampleProfileInference.h" +#include "llvm/ADT/BitVector.h" #include "llvm/Support/Debug.h" #include <queue> #include <set> @@ -144,7 +145,7 @@ public: /// A cost of decreasing the entry block's count by one. static constexpr int64_t AuxCostDecEntry = 10; /// A cost of taking an unlikely jump. - static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20; + static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30; private: /// Check for existence of an augmenting path with a positive capacity. @@ -236,7 +237,7 @@ private: } } - /// An node in a flow network. + /// A node in a flow network. struct Node { /// The cost of the cheapest path from the source to the current node. int64_t Distance; @@ -303,13 +304,10 @@ public: rebalanceUnknownSubgraphs(); } - /// The probability for the first successor of a unknown subgraph - static constexpr double UnknownFirstSuccProbability = 0.5; - private: void joinIsolatedComponents() { // Find blocks that are reachable from the source - auto Visited = std::vector<bool>(NumBlocks(), false); + auto Visited = BitVector(NumBlocks(), false); findReachable(Func.Entry, Visited); // Iterate over all non-reachable blocks and adjust their weights @@ -334,7 +332,7 @@ private: /// Run BFS from a given block along the jumps with a positive flow and mark /// all reachable blocks. - void findReachable(uint64_t Src, std::vector<bool> &Visited) { + void findReachable(uint64_t Src, BitVector &Visited) { if (Visited[Src]) return; std::queue<uint64_t> Queue; @@ -452,44 +450,70 @@ private: uint64_t NumBlocks() const { return Func.Blocks.size(); } - /// Rebalance unknown subgraphs so as each branch splits with probabilities - /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability + /// Rebalance unknown subgraphs so that the flow is split evenly across the + /// outgoing branches of every block of the subgraph. The method iterates over + /// blocks with known weight and identifies unknown subgraphs rooted at the + /// blocks. Then it verifies if flow rebalancing is feasible and applies it. void rebalanceUnknownSubgraphs() { - assert(UnknownFirstSuccProbability >= 0.0 && - UnknownFirstSuccProbability <= 1.0 && - "the share of the unknown successor should be between 0 and 1"); - // Try to find unknown subgraphs from each non-unknown block + // Try to find unknown subgraphs from each block for (uint64_t I = 0; I < Func.Blocks.size(); I++) { auto SrcBlock = &Func.Blocks[I]; - // Do not attempt to find unknown successors from a unknown or a - // zero-flow block - if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0) + // Verify if rebalancing rooted at SrcBlock is feasible + if (!canRebalanceAtRoot(SrcBlock)) continue; - std::vector<FlowBlock *> UnknownSuccs; + // Find an unknown subgraphs starting at SrcBlock. Along the way, + // fill in known destinations and intermediate unknown blocks. + std::vector<FlowBlock *> UnknownBlocks; + std::vector<FlowBlock *> KnownDstBlocks; + findUnknownSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks); + + // Verify if rebalancing of the subgraph is feasible. If the search is + // successful, find the unique destination block (which can be null) FlowBlock *DstBlock = nullptr; - // Find a unknown subgraphs starting at block SrcBlock - if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + if (!canRebalanceSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks, + DstBlock)) continue; - // At the moment, we do not rebalance subgraphs containing cycles among - // unknown blocks - if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + + // We cannot rebalance subgraphs containing cycles among unknown blocks + if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownBlocks)) continue; // Rebalance the flow - rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs); + rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownBlocks); } } - /// Find a unknown subgraph starting at block SrcBlock. - /// If the search is successful, the method sets DstBlock and UnknownSuccs. - bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock, - std::vector<FlowBlock *> &UnknownSuccs) { + /// Verify if rebalancing rooted at a given block is possible. + bool canRebalanceAtRoot(const FlowBlock *SrcBlock) { + // Do not attempt to find unknown subgraphs from an unknown or a + // zero-flow block + if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0) + return false; + + // Do not attempt to process subgraphs from a block w/o unknown sucessors + bool HasUnknownSuccs = false; + for (auto Jump : SrcBlock->SuccJumps) { + if (Func.Blocks[Jump->Target].UnknownWeight) { + HasUnknownSuccs = true; + break; + } + } + if (!HasUnknownSuccs) + return false; + + return true; + } + + /// Find an unknown subgraph starting at block SrcBlock. The method sets + /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks. + void findUnknownSubgraph(const FlowBlock *SrcBlock, + std::vector<FlowBlock *> &KnownDstBlocks, + std::vector<FlowBlock *> &UnknownBlocks) { // Run BFS from SrcBlock and make sure all paths are going through unknown // blocks and end at a non-unknown DstBlock - auto Visited = std::vector<bool>(NumBlocks(), false); + auto Visited = BitVector(NumBlocks(), false); std::queue<uint64_t> Queue; - DstBlock = nullptr; Queue.push(SrcBlock->Index); Visited[SrcBlock->Index] = true; @@ -498,52 +522,105 @@ private: Queue.pop(); // Process blocks reachable from Block for (auto Jump : Block.SuccJumps) { + // If Jump can be ignored, skip it + if (ignoreJump(SrcBlock, nullptr, Jump)) + continue; + uint64_t Dst = Jump->Target; + // If Dst has been visited, skip Jump if (Visited[Dst]) continue; + // Process block Dst Visited[Dst] = true; if (!Func.Blocks[Dst].UnknownWeight) { - // If we see non-unique non-unknown block reachable from SrcBlock, - // stop processing and skip rebalancing - FlowBlock *CandidateDstBlock = &Func.Blocks[Dst]; - if (DstBlock != nullptr && DstBlock != CandidateDstBlock) - return false; - DstBlock = CandidateDstBlock; + KnownDstBlocks.push_back(&Func.Blocks[Dst]); } else { Queue.push(Dst); - UnknownSuccs.push_back(&Func.Blocks[Dst]); + UnknownBlocks.push_back(&Func.Blocks[Dst]); } } } + } + /// Verify if rebalancing of the subgraph is feasible. If the checks are + /// successful, set the unique destination block, DstBlock (can be null). + bool canRebalanceSubgraph(const FlowBlock *SrcBlock, + const std::vector<FlowBlock *> &KnownDstBlocks, + const std::vector<FlowBlock *> &UnknownBlocks, + FlowBlock *&DstBlock) { // If the list of unknown blocks is empty, we don't need rebalancing - if (UnknownSuccs.empty()) + if (UnknownBlocks.empty()) return false; - // If all reachable nodes from SrcBlock are unknown, skip rebalancing - if (DstBlock == nullptr) + + // If there are multiple known sinks, we can't rebalance + if (KnownDstBlocks.size() > 1) return false; - // If any of the unknown blocks is an exit block, skip rebalancing - for (auto Block : UnknownSuccs) { - if (Block->isExit()) + DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front(); + + // Verify sinks of the subgraph + for (auto Block : UnknownBlocks) { + if (Block->SuccJumps.empty()) { + // If there are multiple (known and unknown) sinks, we can't rebalance + if (DstBlock != nullptr) + return false; + continue; + } + size_t NumIgnoredJumps = 0; + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + NumIgnoredJumps++; + } + // If there is a non-sink block in UnknownBlocks with all jumps ignored, + // then we can't rebalance + if (NumIgnoredJumps == Block->SuccJumps.size()) return false; } return true; } + /// Decide whether the Jump is ignored while processing an unknown subgraphs + /// rooted at basic block SrcBlock with the destination block, DstBlock. + bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, + const FlowJump *Jump) { + // Ignore unlikely jumps with zero flow + if (Jump->IsUnlikely && Jump->Flow == 0) + return true; + + auto JumpSource = &Func.Blocks[Jump->Source]; + auto JumpTarget = &Func.Blocks[Jump->Target]; + + // Do not ignore jumps coming into DstBlock + if (DstBlock != nullptr && JumpTarget == DstBlock) + return false; + + // Ignore jumps out of SrcBlock to known blocks + if (!JumpTarget->UnknownWeight && JumpSource == SrcBlock) + return true; + + // Ignore jumps to known blocks with zero flow + if (!JumpTarget->UnknownWeight && JumpTarget->Flow == 0) + return true; + + return false; + } + /// Verify if the given unknown subgraph is acyclic, and if yes, reorder - /// UnknownSuccs in the topological order (so that all jumps are "forward"). - bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, - std::vector<FlowBlock *> &UnknownSuccs) { + /// UnknownBlocks in the topological order (so that all jumps are "forward"). + bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, + std::vector<FlowBlock *> &UnknownBlocks) { // Extract local in-degrees in the considered subgraph auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0); - for (auto Jump : SrcBlock->SuccJumps) { - LocalInDegree[Jump->Target]++; - } - for (uint64_t I = 0; I < UnknownSuccs.size(); I++) { - for (auto Jump : UnknownSuccs[I]->SuccJumps) { + auto fillInDegree = [&](const FlowBlock *Block) { + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; LocalInDegree[Jump->Target]++; } + }; + fillInDegree(SrcBlock); + for (auto Block : UnknownBlocks) { + fillInDegree(Block); } // A loop containing SrcBlock if (LocalInDegree[SrcBlock->Index] > 0) @@ -553,15 +630,20 @@ private: std::queue<uint64_t> Queue; Queue.push(SrcBlock->Index); while (!Queue.empty()) { - auto &Block = Func.Blocks[Queue.front()]; + FlowBlock *Block = &Func.Blocks[Queue.front()]; Queue.pop(); - // Stop propagation once we reach DstBlock - if (Block.Index == DstBlock->Index) + // Stop propagation once we reach DstBlock, if any + if (DstBlock != nullptr && Block == DstBlock) break; - AcyclicOrder.push_back(&Block); + // Keep an acyclic order of unknown blocks + if (Block->UnknownWeight && Block != SrcBlock) + AcyclicOrder.push_back(Block); + // Add to the queue all successors with zero local in-degree - for (auto Jump : Block.SuccJumps) { + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; uint64_t Dst = Jump->Target; LocalInDegree[Dst]--; if (LocalInDegree[Dst] == 0) { @@ -572,42 +654,69 @@ private: // If there is a cycle in the subgraph, AcyclicOrder contains only a subset // of all blocks - if (UnknownSuccs.size() + 1 != AcyclicOrder.size()) + if (UnknownBlocks.size() != AcyclicOrder.size()) return false; - UnknownSuccs = AcyclicOrder; + UnknownBlocks = AcyclicOrder; return true; } - /// Rebalance a given subgraph. - void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, - std::vector<FlowBlock *> &UnknownSuccs) { + /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and + /// having UnknownBlocks intermediate blocks. + void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock, + const FlowBlock *DstBlock, + const std::vector<FlowBlock *> &UnknownBlocks) { assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph"); - assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns"); - for (auto Block : UnknownSuccs) { + // Ditribute flow from the source block + uint64_t BlockFlow = 0; + // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps + for (auto Jump : SrcBlock->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; + BlockFlow += Jump->Flow; + } + rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow); + + // Ditribute flow from the remaining blocks + for (auto Block : UnknownBlocks) { + assert(Block->UnknownWeight && "incorrect unknown subgraph"); + uint64_t BlockFlow = 0; // Block's flow is the sum of incoming flows - uint64_t TotalFlow = 0; - if (Block == SrcBlock) { - TotalFlow = Block->Flow; - } else { - for (auto Jump : Block->PredJumps) { - TotalFlow += Jump->Flow; - } - Block->Flow = TotalFlow; + for (auto Jump : Block->PredJumps) { + BlockFlow += Jump->Flow; } + Block->Flow = BlockFlow; + rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow); + } + } - // Process all successor jumps and update corresponding flow values - for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) { - auto Jump = Block->SuccJumps[I]; - if (I + 1 == Block->SuccJumps.size()) { - Jump->Flow = TotalFlow; - continue; - } - uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability); - Jump->Flow = Flow; - TotalFlow -= Flow; - } + /// Redistribute flow for a block in a subgraph rooted at SrcBlock, + /// and ending at DstBlock. + void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, + const FlowBlock *Block, uint64_t BlockFlow) { + // Process all successor jumps and update corresponding flow values + size_t BlockDegree = 0; + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; + BlockDegree++; + } + // If all successor jumps of the block are ignored, skip it + if (DstBlock == nullptr && BlockDegree == 0) + return; + assert(BlockDegree > 0 && "all outgoing jumps are ignored"); + + // Each of the Block's successors gets the following amount of flow. + // Rounding the value up so that all flow is propagated + uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree; + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; + uint64_t Flow = std::min(SuccFlow, BlockFlow); + Jump->Flow = Flow; + BlockFlow -= Flow; } + assert(BlockFlow == 0 && "not all flow is propagated"); } /// A constant indicating an arbitrary exit block of a function. @@ -799,7 +908,7 @@ void verifyWeights(const FlowFunction &Func) { // Run BFS from the source along edges with positive flow std::queue<uint64_t> Queue; - auto Visited = std::vector<bool>(NumBlocks, false); + auto Visited = BitVector(NumBlocks, false); Queue.push(Func.Entry); Visited[Func.Entry] = true; while (!Queue.empty()) { diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index c840ee85795f..5363a851fc27 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -173,7 +173,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) { auto *PtrTy = cast<PointerType>(Ty); if (DL.isNonIntegralPointerType(PtrTy)) { auto *Int8PtrTy = Builder.getInt8PtrTy(PtrTy->getAddressSpace()); - assert(DL.getTypeAllocSize(Int8PtrTy->getElementType()) == 1 && + assert(DL.getTypeAllocSize(Builder.getInt8Ty()) == 1 && "alloc size of i8 must by 1 byte for the GEP to be correct"); auto *GEP = Builder.CreateGEP( Builder.getInt8Ty(), Constant::getNullValue(Int8PtrTy), V, "uglygep"); @@ -471,7 +471,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, // indexes into the array implied by the pointer operand; the rest of // the indices index into the element or field type selected by the // preceding index. - Type *ElTy = PTy->getElementType(); + Type *ElTy = PTy->getNonOpaquePointerElementType(); for (;;) { // If the scale size is not 0, attempt to factor out a scale for // array indexing. @@ -640,8 +640,8 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, Value *Casted = V; if (V->getType() != PTy) Casted = InsertNoopCastOfTo(Casted, PTy); - Value *GEP = Builder.CreateGEP(PTy->getElementType(), Casted, GepIndices, - "scevgep"); + Value *GEP = Builder.CreateGEP(PTy->getNonOpaquePointerElementType(), + Casted, GepIndices, "scevgep"); Ops.push_back(SE.getUnknown(GEP)); } @@ -1671,7 +1671,7 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { return Builder.CreateSExt(V, Ty); } -Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { +Value *SCEVExpander::expandSMaxExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands()-2; i >= 0; --i) { @@ -1700,7 +1700,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { return LHS; } -Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { +Value *SCEVExpander::expandUMaxExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands()-2; i >= 0; --i) { @@ -1729,7 +1729,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { return LHS; } -Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { +Value *SCEVExpander::expandSMinExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands() - 2; i >= 0; --i) { @@ -1758,7 +1758,7 @@ Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { return LHS; } -Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { +Value *SCEVExpander::expandUMinExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands() - 2; i >= 0; --i) { @@ -1787,6 +1787,40 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { return LHS; } +Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { + return expandSMaxExpr(S); +} + +Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { + return expandUMaxExpr(S); +} + +Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { + return expandSMinExpr(S); +} + +Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { + return expandUMinExpr(S); +} + +Value *SCEVExpander::visitSequentialUMinExpr(const SCEVSequentialUMinExpr *S) { + SmallVector<Value *> Ops; + for (const SCEV *Op : S->operands()) + Ops.emplace_back(expand(Op)); + + Value *SaturationPoint = + MinMaxIntrinsic::getSaturationPoint(Intrinsic::umin, S->getType()); + + SmallVector<Value *> OpIsZero; + for (Value *Op : ArrayRef<Value *>(Ops).drop_back()) + OpIsZero.emplace_back(Builder.CreateICmpEQ(Op, SaturationPoint)); + + Value *AnyOpIsZero = Builder.CreateLogicalOr(OpIsZero); + + Value *NaiveUMin = expandUMinExpr(S); + return Builder.CreateSelect(AnyOpIsZero, SaturationPoint, NaiveUMin); +} + Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, Instruction *IP, bool Root) { setInsertPoint(IP); @@ -1809,8 +1843,8 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) { // instruction. Instruction *Tmp; if (Inst->getType()->isIntegerTy()) - Tmp = - cast<Instruction>(Builder.CreateAdd(Inst, Inst, "tmp.lcssa.user")); + Tmp = cast<Instruction>(Builder.CreateIntToPtr( + Inst, Inst->getType()->getPointerTo(), "tmp.lcssa.user")); else { assert(Inst->getType()->isPointerTy()); Tmp = cast<Instruction>(Builder.CreatePtrToInt( @@ -1947,22 +1981,14 @@ Value *SCEVExpander::expand(const SCEV *S) { if (VO.second) { if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { - Type *Ety = Vty->getPointerElementType(); int64_t Offset = VO.second->getSExtValue(); - int64_t ESize = SE.getTypeSizeInBits(Ety); - if ((Offset * 8) % ESize == 0) { - ConstantInt *Idx = - ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize); - V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); - } else { - ConstantInt *Idx = - ConstantInt::getSigned(VO.second->getType(), -Offset); - unsigned AS = Vty->getAddressSpace(); - V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); - V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, - "uglygep"); - V = Builder.CreateBitCast(V, Vty); - } + ConstantInt *Idx = + ConstantInt::getSigned(VO.second->getType(), -Offset); + unsigned AS = Vty->getAddressSpace(); + V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); + V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, + "uglygep"); + V = Builder.CreateBitCast(V, Vty); } else { V = Builder.CreateSub(V, VO.second); } @@ -2271,10 +2297,27 @@ template<typename T> static InstructionCost costAndCollectOperands( case scSMaxExpr: case scUMaxExpr: case scSMinExpr: - case scUMinExpr: { + case scUMinExpr: + case scSequentialUMinExpr: { // FIXME: should this ask the cost for Intrinsic's? + // The reduction tree. Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 1); Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1, 0, 2); + switch (S->getSCEVType()) { + case scSequentialUMinExpr: { + // The safety net against poison. + // FIXME: this is broken. + Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 0); + Cost += ArithCost(Instruction::Or, + S->getNumOperands() > 2 ? S->getNumOperands() - 2 : 0); + Cost += CmpSelCost(Instruction::Select, 1, 0, 1); + break; + } + default: + assert(!isa<SCEVSequentialMinMaxExpr>(S) && + "Unhandled SCEV expression type?"); + break; + } break; } case scAddRecExpr: { @@ -2362,7 +2405,7 @@ bool SCEVExpander::isHighCostExpansionHelper( case scConstant: { // Only evalulate the costs of constants when optimizing for size. if (CostKind != TargetTransformInfo::TCK_CodeSize) - return 0; + return false; const APInt &Imm = cast<SCEVConstant>(S)->getAPInt(); Type *Ty = S->getType(); Cost += TTI.getIntImmCostInst( @@ -2399,7 +2442,8 @@ bool SCEVExpander::isHighCostExpansionHelper( case scUMaxExpr: case scSMaxExpr: case scUMinExpr: - case scSMinExpr: { + case scSMinExpr: + case scSequentialUMinExpr: { assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 && "Nary expr should have more than 1 operand."); // The simple nary expr will require one less op (or pair of ops) @@ -2490,49 +2534,73 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero); Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue); - // Get the backedge taken count and truncate or extended to the AR type. - Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); - // Compute |Step| * Backedge - Value *MulV, *OfMul; - if (Step->isOne()) { - // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't - // needed, there is never an overflow, so to avoid artificially inflating - // the cost of the check, directly emit the optimized IR. - MulV = TruncTripCount; - OfMul = ConstantInt::getFalse(MulV->getContext()); - } else { - auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), - Intrinsic::umul_with_overflow, Ty); - CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); - MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); - OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); - } - // Compute: - // Start + |Step| * Backedge < Start - // Start - |Step| * Backedge > Start - Value *Add = nullptr, *Sub = nullptr; - if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) { - StartValue = InsertNoopCastOfTo( - StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace())); - Value *NegMulV = Builder.CreateNeg(MulV); - Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV); - Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV); - } else { - Add = Builder.CreateAdd(StartValue, MulV); - Sub = Builder.CreateSub(StartValue, MulV); - } - - Value *EndCompareGT = Builder.CreateICmp( - Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); + // 1. Start + |Step| * Backedge < Start + // 2. Start - |Step| * Backedge > Start + // + // And select either 1. or 2. depending on whether step is positive or + // negative. If Step is known to be positive or negative, only create + // either 1. or 2. + auto ComputeEndCheck = [&]() -> Value * { + // Checking <u 0 is always false. + if (!Signed && Start->isZero() && SE.isKnownPositive(Step)) + return ConstantInt::getFalse(Loc->getContext()); + + // Get the backedge taken count and truncate or extended to the AR type. + Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); + + Value *MulV, *OfMul; + if (Step->isOne()) { + // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't + // needed, there is never an overflow, so to avoid artificially inflating + // the cost of the check, directly emit the optimized IR. + MulV = TruncTripCount; + OfMul = ConstantInt::getFalse(MulV->getContext()); + } else { + auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), + Intrinsic::umul_with_overflow, Ty); + CallInst *Mul = + Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); + MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); + OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); + } - Value *EndCompareLT = Builder.CreateICmp( - Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue); + Value *Add = nullptr, *Sub = nullptr; + bool NeedPosCheck = !SE.isKnownNegative(Step); + bool NeedNegCheck = !SE.isKnownPositive(Step); + + if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) { + StartValue = InsertNoopCastOfTo( + StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace())); + Value *NegMulV = Builder.CreateNeg(MulV); + if (NeedPosCheck) + Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV); + if (NeedNegCheck) + Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV); + } else { + if (NeedPosCheck) + Add = Builder.CreateAdd(StartValue, MulV); + if (NeedNegCheck) + Sub = Builder.CreateSub(StartValue, MulV); + } - // Select the answer based on the sign of Step. - Value *EndCheck = - Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT); + Value *EndCompareLT = nullptr; + Value *EndCompareGT = nullptr; + Value *EndCheck = nullptr; + if (NeedPosCheck) + EndCheck = EndCompareLT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue); + if (NeedNegCheck) + EndCheck = EndCompareGT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); + if (NeedPosCheck && NeedNegCheck) { + // Select the answer based on the sign of Step. + EndCheck = Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT); + } + return Builder.CreateOr(EndCheck, OfMul); + }; + Value *EndCheck = ComputeEndCheck(); // If the backedge taken count type is larger than the AR type, // check that we don't drop any bits by truncating it. If we are @@ -2548,7 +2616,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck); } - return Builder.CreateOr(EndCheck, OfMul); + return EndCheck; } Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred, @@ -2578,17 +2646,16 @@ Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred, Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union, Instruction *IP) { - auto *BoolType = IntegerType::get(IP->getContext(), 1); - Value *Check = ConstantInt::getNullValue(BoolType); - // Loop over all checks in this set. + SmallVector<Value *> Checks; for (auto Pred : Union->getPredicates()) { - auto *NextCheck = expandCodeForPredicate(Pred, IP); + Checks.push_back(expandCodeForPredicate(Pred, IP)); Builder.SetInsertPoint(IP); - Check = Builder.CreateOr(Check, NextCheck); } - return Check; + if (Checks.empty()) + return ConstantInt::getFalse(IP->getContext()); + return Builder.CreateOr(Checks); } Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) { @@ -2720,13 +2787,8 @@ void SCEVExpanderCleaner::cleanup() { // Remove sets with value handles. Expander.clear(); - // Sort so that earlier instructions do not dominate later instructions. - stable_sort(InsertedInstructions, [this](Instruction *A, Instruction *B) { - return DT.dominates(B, A); - }); // Remove all inserted instructions. - for (Instruction *I : InsertedInstructions) { - + for (Instruction *I : reverse(InsertedInstructions)) { #ifndef NDEBUG assert(all_of(I->users(), [&InsertedSet](Value *U) { diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 1046998c26de..335ac03ccb52 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -2052,109 +2052,119 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB, if (ScanIdx == 0) return false; - // Okay, we *could* sink last ScanIdx instructions. But how many can we - // actually sink before encountering instruction that is unprofitable to sink? - auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) { - unsigned NumPHIdValues = 0; - for (auto *I : *LRI) - for (auto *V : PHIOperands[I]) { - if (!InstructionsToSink.contains(V)) - ++NumPHIdValues; - // FIXME: this check is overly optimistic. We may end up not sinking - // said instruction, due to the very same profitability check. - // See @creating_too_many_phis in sink-common-code.ll. - } - LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n"); - unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size(); - if ((NumPHIdValues % UnconditionalPreds.size()) != 0) + bool followedByDeoptOrUnreachable = IsBlockFollowedByDeoptOrUnreachable(BB); + + if (!followedByDeoptOrUnreachable) { + // Okay, we *could* sink last ScanIdx instructions. But how many can we + // actually sink before encountering instruction that is unprofitable to + // sink? + auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) { + unsigned NumPHIdValues = 0; + for (auto *I : *LRI) + for (auto *V : PHIOperands[I]) { + if (!InstructionsToSink.contains(V)) + ++NumPHIdValues; + // FIXME: this check is overly optimistic. We may end up not sinking + // said instruction, due to the very same profitability check. + // See @creating_too_many_phis in sink-common-code.ll. + } + LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n"); + unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size(); + if ((NumPHIdValues % UnconditionalPreds.size()) != 0) NumPHIInsts++; - return NumPHIInsts <= 1; - }; + return NumPHIInsts <= 1; + }; - // We've determined that we are going to sink last ScanIdx instructions, - // and recorded them in InstructionsToSink. Now, some instructions may be - // unprofitable to sink. But that determination depends on the instructions - // that we are going to sink. - - // First, forward scan: find the first instruction unprofitable to sink, - // recording all the ones that are profitable to sink. - // FIXME: would it be better, after we detect that not all are profitable. - // to either record the profitable ones, or erase the unprofitable ones? - // Maybe we need to choose (at runtime) the one that will touch least instrs? - LRI.reset(); - int Idx = 0; - SmallPtrSet<Value *, 4> InstructionsProfitableToSink; - while (Idx < ScanIdx) { - if (!ProfitableToSinkInstruction(LRI)) { - // Too many PHIs would be created. - LLVM_DEBUG( - dbgs() << "SINK: stopping here, too many PHIs would be created!\n"); - break; + // We've determined that we are going to sink last ScanIdx instructions, + // and recorded them in InstructionsToSink. Now, some instructions may be + // unprofitable to sink. But that determination depends on the instructions + // that we are going to sink. + + // First, forward scan: find the first instruction unprofitable to sink, + // recording all the ones that are profitable to sink. + // FIXME: would it be better, after we detect that not all are profitable. + // to either record the profitable ones, or erase the unprofitable ones? + // Maybe we need to choose (at runtime) the one that will touch least + // instrs? + LRI.reset(); + int Idx = 0; + SmallPtrSet<Value *, 4> InstructionsProfitableToSink; + while (Idx < ScanIdx) { + if (!ProfitableToSinkInstruction(LRI)) { + // Too many PHIs would be created. + LLVM_DEBUG( + dbgs() << "SINK: stopping here, too many PHIs would be created!\n"); + break; + } + InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end()); + --LRI; + ++Idx; } - InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end()); - --LRI; - ++Idx; - } - // If no instructions can be sunk, early-return. - if (Idx == 0) - return false; + // If no instructions can be sunk, early-return. + if (Idx == 0) + return false; - // Did we determine that (only) some instructions are unprofitable to sink? - if (Idx < ScanIdx) { - // Okay, some instructions are unprofitable. - ScanIdx = Idx; - InstructionsToSink = InstructionsProfitableToSink; - - // But, that may make other instructions unprofitable, too. - // So, do a backward scan, do any earlier instructions become unprofitable? - assert(!ProfitableToSinkInstruction(LRI) && - "We already know that the last instruction is unprofitable to sink"); - ++LRI; - --Idx; - while (Idx >= 0) { - // If we detect that an instruction becomes unprofitable to sink, - // all earlier instructions won't be sunk either, - // so preemptively keep InstructionsProfitableToSink in sync. - // FIXME: is this the most performant approach? - for (auto *I : *LRI) - InstructionsProfitableToSink.erase(I); - if (!ProfitableToSinkInstruction(LRI)) { - // Everything starting with this instruction won't be sunk. - ScanIdx = Idx; - InstructionsToSink = InstructionsProfitableToSink; - } + // Did we determine that (only) some instructions are unprofitable to sink? + if (Idx < ScanIdx) { + // Okay, some instructions are unprofitable. + ScanIdx = Idx; + InstructionsToSink = InstructionsProfitableToSink; + + // But, that may make other instructions unprofitable, too. + // So, do a backward scan, do any earlier instructions become + // unprofitable? + assert( + !ProfitableToSinkInstruction(LRI) && + "We already know that the last instruction is unprofitable to sink"); ++LRI; --Idx; + while (Idx >= 0) { + // If we detect that an instruction becomes unprofitable to sink, + // all earlier instructions won't be sunk either, + // so preemptively keep InstructionsProfitableToSink in sync. + // FIXME: is this the most performant approach? + for (auto *I : *LRI) + InstructionsProfitableToSink.erase(I); + if (!ProfitableToSinkInstruction(LRI)) { + // Everything starting with this instruction won't be sunk. + ScanIdx = Idx; + InstructionsToSink = InstructionsProfitableToSink; + } + ++LRI; + --Idx; + } } - } - // If no instructions can be sunk, early-return. - if (ScanIdx == 0) - return false; + // If no instructions can be sunk, early-return. + if (ScanIdx == 0) + return false; + } bool Changed = false; if (HaveNonUnconditionalPredecessors) { - // It is always legal to sink common instructions from unconditional - // predecessors. However, if not all predecessors are unconditional, - // this transformation might be pessimizing. So as a rule of thumb, - // don't do it unless we'd sink at least one non-speculatable instruction. - // See https://bugs.llvm.org/show_bug.cgi?id=30244 - LRI.reset(); - int Idx = 0; - bool Profitable = false; - while (Idx < ScanIdx) { - if (!isSafeToSpeculativelyExecute((*LRI)[0])) { - Profitable = true; - break; + if (!followedByDeoptOrUnreachable) { + // It is always legal to sink common instructions from unconditional + // predecessors. However, if not all predecessors are unconditional, + // this transformation might be pessimizing. So as a rule of thumb, + // don't do it unless we'd sink at least one non-speculatable instruction. + // See https://bugs.llvm.org/show_bug.cgi?id=30244 + LRI.reset(); + int Idx = 0; + bool Profitable = false; + while (Idx < ScanIdx) { + if (!isSafeToSpeculativelyExecute((*LRI)[0])) { + Profitable = true; + break; + } + --LRI; + ++Idx; } - --LRI; - ++Idx; + if (!Profitable) + return false; } - if (!Profitable) - return false; LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n"); // We have a conditional edge and we're going to sink some instructions. @@ -4935,14 +4945,13 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, AssumptionCache *AC, const DataLayout &DL) { Value *Cond = SI->getCondition(); - unsigned Bits = Cond->getType()->getIntegerBitWidth(); KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) // bits are in the condition value. - unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1; - unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits; + unsigned MaxSignificantBitsInCond = + ComputeMaxSignificantBits(Cond, DL, 0, AC, SI); // Gather dead cases. SmallVector<ConstantInt *, 8> DeadCases; @@ -4973,8 +4982,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, bool HasDefault = !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); const unsigned NumUnknownBits = - Bits - (Known.Zero | Known.One).countPopulation(); - assert(NumUnknownBits <= Bits); + Known.getBitWidth() - (Known.Zero | Known.One).countPopulation(); + assert(NumUnknownBits <= Known.getBitWidth()); if (HasDefault && DeadCases.empty() && NumUnknownBits < 64 /* avoid overflow */ && SI->getNumCases() == (1ULL << NumUnknownBits)) { @@ -5796,10 +5805,9 @@ static void reuseTableCompare( for (auto ValuePair : Values) { Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(), ValuePair.second, CmpOp1, true); - if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst)) + if (!CaseConst || CaseConst == DefaultConst || + (CaseConst != TrueConst && CaseConst != FalseConst)) return; - assert((CaseConst == TrueConst || CaseConst == FalseConst) && - "Expect true or false as compare result."); } // Check if the branch instruction dominates the phi node. It's a simple diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 02727a3dbf9c..e02d02a05752 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -602,7 +602,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { Align MemSetAlign = CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne(); CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign); - AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0)); + AttrBuilder ArgAttrs(CI->getContext(), CI->getAttributes().getParamAttrs(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); copyFlags(*CI, NewCI); @@ -2515,8 +2515,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, } else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) { // sprintf(dest, "%s", str) -> stpcpy(dest, str) - dest // Handle mismatched pointer types (goes away with typeless pointers?). - V = B.CreatePointerCast(V, Dest->getType()); - Value *PtrDiff = B.CreatePtrDiff(V, Dest); + V = B.CreatePointerCast(V, B.getInt8PtrTy()); + Dest = B.CreatePointerCast(Dest, B.getInt8PtrTy()); + Value *PtrDiff = B.CreatePtrDiff(B.getInt8Ty(), V, Dest); return B.CreateIntCast(PtrDiff, CI->getType(), false); } diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index b822db938af8..8947303674ee 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -398,13 +398,17 @@ Value *Mapper::mapValue(const Value *V) { SmallVector<ValueAsMetadata *, 4> MappedArgs; for (auto *VAM : AL->getArgs()) { // Map both Local and Constant VAMs here; they will both ultimately - // be mapped via mapValue (apart from constants when we have no - // module level changes, which have an identity mapping). + // be mapped via mapValue. The exceptions are constants when we have no + // module level changes and locals when they have no existing mapped + // value and RF_IgnoreMissingLocals is set; these have identity + // mappings. if ((Flags & RF_NoModuleLevelChanges) && isa<ConstantAsMetadata>(VAM)) { MappedArgs.push_back(VAM); } else if (Value *LV = mapValue(VAM->getValue())) { MappedArgs.push_back( LV == VAM->getValue() ? VAM : ValueAsMetadata::get(LV)); + } else if ((Flags & RF_IgnoreMissingLocals) && isa<LocalAsMetadata>(VAM)) { + MappedArgs.push_back(VAM); } else { // If we cannot map the value, set the argument as undef. MappedArgs.push_back(ValueAsMetadata::get( diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 5a4a2f0924f6..97c2acb7d4c7 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -698,8 +698,9 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { ChainInstrs.push_back(&I); continue; } - if (I.mayThrow()) { - LLVM_DEBUG(dbgs() << "LSV: Found may-throw operation: " << I << '\n'); + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) { + LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: " + << I << '\n'); break; } if (I.mayReadOrWriteMemory()) @@ -853,13 +854,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) { (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; - // Make sure all the users of a vector are constant-index extracts. - if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { - const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); - return EEI && isa<ConstantInt>(EEI->getOperand(1)); - })) - continue; - // Save the load locations. const ChainID ID = getChainID(Ptr); LoadRefs[ID].push_back(LI); @@ -900,12 +894,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) { (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; - if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { - const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); - return EEI && isa<ConstantInt>(EEI->getOperand(1)); - })) - continue; - // Save store location. const ChainID ID = getChainID(Ptr); StoreRefs[ID].push_back(SI); @@ -1289,52 +1277,32 @@ bool Vectorizer::vectorizeLoadChain( Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); propagateMetadata(LI, Chain); - if (VecLoadTy) { - SmallVector<Instruction *, 16> InstrsToErase; - - unsigned VecWidth = VecLoadTy->getNumElements(); - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - for (auto Use : Chain[I]->users()) { - // All users of vector loads are ExtractElement instructions with - // constant indices, otherwise we would have bailed before now. - Instruction *UI = cast<Instruction>(Use); - unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); - unsigned NewIdx = Idx + I * VecWidth; - Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), - UI->getName()); - if (V->getType() != UI->getType()) - V = Builder.CreateBitCast(V, UI->getType()); - - // Replace the old instruction. - UI->replaceAllUsesWith(V); - InstrsToErase.push_back(UI); - } + for (unsigned I = 0, E = Chain.size(); I != E; ++I) { + Value *CV = Chain[I]; + Value *V; + if (VecLoadTy) { + // Extract a subvector using shufflevector. + unsigned VecWidth = VecLoadTy->getNumElements(); + auto Mask = + llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth)); + V = Builder.CreateShuffleVector(LI, Mask, CV->getName()); + } else { + V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); } - // Bitcast might not be an Instruction, if the value being loaded is a - // constant. In that case, no need to reorder anything. - if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) - reorder(BitcastInst); - - for (auto I : InstrsToErase) - I->eraseFromParent(); - } else { - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - Value *CV = Chain[I]; - Value *V = - Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); - if (V->getType() != CV->getType()) { - V = Builder.CreateBitOrPointerCast(V, CV->getType()); - } - - // Replace the old instruction. - CV->replaceAllUsesWith(V); + if (V->getType() != CV->getType()) { + V = Builder.CreateBitOrPointerCast(V, CV->getType()); } - if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) - reorder(BitcastInst); + // Replace the old instruction. + CV->replaceAllUsesWith(V); } + // Bitcast might not be an Instruction, if the value being loaded is a + // constant. In that case, no need to reorder anything. + if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) + reorder(BitcastInst); + eraseInstructions(Chain); ++NumVectorInstructions; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4747f34fcc62..d11f4146b590 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -470,10 +470,11 @@ public: /// on, while the old loop will be used as the scalar remainder. Control flow /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new - /// loop. - /// In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. - virtual BasicBlock *createVectorizedLoopSkeleton(); + /// loop and the start value for the canonical induction, if it is != 0. The + /// latter is the case when vectorizing the epilogue loop. In the case of + /// epilogue vectorization, this function is overriden to handle the more + /// complex control flow around the loops. + virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, @@ -507,10 +508,10 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, VPValue *Def, - VPTransformState &State); + /// the corresponding type. \p CanonicalIV is the scalar value generated for + /// the canonical induction variable. + void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, + VPTransformState &State, Value *CanonicalIV); /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, @@ -556,6 +557,10 @@ public: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + // Returns the resume value (bc.merge.rdx) for a reduction as + // generated by fixReduction. + PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); + protected: friend class LoopVectorizationPlanner; @@ -573,16 +578,18 @@ protected: Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock); - /// Create a new induction variable inside L. - PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, - Value *Step, Instruction *DL); + /// Introduce a conditional branch (on true, condition to be set later) at the + /// end of the header=latch connecting it to itself (across the backedge) and + /// to the exit block of \p L. + void createHeaderBranch(Loop *L); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); /// Create the exit value of first order recurrences in the middle block and /// update their users. - void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); + void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, + VPTransformState &State); /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); @@ -606,14 +613,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// This function adds - /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) - /// to each vector element of Val. The sequence starts at StartIndex. - /// \p Opcode is relevant for FP induction variable. - virtual Value * - getStepVector(Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); - /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and /// \p EntryVal is the value from the original loop that maps to the steps. @@ -640,9 +639,6 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// Generate a shuffle sequence that will reverse the vector Vec. - virtual Value *reverseVector(Value *Vec); - /// Returns (and creates if needed) the original loop trip count. Value *getOrCreateTripCount(Loop *NewLoop); @@ -685,14 +681,13 @@ protected: Loop *createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off (given by - /// \p VectorTripCount). + /// in the scalar epilogue, from where the vectorized loop left off. /// In cases where the loop skeleton is more complicated (eg. epilogue /// vectorization) and the resume values can come from an additional bypass /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( - Loop *L, Value *VectorTripCount, + Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -795,12 +790,6 @@ protected: /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector<BasicBlock *, 4> LoopBypassBlocks; - /// The new Induction variable which was added to the new block. - PHINode *Induction = nullptr; - - /// The induction variable of the old basic block. - PHINode *OldInduction = nullptr; - /// Store instructions that were predicated. SmallVector<Instruction *, 4> PredicatedInstructions; @@ -838,6 +827,11 @@ protected: /// Structure to hold information about generated runtime checks, responsible /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; + + // Holds the resume values for reductions in the loops, used to set the + // correct start value of reduction PHIs when vectorizing the epilogue. + SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> + ReductionResumeValues; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -856,10 +850,6 @@ public: private: Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector( - Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; - Value *reverseVector(Value *Vec) override; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -909,14 +899,16 @@ public: // Override this function to handle the more complex control flow around the // three loops. - BasicBlock *createVectorizedLoopSkeleton() final override { + std::pair<BasicBlock *, Value *> + createVectorizedLoopSkeleton() final override { return createEpilogueVectorizedLoopSkeleton(); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + virtual std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -944,7 +936,8 @@ public: EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -973,7 +966,8 @@ public: EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -1069,16 +1063,16 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } +namespace llvm { + /// Return a value for Step multiplied by VF. -static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, - int64_t Step) { +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, + int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } -namespace llvm { - /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); @@ -1163,7 +1157,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( // will lead to gather/scatter instructions, which don't need to be // handled. if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || - isa<VPInterleaveRecipe>(CurRec)) + isa<VPInterleaveRecipe>(CurRec) || + isa<VPCanonicalIVPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1232,6 +1227,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, } } +PHINode *InnerLoopVectorizer::getReductionResumeValue( + const RecurrenceDescriptor &RdxDesc) { + auto It = ReductionResumeValues.find(&RdxDesc); + assert(It != ReductionResumeValues.end() && + "Expected to find a resume value for the reduction."); + return It->second; +} + namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -1556,13 +1559,16 @@ public: /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool isLegalGatherOrScatter(Value *V, + ElementCount VF = ElementCount::getFixed(1)) { bool LI = isa<LoadInst>(V); bool SI = isa<StoreInst>(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); + if (VF.isVector()) + Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } @@ -1577,16 +1583,17 @@ public: } /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - /// If a non-zero VF has been calculated, we check if I will be scalarized - /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; + /// predication when vectorizing \p I with vectorization factor \p VF. Such + /// instructions include conditional stores and instructions that may divide + /// by zero. + bool isScalarWithPredication(Instruction *I, ElementCount VF) const; // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. + // \p VF is the vectorization factor that will be used to vectorize \p I. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + bool isPredicatedInst(Instruction *I, ElementCount VF, + bool IsKnownUniform = false) { // When we know the load is uniform and the original scalar loop was not // predicated we don't need to mark it as a predicated instruction. Any // vectorised blocks created when tail-folding are something artificial we @@ -1602,7 +1609,7 @@ public: // instructions. if (isa<LoadInst>(I) || isa<StoreInst>(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -1794,7 +1801,7 @@ private: /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I); + bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -2078,8 +2085,8 @@ public: /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { - SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); - SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); + SCEVExpanderCleaner SCEVCleaner(SCEVExp); + SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); if (!SCEVCheckCond) SCEVCleaner.markResultUsed(); @@ -2335,6 +2342,60 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } +/// This function adds +/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) +/// to each vector element of Val. The sequence starts at StartIndex. +/// \p Opcode is relevant for FP induction variable. +static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps BinOp, ElementCount VF, + IRBuilder<> &Builder) { + assert(VF.isVector() && "only vector VFs are supported"); + + // Create and check the types. + auto *ValVTy = cast<VectorType>(Val->getType()); + ElementCount VLen = ValVTy->getElementCount(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + + SmallVector<Constant *, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + VectorType *InitVecValVTy = ValVTy; + Type *InitVecValSTy = STy; + if (STy->isFloatingPointTy()) { + InitVecValSTy = + IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); + InitVecValVTy = VectorType::get(InitVecValSTy, VLen); + } + Value *InitVec = Builder.CreateStepVector(InitVecValVTy); + + // Splat the StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); + + if (STy->isIntegerTy()) { + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw + // flags, which can be found from the original scalar operations. + Step = Builder.CreateMul(InitVec, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); + InitVec = Builder.CreateUIToFP(InitVec, ValVTy); + InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); + + Step = Builder.CreateVectorSplat(VLen, Step); + Value *MulOp = Builder.CreateFMul(InitVec, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); +} + void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, VPTransformState &State) { @@ -2355,8 +2416,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = - getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. @@ -2411,8 +2472,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // placement of all induction updates. auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); - auto *ICmp = cast<Instruction>(Br->getCondition()); - LastInduction->moveBefore(ICmp); + LastInduction->moveBefore(Br); LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); @@ -2434,15 +2494,15 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, - const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, - VPValue *Def, - VPTransformState &State) { +void InnerLoopVectorizer::widenIntOrFpInduction( + PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, + Value *CanonicalIV) { + Value *Start = Def->getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = Def->getInductionDescriptor(); + TruncInst *Trunc = Def->getTruncInst(); IRBuilder<> &Builder = State.Builder; - assert((IV->getType()->isIntegerTy() || IV != OldInduction) && - "Primary induction variable must have an integer type"); assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(!State.VF.isZero() && "VF must be non-zero"); // The value from the original loop to which we are mapping the new induction // variable. @@ -2468,12 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, // induction variable and step. Otherwise, derive these values from the // induction descriptor. auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = Induction; - if (IV != OldInduction) { - ScalarIV = IV->getType()->isIntegerTy() - ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) - : Builder.CreateCast(Instruction::SIToFP, Induction, - IV->getType()); + Value *ScalarIV = CanonicalIV; + Type *NeededType = IV->getType(); + if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { + ScalarIV = + NeededType->isIntegerTy() + ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) + : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, State.CFG.PrevBB); ScalarIV->setName("offset.idx"); @@ -2493,7 +2554,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!State.VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) StartIdx = @@ -2502,7 +2562,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); Value *EntryPart = - getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); + getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), + State.VF, State.Builder); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); @@ -2516,9 +2577,31 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (State.VF.isZero() || State.VF.isScalar()) { + if (State.VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); - CreateSplatIV(ScalarIV, Step); + Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), + Step->getType()->getScalarSizeInBits()); + + Instruction::BinaryOps IncOp = ID.getInductionOpcode(); + if (IncOp == Instruction::BinaryOpsEnd) + IncOp = Instruction::Add; + for (unsigned Part = 0; Part < UF; ++Part) { + Value *StartIdx = ConstantInt::get(ScalarTy, Part); + Instruction::BinaryOps MulOp = Instruction::Mul; + if (Step->getType()->isFloatingPointTy()) { + StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); + MulOp = Instruction::FMul; + } + + Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); + State.set(Def, EntryPart, Part); + if (Trunc) { + assert(!Step->getType()->isFloatingPointTy() && + "fp inductions shouldn't be truncated"); + addMetadata(EntryPart, Trunc); + } + } return; } @@ -2554,54 +2637,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } -Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, - Value *Step, - Instruction::BinaryOps BinOp) { - // Create and check the types. - auto *ValVTy = cast<VectorType>(Val->getType()); - ElementCount VLen = ValVTy->getElementCount(); - - Type *STy = Val->getType()->getScalarType(); - assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && - "Induction Step must be an integer or FP"); - assert(Step->getType() == STy && "Step has wrong type"); - - SmallVector<Constant *, 8> Indices; - - // Create a vector of consecutive numbers from zero to VF. - VectorType *InitVecValVTy = ValVTy; - Type *InitVecValSTy = STy; - if (STy->isFloatingPointTy()) { - InitVecValSTy = - IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); - InitVecValVTy = VectorType::get(InitVecValSTy, VLen); - } - Value *InitVec = Builder.CreateStepVector(InitVecValVTy); - - // Splat the StartIdx - Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); - - if (STy->isIntegerTy()) { - InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); - Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(InitVec, Step); - return Builder.CreateAdd(Val, Step, "induction"); - } - - // Floating point induction. - assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && - "Binary Opcode should be specified for FP induction"); - InitVec = Builder.CreateUIToFP(InitVec, ValVTy); - InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); - - Step = Builder.CreateVectorSplat(VLen, Step); - Value *MulOp = Builder.CreateFMul(InitVec, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); -} - void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, @@ -2691,11 +2726,6 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, State.set(Def, VectorValue, Instance.Part); } -Value *InnerLoopVectorizer::reverseVector(Value *Vec) { - assert(Vec->getType()->isVectorTy() && "Invalid type"); - return Builder.CreateVectorReverse(Vec, "reverse"); -} - // Return whether we allow using masked interleave-groups (for dealing with // strided loads/stores that reside in predicated blocks, or for dealing // with gaps). @@ -2858,7 +2888,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } if (Group->isReverse()) - StridedVec = reverseVector(StridedVec); + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec, Part); } @@ -2894,7 +2924,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *StoredVec = State.get(StoredValues[i], Part); if (Group->isReverse()) - StoredVec = reverseVector(StoredVec); + StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); // If this member has different type, cast it to a unified type. @@ -2993,43 +3023,21 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(Cloned); } -PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, - Value *End, Value *Step, - Instruction *DL) { +void InnerLoopVectorizer::createHeaderBranch(Loop *L) { BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - // As we're just creating this loop, it's possible no latch exists - // yet. If so, use the header as this will be a single block loop. - if (!Latch) - Latch = Header; - - IRBuilder<> B(&*Header->getFirstInsertionPt()); - Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); - setDebugLocFromInst(OldInst, &B); - auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); + assert(!L->getLoopLatch() && "loop should not have a latch at this point"); - B.SetInsertPoint(Latch->getTerminator()); + IRBuilder<> B(Header->getTerminator()); + Instruction *OldInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); setDebugLocFromInst(OldInst, &B); - // Create i+1 and fill the PHINode. - // - // If the tail is not folded, we know that End - Start >= Step (either - // statically or through the minimum iteration checks). We also know that both - // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + - // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned - // overflows and we can mark the induction increment as NUW. - Value *Next = B.CreateAdd(Induction, Step, "index.next", - /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); - Induction->addIncoming(Start, L->getLoopPreheader()); - Induction->addIncoming(Next, Latch); - // Create the compare. - Value *ICmp = B.CreateICmpEQ(Next, End); - B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); + // Connect the header to the exit and header blocks and replace the old + // terminator. + B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. - Latch->getTerminator()->eraseFromParent(); - - return Induction; + Header->getTerminator()->eraseFromParent(); } Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { @@ -3099,10 +3107,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - assert(!VF.isScalable() && - "Tail folding not yet supported for scalable vectors"); + Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); TC = Builder.CreateAdd( - TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); + TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -3436,12 +3443,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { } void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, Value *VectorTripCount, - std::pair<BasicBlock *, Value *> AdditionalBypass) { - assert(VectorTripCount && L && "Expected valid arguments"); + Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); + + Value *VectorTripCount = getOrCreateVectorTripCount(L); + assert(VectorTripCount && L && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3449,6 +3457,7 @@ void InnerLoopVectorizer::createInductionResumeValues( // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. + Instruction *OldInduction = Legal->getPrimaryInduction(); for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3546,25 +3555,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - Optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - if (VectorizedLoopID.hasValue()) { - L->setLoopID(VectorizedLoopID.getValue()); - - // Do not setAlreadyVectorized if loop attributes have been defined - // explicitly. - return LoopVectorPreHeader; - } - - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE, TTI); - Hints.setAlreadyVectorized(); - #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); LI->verify(*DT); @@ -3573,7 +3563,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, return LoopVectorPreHeader; } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +std::pair<BasicBlock *, Value *> +InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3638,33 +3629,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // faster. emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); - Value *Step = createStepForVF(Builder, IdxTy, VF, UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp, CountRoundDown); + createInductionResumeValues(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -4088,8 +4058,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { } } -void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, - VPTransformState &State) { +void InnerLoopVectorizer::fixFirstOrderRecurrence( + VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. @@ -4334,13 +4304,29 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, : Builder.CreateZExt(ReducedPartRdx, PhiTy); } + PHINode *ResumePhi = + dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); - BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + + // If we are fixing reductions in the epilogue loop then we should already + // have created a bc.merge.rdx Phi after the main vector body. Ensure that + // we carry over the incoming values correctly. + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); + else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), + Incoming); + else + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + } + + // Set the resume value for this reduction + ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -4557,6 +4543,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, InductionDescriptor II = Legal->getInductionVars().lookup(P); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); + PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. switch (II.getKind()) { @@ -4572,7 +4561,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. Value *PtrInd = - Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. @@ -4602,10 +4591,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *PhiType = II.getStep()->getType(); // Build a pointer phi - Value *ScalarStartValue = II.getStartValue(); + Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); // A pointer induction, performed by using a gep @@ -4916,7 +4905,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, ElementCount VF) const { if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; switch(I->getOpcode()) { @@ -4928,11 +4918,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); + Type *VTy = Ty; + if (VF.isVector()) + VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - TTI.isLegalMaskedGather(Ty, Alignment)) + TTI.isLegalMaskedGather(VTy, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - TTI.isLegalMaskedScatter(Ty, Alignment)); + TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -5005,7 +4998,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5056,7 +5049,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -5531,10 +5524,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } } - // For scalable vectors, don't use tail folding as this is currently not yet - // supported. The code is likely to have ended up here if the tripcount is - // low, in which case it makes sense not to use scalable vectors. - if (MaxFactors.ScalableVF.isVector()) + // For scalable vectors don't use tail folding for low trip counts or + // optimizing for code size. We only permit this if the user has explicitly + // requested it. + if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && + ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && + MaxFactors.ScalableVF.isVector()) MaxFactors.ScalableVF = ElementCount::getScalable(0); // If we don't know the precise trip count, or if the trip count that we @@ -5849,10 +5844,8 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( const Loop &L, ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { - return Legal->isFirstOrderRecurrence(&Phi) || - Legal->isReductionVariable(&Phi); - })) + if (any_of(L.getHeader()->phis(), + [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) return false; // Phis with uses outside of the loop require special handling and are @@ -5978,11 +5971,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - for (Type *T : ElementTypesInLoop) { - MinWidth = std::min<unsigned>( - MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); - MaxWidth = std::max<unsigned>( - MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + // For in-loop reductions, no element types are added to ElementTypesInLoop + // if there are no loads/stores in the loop. In this case, check through the + // reduction variables to determine the maximum width. + if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { + // Reset MaxWidth so that we can find the smallest type used by recurrences + // in the loop. + MaxWidth = -1U; + for (auto &PhiDescriptorPair : Legal->getReductionVars()) { + const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; + // When finding the min width used by the recurrence we need to account + // for casts on the input operands of the recurrence. + MaxWidth = std::min<unsigned>( + MaxWidth, std::min<unsigned>( + RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), + RdxDesc.getRecurrenceType()->getScalarSizeInBits())); + } + } else { + for (Type *T : ElementTypesInLoop) { + MinWidth = std::min<unsigned>( + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth = std::max<unsigned>( + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + } } return {MinWidth, MaxWidth}; } @@ -6022,18 +6033,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { if (auto *ST = dyn_cast<StoreInst>(&I)) T = ST->getValueOperand()->getType(); - // Ignore loaded pointer types and stored pointer types that are not - // vectorizable. - // - // FIXME: The check here attempts to predict whether a load or store will - // be vectorized. We only know this for certain after a VF has - // been selected. Here, we assume that if an access can be - // vectorized, it will be. We should also look at extending this - // optimization to non-pointer types. - // - if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) - continue; + assert(T->isSized() && + "Expected the load/store/recurrence type to be sized"); ElementTypesInLoop.insert(T); } @@ -6475,7 +6476,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, + ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6484,8 +6486,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); return isa<LoadInst>(I) || (isa<StoreInst>(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6512,13 +6513,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && + if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. @@ -6554,7 +6555,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6601,7 +6602,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -6764,7 +6765,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(I, VF)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6775,7 +6776,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); - if (useEmulatedMaskMemRefHack(I)) + if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -7182,7 +7183,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) + if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7192,7 +7193,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa<StoreInst>(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + isLegalGatherOrScatter(&I, VF)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7234,7 +7235,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); @@ -7437,7 +7438,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, VF)) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7941,6 +7942,40 @@ VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { llvm_unreachable("No plan found!"); } +static void AddRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const auto *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = + S && S->getString().startswith("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(i)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } +} + void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, @@ -7952,9 +7987,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); - State.TripCount = ILV.getOrCreateTripCount(nullptr); - State.CanonicalIV = ILV.Induction; + Value *CanonicalIVStartValue; + std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = + ILV.createVectorizedLoopSkeleton(); ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -7968,8 +8003,35 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. + BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), + ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State); BestVPlan.execute(&State); + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + Optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + + Loop *L = LI->getLoopFor(State.CFG.PrevBB); + if (VectorizedLoopID.hasValue()) + L->setLoopID(VectorizedLoopID.getValue()); + else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + // Disable runtime unrolling when vectorizing the epilogue loop. + if (CanonicalIVStartValue) + AddRuntimeUnrollDisableMetaData(L); + // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(State); @@ -8032,66 +8094,16 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( } } -Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, - Value *Step, - Instruction::BinaryOps BinOp) { - // When unrolling and the VF is 1, we only need to add a simple scalar. - Type *Ty = Val->getType(); - assert(!Ty->isVectorTy() && "Val must be a scalar"); - - if (Ty->isFloatingPointTy()) { - // Floating-point operations inherit FMF via the builder's flags. - Value *MulOp = Builder.CreateFMul(StartIdx, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp); - } - return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); -} - -static void AddRuntimeUnrollDisableMetaData(Loop *L) { - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - bool IsUnrollMetadata = false; - MDNode *LoopID = L->getLoopID(); - if (LoopID) { - // First find existing loop unrolling disable metadata. - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (MD) { - const auto *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = - S && S->getString().startswith("llvm.loop.unroll.disable"); - } - MDs.push_back(LoopID->getOperand(i)); - } - } - - if (!IsUnrollMetadata) { - // Add runtime unroll disable metadata. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back( - MDString::get(Context, "llvm.loop.unroll.runtime.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); - } -} - //===--------------------------------------------------------------------===// // EpilogueVectorizerMainLoop //===--------------------------------------------------------------------===// /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +std::pair<BasicBlock *, Value *> +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton(""); @@ -8120,24 +8132,16 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); - Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, // because the vplan in the second pass still contains the inductions from the // original loop. - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -8219,7 +8223,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * +std::pair<BasicBlock *, Value *> EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton("vec.epilog."); @@ -8275,6 +8279,25 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { LoopBypassBlocks.push_back(EPI.MemSafetyCheck); LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + // The vec.epilog.iter.check block may contain Phi nodes from reductions which + // merge control-flow from the latch block and the middle block. Update the + // incoming values here and move the Phi into the preheader. + SmallVector<PHINode *, 4> PhisInBlock; + for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) + PhisInBlock.push_back(&Phi); + + for (PHINode *Phi : PhisInBlock) { + Phi->replaceIncomingBlockWith( + VecEpilogueIterationCountCheck->getSinglePredecessor(), + VecEpilogueIterationCountCheck); + Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); + if (EPI.SCEVSafetyCheck) + Phi->removeIncomingValue(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + Phi->removeIncomingValue(EPI.MemSafetyCheck); + Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); + } + // Generate a resume induction for the vector epilogue and put it in the // vector epilogue preheader Type *IdxTy = Legal->getWidestInductionType(); @@ -8285,13 +8308,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { EPI.MainLoopIterationCountCheck); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); - Value *StartIdx = EPResumeVal; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail @@ -8300,12 +8317,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, CountRoundDown, - {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); + createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); - AddRuntimeUnrollDisableMetaData(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; } BasicBlock * @@ -8447,33 +8462,22 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. - // Start by constructing the desired canonical IV in the header block. - VPValue *IV = nullptr; - if (Legal->getPrimaryInduction()) - IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); - else { - VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); - auto *IVRecipe = new VPWidenCanonicalIVRecipe(); - HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); - IV = IVRecipe; - } + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); + HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); - // Create the block in mask as the first non-phi instruction in the block. VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - - VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - bool TailFolded = !CM.isScalarEpilogueAllowed(); - - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { - // While ActiveLaneMask is a binary op that consumes the loop tripcount - // as a second argument, we only pass the IV here and extract the - // tripcount from the transform state where codegen of the VP instructions - // happen. - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + if (CM.TTI.emitGetActiveLaneMask()) { + VPValue *TC = Plan->getOrCreateTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; @@ -8621,7 +8625,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -8661,7 +8667,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8719,7 +8726,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, void VPRecipeBuilder::fixHeaderPhis() { BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); - for (VPWidenPHIRecipe *R : PhisToFix) { + for (VPHeaderPHIRecipe *R : PhisToFix) { auto *PN = cast<PHINode>(R->getUnderlyingValue()); VPRecipeBase *IncR = getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); @@ -8735,7 +8742,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, Range); // Even if the instruction is not marked as uniform, there are certain @@ -8861,7 +8868,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) return toVPRecipeResult(Recipe); - VPWidenPHIRecipe *PhiRecipe = nullptr; + VPHeaderPHIRecipe *PhiRecipe = nullptr; if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { @@ -8882,11 +8889,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); PhisToFix.push_back(PhiRecipe); } else { - // TODO: record start and backedge value for remaining pointer induction - // phis. + // TODO: record backedge value for remaining pointer induction phis. assert(Phi->getType()->isPointerTy() && "only pointer phis should be handled here"); - PhiRecipe = new VPWidenPHIRecipe(Phi); + assert(Legal->getInductionVars().count(Phi) && + "Not an induction variable"); + InductionDescriptor II = Legal->getInductionVars().lookup(Phi); + VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); + PhiRecipe = new VPWidenPHIRecipe(Phi, Start); } return toVPRecipeResult(PhiRecipe); @@ -8966,6 +8976,40 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } +// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a +// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a +// BranchOnCount VPInstruction to the latch. +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, + bool HasNUW, bool IsVPlanNative) { + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getOrAddVPValue(StartIdx); + + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + if (IsVPlanNative) + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + Header->insert(CanonicalIVPHI, Header->begin()); + + auto *CanonicalIVIncrement = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW + : VPInstruction::CanonicalIVIncrement, + {CanonicalIVPHI}, DL); + CanonicalIVPHI->addOperand(CanonicalIVIncrement); + + VPBasicBlock *EB = TopRegion->getExitBasicBlock(); + if (IsVPlanNative) { + EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); + EB->setCondBit(nullptr); + } + EB->appendRecipe(CanonicalIVIncrement); + + auto *BranchOnCount = + new VPInstruction(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchOnCount); +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -9033,6 +9077,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); auto Plan = std::make_unique<VPlan>(TopRegion); + Instruction *DLInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + DLInst ? DLInst->getDebugLoc() : DebugLoc(), + !CM.foldTailByMasking(), false); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -9194,6 +9244,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); // Now that sink-after is done, move induction recipes for optimized truncates @@ -9325,6 +9376,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, DeadInstructions, *PSE.getSE()); + + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), + true, true); return Plan; } @@ -9414,16 +9468,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } // If tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the end of the latch. + // and the live-out instruction of each reduction, at the beginning of the + // dedicated latch block. if (CM.foldTailByMasking()) { + Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || PhiR->isInLoop()) continue; - Builder.setInsertPoint(LatchVPBB); VPValue *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); VPValue *Red = PhiR->getBackedgeValue(); + assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && + "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } @@ -9682,9 +9739,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), - getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), State); + auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); + State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -10013,7 +10069,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } - State.set(getVPSingleValue(), NewLI, Part); + State.set(this, NewLI, Part); } } @@ -10561,6 +10617,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + + // Ensure that the start values for any VPReductionPHIRecipes are + // updated before vectorising the epilogue loop. + VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { + if (auto *Resume = MainILV.getReductionResumeValue( + ReductionPhi->getRecurrenceDescriptor())) { + VPValue *StartVal = new VPValue(Resume); + BestEpiPlan.addExternalDef(StartVal); + ReductionPhi->setOperand(0, StartVal); + } + } + } + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT); ++LoopsEpilogueVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 37ae13666f7a..99c265fc5101 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -435,7 +435,7 @@ struct InstructionsState { } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + bool isAltShuffle() const { return AltOp != MainOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); @@ -581,7 +581,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static MemoryLocation getLocation(Instruction *I, AAResults *AA) { +static MemoryLocation getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -1417,7 +1417,11 @@ public: HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); } else if (NumFreeOpsHash.NumOfAPOs == Min && NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { - ++HashMap[NumFreeOpsHash.Hash].first; + auto It = HashMap.find(NumFreeOpsHash.Hash); + if (It == HashMap.end()) + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + else + ++It->second.first; } } // Select the lane with the minimum counter. @@ -2019,9 +2023,7 @@ private: } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { - return getOpcode() != getAltOpcode(); - } + bool isAltShuffle() const { return MainOp != AltOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); @@ -2519,12 +2521,11 @@ private: SD->IsScheduled = true; LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ScheduleData *BundleMember = SD; - while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; + for (ScheduleData *BundleMember = SD; BundleMember; + BundleMember = BundleMember->NextInBundle) { + if (BundleMember->Inst != BundleMember->OpValue) continue; - } + // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. @@ -2589,7 +2590,6 @@ private: << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } - BundleMember = BundleMember->NextInBundle; } } @@ -2618,6 +2618,10 @@ private: } } + /// Build a bundle from the ScheduleData nodes corresponding to the + /// scalar instruction for each lane. + ScheduleData *buildBundle(ArrayRef<Value *> VL); + /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. @@ -3040,7 +3044,7 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. - DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; + DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; // ExtractElement gather nodes which can be vectorized and need to handle // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; @@ -3051,6 +3055,29 @@ void BoUpSLP::reorderTopToBottom() { const std::unique_ptr<TreeEntry> &TE) { if (Optional<OrdersType> CurrentOrder = getReorderingData(*TE.get(), /*TopToBottom=*/true)) { + // Do not include ordering for nodes used in the alt opcode vectorization, + // better to reorder them during bottom-to-top stage. If follow the order + // here, it causes reordering of the whole graph though actually it is + // profitable just to reorder the subgraph that starts from the alternate + // opcode vectorization node. Such nodes already end-up with the shuffle + // instruction and it is just enough to change this shuffle rather than + // rotate the scalars for the whole graph. + unsigned Cnt = 0; + const TreeEntry *UserTE = TE.get(); + while (UserTE && Cnt < RecursionMaxDepth) { + if (UserTE->UserTreeIndices.size() != 1) + break; + if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { + return EI.UserTE->State == TreeEntry::Vectorize && + EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; + })) + return; + if (UserTE->UserTreeIndices.empty()) + UserTE = nullptr; + else + UserTE = UserTE->UserTreeIndices.back().UserTE; + ++Cnt; + } VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); @@ -3066,7 +3093,7 @@ void BoUpSLP::reorderTopToBottom() { // Try to find the most profitable order. We just are looking for the most // used order and reorder scalar elements in the nodes according to this // mostly used order. - const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond(); + ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); // All operands are reordered and used only in this node - propagate the // most used order to the user node. MapVector<OrdersType, unsigned, @@ -4459,6 +4486,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, CurrentOrder.clear(); return false; } + if (ShouldKeepOrder) + CurrentOrder.clear(); return ShouldKeepOrder; } @@ -7202,6 +7231,33 @@ void BoUpSLP::optimizeGatherSequence() { GatherShuffleSeq.clear(); } +BoUpSLP::ScheduleData * +BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { + ScheduleData *Bundle = nullptr; + ScheduleData *PrevInBundle = nullptr; + for (Value *V : VL) { + ScheduleData *BundleMember = getScheduleData(V); + assert(BundleMember && + "no ScheduleData for bundle member " + "(maybe not in same basic block)"); + assert(BundleMember->isSchedulingEntity() && + "bundle member already part of other bundle"); + if (PrevInBundle) { + PrevInBundle->NextInBundle = BundleMember; + } else { + Bundle = BundleMember; + } + BundleMember->UnscheduledDepsInBundle = 0; + Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; + + // Group the instructions to a bundle. + BundleMember->FirstInBundle = Bundle; + PrevInBundle = BundleMember; + } + assert(Bundle && "Failed to find schedule bundle"); + return Bundle; +} + // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. Optional<BoUpSLP::ScheduleData *> @@ -7214,12 +7270,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; - ScheduleData *PrevInBundle = nullptr; - ScheduleData *Bundle = nullptr; - bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); - auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule, + auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, ScheduleData *Bundle) { // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to @@ -7263,39 +7316,28 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Otherwise the compiler may crash trying to incorrectly calculate // dependencies and emit instruction in the wrong order at the actual // scheduling. - TryScheduleBundle(/*ReSchedule=*/false, nullptr); + TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); return None; } } + bool ReSchedule = false; for (Value *V : VL) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); - if (BundleMember->IsScheduled) { - // A bundle member was scheduled as single instruction before and now - // needs to be scheduled as part of the bundle. We just get rid of the - // existing schedule. - LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember - << " was already scheduled\n"); - ReSchedule = true; - } - assert(BundleMember->isSchedulingEntity() && - "bundle member already part of other bundle"); - if (PrevInBundle) { - PrevInBundle->NextInBundle = BundleMember; - } else { - Bundle = BundleMember; - } - BundleMember->UnscheduledDepsInBundle = 0; - Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; - - // Group the instructions to a bundle. - BundleMember->FirstInBundle = Bundle; - PrevInBundle = BundleMember; + if (!BundleMember->IsScheduled) + continue; + // A bundle member was scheduled as single instruction before and now + // needs to be scheduled as part of the bundle. We just get rid of the + // existing schedule. + LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember + << " was already scheduled\n"); + ReSchedule = true; } - assert(Bundle && "Failed to find schedule bundle"); - TryScheduleBundle(ReSchedule, Bundle); + + auto *Bundle = buildBundle(VL); + TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); return None; @@ -7464,20 +7506,33 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, while (!WorkList.empty()) { ScheduleData *SD = WorkList.pop_back_val(); - - ScheduleData *BundleMember = SD; - while (BundleMember) { + for (ScheduleData *BundleMember = SD; BundleMember; + BundleMember = BundleMember->NextInBundle) { assert(isInSchedulingRegion(BundleMember)); - if (!BundleMember->hasValidDependencies()) { - - LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember - << "\n"); - BundleMember->Dependencies = 0; - BundleMember->resetUnscheduledDeps(); + if (BundleMember->hasValidDependencies()) + continue; - // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember + << "\n"); + BundleMember->Dependencies = 0; + BundleMember->resetUnscheduledDeps(); + + // Handle def-use chain dependencies. + if (BundleMember->OpValue != BundleMember->Inst) { + ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + for (User *U : BundleMember->Inst->users()) { + assert(isa<Instruction>(U) && + "user of instruction must be instruction"); + ScheduleData *UseSD = getScheduleData(U); if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; @@ -7486,89 +7541,69 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, if (!DestBundle->hasValidDependencies()) WorkList.push_back(DestBundle); } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa<Instruction>(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. - BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); - } - } } + } - // Handle the memory dependencies. - ScheduleData *DepDest = BundleMember->NextLoadStore; - if (DepDest) { - Instruction *SrcInst = BundleMember->Inst; - MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); - unsigned numAliased = 0; - unsigned DistToSrc = 1; - - while (DepDest) { - assert(isInSchedulingRegion(DepDest)); - - // We have two limits to reduce the complexity: - // 1) AliasedCheckLimit: It's a small limit to reduce calls to - // SLP->isAliased (which is the expensive part in this loop). - // 2) MaxMemDepDistance: It's for very large blocks and it aborts - // the whole loop (even if the loop is fast, it's quadratic). - // It's important for the loop break condition (see below) to - // check this limit even between two read-only instructions. - if (DistToSrc >= MaxMemDepDistance || - ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && - (numAliased >= AliasedCheckLimit || - SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { - - // We increment the counter only if the locations are aliased - // (instead of counting all alias checks). This gives a better - // balance between reduced runtime and accurate dependencies. - numAliased++; - - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } - } - DepDest = DepDest->NextLoadStore; - - // Example, explaining the loop break condition: Let's assume our - // starting instruction is i0 and MaxMemDepDistance = 3. - // - // +--------v--v--v - // i0,i1,i2,i3,i4,i5,i6,i7,i8 - // +--------^--^--^ - // - // MaxMemDepDistance let us stop alias-checking at i3 and we add - // dependencies from i0 to i3,i4,.. (even if they are not aliased). - // Previously we already added dependencies from i3 to i6,i7,i8 - // (because of MaxMemDepDistance). As we added a dependency from - // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 - // and we can abort this loop at i6. - if (DistToSrc >= 2 * MaxMemDepDistance) - break; - DistToSrc++; + // Handle the memory dependencies (if any). + ScheduleData *DepDest = BundleMember->NextLoadStore; + if (!DepDest) + continue; + Instruction *SrcInst = BundleMember->Inst; + assert(SrcInst->mayReadOrWriteMemory() && + "NextLoadStore list for non memory effecting bundle?"); + MemoryLocation SrcLoc = getLocation(SrcInst); + bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; + + for ( ; DepDest; DepDest = DepDest->NextLoadStore) { + assert(isInSchedulingRegion(DepDest)); + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } - BundleMember = BundleMember->NextInBundle; } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); @@ -7638,8 +7673,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Move the scheduled instruction(s) to their dedicated places, if not // there yet. - ScheduleData *BundleMember = picked; - while (BundleMember) { + for (ScheduleData *BundleMember = picked; BundleMember; + BundleMember = BundleMember->NextInBundle) { Instruction *pickedInst = BundleMember->Inst; if (pickedInst->getNextNode() != LastScheduledInst) { BS->BB->getInstList().remove(pickedInst); @@ -7647,7 +7682,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { pickedInst); } LastScheduledInst = pickedInst; - BundleMember = BundleMember->NextInBundle; } BS->schedule(picked, ReadyInsts); @@ -8045,8 +8079,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { + LLVM_DEBUG( + dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); return false; + } // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) @@ -8693,7 +8730,6 @@ class HorizontalReduction { static RecurKind getRdxKind(Instruction *I) { assert(I && "Expected instruction for reduction matching"); - TargetTransformInfo::ReductionFlags RdxFlags; if (match(I, m_Add(m_Value(), m_Value()))) return RecurKind::Add; if (match(I, m_Mul(m_Value(), m_Value()))) @@ -8767,7 +8803,6 @@ class HorizontalReduction { return RecurKind::None; } - TargetTransformInfo::ReductionFlags RdxFlags; switch (Pred) { default: return RecurKind::None; @@ -9206,7 +9241,7 @@ private: auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*unsigned=*/false, CostKind); + /*IsUnsigned=*/false, CostKind); CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -9571,8 +9606,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); - // Aggregate value is unlikely to be processed in vector register, we need to - // extract scalars into scalar registers, so NeedExtraction is set true. + // Aggregate value is unlikely to be processed in vector register. return tryToVectorizeList(BuildVectorOpds, R); } @@ -9598,7 +9632,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, function_ref<unsigned(T *)> Limit, function_ref<bool(T *, T *)> Comparator, function_ref<bool(T *, T *)> AreCompatible, - function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize, + function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, bool LimitForRegisterSize) { bool Changed = false; // Sort by type, parent, operands. @@ -9627,7 +9661,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // same/alternate ops only, this may result in some extra final // vectorization. if (NumElts > 1 && - TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { // Success start over because instructions might have been changed. Changed = true; } else if (NumElts < Limit(*IncIt) && @@ -9638,7 +9672,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // Final attempt to vectorize instructions with the same types. if (Candidates.size() > 1 && (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { - if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) { + if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) { // Success start over because instructions might have been changed. Changed = true; } else if (LimitForRegisterSize) { @@ -9649,7 +9683,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) ++SameTypeIt; unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts), + if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts), /*LimitForRegisterSize=*/false)) Changed = true; It = SameTypeIt; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 65857f034210..e5dded3c0f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -59,7 +59,7 @@ class VPRecipeBuilder { /// Cross-iteration reduction & first-order recurrence phis for which we need /// to add the incoming value from the backedge after all recipes have been /// created. - SmallVector<VPWidenPHIRecipe *, 4> PhisToFix; + SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix; /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d9e71663cd2..a96c122db2a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -677,10 +677,10 @@ void VPInstruction::generateInstruction(VPTransformState &State, // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.TripCount; + Value *ScalarTC = State.get(getOperand(1), Part); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); @@ -711,6 +711,51 @@ void VPInstruction::generateInstruction(VPTransformState &State, } break; } + + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: { + Value *Next = nullptr; + if (Part == 0) { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = + createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + } else { + Next = State.get(this, 0); + } + + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCount: { + if (Part != 0) + break; + // First create the compare. + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *Cond = Builder.CreateICmpEQ(IV, TC); + + // Now create the branch. + auto *Plan = getParent()->getPlan(); + VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath && + "empty entry block only expected in VPlanNativePath"); + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + } + // TODO: Once the exit block is modeled in VPlan, use it instead of going + // through State.CFG.LastBB. + BasicBlock *Exit = + cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0); + + Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]); + Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -758,6 +803,15 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; + case VPInstruction::CanonicalIVIncrement: + O << "VF * UF + "; + break; + case VPInstruction::CanonicalIVIncrementNUW: + O << "VF * UF +(nuw) "; + break; + case VPInstruction::BranchOnCount: + O << "branch-on-count "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -786,23 +840,55 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { FMF = FMFNew; } -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. -void VPlan::execute(VPTransformState *State) { - // -1. Check if the backedge taken count is needed, and if so build it. +void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, + Value *CanonicalIVStartValue, + VPTransformState &State) { + // Check if the trip count is needed, and if so build it. + if (TripCount && TripCount->getNumUsers()) { + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(TripCount, TripCountV, Part); + } + + // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { - Value *TC = State->TripCount; - IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); - auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + auto *TCMO = Builder.CreateSub(TripCountV, + ConstantInt::get(TripCountV->getType(), 1), "trip.count.minus.1"); - auto VF = State->VF; + auto VF = State.VF; Value *VTCMO = VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) - State->set(BackedgeTakenCount, VTCMO, Part); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(BackedgeTakenCount, VTCMO, Part); } + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(&VectorTripCount, VectorTripCountV, Part); + + // When vectorizing the epilogue loop, the canonical induction start value + // needs to be changed from zero to the value after the main vector loop. + if (CanonicalIVStartValue) { + VPValue *VPV = new VPValue(CanonicalIVStartValue); + addExternalDef(VPV); + auto *IV = getCanonicalIV(); + assert(all_of(IV->users(), + [](const VPUser *U) { + auto *VPI = cast<VPInstruction>(U); + return VPI->getOpcode() == + VPInstruction::CanonicalIVIncrement || + VPI->getOpcode() == + VPInstruction::CanonicalIVIncrementNUW; + }) && + "the canonical IV should only be used by its increments when " + "resetting the start value"); + IV->setOperand(0, VPV); + } +} + +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { // 0. Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; @@ -834,28 +920,6 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); - // Fix the latch value of reduction and first-order recurrences phis in the - // vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); - if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) || - isa<VPReductionPHIRecipe>(&R))) - continue; - // For first-order recurrences and in-order reduction phis, only a single - // part is generated, which provides the last part from the previous - // iteration. Otherwise all UF parts are generated. - bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) || - cast<VPReductionPHIRecipe>(&R)->isOrdered(); - unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *VecPhi = State->get(PhiR, Part); - Value *Val = State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part); - cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB); - } - } - // Setup branch terminator successors for VPBBs in VPBBsToFix based on // VPBB's successors. for (auto VPBB : State->CFG.VPBBsToFix) { @@ -876,13 +940,19 @@ void VPlan::execute(VPTransformState *State) { // 3. Merge the temporary latch created with the last basic-block filled. BasicBlock *LastBB = State->CFG.PrevBB; + assert(isa<BranchInst>(LastBB->getTerminator()) && + "Expected VPlan CFG to terminate with branch"); + + // Move both the branch and check from LastBB to VectorLatchBB. + auto *LastBranch = cast<BranchInst>(LastBB->getTerminator()); + LastBranch->moveBefore(VectorLatchBB->getTerminator()); + VectorLatchBB->getTerminator()->eraseFromParent(); + // Move condition so it is guaranteed to be next to branch. This is only done + // to avoid excessive test updates. + // TODO: Remove special handling once the increments for all inductions are + // modeled explicitly in VPlan. + cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch); // Connect LastBB to VectorLatchBB to facilitate their merge. - assert((EnableVPlanNativePath || - isa<UnreachableInst>(LastBB->getTerminator())) && - "Expected InnerLoop VPlan CFG to terminate with unreachable"); - assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && - "Expected VPlan CFG to terminate with branch in NativePath"); - LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); // Merge LastBB with Latch. @@ -891,6 +961,37 @@ void VPlan::execute(VPTransformState *State) { assert(Merged && "Could not merge last basic block with latch."); VectorLatchBB = LastBB; + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = Entry->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath); + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + } + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + // TODO: Model their backedge values explicitly. + if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R)) + continue; + + auto *PhiR = cast<VPHeaderPHIRecipe>(&R); + // For canonical IV, first-order recurrences and in-order reduction phis, + // only a single part is generated, which provides the last part from the + // previous iteration. For non-ordered reductions all UF parts are + // generated. + bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || + isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || + cast<VPReductionPHIRecipe>(PhiR)->isOrdered(); + unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; + + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *Phi = State->get(PhiR, Part); + Value *Val = State->get(PhiR->getBackedgeValue(), + SinglePartNeeded ? State->UF - 1 : Part); + cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB); + } + } + // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, @@ -904,6 +1005,12 @@ void VPlan::print(raw_ostream &O) const { O << "VPlan '" << Name << "' {"; + if (VectorTripCount.getNumUsers() > 0) { + O << "\nLive-in "; + VectorTripCount.printAsOperand(O, SlotTracker); + O << " = vector-trip-count\n"; + } + if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { O << "\nLive-in "; BackedgeTakenCount->printAsOperand(O, SlotTracker); @@ -1155,7 +1262,15 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, } else O << " " << VPlanIngredient(IV); } +#endif +bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); + auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); + return StartC && StartC->isZero() && StepC && StepC->isOne(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; @@ -1255,7 +1370,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN "; if (!isStore()) { - getVPSingleValue()->printAsOperand(O, SlotTracker); + printAsOperand(O, SlotTracker); O << " = "; } O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; @@ -1264,26 +1379,39 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { + Value *Start = getStartValue()->getLiveInIRValue(); + PHINode *EntryPart = PHINode::Create( + Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(Start, State.CFG.VectorPreHeader); + EntryPart->setDebugLoc(DL); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, EntryPart, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = CANONICAL-INDUCTION"; +} +#endif + void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.CanonicalIV; + Value *CanonicalIV = State.get(getOperand(0), 0); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; - assert(!VF.isScalable() && "the code following assumes non scalables ECs"); Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF.getKnownMinValue(), - CanonicalIV, "broadcast"); + : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - SmallVector<Constant *, 8> Indices; - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - Indices.push_back( - ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); - // If VF == 1, there is only one iteration in the loop above, thus the - // element pushed back into Indices is ConstantInt::get(STy, Part) - Constant *VStep = - VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); - // Add the consecutive indices to the vector value. + Value *VStep = createStepForVF(Builder, STy, VF, Part); + if (VF.isVector()) { + VStep = Builder.CreateVectorSplat(VF, VStep); + VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); + } Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(this, CanonicalVectorIV, Part); } @@ -1294,7 +1422,8 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); - O << " = WIDEN-CANONICAL-INDUCTION"; + O << " = WIDEN-CANONICAL-INDUCTION "; + printOperands(O, SlotTracker); } #endif @@ -1461,7 +1590,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { for (VPRecipeBase &VPI : *VPBB) { - if (isa<VPWidenPHIRecipe>(&VPI)) + if (isa<VPHeaderPHIRecipe>(&VPI)) continue; assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); auto *VPInst = cast<VPInstruction>(&VPI); @@ -1506,6 +1635,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { for (const VPValue *V : Plan.VPExternalDefs) assignSlot(V); + assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f4a1883e35d5..824440f98a8b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -69,6 +69,9 @@ class VPlanSlp; /// vectors it is an expression determined at runtime. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); +/// Return a value for Step multiplied by VF. +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step); + /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 9) = {1, 2, 4, 8} @@ -198,8 +201,8 @@ struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilder<> &Builder, InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV), - Plan(Plan) {} + : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) { + } /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; @@ -341,9 +344,6 @@ struct VPTransformState { /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). Value *CanonicalIV = nullptr; - /// Hold the trip count of the scalar loop. - Value *TripCount = nullptr; - /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; @@ -793,6 +793,9 @@ public: SLPLoad, SLPStore, ActiveLaneMask, + CanonicalIVIncrement, + CanonicalIVIncrementNUW, + BranchOnCount, }; private: @@ -833,6 +836,16 @@ public: return R->getVPDefID() == VPRecipeBase::VPInstructionSC; } + /// Extra classof implementations to allow directly casting from VPUser -> + /// VPInstruction. + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && R->getVPDefID() == VPRecipeBase::VPInstructionSC; + } + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInstructionSC; + } + unsigned getOpcode() const { return Opcode; } /// Generate the instruction. @@ -871,6 +884,7 @@ public: case Instruction::Unreachable: case Instruction::Fence: case Instruction::AtomicRMW: + case VPInstruction::BranchOnCount: return false; default: return true; @@ -1045,6 +1059,7 @@ public: /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } + const VPValue *getStartValue() const { return getOperand(0); } /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. @@ -1057,66 +1072,65 @@ public: /// Returns the induction descriptor for the recipe. const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } -}; -/// A recipe for handling first order recurrences and pointer inductions. For -/// first-order recurrences, the start value is the first operand of the recipe -/// and the incoming value from the backedge is the second operand. It also -/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all -/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly. -class VPWidenPHIRecipe : public VPRecipeBase, public VPValue { - /// List of incoming blocks. Only used in the VPlan native path. - SmallVector<VPBasicBlock *, 2> IncomingBlocks; + /// Returns true if the induction is canonical, i.e. starting at 0 and + /// incremented by UF * VF (= the original IV is incremented by 1). + bool isCanonical() const; + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + const TruncInst *TruncI = getTruncInst(); + return TruncI ? TruncI->getType() : IV->getType(); + } +}; +/// A pure virtual base class for all recipes modeling header phis, including +/// phis for first order recurrences, pointer inductions and reductions. The +/// start value is the first operand of the recipe and the incoming value from +/// the backedge is the second operand. +class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue { protected: - VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi, - VPValue *Start = nullptr) + VPHeaderPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi, + VPValue *Start = nullptr) : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) { if (Start) addOperand(Start); } public: - /// Create a VPWidenPHIRecipe for \p Phi - VPWidenPHIRecipe(PHINode *Phi) - : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {} - - /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. - VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) { - addOperand(&Start); - } - - ~VPWidenPHIRecipe() override = default; + ~VPHeaderPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { - return B->getVPDefID() == VPRecipeBase::VPWidenPHISC || + return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || - B->getVPDefID() == VPRecipeBase::VPReductionPHISC; + B->getVPDefID() == VPRecipeBase::VPReductionPHISC || + B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || + B->getVPDefID() == VPRecipeBase::VPWidenPHISC; } static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVWidenPHISC || + return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || - V->getVPValueID() == VPValue::VPVReductionPHISC; + V->getVPValueID() == VPValue::VPVReductionPHISC || + V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || + V->getVPValueID() == VPValue::VPVWidenPHISC; } - /// Generate the phi/select nodes. - void execute(VPTransformState &State) override; + /// Generate the phi nodes. + void execute(VPTransformState &State) override = 0; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; + VPSlotTracker &SlotTracker) const override = 0; #endif - /// Returns the start value of the phi, if it is a reduction or first-order - /// recurrence. + /// Returns the start value of the phi, if one is set. VPValue *getStartValue() { return getNumOperands() == 0 ? nullptr : getOperand(0); } - /// Returns the incoming value from the loop backedge, if it is a reduction or - /// first-order recurrence. + /// Returns the incoming value from the loop backedge. VPValue *getBackedgeValue() { return getOperand(1); } @@ -1126,6 +1140,44 @@ public: VPRecipeBase *getBackedgeRecipe() { return cast<VPRecipeBase>(getBackedgeValue()->getDef()); } +}; + +/// A recipe for handling header phis that are widened in the vector loop. +/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are +/// managed in the recipe directly. +class VPWidenPHIRecipe : public VPHeaderPHIRecipe { + /// List of incoming blocks. Only used in the VPlan native path. + SmallVector<VPBasicBlock *, 2> IncomingBlocks; + +public: + /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. + VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr) + : VPHeaderPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) { + if (Start) + addOperand(Start); + } + + ~VPWidenPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *B) { + return B->getVPDefID() == VPRecipeBase::VPWidenPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenPHISC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { @@ -1133,27 +1185,27 @@ public: IncomingBlocks.push_back(IncomingBlock); } - /// Returns the \p I th incoming VPValue. - VPValue *getIncomingValue(unsigned I) { return getOperand(I); } - /// Returns the \p I th incoming VPBasicBlock. VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } + + /// Returns the \p I th incoming VPValue. + VPValue *getIncomingValue(unsigned I) { return getOperand(I); } }; /// A recipe for handling first-order recurrence phis. The start value is the /// first operand of the recipe and the incoming value from the backedge is the /// second operand. -struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { +struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start) - : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC, - VPFirstOrderRecurrencePHISC, Phi, &Start) {} + : VPHeaderPHIRecipe(VPVFirstOrderRecurrencePHISC, + VPFirstOrderRecurrencePHISC, Phi, &Start) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } - static inline bool classof(const VPWidenPHIRecipe *D) { - return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC; @@ -1171,7 +1223,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { /// A recipe for handling reduction phis. The start value is the first operand /// of the recipe and the incoming value from the backedge is the second /// operand. -class VPReductionPHIRecipe : public VPWidenPHIRecipe { +class VPReductionPHIRecipe : public VPHeaderPHIRecipe { /// Descriptor for the reduction. const RecurrenceDescriptor &RdxDesc; @@ -1187,7 +1239,7 @@ public: VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, bool IsOrdered = false) - : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), + : VPHeaderPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } @@ -1198,12 +1250,12 @@ public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; + } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVReductionPHISC; } - static inline bool classof(const VPWidenPHIRecipe *R) { - return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; - } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; @@ -1601,11 +1653,46 @@ public: #endif }; +/// Canonical scalar induction phi of the vector loop. Starting at the specified +/// start value (either 0 or the resume value when vectorizing the epilogue +/// loop). VPWidenCanonicalIVRecipe represents the vector version of the +/// canonical induction variable. +class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVCanonicalIVPHISC, VPCanonicalIVPHISC, + nullptr, StartV), + DL(DL) {} + + ~VPCanonicalIVPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPCanonicalIVPHISC; + } + + /// Generate the canonical scalar induction phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + return getOperand(0)->getLiveInIRValue()->getType(); + } +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: - VPWidenCanonicalIVRecipe() - : VPRecipeBase(VPWidenCanonicalIVSC, {}), + VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) + : VPRecipeBase(VPWidenCanonicalIVSC, {CanonicalIV}), VPValue(VPValue::VPVWidenCanonicalIVSC, nullptr, this) {} ~VPWidenCanonicalIVRecipe() override = default; @@ -1615,6 +1702,16 @@ public: return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; } + /// Extra classof implementations to allow directly casting from VPUser -> + /// VPWidenCanonicalIVRecipe. + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; + } + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; + } + /// Generate a canonical vector induction variable of the vector loop, with /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and /// step = <VF*UF, VF*UF, ..., VF*UF>. @@ -1625,6 +1722,12 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDef()) + ->getScalarType(); + } }; /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It @@ -2112,10 +2215,17 @@ class VPlan { // (operators '==' and '<'). SetVector<VPValue *> VPExternalDefs; - /// Represents the backedge taken count of the original loop, for folding + /// Represents the trip count of the original loop, for folding /// the tail. + VPValue *TripCount = nullptr; + + /// Represents the backedge taken count of the original loop, for folding + /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; + /// Represents the vector trip count. + VPValue VectorTripCount; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -2147,12 +2257,18 @@ public: } for (VPValue *VPV : VPValuesToFree) delete VPV; + if (TripCount) + delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) delete Def; } + /// Prepare the plan for execution, setting up the required live-in values. + void prepareToExecute(Value *TripCount, Value *VectorTripCount, + Value *CanonicalIVStartValue, VPTransformState &State); + /// Generate the IR code for this VPlan. void execute(struct VPTransformState *State); @@ -2165,6 +2281,13 @@ public: return Entry; } + /// The trip count of the original loop. + VPValue *getOrCreateTripCount() { + if (!TripCount) + TripCount = new VPValue(); + return TripCount; + } + /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { if (!BackedgeTakenCount) @@ -2172,6 +2295,9 @@ public: return BackedgeTakenCount; } + /// The vector trip count. + VPValue &getVectorTripCount() { return VectorTripCount; } + /// Mark the plan to indicate that using Value2VPValue is not safe any /// longer, because it may be stale. void disableValue2VPValue() { Value2VPValueEnabled = false; } @@ -2264,6 +2390,21 @@ public: return !VPV->getDef() || (RepR && RepR->isUniform()); } + /// Returns the VPRegionBlock of the vector loop. + VPRegionBlock *getVectorLoopRegion() { + return cast<VPRegionBlock>(getEntry()); + } + + /// Returns the canonical induction recipe of the vector loop. + VPCanonicalIVPHIRecipe *getCanonicalIV() { + VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); + if (EntryVPBB->empty()) { + // VPlan native path. + EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor()); + } + return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 86ecd6817873..e879a33db6ee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -231,7 +231,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { } // Entry point. The driver function for the predicator. -void VPlanPredicator::predicate(void) { +void VPlanPredicator::predicate() { // Predicate the blocks within Region. predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry())); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h index 692afd2978d5..a5db9a54da3c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h @@ -68,7 +68,7 @@ public: VPlanPredicator(VPlan &Plan); /// Predicate Plan's HCFG. - void predicate(void); + void predicate(); }; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d2daf558c2c5..fb5f3d428189 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -324,3 +324,30 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { E.first->eraseFromParent(); } } + +void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; + for (VPUser *U : CanonicalIV->users()) { + WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U); + if (WidenNewIV) + break; + } + + if (!WidenNewIV) + return; + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); + + // If the induction recipe is canonical and the types match, use it + // directly. + if (WidenOriginalIV && WidenOriginalIV->isCanonical() && + WidenOriginalIV->getScalarType() == WidenNewIV->getScalarType()) { + WidenNewIV->replaceAllUsesWith(WidenOriginalIV); + WidenNewIV->eraseFromParent(); + return; + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a82a562d5e35..e74409a86466 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -45,6 +45,10 @@ struct VPlanTransforms { /// in the vectorized loop. There is no need to vectorize the cast - the same /// value can be used for both the phi and casts in the vector loop. static void removeRedundantInductionCasts(VPlan &Plan); + + /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV + /// recipe, if it exists. + static void removeRedundantCanonicalIVs(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index fd92201614df..5296d2b9485c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -96,14 +96,15 @@ public: VPVReplicateSC, VPVWidenSC, VPVWidenCallSC, + VPVWidenCanonicalIVSC, VPVWidenGEPSC, VPVWidenSelectSC, // Phi-like VPValues. Need to be kept together. VPVBlendSC, + VPVCanonicalIVPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, - VPVWidenCanonicalIVSC, VPVWidenIntOrFpInductionSC, VPVPredInstPHI, VPVReductionPHISC, @@ -177,6 +178,7 @@ public: void replaceAllUsesWith(VPValue *New); VPDef *getDef() { return Def; } + const VPDef *getDef() const { return Def; } /// Returns the underlying IR value, if this VPValue is defined outside the /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef @@ -186,6 +188,11 @@ public: "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); return getUnderlyingValue(); } + const Value *getLiveInIRValue() const { + assert(!getDef() && + "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); + return getUnderlyingValue(); + } }; typedef DenseMap<Value *, VPValue *> Value2VPValueTy; @@ -325,6 +332,7 @@ public: VPReductionSC, VPReplicateSC, VPWidenCallSC, + VPWidenCanonicalIVSC, VPWidenGEPSC, VPWidenMemoryInstructionSC, VPWidenSC, @@ -332,9 +340,9 @@ public: // Phi-like recipes. Need to be kept together. VPBlendSC, + VPCanonicalIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, - VPWidenCanonicalIVSC, VPWidenIntOrFpInductionSC, VPPredInstPHISC, VPReductionPHISC, @@ -403,7 +411,6 @@ public: class VPlan; class VPBasicBlock; -class VPRegionBlock; /// This class can be used to assign consecutive numbers to all VPValues in a /// VPlan and allows querying the numbering for printing, similar to the diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 7732d9367985..d36f250995e1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -163,12 +163,32 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { errs() << "VPlan entry block is not a VPBasicBlock\n"; return false; } + + if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) { + errs() << "VPlan vector loop header does not start with a " + "VPCanonicalIVPHIRecipe\n"; + return false; + } + const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit()); if (!Exit) { errs() << "VPlan exit block is not a VPBasicBlock\n"; return false; } + if (Exit->empty()) { + errs() << "VPlan vector loop exit must end with BranchOnCount " + "VPInstruction but is empty\n"; + return false; + } + + auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end())); + if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { + errs() << "VPlan vector loop exit must end with BranchOnCount " + "VPInstruction\n"; + return false; + } + for (const VPRegionBlock *Region : VPBlockUtils::blocksOnly<const VPRegionBlock>( depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>( diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index c0aedab2fed0..620d388199e0 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -881,7 +881,8 @@ static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, ConstantRange IdxRange(IntWidth, true); if (isGuaranteedNotToBePoison(Idx, &AC)) { - if (ValidIndices.contains(computeConstantRange(Idx, true, &AC, CtxI, &DT))) + if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, + true, &AC, CtxI, &DT))) return ScalarizationResult::safe(); return ScalarizationResult::unsafe(); } |
