diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Transforms/Utils | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'llvm/lib/Transforms/Utils')
54 files changed, 8811 insertions, 2193 deletions
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp new file mode 100644 index 0000000000000..84a66e1e96d2c --- /dev/null +++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp @@ -0,0 +1,246 @@ +//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Utility function to lower a printf call into a series of device +// library calls on the AMDGPU target. +// +// WARNING: This file knows about certain library functions. It recognizes them +// by name, and hardwires knowledge of their semantics. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" + +#include <iostream> + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-emit-printf" + +static bool isCString(const Value *Arg) { + auto Ty = Arg->getType(); + auto PtrTy = dyn_cast<PointerType>(Ty); + if (!PtrTy) + return false; + + auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType()); + if (!IntTy) + return false; + + return IntTy->getBitWidth() == 8; +} + +static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) { + auto Int64Ty = Builder.getInt64Ty(); + auto Ty = Arg->getType(); + + if (auto IntTy = dyn_cast<IntegerType>(Ty)) { + switch (IntTy->getBitWidth()) { + case 32: + return Builder.CreateZExt(Arg, Int64Ty); + case 64: + return Arg; + } + } + + if (Ty->getTypeID() == Type::DoubleTyID) { + return Builder.CreateBitCast(Arg, Int64Ty); + } + + if (isa<PointerType>(Ty)) { + return Builder.CreatePtrToInt(Arg, Int64Ty); + } + + llvm_unreachable("unexpected type"); +} + +static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) { + auto Int64Ty = Builder.getInt64Ty(); + auto M = Builder.GetInsertBlock()->getModule(); + auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty); + return Builder.CreateCall(Fn, Version); +} + +static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs, + Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3, + Value *Arg4, Value *Arg5, Value *Arg6, + bool IsLast) { + auto Int64Ty = Builder.getInt64Ty(); + auto Int32Ty = Builder.getInt32Ty(); + auto M = Builder.GetInsertBlock()->getModule(); + auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty, + Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty, + Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty); + auto IsLastValue = Builder.getInt32(IsLast); + auto NumArgsValue = Builder.getInt32(NumArgs); + return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3, + Arg4, Arg5, Arg6, IsLastValue}); +} + +static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool IsLast) { + auto Arg0 = fitArgInto64Bits(Builder, Arg); + auto Zero = Builder.getInt64(0); + return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero, + Zero, IsLast); +} + +// The device library does not provide strlen, so we build our own loop +// here. While we are at it, we also include the terminating null in the length. +static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) { + auto *Prev = Builder.GetInsertBlock(); + Module *M = Prev->getModule(); + + auto CharZero = Builder.getInt8(0); + auto One = Builder.getInt64(1); + auto Zero = Builder.getInt64(0); + auto Int64Ty = Builder.getInt64Ty(); + + // The length is either zero for a null pointer, or the computed value for an + // actual string. We need a join block for a phi that represents the final + // value. + // + // Strictly speaking, the zero does not matter since + // __ockl_printf_append_string_n ignores the length if the pointer is null. + BasicBlock *Join = nullptr; + if (Prev->getTerminator()) { + Join = Prev->splitBasicBlock(Builder.GetInsertPoint(), + "strlen.join"); + Prev->getTerminator()->eraseFromParent(); + } else { + Join = BasicBlock::Create(M->getContext(), "strlen.join", + Prev->getParent()); + } + BasicBlock *While = + BasicBlock::Create(M->getContext(), "strlen.while", + Prev->getParent(), Join); + BasicBlock *WhileDone = BasicBlock::Create( + M->getContext(), "strlen.while.done", + Prev->getParent(), Join); + + // Emit an early return for when the pointer is null. + Builder.SetInsertPoint(Prev); + auto CmpNull = + Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType())); + BranchInst::Create(Join, While, CmpNull, Prev); + + // Entry to the while loop. + Builder.SetInsertPoint(While); + + auto PtrPhi = Builder.CreatePHI(Str->getType(), 2); + PtrPhi->addIncoming(Str, Prev); + auto PtrNext = Builder.CreateGEP(PtrPhi, One); + PtrPhi->addIncoming(PtrNext, While); + + // Condition for the while loop. + auto Data = Builder.CreateLoad(PtrPhi); + auto Cmp = Builder.CreateICmpEQ(Data, CharZero); + Builder.CreateCondBr(Cmp, WhileDone, While); + + // Add one to the computed length. + Builder.SetInsertPoint(WhileDone, WhileDone->begin()); + auto Begin = Builder.CreatePtrToInt(Str, Int64Ty); + auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty); + auto Len = Builder.CreateSub(End, Begin); + Len = Builder.CreateAdd(Len, One); + + // Final join. + BranchInst::Create(Join, WhileDone); + Builder.SetInsertPoint(Join, Join->begin()); + auto LenPhi = Builder.CreatePHI(Len->getType(), 2); + LenPhi->addIncoming(Len, WhileDone); + LenPhi->addIncoming(Zero, Prev); + + return LenPhi; +} + +static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str, + Value *Length, bool isLast) { + auto Int64Ty = Builder.getInt64Ty(); + auto CharPtrTy = Builder.getInt8PtrTy(); + auto Int32Ty = Builder.getInt32Ty(); + auto M = Builder.GetInsertBlock()->getModule(); + auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty, + Int64Ty, CharPtrTy, Int64Ty, Int32Ty); + auto IsLastInt32 = Builder.getInt32(isLast); + return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32}); +} + +static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool IsLast) { + auto Length = getStrlenWithNull(Builder, Arg); + return callAppendStringN(Builder, Desc, Arg, Length, IsLast); +} + +static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool SpecIsCString, bool IsLast) { + if (SpecIsCString && isCString(Arg)) { + return appendString(Builder, Desc, Arg, IsLast); + } + // If the format specifies a string but the argument is not, the frontend will + // have printed a warning. We just rely on undefined behaviour and send the + // argument anyway. + return appendArg(Builder, Desc, Arg, IsLast); +} + +// Scan the format string to locate all specifiers, and mark the ones that +// specify a string, i.e, the "%s" specifier with optional '*' characters. +static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) { + StringRef Str; + if (!getConstantStringInfo(Fmt, Str) || Str.empty()) + return; + + static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn"; + size_t SpecPos = 0; + // Skip the first argument, the format string. + unsigned ArgIdx = 1; + + while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) { + if (Str[SpecPos + 1] == '%') { + SpecPos += 2; + continue; + } + auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos); + if (SpecEnd == StringRef::npos) + return; + auto Spec = Str.slice(SpecPos, SpecEnd + 1); + ArgIdx += Spec.count('*'); + if (Str[SpecEnd] == 's') { + BV.set(ArgIdx); + } + SpecPos = SpecEnd + 1; + ++ArgIdx; + } +} + +Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, + ArrayRef<Value *> Args) { + auto NumOps = Args.size(); + assert(NumOps >= 1); + + auto Fmt = Args[0]; + SparseBitVector<8> SpecIsCString; + locateCStrings(SpecIsCString, Fmt); + + auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0)); + Desc = appendString(Builder, Desc, Fmt, NumOps == 1); + + // FIXME: This invokes hostcall once for each argument. We can pack up to + // seven scalar printf arguments in a single hostcall. See the signature of + // callAppendArgs(). + for (unsigned int i = 1; i != NumOps; ++i) { + bool IsLast = i == NumOps - 1; + bool IsCString = SpecIsCString.test(i); + Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast); + } + + return Builder.CreateTrunc(Desc, Builder.getInt32Ty()); +} diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp new file mode 100644 index 0000000000000..7ff73fcdada79 --- /dev/null +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -0,0 +1,618 @@ +//===- AssumeBundleBuilder.cpp - tools to preserve informations -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "assume-builder" + +#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/DebugCounter.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +cl::opt<bool> ShouldPreserveAllAttributes( + "assume-preserve-all", cl::init(false), cl::Hidden, + cl::desc("enable preservation of all attrbitues. even those that are " + "unlikely to be usefull")); + +cl::opt<bool> EnableKnowledgeRetention( + "enable-knowledge-retention", cl::init(false), cl::Hidden, + cl::desc( + "enable preservation of attributes throughout code transformation")); + +STATISTIC(NumAssumeBuilt, "Number of assume built by the assume builder"); +STATISTIC(NumBundlesInAssumes, "Total number of Bundles in the assume built"); +STATISTIC(NumAssumesMerged, + "Number of assume merged by the assume simplify pass"); +STATISTIC(NumAssumesRemoved, + "Number of assume removed by the assume simplify pass"); + +DEBUG_COUNTER(BuildAssumeCounter, "assume-builder-counter", + "Controls which assumes gets created"); + +namespace { + +bool isUsefullToPreserve(Attribute::AttrKind Kind) { + switch (Kind) { + case Attribute::NonNull: + case Attribute::Alignment: + case Attribute::Dereferenceable: + case Attribute::DereferenceableOrNull: + case Attribute::Cold: + return true; + default: + return false; + } +} + +/// This function will try to transform the given knowledge into a more +/// canonical one. the canonical knowledge maybe the given one. +RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) { + switch (RK.AttrKind) { + default: + return RK; + case Attribute::NonNull: + RK.WasOn = GetUnderlyingObject(RK.WasOn, M->getDataLayout()); + return RK; + case Attribute::Alignment: { + Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) { + if (auto *GEP = dyn_cast<GEPOperator>(Strip)) + RK.ArgValue = + MinAlign(RK.ArgValue, + GEP->getMaxPreservedAlignment(M->getDataLayout()).value()); + }); + RK.WasOn = V; + return RK; + } + case Attribute::Dereferenceable: + case Attribute::DereferenceableOrNull: { + int64_t Offset = 0; + Value *V = GetPointerBaseWithConstantOffset( + RK.WasOn, Offset, M->getDataLayout(), /*AllowNonInBounds*/ false); + if (Offset < 0) + return RK; + RK.ArgValue = RK.ArgValue + Offset; + RK.WasOn = V; + } + } + return RK; +} + +/// This class contain all knowledge that have been gather while building an +/// llvm.assume and the function to manipulate it. +struct AssumeBuilderState { + Module *M; + + using MapKey = std::pair<Value *, Attribute::AttrKind>; + SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap; + Instruction *InstBeingRemoved = nullptr; + AssumptionCache* AC = nullptr; + DominatorTree* DT = nullptr; + + AssumeBuilderState(Module *M, Instruction *I = nullptr, + AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr) + : M(M), InstBeingRemoved(I), AC(AC), DT(DT) {} + + bool tryToPreserveWithoutAddingAssume(RetainedKnowledge RK) { + if (!InstBeingRemoved || !RK.WasOn) + return false; + bool HasBeenPreserved = false; + Use* ToUpdate = nullptr; + getKnowledgeForValue( + RK.WasOn, {RK.AttrKind}, AC, + [&](RetainedKnowledge RKOther, Instruction *Assume, + const CallInst::BundleOpInfo *Bundle) { + if (!isValidAssumeForContext(Assume, InstBeingRemoved, DT)) + return false; + if (RKOther.ArgValue >= RK.ArgValue) { + HasBeenPreserved = true; + return true; + } else if (isValidAssumeForContext(InstBeingRemoved, Assume, + DT)) { + HasBeenPreserved = true; + IntrinsicInst *Intr = cast<IntrinsicInst>(Assume); + ToUpdate = &Intr->op_begin()[Bundle->Begin + ABA_Argument]; + return true; + } + return false; + }); + if (ToUpdate) + ToUpdate->set( + ConstantInt::get(Type::getInt64Ty(M->getContext()), RK.ArgValue)); + return HasBeenPreserved; + } + + bool isKnowledgeWorthPreserving(RetainedKnowledge RK) { + if (!RK) + return false; + if (!RK.WasOn) + return true; + if (RK.WasOn->getType()->isPointerTy()) { + Value *UnderlyingPtr = GetUnderlyingObject(RK.WasOn, M->getDataLayout()); + if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr)) + return false; + } + if (auto *Arg = dyn_cast<Argument>(RK.WasOn)) { + if (Arg->hasAttribute(RK.AttrKind) && + (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) || + Arg->getAttribute(RK.AttrKind).getValueAsInt() >= RK.ArgValue)) + return false; + return true; + } + if (auto *Inst = dyn_cast<Instruction>(RK.WasOn)) + if (wouldInstructionBeTriviallyDead(Inst)) { + if (RK.WasOn->use_empty()) + return false; + Use *SingleUse = RK.WasOn->getSingleUndroppableUse(); + if (SingleUse && SingleUse->getUser() == InstBeingRemoved) + return false; + } + return true; + } + + void addKnowledge(RetainedKnowledge RK) { + RK = canonicalizedKnowledge(RK, M); + + if (!isKnowledgeWorthPreserving(RK)) + return; + + if (tryToPreserveWithoutAddingAssume(RK)) + return; + MapKey Key{RK.WasOn, RK.AttrKind}; + auto Lookup = AssumedKnowledgeMap.find(Key); + if (Lookup == AssumedKnowledgeMap.end()) { + AssumedKnowledgeMap[Key] = RK.ArgValue; + return; + } + assert(((Lookup->second == 0 && RK.ArgValue == 0) || + (Lookup->second != 0 && RK.ArgValue != 0)) && + "inconsistent argument value"); + + /// This is only desirable because for all attributes taking an argument + /// higher is better. + Lookup->second = std::max(Lookup->second, RK.ArgValue); + } + + void addAttribute(Attribute Attr, Value *WasOn) { + if (Attr.isTypeAttribute() || Attr.isStringAttribute() || + (!ShouldPreserveAllAttributes && + !isUsefullToPreserve(Attr.getKindAsEnum()))) + return; + unsigned AttrArg = 0; + if (Attr.isIntAttribute()) + AttrArg = Attr.getValueAsInt(); + addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn}); + } + + void addCall(const CallBase *Call) { + auto addAttrList = [&](AttributeList AttrList) { + for (unsigned Idx = AttributeList::FirstArgIndex; + Idx < AttrList.getNumAttrSets(); Idx++) + for (Attribute Attr : AttrList.getAttributes(Idx)) + addAttribute(Attr, Call->getArgOperand(Idx - 1)); + for (Attribute Attr : AttrList.getFnAttributes()) + addAttribute(Attr, nullptr); + }; + addAttrList(Call->getAttributes()); + if (Function *Fn = Call->getCalledFunction()) + addAttrList(Fn->getAttributes()); + } + + IntrinsicInst *build() { + if (AssumedKnowledgeMap.empty()) + return nullptr; + if (!DebugCounter::shouldExecute(BuildAssumeCounter)) + return nullptr; + Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); + LLVMContext &C = M->getContext(); + SmallVector<OperandBundleDef, 8> OpBundle; + for (auto &MapElem : AssumedKnowledgeMap) { + SmallVector<Value *, 2> Args; + if (MapElem.first.first) + Args.push_back(MapElem.first.first); + + /// This is only valid because for all attribute that currently exist a + /// value of 0 is useless. and should not be preserved. + if (MapElem.second) + Args.push_back(ConstantInt::get(Type::getInt64Ty(M->getContext()), + MapElem.second)); + OpBundle.push_back(OperandBundleDefT<Value *>( + std::string(Attribute::getNameFromAttrKind(MapElem.first.second)), + Args)); + NumBundlesInAssumes++; + } + NumAssumeBuilt++; + return cast<IntrinsicInst>(CallInst::Create( + FnAssume, ArrayRef<Value *>({ConstantInt::getTrue(C)}), OpBundle)); + } + + void addAccessedPtr(Instruction *MemInst, Value *Pointer, Type *AccType, + MaybeAlign MA) { + unsigned DerefSize = MemInst->getModule() + ->getDataLayout() + .getTypeStoreSize(AccType) + .getKnownMinSize(); + if (DerefSize != 0) { + addKnowledge({Attribute::Dereferenceable, DerefSize, Pointer}); + if (!NullPointerIsDefined(MemInst->getFunction(), + Pointer->getType()->getPointerAddressSpace())) + addKnowledge({Attribute::NonNull, 0u, Pointer}); + } + if (MA.valueOrOne() > 1) + addKnowledge( + {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer}); + } + + void addInstruction(Instruction *I) { + if (auto *Call = dyn_cast<CallBase>(I)) + return addCall(Call); + if (auto *Load = dyn_cast<LoadInst>(I)) + return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(), + Load->getAlign()); + if (auto *Store = dyn_cast<StoreInst>(I)) + return addAccessedPtr(I, Store->getPointerOperand(), + Store->getValueOperand()->getType(), + Store->getAlign()); + // TODO: Add support for the other Instructions. + // TODO: Maybe we should look around and merge with other llvm.assume. + } +}; + +} // namespace + +IntrinsicInst *llvm::buildAssumeFromInst(Instruction *I) { + if (!EnableKnowledgeRetention) + return nullptr; + AssumeBuilderState Builder(I->getModule()); + Builder.addInstruction(I); + return Builder.build(); +} + +void llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC, + DominatorTree *DT) { + if (!EnableKnowledgeRetention || I->isTerminator()) + return; + AssumeBuilderState Builder(I->getModule(), I, AC, DT); + Builder.addInstruction(I); + if (IntrinsicInst *Intr = Builder.build()) { + Intr->insertBefore(I); + if (AC) + AC->registerAssumption(Intr); + } +} + +namespace { + +struct AssumeSimplify { + Function &F; + AssumptionCache &AC; + DominatorTree *DT; + LLVMContext &C; + SmallDenseSet<IntrinsicInst *> CleanupToDo; + StringMapEntry<uint32_t> *IgnoreTag; + SmallDenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 4>, 8> BBToAssume; + bool MadeChange = false; + + AssumeSimplify(Function &F, AssumptionCache &AC, DominatorTree *DT, + LLVMContext &C) + : F(F), AC(AC), DT(DT), C(C), + IgnoreTag(C.getOrInsertBundleTag(IgnoreBundleTag)) {} + + void buildMapping(bool FilterBooleanArgument) { + BBToAssume.clear(); + for (Value *V : AC.assumptions()) { + if (!V) + continue; + IntrinsicInst *Assume = cast<IntrinsicInst>(V); + if (FilterBooleanArgument) { + auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0)); + if (!Arg || Arg->isZero()) + continue; + } + BBToAssume[Assume->getParent()].push_back(Assume); + } + + for (auto &Elem : BBToAssume) { + llvm::sort(Elem.second, + [](const IntrinsicInst *LHS, const IntrinsicInst *RHS) { + return LHS->comesBefore(RHS); + }); + } + } + + /// Remove all asumes in CleanupToDo if there boolean argument is true and + /// ForceCleanup is set or the assume doesn't hold valuable knowledge. + void RunCleanup(bool ForceCleanup) { + for (IntrinsicInst *Assume : CleanupToDo) { + auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0)); + if (!Arg || Arg->isZero() || + (!ForceCleanup && !isAssumeWithEmptyBundle(*Assume))) + continue; + MadeChange = true; + if (ForceCleanup) + NumAssumesMerged++; + else + NumAssumesRemoved++; + Assume->eraseFromParent(); + } + CleanupToDo.clear(); + } + + /// Remove knowledge stored in assume when it is already know by an attribute + /// or an other assume. This can when valid update an existing knowledge in an + /// attribute or an other assume. + void dropRedundantKnowledge() { + struct MapValue { + IntrinsicInst *Assume; + unsigned ArgValue; + CallInst::BundleOpInfo *BOI; + }; + buildMapping(false); + SmallDenseMap<std::pair<Value *, Attribute::AttrKind>, + SmallVector<MapValue, 2>, 16> + Knowledge; + for (BasicBlock *BB : depth_first(&F)) + for (Value *V : BBToAssume[BB]) { + if (!V) + continue; + IntrinsicInst *Assume = cast<IntrinsicInst>(V); + for (CallInst::BundleOpInfo &BOI : Assume->bundle_op_infos()) { + auto RemoveFromAssume = [&]() { + CleanupToDo.insert(Assume); + if (BOI.Begin != BOI.End) { + Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn]; + U->set(UndefValue::get(U->get()->getType())); + } + BOI.Tag = IgnoreTag; + }; + if (BOI.Tag == IgnoreTag) { + CleanupToDo.insert(Assume); + continue; + } + RetainedKnowledge RK = getKnowledgeFromBundle(*Assume, BOI); + if (auto *Arg = dyn_cast_or_null<Argument>(RK.WasOn)) { + bool HasSameKindAttr = Arg->hasAttribute(RK.AttrKind); + if (HasSameKindAttr) + if (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) || + Arg->getAttribute(RK.AttrKind).getValueAsInt() >= + RK.ArgValue) { + RemoveFromAssume(); + continue; + } + if (isValidAssumeForContext( + Assume, &*F.getEntryBlock().getFirstInsertionPt()) || + Assume == &*F.getEntryBlock().getFirstInsertionPt()) { + if (HasSameKindAttr) + Arg->removeAttr(RK.AttrKind); + Arg->addAttr(Attribute::get(C, RK.AttrKind, RK.ArgValue)); + MadeChange = true; + RemoveFromAssume(); + continue; + } + } + auto &Lookup = Knowledge[{RK.WasOn, RK.AttrKind}]; + for (MapValue &Elem : Lookup) { + if (!isValidAssumeForContext(Elem.Assume, Assume, DT)) + continue; + if (Elem.ArgValue >= RK.ArgValue) { + RemoveFromAssume(); + continue; + } else if (isValidAssumeForContext(Assume, Elem.Assume, DT)) { + Elem.Assume->op_begin()[Elem.BOI->Begin + ABA_Argument].set( + ConstantInt::get(Type::getInt64Ty(C), RK.ArgValue)); + MadeChange = true; + RemoveFromAssume(); + continue; + } + } + Lookup.push_back({Assume, RK.ArgValue, &BOI}); + } + } + } + + using MergeIterator = SmallVectorImpl<IntrinsicInst *>::iterator; + + /// Merge all Assumes from Begin to End in and insert the resulting assume as + /// high as possible in the basicblock. + void mergeRange(BasicBlock *BB, MergeIterator Begin, MergeIterator End) { + if (Begin == End || std::next(Begin) == End) + return; + /// Provide no additional information so that AssumeBuilderState doesn't + /// try to do any punning since it already has been done better. + AssumeBuilderState Builder(F.getParent()); + + /// For now it is initialized to the best value it could have + Instruction *InsertPt = BB->getFirstNonPHI(); + if (isa<LandingPadInst>(InsertPt)) + InsertPt = InsertPt->getNextNode(); + for (IntrinsicInst *I : make_range(Begin, End)) { + CleanupToDo.insert(I); + for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) { + RetainedKnowledge RK = getKnowledgeFromBundle(*I, BOI); + if (!RK) + continue; + Builder.addKnowledge(RK); + if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn)) + if (I->getParent() == InsertPt->getParent() && + (InsertPt->comesBefore(I) || InsertPt == I)) + InsertPt = I->getNextNode(); + } + } + + /// Adjust InsertPt if it is before Begin, since mergeAssumes only + /// guarantees we can place the resulting assume between Begin and End. + if (InsertPt->comesBefore(*Begin)) + for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator(); + It != E; --It) + if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) { + InsertPt = It->getNextNode(); + break; + } + IntrinsicInst *MergedAssume = Builder.build(); + if (!MergedAssume) + return; + MadeChange = true; + MergedAssume->insertBefore(InsertPt); + AC.registerAssumption(MergedAssume); + } + + /// Merge assume when they are in the same BasicBlock and for all instruction + /// between them isGuaranteedToTransferExecutionToSuccessor returns true. + void mergeAssumes() { + buildMapping(true); + + SmallVector<MergeIterator, 4> SplitPoints; + for (auto &Elem : BBToAssume) { + SmallVectorImpl<IntrinsicInst *> &AssumesInBB = Elem.second; + if (AssumesInBB.size() < 2) + continue; + /// AssumesInBB is already sorted by order in the block. + + BasicBlock::iterator It = AssumesInBB.front()->getIterator(); + BasicBlock::iterator E = AssumesInBB.back()->getIterator(); + SplitPoints.push_back(AssumesInBB.begin()); + MergeIterator LastSplit = AssumesInBB.begin(); + for (; It != E; ++It) + if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) { + for (; (*LastSplit)->comesBefore(&*It); ++LastSplit) + ; + if (SplitPoints.back() != LastSplit) + SplitPoints.push_back(LastSplit); + } + SplitPoints.push_back(AssumesInBB.end()); + for (auto SplitIt = SplitPoints.begin(); + SplitIt != std::prev(SplitPoints.end()); SplitIt++) { + mergeRange(Elem.first, *SplitIt, *(SplitIt + 1)); + } + SplitPoints.clear(); + } + } +}; + +bool simplifyAssumes(Function &F, AssumptionCache *AC, DominatorTree *DT) { + AssumeSimplify AS(F, *AC, DT, F.getContext()); + + /// Remove knowledge that is already known by a dominating other assume or an + /// attribute. + AS.dropRedundantKnowledge(); + + /// Remove assume that are empty. + AS.RunCleanup(false); + + /// Merge assume in the same basicblock when possible. + AS.mergeAssumes(); + + /// Remove assume that were merged. + AS.RunCleanup(true); + return AS.MadeChange; +} + +} // namespace + +PreservedAnalyses AssumeSimplifyPass::run(Function &F, + FunctionAnalysisManager &AM) { + if (!EnableKnowledgeRetention) + return PreservedAnalyses::all(); + simplifyAssumes(F, &AM.getResult<AssumptionAnalysis>(F), + AM.getCachedResult<DominatorTreeAnalysis>(F)); + return PreservedAnalyses::all(); +} + +namespace { +class AssumeSimplifyPassLegacyPass : public FunctionPass { +public: + static char ID; + + AssumeSimplifyPassLegacyPass() : FunctionPass(ID) { + initializeAssumeSimplifyPassLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipFunction(F) || !EnableKnowledgeRetention) + return false; + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + return simplifyAssumes(F, &AC, DTWP ? &DTWP->getDomTree() : nullptr); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + + AU.setPreservesAll(); + } +}; +} // namespace + +char AssumeSimplifyPassLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(AssumeSimplifyPassLegacyPass, "assume-simplify", + "Assume Simplify", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(AssumeSimplifyPassLegacyPass, "assume-simplify", + "Assume Simplify", false, false) + +FunctionPass *llvm::createAssumeSimplifyPass() { + return new AssumeSimplifyPassLegacyPass(); +} + +PreservedAnalyses AssumeBuilderPass::run(Function &F, + FunctionAnalysisManager &AM) { + AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); + DominatorTree* DT = AM.getCachedResult<DominatorTreeAnalysis>(F); + for (Instruction &I : instructions(F)) + salvageKnowledge(&I, AC, DT); + return PreservedAnalyses::all(); +} + +namespace { +class AssumeBuilderPassLegacyPass : public FunctionPass { +public: + static char ID; + + AssumeBuilderPassLegacyPass() : FunctionPass(ID) { + initializeAssumeBuilderPassLegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + for (Instruction &I : instructions(F)) + salvageKnowledge(&I, &AC, DTWP ? &DTWP->getDomTree() : nullptr); + return true; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + + AU.setPreservesAll(); + } +}; +} // namespace + +char AssumeBuilderPassLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(AssumeBuilderPassLegacyPass, "assume-builder", + "Assume Builder", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(AssumeBuilderPassLegacyPass, "assume-builder", + "Assume Builder", false, false) diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index c9eb4abfa21ae..085d91031cf90 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -153,7 +153,8 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, } } -bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { +bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI, + MemorySSAUpdater *MSSAU) { // Recursively deleting a PHI may cause multiple PHIs to be deleted // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete. SmallVector<WeakTrackingVH, 8> PHIs; @@ -163,7 +164,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { bool Changed = false; for (unsigned i = 0, e = PHIs.size(); i != e; ++i) if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*())) - Changed |= RecursivelyDeleteDeadPHINode(PN, TLI); + Changed |= RecursivelyDeleteDeadPHINode(PN, TLI, MSSAU); return Changed; } @@ -314,6 +315,31 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, return true; } +bool llvm::MergeBlockSuccessorsIntoGivenBlocks( + SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU, + LoopInfo *LI) { + assert(!MergeBlocks.empty() && "MergeBlocks should not be empty"); + + bool BlocksHaveBeenMerged = false; + while (!MergeBlocks.empty()) { + BasicBlock *BB = *MergeBlocks.begin(); + BasicBlock *Dest = BB->getSingleSuccessor(); + if (Dest && (!L || L->contains(Dest))) { + BasicBlock *Fold = Dest->getUniquePredecessor(); + (void)Fold; + if (MergeBlockIntoPredecessor(Dest, DTU, LI)) { + assert(Fold == BB && + "Expecting BB to be unique predecessor of the Dest block"); + MergeBlocks.erase(Dest); + BlocksHaveBeenMerged = true; + } else + MergeBlocks.erase(BB); + } else + MergeBlocks.erase(BB); + } + return BlocksHaveBeenMerged; +} + /// Remove redundant instructions within sequences of consecutive dbg.value /// instructions. This is done using a backward scan to keep the last dbg.value /// describing a specific variable/fragment. @@ -505,7 +531,8 @@ llvm::SplitAllCriticalEdges(Function &F, unsigned NumBroken = 0; for (BasicBlock &BB : F) { Instruction *TI = BB.getTerminator(); - if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI)) + if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) && + !isa<CallBrInst>(TI)) for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (SplitCriticalEdge(TI, i, Options)) ++NumBroken; @@ -900,9 +927,25 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, Pred->getInstList().insert(NewRet->getIterator(), NewBC); *i = NewBC; } + + Instruction *NewEV = nullptr; + if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) { + V = EVI->getOperand(0); + NewEV = EVI->clone(); + if (NewBC) { + NewBC->setOperand(0, NewEV); + Pred->getInstList().insert(NewBC->getIterator(), NewEV); + } else { + Pred->getInstList().insert(NewRet->getIterator(), NewEV); + *i = NewEV; + } + } + if (PHINode *PN = dyn_cast<PHINode>(V)) { if (PN->getParent() == BB) { - if (NewBC) + if (NewEV) { + NewEV->setOperand(0, PN->getIncomingValueForBlock(Pred)); + } else if (NewBC) NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred)); else *i = PN->getIncomingValueForBlock(Pred); @@ -1084,3 +1127,247 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, } return BI->getCondition(); } + +// After creating a control flow hub, the operands of PHINodes in an outgoing +// block Out no longer match the predecessors of that block. Predecessors of Out +// that are incoming blocks to the hub are now replaced by just one edge from +// the hub. To match this new control flow, the corresponding values from each +// PHINode must now be moved a new PHINode in the first guard block of the hub. +// +// This operation cannot be performed with SSAUpdater, because it involves one +// new use: If the block Out is in the list of Incoming blocks, then the newly +// created PHI in the Hub will use itself along that edge from Out to Hub. +static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock, + const SetVector<BasicBlock *> &Incoming, + BasicBlock *FirstGuardBlock) { + auto I = Out->begin(); + while (I != Out->end() && isa<PHINode>(I)) { + auto Phi = cast<PHINode>(I); + auto NewPhi = + PHINode::Create(Phi->getType(), Incoming.size(), + Phi->getName() + ".moved", &FirstGuardBlock->back()); + for (auto In : Incoming) { + Value *V = UndefValue::get(Phi->getType()); + if (In == Out) { + V = NewPhi; + } else if (Phi->getBasicBlockIndex(In) != -1) { + V = Phi->removeIncomingValue(In, false); + } + NewPhi->addIncoming(V, In); + } + assert(NewPhi->getNumIncomingValues() == Incoming.size()); + if (Phi->getNumOperands() == 0) { + Phi->replaceAllUsesWith(NewPhi); + I = Phi->eraseFromParent(); + continue; + } + Phi->addIncoming(NewPhi, GuardBlock); + ++I; + } +} + +using BBPredicates = DenseMap<BasicBlock *, PHINode *>; +using BBSetVector = SetVector<BasicBlock *>; + +// Redirects the terminator of the incoming block to the first guard +// block in the hub. The condition of the original terminator (if it +// was conditional) and its original successors are returned as a +// tuple <condition, succ0, succ1>. The function additionally filters +// out successors that are not in the set of outgoing blocks. +// +// - condition is non-null iff the branch is conditional. +// - Succ1 is non-null iff the sole/taken target is an outgoing block. +// - Succ2 is non-null iff condition is non-null and the fallthrough +// target is an outgoing block. +static std::tuple<Value *, BasicBlock *, BasicBlock *> +redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock, + const BBSetVector &Outgoing) { + auto Branch = cast<BranchInst>(BB->getTerminator()); + auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr; + + BasicBlock *Succ0 = Branch->getSuccessor(0); + BasicBlock *Succ1 = nullptr; + Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr; + + if (Branch->isUnconditional()) { + Branch->setSuccessor(0, FirstGuardBlock); + assert(Succ0); + } else { + Succ1 = Branch->getSuccessor(1); + Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr; + assert(Succ0 || Succ1); + if (Succ0 && !Succ1) { + Branch->setSuccessor(0, FirstGuardBlock); + } else if (Succ1 && !Succ0) { + Branch->setSuccessor(1, FirstGuardBlock); + } else { + Branch->eraseFromParent(); + BranchInst::Create(FirstGuardBlock, BB); + } + } + + assert(Succ0 || Succ1); + return std::make_tuple(Condition, Succ0, Succ1); +} + +// Capture the existing control flow as guard predicates, and redirect +// control flow from every incoming block to the first guard block in +// the hub. +// +// There is one guard predicate for each outgoing block OutBB. The +// predicate is a PHINode with one input for each InBB which +// represents whether the hub should transfer control flow to OutBB if +// it arrived from InBB. These predicates are NOT ORTHOGONAL. The Hub +// evaluates them in the same order as the Outgoing set-vector, and +// control branches to the first outgoing block whose predicate +// evaluates to true. +static void convertToGuardPredicates( + BasicBlock *FirstGuardBlock, BBPredicates &GuardPredicates, + SmallVectorImpl<WeakVH> &DeletionCandidates, const BBSetVector &Incoming, + const BBSetVector &Outgoing) { + auto &Context = Incoming.front()->getContext(); + auto BoolTrue = ConstantInt::getTrue(Context); + auto BoolFalse = ConstantInt::getFalse(Context); + + // The predicate for the last outgoing is trivially true, and so we + // process only the first N-1 successors. + for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { + auto Out = Outgoing[i]; + LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n"); + auto Phi = + PHINode::Create(Type::getInt1Ty(Context), Incoming.size(), + StringRef("Guard.") + Out->getName(), FirstGuardBlock); + GuardPredicates[Out] = Phi; + } + + for (auto In : Incoming) { + Value *Condition; + BasicBlock *Succ0; + BasicBlock *Succ1; + std::tie(Condition, Succ0, Succ1) = + redirectToHub(In, FirstGuardBlock, Outgoing); + + // Optimization: Consider an incoming block A with both successors + // Succ0 and Succ1 in the set of outgoing blocks. The predicates + // for Succ0 and Succ1 complement each other. If Succ0 is visited + // first in the loop below, control will branch to Succ0 using the + // corresponding predicate. But if that branch is not taken, then + // control must reach Succ1, which means that the predicate for + // Succ1 is always true. + bool OneSuccessorDone = false; + for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { + auto Out = Outgoing[i]; + auto Phi = GuardPredicates[Out]; + if (Out != Succ0 && Out != Succ1) { + Phi->addIncoming(BoolFalse, In); + continue; + } + // Optimization: When only one successor is an outgoing block, + // the predicate is always true. + if (!Succ0 || !Succ1 || OneSuccessorDone) { + Phi->addIncoming(BoolTrue, In); + continue; + } + assert(Succ0 && Succ1); + OneSuccessorDone = true; + if (Out == Succ0) { + Phi->addIncoming(Condition, In); + continue; + } + auto Inverted = invertCondition(Condition); + DeletionCandidates.push_back(Condition); + Phi->addIncoming(Inverted, In); + } + } +} + +// For each outgoing block OutBB, create a guard block in the Hub. The +// first guard block was already created outside, and available as the +// first element in the vector of guard blocks. +// +// Each guard block terminates in a conditional branch that transfers +// control to the corresponding outgoing block or the next guard +// block. The last guard block has two outgoing blocks as successors +// since the condition for the final outgoing block is trivially +// true. So we create one less block (including the first guard block) +// than the number of outgoing blocks. +static void createGuardBlocks(SmallVectorImpl<BasicBlock *> &GuardBlocks, + Function *F, const BBSetVector &Outgoing, + BBPredicates &GuardPredicates, StringRef Prefix) { + for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) { + GuardBlocks.push_back( + BasicBlock::Create(F->getContext(), Prefix + ".guard", F)); + } + assert(GuardBlocks.size() == GuardPredicates.size()); + + // To help keep the loop simple, temporarily append the last + // outgoing block to the list of guard blocks. + GuardBlocks.push_back(Outgoing.back()); + + for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) { + auto Out = Outgoing[i]; + assert(GuardPredicates.count(Out)); + BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out], + GuardBlocks[i]); + } + + // Remove the last block from the guard list. + GuardBlocks.pop_back(); +} + +BasicBlock *llvm::CreateControlFlowHub( + DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks, + const BBSetVector &Incoming, const BBSetVector &Outgoing, + const StringRef Prefix) { + auto F = Incoming.front()->getParent(); + auto FirstGuardBlock = + BasicBlock::Create(F->getContext(), Prefix + ".guard", F); + + SmallVector<DominatorTree::UpdateType, 16> Updates; + if (DTU) { + for (auto In : Incoming) { + for (auto Succ : successors(In)) { + if (Outgoing.count(Succ)) + Updates.push_back({DominatorTree::Delete, In, Succ}); + } + Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock}); + } + } + + BBPredicates GuardPredicates; + SmallVector<WeakVH, 8> DeletionCandidates; + convertToGuardPredicates(FirstGuardBlock, GuardPredicates, DeletionCandidates, + Incoming, Outgoing); + + GuardBlocks.push_back(FirstGuardBlock); + createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix); + + // Update the PHINodes in each outgoing block to match the new control flow. + for (int i = 0, e = GuardBlocks.size(); i != e; ++i) { + reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock); + } + reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock); + + if (DTU) { + int NumGuards = GuardBlocks.size(); + assert((int)Outgoing.size() == NumGuards + 1); + for (int i = 0; i != NumGuards - 1; ++i) { + Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]}); + Updates.push_back( + {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]}); + } + Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], + Outgoing[NumGuards - 1]}); + Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], + Outgoing[NumGuards]}); + DTU->applyUpdates(Updates); + } + + for (auto I : DeletionCandidates) { + if (I->use_empty()) + if (auto Inst = dyn_cast_or_null<Instruction>(I)) + Inst->eraseFromParent(); + } + + return FirstGuardBlock; +} diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 008cea333e6b3..39fb504cf7b75 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -150,14 +150,51 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, // it in this generic function. if (DestBB->isEHPad()) return nullptr; - // Don't split the non-fallthrough edge from a callbr. - if (isa<CallBrInst>(TI) && SuccNum > 0) - return nullptr; - if (Options.IgnoreUnreachableDests && isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime())) return nullptr; + auto *LI = Options.LI; + SmallVector<BasicBlock *, 4> LoopPreds; + // Check if extra modifications will be required to preserve loop-simplify + // form after splitting. If it would require splitting blocks with IndirectBr + // terminators, bail out if preserving loop-simplify form is requested. + if (LI) { + if (Loop *TIL = LI->getLoopFor(TIBB)) { + + // The only that we can break LoopSimplify form by splitting a critical + // edge is if after the split there exists some edge from TIL to DestBB + // *and* the only edge into DestBB from outside of TIL is that of + // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB + // is the new exit block and it has no non-loop predecessors. If the + // second isn't true, then DestBB was not in LoopSimplify form prior to + // the split as it had a non-loop predecessor. In both of these cases, + // the predecessor must be directly in TIL, not in a subloop, or again + // LoopSimplify doesn't hold. + for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E; + ++I) { + BasicBlock *P = *I; + if (P == TIBB) + continue; // The new block is known. + if (LI->getLoopFor(P) != TIL) { + // No need to re-simplify, it wasn't to start with. + LoopPreds.clear(); + break; + } + LoopPreds.push_back(P); + } + // Loop-simplify form can be preserved, if we can split all in-loop + // predecessors. + if (any_of(LoopPreds, [](BasicBlock *Pred) { + return isa<IndirectBrInst>(Pred->getTerminator()); + })) { + if (Options.PreserveLoopSimplify) + return nullptr; + LoopPreds.clear(); + } + } + } + // Create a new basic block, linking it into the CFG. BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), TIBB->getName() + "." + DestBB->getName() + "_crit_edge"); @@ -165,14 +202,14 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, BranchInst *NewBI = BranchInst::Create(DestBB, NewBB); NewBI->setDebugLoc(TI->getDebugLoc()); - // Branch to the new block, breaking the edge. - TI->setSuccessor(SuccNum, NewBB); - // Insert the block into the function... right after the block TI lives in. Function &F = *TIBB->getParent(); Function::iterator FBBI = TIBB->getIterator(); F.getBasicBlockList().insert(++FBBI, NewBB); + // Branch to the new block, breaking the edge. + TI->setSuccessor(SuccNum, NewBB); + // If there are any PHI nodes in DestBB, we need to update them so that they // merge incoming values from NewBB instead of from TIBB. { @@ -212,7 +249,6 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, // If we have nothing to update, just return. auto *DT = Options.DT; auto *PDT = Options.PDT; - auto *LI = Options.LI; auto *MSSAU = Options.MSSAU; if (MSSAU) MSSAU->wireOldPredecessorsToNewImmediatePredecessor( @@ -281,28 +317,6 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); } - // The only that we can break LoopSimplify form by splitting a critical - // edge is if after the split there exists some edge from TIL to DestBB - // *and* the only edge into DestBB from outside of TIL is that of - // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB - // is the new exit block and it has no non-loop predecessors. If the - // second isn't true, then DestBB was not in LoopSimplify form prior to - // the split as it had a non-loop predecessor. In both of these cases, - // the predecessor must be directly in TIL, not in a subloop, or again - // LoopSimplify doesn't hold. - SmallVector<BasicBlock *, 4> LoopPreds; - for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E; - ++I) { - BasicBlock *P = *I; - if (P == NewBB) - continue; // The new block is known. - if (LI->getLoopFor(P) != TIL) { - // No need to re-simplify, it wasn't to start with. - LoopPreds.clear(); - break; - } - LoopPreds.push_back(P); - } if (!LoopPreds.empty()) { assert(!DestBB->isEHPad() && "We don't split edges to EH pads!"); BasicBlock *NewExitBB = SplitBlockPredecessors( @@ -388,13 +402,20 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F, if (FirstNonPHI->isEHPad() || Target->isLandingPad()) continue; + // Remember edge probabilities if needed. + SmallVector<BranchProbability, 4> EdgeProbabilities; + if (ShouldUpdateAnalysis) { + EdgeProbabilities.reserve(Target->getTerminator()->getNumSuccessors()); + for (unsigned I = 0, E = Target->getTerminator()->getNumSuccessors(); + I < E; ++I) + EdgeProbabilities.emplace_back(BPI->getEdgeProbability(Target, I)); + BPI->eraseBlock(Target); + } + BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); if (ShouldUpdateAnalysis) { // Copy the BFI/BPI from Target to BodyBlock. - for (unsigned I = 0, E = BodyBlock->getTerminator()->getNumSuccessors(); - I < E; ++I) - BPI->setEdgeProbability(BodyBlock, I, - BPI->getEdgeProbability(Target, I)); + BPI->setEdgeProbability(BodyBlock, EdgeProbabilities); BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency()); } // It's possible Target was its own successor through an indirectbr. @@ -423,7 +444,6 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F, BlockFrequency NewBlockFreqForTarget = BFI->getBlockFreq(Target) - BlockFreqForDirectSucc; BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency()); - BPI->eraseBlock(Target); } // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 71316ce8f7583..c64ad147fdfec 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -378,6 +378,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; + case LibFunc_aligned_alloc: + Changed |= setDoesNotThrow(F); + Changed |= setRetDoesNotAlias(F); + return Changed; case LibFunc_bcopy: Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); @@ -819,14 +823,14 @@ StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty, //- Emit LibCalls ------------------------------------------------------------// -Value *llvm::castToCStr(Value *V, IRBuilder<> &B) { +Value *llvm::castToCStr(Value *V, IRBuilderBase &B) { unsigned AS = V->getType()->getPointerAddressSpace(); return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr"); } static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType, ArrayRef<Type *> ParamTypes, - ArrayRef<Value *> Operands, IRBuilder<> &B, + ArrayRef<Value *> Operands, IRBuilderBase &B, const TargetLibraryInfo *TLI, bool IsVaArgs = false) { if (!TLI->has(TheLibFunc)) @@ -844,20 +848,20 @@ static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType, return CI; } -Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL, +Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVMContext &Context = B.GetInsertBlock()->getContext(); return emitLibCall(LibFunc_strlen, DL.getIntPtrType(Context), B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI); } -Value *llvm::emitStrDup(Value *Ptr, IRBuilder<> &B, +Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI); } -Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B, +Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilderBase &B, const TargetLibraryInfo *TLI) { Type *I8Ptr = B.getInt8PtrTy(); Type *I32Ty = B.getInt32Ty(); @@ -865,7 +869,7 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, B, TLI); } -Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, +Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVMContext &Context = B.GetInsertBlock()->getContext(); return emitLibCall( @@ -874,28 +878,28 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); } -Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, +Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B, const TargetLibraryInfo *TLI) { Type *I8Ptr = B.getInt8PtrTy(); return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr}, {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI); } -Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilder<> &B, +Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilderBase &B, const TargetLibraryInfo *TLI) { Type *I8Ptr = B.getInt8PtrTy(); return emitLibCall(LibFunc_stpcpy, I8Ptr, {I8Ptr, I8Ptr}, {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI); } -Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B, +Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B, const TargetLibraryInfo *TLI) { Type *I8Ptr = B.getInt8PtrTy(); return emitLibCall(LibFunc_strncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()}, {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI); } -Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B, +Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B, const TargetLibraryInfo *TLI) { Type *I8Ptr = B.getInt8PtrTy(); return emitLibCall(LibFunc_stpncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()}, @@ -903,7 +907,7 @@ Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B, } Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, - IRBuilder<> &B, const DataLayout &DL, + IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_memcpy_chk)) return nullptr; @@ -926,7 +930,7 @@ Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, return CI; } -Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B, +Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVMContext &Context = B.GetInsertBlock()->getContext(); return emitLibCall( @@ -935,7 +939,7 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B, {castToCStr(Ptr, B), Val, Len}, B, TLI); } -Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, +Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVMContext &Context = B.GetInsertBlock()->getContext(); return emitLibCall( @@ -944,7 +948,7 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); } -Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, +Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVMContext &Context = B.GetInsertBlock()->getContext(); return emitLibCall( @@ -954,7 +958,7 @@ Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, } Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len, - IRBuilder<> &B, const TargetLibraryInfo *TLI) { + IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall( LibFunc_memccpy, B.getInt8PtrTy(), {B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), Len->getType()}, @@ -962,7 +966,7 @@ Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len, } Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt, - ArrayRef<Value *> VariadicArgs, IRBuilder<> &B, + ArrayRef<Value *> VariadicArgs, IRBuilderBase &B, const TargetLibraryInfo *TLI) { SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)}; Args.insert(Args.end(), VariadicArgs.begin(), VariadicArgs.end()); @@ -972,7 +976,7 @@ Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt, } Value *llvm::emitSPrintf(Value *Dest, Value *Fmt, - ArrayRef<Value *> VariadicArgs, IRBuilder<> &B, + ArrayRef<Value *> VariadicArgs, IRBuilderBase &B, const TargetLibraryInfo *TLI) { SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)}; Args.insert(Args.end(), VariadicArgs.begin(), VariadicArgs.end()); @@ -981,28 +985,28 @@ Value *llvm::emitSPrintf(Value *Dest, Value *Fmt, /*IsVaArgs=*/true); } -Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilder<> &B, +Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall(LibFunc_strcat, B.getInt8PtrTy(), {B.getInt8PtrTy(), B.getInt8PtrTy()}, {castToCStr(Dest, B), castToCStr(Src, B)}, B, TLI); } -Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B, +Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall(LibFunc_strlcpy, Size->getType(), {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()}, {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI); } -Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B, +Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall(LibFunc_strlcat, Size->getType(), {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()}, {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI); } -Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B, +Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall(LibFunc_strncat, B.getInt8PtrTy(), {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()}, @@ -1010,7 +1014,7 @@ Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B, } Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList, - IRBuilder<> &B, const TargetLibraryInfo *TLI) { + IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall( LibFunc_vsnprintf, B.getInt32Ty(), {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy(), VAList->getType()}, @@ -1018,7 +1022,7 @@ Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList, } Value *llvm::emitVSPrintf(Value *Dest, Value *Fmt, Value *VAList, - IRBuilder<> &B, const TargetLibraryInfo *TLI) { + IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall(LibFunc_vsprintf, B.getInt32Ty(), {B.getInt8PtrTy(), B.getInt8PtrTy(), VAList->getType()}, {castToCStr(Dest, B), castToCStr(Fmt, B), VAList}, B, TLI); @@ -1040,7 +1044,7 @@ static void appendTypeSuffix(Value *Op, StringRef &Name, } static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name, - IRBuilder<> &B, + IRBuilderBase &B, const AttributeList &Attrs) { assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall"); @@ -1062,7 +1066,7 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name, return CI; } -Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, +Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B, const AttributeList &Attrs) { SmallString<20> NameBuffer; appendTypeSuffix(Op, Name, NameBuffer); @@ -1072,7 +1076,7 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, LibFunc DoubleFn, LibFunc FloatFn, - LibFunc LongDoubleFn, IRBuilder<> &B, + LibFunc LongDoubleFn, IRBuilderBase &B, const AttributeList &Attrs) { // Get the name of the function according to TLI. StringRef Name = getFloatFnName(TLI, Op->getType(), @@ -1082,7 +1086,7 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, } static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2, - StringRef Name, IRBuilder<> &B, + StringRef Name, IRBuilderBase &B, const AttributeList &Attrs) { assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); @@ -1105,7 +1109,8 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2, } Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, - IRBuilder<> &B, const AttributeList &Attrs) { + IRBuilderBase &B, + const AttributeList &Attrs) { assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); SmallString<20> NameBuffer; @@ -1117,7 +1122,7 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, const TargetLibraryInfo *TLI, LibFunc DoubleFn, LibFunc FloatFn, - LibFunc LongDoubleFn, IRBuilder<> &B, + LibFunc LongDoubleFn, IRBuilderBase &B, const AttributeList &Attrs) { // Get the name of the function according to TLI. StringRef Name = getFloatFnName(TLI, Op1->getType(), @@ -1126,7 +1131,7 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs); } -Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B, +Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_putchar)) return nullptr; @@ -1149,7 +1154,7 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B, return CI; } -Value *llvm::emitPutS(Value *Str, IRBuilder<> &B, +Value *llvm::emitPutS(Value *Str, IRBuilderBase &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_puts)) return nullptr; @@ -1166,7 +1171,7 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B, return CI; } -Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B, +Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_fputc)) return nullptr; @@ -1187,27 +1192,7 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B, return CI; } -Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B, - const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fputc_unlocked)) - return nullptr; - - Module *M = B.GetInsertBlock()->getModule(); - StringRef FPutcUnlockedName = TLI->getName(LibFunc_fputc_unlocked); - FunctionCallee F = M->getOrInsertFunction(FPutcUnlockedName, B.getInt32Ty(), - B.getInt32Ty(), File->getType()); - if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FPutcUnlockedName, *TLI); - Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari"); - CallInst *CI = B.CreateCall(F, {Char, File}, FPutcUnlockedName); - - if (const Function *Fn = - dyn_cast<Function>(F.getCallee()->stripPointerCasts())) - CI->setCallingConv(Fn->getCallingConv()); - return CI; -} - -Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B, +Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_fputs)) return nullptr; @@ -1226,26 +1211,7 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B, return CI; } -Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B, - const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fputs_unlocked)) - return nullptr; - - Module *M = B.GetInsertBlock()->getModule(); - StringRef FPutsUnlockedName = TLI->getName(LibFunc_fputs_unlocked); - FunctionCallee F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(), - B.getInt8PtrTy(), File->getType()); - if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FPutsUnlockedName, *TLI); - CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsUnlockedName); - - if (const Function *Fn = - dyn_cast<Function>(F.getCallee()->stripPointerCasts())) - CI->setCallingConv(Fn->getCallingConv()); - return CI; -} - -Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B, +Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_fwrite)) return nullptr; @@ -1269,7 +1235,7 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B, return CI; } -Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL, +Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_malloc)) return nullptr; @@ -1290,7 +1256,7 @@ Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL, } Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs, - IRBuilder<> &B, const TargetLibraryInfo &TLI) { + IRBuilderBase &B, const TargetLibraryInfo &TLI) { if (!TLI.has(LibFunc_calloc)) return nullptr; @@ -1309,88 +1275,3 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs, return CI; } - -Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File, - IRBuilder<> &B, const DataLayout &DL, - const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fwrite_unlocked)) - return nullptr; - - Module *M = B.GetInsertBlock()->getModule(); - LLVMContext &Context = B.GetInsertBlock()->getContext(); - StringRef FWriteUnlockedName = TLI->getName(LibFunc_fwrite_unlocked); - FunctionCallee F = M->getOrInsertFunction( - FWriteUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(), - DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); - - if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FWriteUnlockedName, *TLI); - CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File}); - - if (const Function *Fn = - dyn_cast<Function>(F.getCallee()->stripPointerCasts())) - CI->setCallingConv(Fn->getCallingConv()); - return CI; -} - -Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B, - const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fgetc_unlocked)) - return nullptr; - - Module *M = B.GetInsertBlock()->getModule(); - StringRef FGetCUnlockedName = TLI->getName(LibFunc_fgetc_unlocked); - FunctionCallee F = M->getOrInsertFunction(FGetCUnlockedName, B.getInt32Ty(), - File->getType()); - if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FGetCUnlockedName, *TLI); - CallInst *CI = B.CreateCall(F, File, FGetCUnlockedName); - - if (const Function *Fn = - dyn_cast<Function>(F.getCallee()->stripPointerCasts())) - CI->setCallingConv(Fn->getCallingConv()); - return CI; -} - -Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File, - IRBuilder<> &B, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fgets_unlocked)) - return nullptr; - - Module *M = B.GetInsertBlock()->getModule(); - StringRef FGetSUnlockedName = TLI->getName(LibFunc_fgets_unlocked); - FunctionCallee F = - M->getOrInsertFunction(FGetSUnlockedName, B.getInt8PtrTy(), - B.getInt8PtrTy(), B.getInt32Ty(), File->getType()); - inferLibFuncAttributes(M, FGetSUnlockedName, *TLI); - CallInst *CI = - B.CreateCall(F, {castToCStr(Str, B), Size, File}, FGetSUnlockedName); - - if (const Function *Fn = - dyn_cast<Function>(F.getCallee()->stripPointerCasts())) - CI->setCallingConv(Fn->getCallingConv()); - return CI; -} - -Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File, - IRBuilder<> &B, const DataLayout &DL, - const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fread_unlocked)) - return nullptr; - - Module *M = B.GetInsertBlock()->getModule(); - LLVMContext &Context = B.GetInsertBlock()->getContext(); - StringRef FReadUnlockedName = TLI->getName(LibFunc_fread_unlocked); - FunctionCallee F = M->getOrInsertFunction( - FReadUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(), - DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); - - if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FReadUnlockedName, *TLI); - CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File}); - - if (const Function *Fn = - dyn_cast<Function>(F.getCallee()->stripPointerCasts())) - CI->setCallingConv(Fn->getCallingConv()); - return CI; -} diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index 9a6761040bd89..833d04210629d 100644 --- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -213,9 +213,8 @@ bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { return false; // Do not visit nodes that have been visited already. We return true because // it means that we couldn't find any value that doesn't look hash-like. - if (Visited.find(I) != Visited.end()) + if (!Visited.insert(I).second) return true; - Visited.insert(I); return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { // Ignore undef values as they probably don't affect the division // operands. @@ -264,6 +263,7 @@ QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", MainBB->getParent(), SuccessorBB); IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); + Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); Value *Dividend = SlowDivOrRem->getOperand(0); Value *Divisor = SlowDivOrRem->getOperand(1); @@ -287,6 +287,7 @@ QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", MainBB->getParent(), SuccessorBB); IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); + Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); Value *Dividend = SlowDivOrRem->getOperand(0); Value *Divisor = SlowDivOrRem->getOperand(1); @@ -312,6 +313,7 @@ QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, BasicBlock *PhiBB) { IRBuilder<> Builder(PhiBB, PhiBB->begin()); + Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); QuoPhi->addIncoming(LHS.Quotient, LHS.BB); QuoPhi->addIncoming(RHS.Quotient, RHS.BB); @@ -328,6 +330,7 @@ QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { assert((Op1 || Op2) && "Nothing to check"); IRBuilder<> Builder(MainBB, MainBB->end()); + Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); Value *OrV; if (Op1 && Op2) @@ -396,6 +399,9 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { isa<ConstantInt>(BCI->getOperand(0))) return None; + IRBuilder<> Builder(MainBB, MainBB->end()); + Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); + if (DividendShort && !isSignedOp()) { // If the division is unsigned and Dividend is known to be short, then // either @@ -418,7 +424,6 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { Long.Remainder = Dividend; QuotRemWithBB Fast = createFastBB(SuccessorBB); QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); - IRBuilder<> Builder(MainBB, MainBB->end()); Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); return Result; @@ -435,7 +440,6 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, DivisorShort ? nullptr : Divisor); - IRBuilder<> Builder(MainBB, MainBB->end()); Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); return Result; } diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp new file mode 100644 index 0000000000000..52e859361c598 --- /dev/null +++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp @@ -0,0 +1,167 @@ +//===- CallGraphUpdater.cpp - A (lazy) call graph update helper -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file provides interfaces used to manipulate a call graph, regardless +/// if it is a "old style" CallGraph or an "new style" LazyCallGraph. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +bool CallGraphUpdater::finalize() { + if (!DeadFunctionsInComdats.empty()) { + filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(), + DeadFunctionsInComdats); + DeadFunctions.append(DeadFunctionsInComdats.begin(), + DeadFunctionsInComdats.end()); + } + + if (CG) { + // First remove all references, e.g., outgoing via called functions. This is + // necessary as we can delete functions that have circular references. + for (Function *DeadFn : DeadFunctions) { + DeadFn->removeDeadConstantUsers(); + CallGraphNode *DeadCGN = (*CG)[DeadFn]; + DeadCGN->removeAllCalledFunctions(); + CG->getExternalCallingNode()->removeAnyCallEdgeTo(DeadCGN); + DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType())); + } + + // Then remove the node and function from the module. + for (Function *DeadFn : DeadFunctions) { + CallGraphNode *DeadCGN = CG->getOrInsertFunction(DeadFn); + assert(DeadCGN->getNumReferences() == 0 && + "References should have been handled by now"); + delete CG->removeFunctionFromModule(DeadCGN); + } + } else { + // This is the code path for the new lazy call graph and for the case were + // no call graph was provided. + for (Function *DeadFn : DeadFunctions) { + DeadFn->removeDeadConstantUsers(); + DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType())); + + if (LCG && !ReplacedFunctions.count(DeadFn)) { + // Taken mostly from the inliner: + LazyCallGraph::Node &N = LCG->get(*DeadFn); + auto *DeadSCC = LCG->lookupSCC(N); + assert(DeadSCC && DeadSCC->size() == 1 && + &DeadSCC->begin()->getFunction() == DeadFn); + auto &DeadRC = DeadSCC->getOuterRefSCC(); + + FunctionAnalysisManager &FAM = + AM->getResult<FunctionAnalysisManagerCGSCCProxy>(*DeadSCC, *LCG) + .getManager(); + + FAM.clear(*DeadFn, DeadFn->getName()); + AM->clear(*DeadSCC, DeadSCC->getName()); + LCG->removeDeadFunction(*DeadFn); + + // Mark the relevant parts of the call graph as invalid so we don't + // visit them. + UR->InvalidatedSCCs.insert(DeadSCC); + UR->InvalidatedRefSCCs.insert(&DeadRC); + } + + // The function is now really dead and de-attached from everything. + DeadFn->eraseFromParent(); + } + } + + bool Changed = !DeadFunctions.empty(); + DeadFunctionsInComdats.clear(); + DeadFunctions.clear(); + return Changed; +} + +void CallGraphUpdater::reanalyzeFunction(Function &Fn) { + if (CG) { + CallGraphNode *OldCGN = CG->getOrInsertFunction(&Fn); + OldCGN->removeAllCalledFunctions(); + CG->populateCallGraphNode(OldCGN); + } else if (LCG) { + LazyCallGraph::Node &N = LCG->get(Fn); + LazyCallGraph::SCC *C = LCG->lookupSCC(N); + updateCGAndAnalysisManagerForCGSCCPass(*LCG, *C, N, *AM, *UR, *FAM); + } +} + +void CallGraphUpdater::registerOutlinedFunction(Function &NewFn) { + if (CG) + CG->addToCallGraph(&NewFn); + else if (LCG) + LCG->addNewFunctionIntoSCC(NewFn, *SCC); +} + +void CallGraphUpdater::removeFunction(Function &DeadFn) { + DeadFn.deleteBody(); + DeadFn.setLinkage(GlobalValue::ExternalLinkage); + if (DeadFn.hasComdat()) + DeadFunctionsInComdats.push_back(&DeadFn); + else + DeadFunctions.push_back(&DeadFn); + + // For the old call graph we remove the function from the SCC right away. + if (CG && !ReplacedFunctions.count(&DeadFn)) { + CallGraphNode *DeadCGN = (*CG)[&DeadFn]; + DeadCGN->removeAllCalledFunctions(); + CGSCC->DeleteNode(DeadCGN); + } +} + +void CallGraphUpdater::replaceFunctionWith(Function &OldFn, Function &NewFn) { + OldFn.removeDeadConstantUsers(); + ReplacedFunctions.insert(&OldFn); + if (CG) { + // Update the call graph for the newly promoted function. + CallGraphNode *OldCGN = (*CG)[&OldFn]; + CallGraphNode *NewCGN = CG->getOrInsertFunction(&NewFn); + NewCGN->stealCalledFunctionsFrom(OldCGN); + CG->ReplaceExternalCallEdge(OldCGN, NewCGN); + + // And update the SCC we're iterating as well. + CGSCC->ReplaceNode(OldCGN, NewCGN); + } else if (LCG) { + // Directly substitute the functions in the call graph. + LazyCallGraph::Node &OldLCGN = LCG->get(OldFn); + SCC->getOuterRefSCC().replaceNodeFunction(OldLCGN, NewFn); + } + removeFunction(OldFn); +} + +bool CallGraphUpdater::replaceCallSite(CallBase &OldCS, CallBase &NewCS) { + // This is only necessary in the (old) CG. + if (!CG) + return true; + + Function *Caller = OldCS.getCaller(); + CallGraphNode *NewCalleeNode = + CG->getOrInsertFunction(NewCS.getCalledFunction()); + CallGraphNode *CallerNode = (*CG)[Caller]; + if (llvm::none_of(*CallerNode, [&OldCS](const CallGraphNode::CallRecord &CR) { + return CR.first && *CR.first == &OldCS; + })) + return false; + CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode); + return true; +} + +void CallGraphUpdater::removeCallSite(CallBase &CS) { + // This is only necessary in the (old) CG. + if (!CG) + return; + + Function *Caller = CS.getCaller(); + CallGraphNode *CallerNode = (*CG)[Caller]; + CallerNode->removeCallEdgeFor(CS); +} diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index f04d76e70c0da..5a47c1fd0b6cb 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -12,7 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CallPromotionUtils.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -158,32 +161,31 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst, /// %t1 = bitcast i32 %t0 to ... /// br label %normal_dst /// -static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) { +static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) { // Save the users of the calling instruction. These uses will be changed to // use the bitcast after we create it. SmallVector<User *, 16> UsersToUpdate; - for (User *U : CS.getInstruction()->users()) + for (User *U : CB.users()) UsersToUpdate.push_back(U); // Determine an appropriate location to create the bitcast for the return // value. The location depends on if we have a call or invoke instruction. Instruction *InsertBefore = nullptr; - if (auto *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) + if (auto *Invoke = dyn_cast<InvokeInst>(&CB)) InsertBefore = &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front(); else - InsertBefore = &*std::next(CS.getInstruction()->getIterator()); + InsertBefore = &*std::next(CB.getIterator()); // Bitcast the return value to the correct type. - auto *Cast = CastInst::CreateBitOrPointerCast(CS.getInstruction(), RetTy, "", - InsertBefore); + auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore); if (RetBitCast) *RetBitCast = Cast; // Replace all the original uses of the calling instruction with the bitcast. for (User *U : UsersToUpdate) - U->replaceUsesOfWith(CS.getInstruction(), Cast); + U->replaceUsesOfWith(&CB, Cast); } /// Predicate and clone the given call site. @@ -253,26 +255,91 @@ static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) { /// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] /// br %normal_dst /// -static Instruction *versionCallSite(CallSite CS, Value *Callee, - MDNode *BranchWeights) { - - IRBuilder<> Builder(CS.getInstruction()); - Instruction *OrigInst = CS.getInstruction(); +/// An indirect musttail call is processed slightly differently in that: +/// 1. No merge block needed for the orginal and the cloned callsite, since +/// either one ends the flow. No phi node is needed either. +/// 2. The return statement following the original call site is duplicated too +/// and placed immediately after the cloned call site per the IR convention. +/// +/// For example, the musttail call instruction below: +/// +/// orig_bb: +/// %t0 = musttail call i32 %ptr() +/// ... +/// +/// Is replaced by the following: +/// +/// cond_bb: +/// %cond = icmp eq i32 ()* %ptr, @func +/// br i1 %cond, %then_bb, %orig_bb +/// +/// then_bb: +/// ; The clone of the original call instruction is placed in the "then" +/// ; block. It is not yet promoted. +/// %t1 = musttail call i32 %ptr() +/// ret %t1 +/// +/// orig_bb: +/// ; The original call instruction stays in its original block. +/// %t0 = musttail call i32 %ptr() +/// ret %t0 +static CallBase &versionCallSite(CallBase &CB, Value *Callee, + MDNode *BranchWeights) { + + IRBuilder<> Builder(&CB); + CallBase *OrigInst = &CB; BasicBlock *OrigBlock = OrigInst->getParent(); // Create the compare. The called value and callee must have the same type to // be compared. - if (CS.getCalledValue()->getType() != Callee->getType()) - Callee = Builder.CreateBitCast(Callee, CS.getCalledValue()->getType()); - auto *Cond = Builder.CreateICmpEQ(CS.getCalledValue(), Callee); + if (CB.getCalledOperand()->getType() != Callee->getType()) + Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType()); + auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee); + + if (OrigInst->isMustTailCall()) { + // Create an if-then structure. The original instruction stays in its block, + // and a clone of the original instruction is placed in the "then" block. + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights); + BasicBlock *ThenBlock = ThenTerm->getParent(); + ThenBlock->setName("if.true.direct_targ"); + CallBase *NewInst = cast<CallBase>(OrigInst->clone()); + NewInst->insertBefore(ThenTerm); + + // Place a clone of the optional bitcast after the new call site. + Value *NewRetVal = NewInst; + auto Next = OrigInst->getNextNode(); + if (auto *BitCast = dyn_cast_or_null<BitCastInst>(Next)) { + assert(BitCast->getOperand(0) == OrigInst && + "bitcast following musttail call must use the call"); + auto NewBitCast = BitCast->clone(); + NewBitCast->replaceUsesOfWith(OrigInst, NewInst); + NewBitCast->insertBefore(ThenTerm); + NewRetVal = NewBitCast; + Next = BitCast->getNextNode(); + } + + // Place a clone of the return instruction after the new call site. + ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next); + assert(Ret && "musttail call must precede a ret with an optional bitcast"); + auto NewRet = Ret->clone(); + if (Ret->getReturnValue()) + NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal); + NewRet->insertBefore(ThenTerm); + + // A return instructions is terminating, so we don't need the terminator + // instruction just created. + ThenTerm->eraseFromParent(); + + return *NewInst; + } // Create an if-then-else structure. The original instruction is moved into // the "else" block, and a clone of the original instruction is placed in the // "then" block. Instruction *ThenTerm = nullptr; Instruction *ElseTerm = nullptr; - SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm, - BranchWeights); + SplitBlockAndInsertIfThenElse(Cond, &CB, &ThenTerm, &ElseTerm, BranchWeights); BasicBlock *ThenBlock = ThenTerm->getParent(); BasicBlock *ElseBlock = ElseTerm->getParent(); BasicBlock *MergeBlock = OrigInst->getParent(); @@ -281,7 +348,7 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee, ElseBlock->setName("if.false.orig_indirect"); MergeBlock->setName("if.end.icp"); - Instruction *NewInst = OrigInst->clone(); + CallBase *NewInst = cast<CallBase>(OrigInst->clone()); OrigInst->moveBefore(ElseTerm); NewInst->insertBefore(ThenTerm); @@ -313,18 +380,18 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee, // Create a phi node for the returned value of the call site. createRetPHINode(OrigInst, NewInst, MergeBlock, Builder); - return NewInst; + return *NewInst; } -bool llvm::isLegalToPromote(CallSite CS, Function *Callee, +bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason) { - assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted"); + assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted"); auto &DL = Callee->getParent()->getDataLayout(); // Check the return type. The callee's return value type must be bitcast // compatible with the call site's type. - Type *CallRetTy = CS.getInstruction()->getType(); + Type *CallRetTy = CB.getType(); Type *FuncRetTy = Callee->getReturnType(); if (CallRetTy != FuncRetTy) if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) { @@ -336,9 +403,12 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee, // The number of formal arguments of the callee. unsigned NumParams = Callee->getFunctionType()->getNumParams(); + // The number of actual arguments in the call. + unsigned NumArgs = CB.arg_size(); + // Check the number of arguments. The callee and call site must agree on the // number of arguments. - if (CS.arg_size() != NumParams && !Callee->isVarArg()) { + if (NumArgs != NumParams && !Callee->isVarArg()) { if (FailureReason) *FailureReason = "The number of arguments mismatch"; return false; @@ -347,9 +417,10 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee, // Check the argument types. The callee's formal argument types must be // bitcast compatible with the corresponding actual argument types of the call // site. - for (unsigned I = 0; I < NumParams; ++I) { + unsigned I = 0; + for (; I < NumParams; ++I) { Type *FormalTy = Callee->getFunctionType()->getFunctionParamType(I); - Type *ActualTy = CS.getArgument(I)->getType(); + Type *ActualTy = CB.getArgOperand(I)->getType(); if (FormalTy == ActualTy) continue; if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) { @@ -358,35 +429,43 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee, return false; } } + for (; I < NumArgs; I++) { + // Vararg functions can have more arguments than paramters. + assert(Callee->isVarArg()); + if (CB.paramHasAttr(I, Attribute::StructRet)) { + *FailureReason = "SRet arg to vararg function"; + return false; + } + } return true; } -Instruction *llvm::promoteCall(CallSite CS, Function *Callee, - CastInst **RetBitCast) { - assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted"); +CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, + CastInst **RetBitCast) { + assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted"); // Set the called function of the call site to be the given callee (but don't // change the type). - cast<CallBase>(CS.getInstruction())->setCalledOperand(Callee); + CB.setCalledOperand(Callee); // Since the call site will no longer be direct, we must clear metadata that // is only appropriate for indirect calls. This includes !prof and !callees // metadata. - CS.getInstruction()->setMetadata(LLVMContext::MD_prof, nullptr); - CS.getInstruction()->setMetadata(LLVMContext::MD_callees, nullptr); + CB.setMetadata(LLVMContext::MD_prof, nullptr); + CB.setMetadata(LLVMContext::MD_callees, nullptr); // If the function type of the call site matches that of the callee, no // additional work is required. - if (CS.getFunctionType() == Callee->getFunctionType()) - return CS.getInstruction(); + if (CB.getFunctionType() == Callee->getFunctionType()) + return CB; // Save the return types of the call site and callee. - Type *CallSiteRetTy = CS.getInstruction()->getType(); + Type *CallSiteRetTy = CB.getType(); Type *CalleeRetTy = Callee->getReturnType(); // Change the function type of the call site the match that of the callee. - CS.mutateFunctionType(Callee->getFunctionType()); + CB.mutateFunctionType(Callee->getFunctionType()); // Inspect the arguments of the call site. If an argument's type doesn't // match the corresponding formal argument's type in the callee, bitcast it @@ -395,19 +474,18 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee, auto CalleeParamNum = CalleeType->getNumParams(); LLVMContext &Ctx = Callee->getContext(); - const AttributeList &CallerPAL = CS.getAttributes(); + const AttributeList &CallerPAL = CB.getAttributes(); // The new list of argument attributes. SmallVector<AttributeSet, 4> NewArgAttrs; bool AttributeChanged = false; for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) { - auto *Arg = CS.getArgument(ArgNo); + auto *Arg = CB.getArgOperand(ArgNo); Type *FormalTy = CalleeType->getParamType(ArgNo); Type *ActualTy = Arg->getType(); if (FormalTy != ActualTy) { - auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", - CS.getInstruction()); - CS.setArgument(ArgNo, Cast); + auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB); + CB.setArgOperand(ArgNo, Cast); // Remove any incompatible attributes for the argument. AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo)); @@ -432,30 +510,89 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee, // Remove any incompatible return value attribute. AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) { - createRetBitCast(CS, CallSiteRetTy, RetBitCast); + createRetBitCast(CB, CallSiteRetTy, RetBitCast); RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy)); AttributeChanged = true; } // Set the new callsite attribute. if (AttributeChanged) - CS.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(), + CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(), AttributeSet::get(Ctx, RAttrs), NewArgAttrs)); - return CS.getInstruction(); + return CB; } -Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee, - MDNode *BranchWeights) { +CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee, + MDNode *BranchWeights) { // Version the indirect call site. If the called value is equal to the given // callee, 'NewInst' will be executed, otherwise the original call site will // be executed. - Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights); + CallBase &NewInst = versionCallSite(CB, Callee, BranchWeights); // Promote 'NewInst' so that it directly calls the desired function. - return promoteCall(CallSite(NewInst), Callee); + return promoteCall(NewInst, Callee); +} + +bool llvm::tryPromoteCall(CallBase &CB) { + assert(!CB.getCalledFunction()); + Module *M = CB.getCaller()->getParent(); + const DataLayout &DL = M->getDataLayout(); + Value *Callee = CB.getCalledOperand(); + + LoadInst *VTableEntryLoad = dyn_cast<LoadInst>(Callee); + if (!VTableEntryLoad) + return false; // Not a vtable entry load. + Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand(); + APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0); + Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets( + DL, VTableOffset, /* AllowNonInbounds */ true); + LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr); + if (!VTablePtrLoad) + return false; // Not a vtable load. + Value *Object = VTablePtrLoad->getPointerOperand(); + APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0); + Value *ObjectBase = Object->stripAndAccumulateConstantOffsets( + DL, ObjectOffset, /* AllowNonInbounds */ true); + if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0)) + // Not an Alloca or the offset isn't zero. + return false; + + // Look for the vtable pointer store into the object by the ctor. + BasicBlock::iterator BBI(VTablePtrLoad); + Value *VTablePtr = FindAvailableLoadedValue( + VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr); + if (!VTablePtr) + return false; // No vtable found. + APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0); + Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets( + DL, VTableOffsetGVBase, /* AllowNonInbounds */ true); + GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase); + if (!(GV && GV->isConstant() && GV->hasDefinitiveInitializer())) + // Not in the form of a global constant variable with an initializer. + return false; + + Constant *VTableGVInitializer = GV->getInitializer(); + APInt VTableGVOffset = VTableOffsetGVBase + VTableOffset; + if (!(VTableGVOffset.getActiveBits() <= 64)) + return false; // Out of range. + Constant *Ptr = getPointerAtOffset(VTableGVInitializer, + VTableGVOffset.getZExtValue(), + *M); + if (!Ptr) + return false; // No constant (function) pointer found. + Function *DirectCallee = dyn_cast<Function>(Ptr->stripPointerCasts()); + if (!DirectCallee) + return false; // No function pointer found. + + if (!isLegalToPromote(CB, DirectCallee)) + return false; + + // Success. + promoteCall(CB, DirectCallee); + return true; } #undef DEBUG_TYPE diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp new file mode 100644 index 0000000000000..1ae17c64b8f6d --- /dev/null +++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp @@ -0,0 +1,250 @@ +//==- CanonicalizeFreezeInLoops - Canonicalize freezes in a loop-*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass canonicalizes freeze instructions in a loop by pushing them out to +// the preheader. +// +// loop: +// i = phi init, i.next +// i.next = add nsw i, 1 +// i.next.fr = freeze i.next // push this out of this loop +// use(i.next.fr) +// br i1 (i.next <= N), loop, exit +// => +// init.fr = freeze init +// loop: +// i = phi init.fr, i.next +// i.next = add i, 1 // nsw is dropped here +// use(i.next) +// br i1 (i.next <= N), loop, exit +// +// Removing freezes from these chains help scalar evolution successfully analyze +// expressions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils.h" + +using namespace llvm; + +#define DEBUG_TYPE "canon-freeze" + +namespace { + +class CanonicalizeFreezeInLoops : public LoopPass { +public: + static char ID; + + CanonicalizeFreezeInLoops(); + +private: + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +class CanonicalizeFreezeInLoopsImpl { + Loop *L; + ScalarEvolution &SE; + DominatorTree &DT; + + struct FrozenIndPHIInfo { + // A freeze instruction that uses an induction phi + FreezeInst *FI = nullptr; + // The induction phi, step instruction, the operand idx of StepInst which is + // a step value + PHINode *PHI; + BinaryOperator *StepInst; + unsigned StepValIdx = 0; + + FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst) + : PHI(PHI), StepInst(StepInst) {} + }; + + // Can freeze instruction be pushed into operands of I? + // In order to do this, I should not create a poison after I's flags are + // stripped. + bool canHandleInst(const Instruction *I) { + auto Opc = I->getOpcode(); + // If add/sub/mul, drop nsw/nuw flags. + return Opc == Instruction::Add || Opc == Instruction::Sub || + Opc == Instruction::Mul; + } + + void InsertFreezeAndForgetFromSCEV(Use &U); + +public: + CanonicalizeFreezeInLoopsImpl(Loop *L, ScalarEvolution &SE, DominatorTree &DT) + : L(L), SE(SE), DT(DT) {} + bool run(); +}; + +} // anonymous namespace + +// Given U = (value, user), replace value with freeze(value), and let +// SCEV forget user. The inserted freeze is placed in the preheader. +void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) { + auto *PH = L->getLoopPreheader(); + + auto *UserI = cast<Instruction>(U.getUser()); + auto *ValueToFr = U.get(); + assert(L->contains(UserI->getParent()) && + "Should not process an instruction that isn't inside the loop"); + if (isGuaranteedNotToBeUndefOrPoison(ValueToFr, UserI, &DT)) + return; + + LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n"); + LLVM_DEBUG(dbgs() << "\tUser: " << *U.getUser() << "\n"); + LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n"); + + U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen", + PH->getTerminator())); + + SE.forgetValue(UserI); +} + +bool CanonicalizeFreezeInLoopsImpl::run() { + // The loop should be in LoopSimplify form. + if (!L->isLoopSimplifyForm()) + return false; + + SmallVector<FrozenIndPHIInfo, 4> Candidates; + + for (auto &PHI : L->getHeader()->phis()) { + InductionDescriptor ID; + if (!InductionDescriptor::isInductionPHI(&PHI, L, &SE, ID)) + continue; + + LLVM_DEBUG(dbgs() << "canonfr: PHI: " << PHI << "\n"); + FrozenIndPHIInfo Info(&PHI, ID.getInductionBinOp()); + if (!Info.StepInst || !canHandleInst(Info.StepInst)) { + // The stepping instruction has unknown form. + // Ignore this PHI. + continue; + } + + Info.StepValIdx = Info.StepInst->getOperand(0) == &PHI; + Value *StepV = Info.StepInst->getOperand(Info.StepValIdx); + if (auto *StepI = dyn_cast<Instruction>(StepV)) { + if (L->contains(StepI->getParent())) { + // The step value is inside the loop. Freezing step value will introduce + // another freeze into the loop, so skip this PHI. + continue; + } + } + + auto Visit = [&](User *U) { + if (auto *FI = dyn_cast<FreezeInst>(U)) { + LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n"); + Info.FI = FI; + Candidates.push_back(Info); + } + }; + for_each(PHI.users(), Visit); + for_each(Info.StepInst->users(), Visit); + } + + if (Candidates.empty()) + return false; + + SmallSet<PHINode *, 8> ProcessedPHIs; + for (const auto &Info : Candidates) { + PHINode *PHI = Info.PHI; + if (!ProcessedPHIs.insert(Info.PHI).second) + continue; + + BinaryOperator *StepI = Info.StepInst; + assert(StepI && "Step instruction should have been found"); + + // Drop flags from the step instruction. + if (!isGuaranteedNotToBeUndefOrPoison(StepI, StepI, &DT)) { + LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n"); + StepI->dropPoisonGeneratingFlags(); + SE.forgetValue(StepI); + } + + InsertFreezeAndForgetFromSCEV(StepI->getOperandUse(Info.StepValIdx)); + + unsigned OperandIdx = + PHI->getOperandNumForIncomingValue(PHI->getIncomingValue(0) == StepI); + InsertFreezeAndForgetFromSCEV(PHI->getOperandUse(OperandIdx)); + } + + // Finally, remove the old freeze instructions. + for (const auto &Item : Candidates) { + auto *FI = Item.FI; + LLVM_DEBUG(dbgs() << "canonfr: removing " << *FI << "\n"); + SE.forgetValue(FI); + FI->replaceAllUsesWith(FI->getOperand(0)); + FI->eraseFromParent(); + } + + return true; +} + +CanonicalizeFreezeInLoops::CanonicalizeFreezeInLoops() : LoopPass(ID) { + initializeCanonicalizeFreezeInLoopsPass(*PassRegistry::getPassRegistry()); +} + +void CanonicalizeFreezeInLoops::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreservedID(LoopSimplifyID); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); +} + +bool CanonicalizeFreezeInLoops::runOnLoop(Loop *L, LPPassManager &) { + if (skipLoop(L)) + return false; + + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return CanonicalizeFreezeInLoopsImpl(L, SE, DT).run(); +} + +PreservedAnalyses +CanonicalizeFreezeInLoopsPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + if (!CanonicalizeFreezeInLoopsImpl(&L, AR.SE, AR.DT).run()) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +INITIALIZE_PASS_BEGIN(CanonicalizeFreezeInLoops, "canon-freeze", + "Canonicalize Freeze Instructions in Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(CanonicalizeFreezeInLoops, "canon-freeze", + "Canonicalize Freeze Instructions in Loops", false, false) + +Pass *llvm::createCanonicalizeFreezeInLoopsPass() { + return new CanonicalizeFreezeInLoops(); +} + +char CanonicalizeFreezeInLoops::ID = 0; diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 75e8963303c24..788983c156903 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -46,7 +46,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, if (BB->hasName()) NewBB->setName(BB->getName() + NameSuffix); - bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; + bool hasCalls = false, hasDynamicAllocas = false; Module *TheModule = F ? F->getParent() : nullptr; // Loop over all instructions, and copy them over. @@ -62,18 +62,15 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I)); if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { - if (isa<ConstantInt>(AI->getArraySize())) - hasStaticAllocas = true; - else + if (!AI->isStaticAlloca()) { hasDynamicAllocas = true; + } } } if (CodeInfo) { CodeInfo->ContainsCalls |= hasCalls; CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; - CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && - BB != &BB->getParent()->getEntryBlock(); } return NewBB; } @@ -367,8 +364,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); if (CodeInfo) - if (auto CS = ImmutableCallSite(&*II)) - if (CS.hasOperandBundles()) + if (auto *CB = dyn_cast<CallBase>(&*II)) + if (CB->hasOperandBundles()) CodeInfo->OperandBundleCallSites.push_back(NewInst); if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { @@ -424,8 +421,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, VMap[OldTI] = NewInst; // Add instruction map to value. if (CodeInfo) - if (auto CS = ImmutableCallSite(OldTI)) - if (CS.hasOperandBundles()) + if (auto *CB = dyn_cast<CallBase>(OldTI)) + if (CB->hasOperandBundles()) CodeInfo->OperandBundleCallSites.push_back(NewInst); // Recursively clone any reachable successor blocks. @@ -619,8 +616,9 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Skip over non-intrinsic callsites, we don't want to remove any nodes from // the CGSCC. - CallSite CS = CallSite(I); - if (CS && CS.getCalledFunction() && !CS.getCalledFunction()->isIntrinsic()) + CallBase *CB = dyn_cast<CallBase>(I); + if (CB && CB->getCalledFunction() && + !CB->getCalledFunction()->isIntrinsic()) continue; // See if this instruction simplifies. @@ -804,8 +802,6 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, // Update LoopInfo. NewLoop->addBasicBlockToLoop(NewBB, *LI); - if (BB == CurLoop->getHeader()) - NewLoop->moveToHeader(NewBB); // Add DominatorTree node. After seeing all blocks, update to correct // IDom. @@ -815,6 +811,11 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, } for (BasicBlock *BB : OrigLoop->getBlocks()) { + // Update loop headers. + Loop *CurLoop = LI->getLoopFor(BB); + if (BB == CurLoop->getHeader()) + LMap[CurLoop]->moveToHeader(cast<BasicBlock>(VMap[BB])); + // Update DominatorTree. BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]), diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 682af4a88d3e5..8cdbb9d356523 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -31,11 +31,14 @@ #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -448,18 +451,24 @@ CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC, for (User *U : Addr->users()) { IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U); if (IntrInst) { + // We don't model addresses with multiple start/end markers, but the + // markers do not need to be in the region. if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) { - // Do not handle the case where Addr has multiple start markers. if (Info.LifeStart) return {}; Info.LifeStart = IntrInst; + continue; } if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) { if (Info.LifeEnd) return {}; Info.LifeEnd = IntrInst; + continue; } - continue; + // At this point, permit debug uses outside of the region. + // This is fixed in a later call to fixupDebugInfoPostExtraction(). + if (isa<DbgInfoIntrinsic>(IntrInst)) + continue; } // Find untracked uses of the address, bail. if (!definedInRegion(Blocks, U)) @@ -865,10 +874,13 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::NoAlias: case Attribute::NoBuiltin: case Attribute::NoCapture: + case Attribute::NoMerge: case Attribute::NoReturn: case Attribute::NoSync: + case Attribute::NoUndef: case Attribute::None: case Attribute::NonNull: + case Attribute::Preallocated: case Attribute::ReadNone: case Attribute::ReadOnly: case Attribute::Returned: @@ -884,6 +896,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::ZExt: case Attribute::ImmArg: case Attribute::EndAttrKinds: + case Attribute::EmptyKey: + case Attribute::TombstoneKey: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: @@ -898,6 +912,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::NonLazyBind: case Attribute::NoRedZone: case Attribute::NoUnwind: + case Attribute::NullPointerIsValid: case Attribute::OptForFuzzing: case Attribute::OptimizeNone: case Attribute::OptimizeForSize: @@ -1120,8 +1135,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, GetElementPtrInst *GEP = GetElementPtrInst::Create( StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); codeReplacer->getInstList().push_back(GEP); - StoreInst *SI = new StoreInst(StructValues[i], GEP); - codeReplacer->getInstList().push_back(SI); + new StoreInst(StructValues[i], GEP, codeReplacer); } } @@ -1164,9 +1178,9 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, Output = ReloadOutputs[i]; } LoadInst *load = new LoadInst(outputs[i]->getType(), Output, - outputs[i]->getName() + ".reload"); + outputs[i]->getName() + ".reload", + codeReplacer); Reloads.push_back(load); - codeReplacer->getInstList().push_back(load); std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end()); for (unsigned u = 0, e = Users.size(); u != e; ++u) { Instruction *inst = cast<Instruction>(Users[u]); @@ -1351,6 +1365,9 @@ void CodeExtractor::calculateNewCallTerminatorWeights( // Block Frequency distribution with dummy node. Distribution BranchDist; + SmallVector<BranchProbability, 4> EdgeProbabilities( + TI->getNumSuccessors(), BranchProbability::getUnknown()); + // Add each of the frequencies of the successors. for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) { BlockNode ExitNode(i); @@ -1358,12 +1375,14 @@ void CodeExtractor::calculateNewCallTerminatorWeights( if (ExitFreq != 0) BranchDist.addExit(ExitNode, ExitFreq); else - BPI->setEdgeProbability(CodeReplacer, i, BranchProbability::getZero()); + EdgeProbabilities[i] = BranchProbability::getZero(); } // Check for no total weight. - if (BranchDist.Total == 0) + if (BranchDist.Total == 0) { + BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities); return; + } // Normalize the distribution so that they can fit in unsigned. BranchDist.normalize(); @@ -1375,13 +1394,133 @@ void CodeExtractor::calculateNewCallTerminatorWeights( // Get the weight and update the current BFI. BranchWeights[Weight.TargetNode.Index] = Weight.Amount; BranchProbability BP(Weight.Amount, BranchDist.Total); - BPI->setEdgeProbability(CodeReplacer, Weight.TargetNode.Index, BP); + EdgeProbabilities[Weight.TargetNode.Index] = BP; } + BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities); TI->setMetadata( LLVMContext::MD_prof, MDBuilder(TI->getContext()).createBranchWeights(BranchWeights)); } +/// Erase debug info intrinsics which refer to values in \p F but aren't in +/// \p F. +static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) { + for (Instruction &I : instructions(F)) { + SmallVector<DbgVariableIntrinsic *, 4> DbgUsers; + findDbgUsers(DbgUsers, &I); + for (DbgVariableIntrinsic *DVI : DbgUsers) + if (DVI->getFunction() != &F) + DVI->eraseFromParent(); + } +} + +/// Fix up the debug info in the old and new functions by pointing line +/// locations and debug intrinsics to the new subprogram scope, and by deleting +/// intrinsics which point to values outside of the new function. +static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, + CallInst &TheCall) { + DISubprogram *OldSP = OldFunc.getSubprogram(); + LLVMContext &Ctx = OldFunc.getContext(); + + if (!OldSP) { + // Erase any debug info the new function contains. + stripDebugInfo(NewFunc); + // Make sure the old function doesn't contain any non-local metadata refs. + eraseDebugIntrinsicsWithNonLocalRefs(NewFunc); + return; + } + + // Create a subprogram for the new function. Leave out a description of the + // function arguments, as the parameters don't correspond to anything at the + // source level. + assert(OldSP->getUnit() && "Missing compile unit for subprogram"); + DIBuilder DIB(*OldFunc.getParent(), /*AllowUnresolvedNodes=*/false, + OldSP->getUnit()); + auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None)); + DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition | + DISubprogram::SPFlagOptimized | + DISubprogram::SPFlagLocalToUnit; + auto NewSP = DIB.createFunction( + OldSP->getUnit(), NewFunc.getName(), NewFunc.getName(), OldSP->getFile(), + /*LineNo=*/0, SPType, /*ScopeLine=*/0, DINode::FlagZero, SPFlags); + NewFunc.setSubprogram(NewSP); + + // Debug intrinsics in the new function need to be updated in one of two + // ways: + // 1) They need to be deleted, because they describe a value in the old + // function. + // 2) They need to point to fresh metadata, e.g. because they currently + // point to a variable in the wrong scope. + SmallDenseMap<DINode *, DINode *> RemappedMetadata; + SmallVector<Instruction *, 4> DebugIntrinsicsToDelete; + for (Instruction &I : instructions(NewFunc)) { + auto *DII = dyn_cast<DbgInfoIntrinsic>(&I); + if (!DII) + continue; + + // Point the intrinsic to a fresh label within the new function. + if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) { + DILabel *OldLabel = DLI->getLabel(); + DINode *&NewLabel = RemappedMetadata[OldLabel]; + if (!NewLabel) + NewLabel = DILabel::get(Ctx, NewSP, OldLabel->getName(), + OldLabel->getFile(), OldLabel->getLine()); + DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel)); + continue; + } + + // If the location isn't a constant or an instruction, delete the + // intrinsic. + auto *DVI = cast<DbgVariableIntrinsic>(DII); + Value *Location = DVI->getVariableLocation(); + if (!Location || + (!isa<Constant>(Location) && !isa<Instruction>(Location))) { + DebugIntrinsicsToDelete.push_back(DVI); + continue; + } + + // If the variable location is an instruction but isn't in the new + // function, delete the intrinsic. + Instruction *LocationInst = dyn_cast<Instruction>(Location); + if (LocationInst && LocationInst->getFunction() != &NewFunc) { + DebugIntrinsicsToDelete.push_back(DVI); + continue; + } + + // Point the intrinsic to a fresh variable within the new function. + DILocalVariable *OldVar = DVI->getVariable(); + DINode *&NewVar = RemappedMetadata[OldVar]; + if (!NewVar) + NewVar = DIB.createAutoVariable( + NewSP, OldVar->getName(), OldVar->getFile(), OldVar->getLine(), + OldVar->getType(), /*AlwaysPreserve=*/false, DINode::FlagZero, + OldVar->getAlignInBits()); + DVI->setArgOperand(1, MetadataAsValue::get(Ctx, NewVar)); + } + for (auto *DII : DebugIntrinsicsToDelete) + DII->eraseFromParent(); + DIB.finalizeSubprogram(NewSP); + + // Fix up the scope information attached to the line locations in the new + // function. + for (Instruction &I : instructions(NewFunc)) { + if (const DebugLoc &DL = I.getDebugLoc()) + I.setDebugLoc(DebugLoc::get(DL.getLine(), DL.getCol(), NewSP)); + + // Loop info metadata may contain line locations. Fix them up. + auto updateLoopInfoLoc = [&Ctx, + NewSP](const DILocation &Loc) -> DILocation * { + return DILocation::get(Ctx, Loc.getLine(), Loc.getColumn(), NewSP, + nullptr); + }; + updateLoopMetadataDebugLocations(I, updateLoopInfoLoc); + } + if (!TheCall.getDebugLoc()) + TheCall.setDebugLoc(DebugLoc::get(0, 0, OldSP)); + + eraseDebugIntrinsicsWithNonLocalRefs(NewFunc); +} + Function * CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) { if (!isEligible()) @@ -1405,13 +1544,19 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) { } } - if (AC) { - // Remove @llvm.assume calls that were moved to the new function from the - // old function's assumption cache. - for (BasicBlock *Block : Blocks) - for (auto &I : *Block) - if (match(&I, m_Intrinsic<Intrinsic::assume>())) - AC->unregisterAssumption(cast<CallInst>(&I)); + // Remove @llvm.assume calls that will be moved to the new function from the + // old function's assumption cache. + for (BasicBlock *Block : Blocks) { + for (auto It = Block->begin(), End = Block->end(); It != End;) { + Instruction *I = &*It; + ++It; + + if (match(I, m_Intrinsic<Intrinsic::assume>())) { + if (AC) + AC->unregisterAssumption(cast<CallInst>(I)); + I->eraseFromParent(); + } + } } // If we have any return instructions in the region, split those blocks so @@ -1567,26 +1712,7 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) { } } - // Erase debug info intrinsics. Variable updates within the new function are - // invisible to debuggers. This could be improved by defining a DISubprogram - // for the new function. - for (BasicBlock &BB : *newFunction) { - auto BlockIt = BB.begin(); - // Remove debug info intrinsics from the new function. - while (BlockIt != BB.end()) { - Instruction *Inst = &*BlockIt; - ++BlockIt; - if (isa<DbgInfoIntrinsic>(Inst)) - Inst->eraseFromParent(); - } - // Remove debug info intrinsics which refer to values in the new function - // from the old function. - SmallVector<DbgVariableIntrinsic *, 4> DbgUsers; - for (Instruction &I : BB) - findDbgUsers(DbgUsers, &I); - for (DbgVariableIntrinsic *DVI : DbgUsers) - DVI->eraseFromParent(); - } + fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall); // Mark the new function `noreturn` if applicable. Terminators which resume // exception propagation are treated as returning instructions. This is to @@ -1604,17 +1730,36 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) { }); LLVM_DEBUG(if (verifyFunction(*oldFunction)) report_fatal_error("verification of oldFunction failed!")); - LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, AC)) - report_fatal_error("Stale Asumption cache for old Function!")); + LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC)) + report_fatal_error("Stale Asumption cache for old Function!")); return newFunction; } -bool CodeExtractor::verifyAssumptionCache(const Function& F, +bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc, + const Function &NewFunc, AssumptionCache *AC) { for (auto AssumeVH : AC->assumptions()) { - CallInst *I = cast<CallInst>(AssumeVH); - if (I->getFunction() != &F) + CallInst *I = dyn_cast_or_null<CallInst>(AssumeVH); + if (!I) + continue; + + // There shouldn't be any llvm.assume intrinsics in the new function. + if (I->getFunction() != &OldFunc) return true; + + // There shouldn't be any stale affected values in the assumption cache + // that were previously in the old function, but that have now been moved + // to the new function. + for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) { + CallInst *AffectedCI = dyn_cast_or_null<CallInst>(AffectedValVH); + if (!AffectedCI) + continue; + if (AffectedCI->getFunction() != &OldFunc) + return true; + auto *AssumedInst = dyn_cast<Instruction>(AffectedCI->getOperand(0)); + if (AssumedInst->getFunction() != &OldFunc) + return true; + } } return false; } diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp index 93395ac761ab5..08047dc0f96ee 100644 --- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp +++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CodeMoverUtils.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" @@ -30,6 +31,201 @@ STATISTIC(NotControlFlowEquivalent, STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported"); STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported"); +namespace { +/// Represent a control condition. A control condition is a condition of a +/// terminator to decide which successors to execute. The pointer field +/// represents the address of the condition of the terminator. The integer field +/// is a bool, it is true when the basic block is executed when V is true. For +/// example, `br %cond, bb0, bb1` %cond is a control condition of bb0 with the +/// integer field equals to true, while %cond is a control condition of bb1 with +/// the integer field equals to false. +using ControlCondition = PointerIntPair<Value *, 1, bool>; +#ifndef NDEBUG +raw_ostream &operator<<(raw_ostream &OS, const ControlCondition &C) { + OS << "[" << *C.getPointer() << ", " << (C.getInt() ? "true" : "false") + << "]"; + return OS; +} +#endif + +/// Represent a set of control conditions required to execute ToBB from FromBB. +class ControlConditions { + using ConditionVectorTy = SmallVector<ControlCondition, 6>; + + /// A SmallVector of control conditions. + ConditionVectorTy Conditions; + +public: + /// Return a ControlConditions which stores all conditions required to execute + /// \p BB from \p Dominator. If \p MaxLookup is non-zero, it limits the + /// number of conditions to collect. Return None if not all conditions are + /// collected successfully, or we hit the limit. + static const Optional<ControlConditions> + collectControlConditions(const BasicBlock &BB, const BasicBlock &Dominator, + const DominatorTree &DT, + const PostDominatorTree &PDT, + unsigned MaxLookup = 6); + + /// Return true if there exists no control conditions required to execute ToBB + /// from FromBB. + bool isUnconditional() const { return Conditions.empty(); } + + /// Return a constant reference of Conditions. + const ConditionVectorTy &getControlConditions() const { return Conditions; } + + /// Add \p V as one of the ControlCondition in Condition with IsTrueCondition + /// equals to \p True. Return true if inserted successfully. + bool addControlCondition(ControlCondition C); + + /// Return true if for all control conditions in Conditions, there exists an + /// equivalent control condition in \p Other.Conditions. + bool isEquivalent(const ControlConditions &Other) const; + + /// Return true if \p C1 and \p C2 are equivalent. + static bool isEquivalent(const ControlCondition &C1, + const ControlCondition &C2); + +private: + ControlConditions() = default; + + static bool isEquivalent(const Value &V1, const Value &V2); + static bool isInverse(const Value &V1, const Value &V2); +}; +} // namespace + +static bool domTreeLevelBefore(DominatorTree *DT, const Instruction *InstA, + const Instruction *InstB) { + // Use ordered basic block in case the 2 instructions are in the same + // block. + if (InstA->getParent() == InstB->getParent()) + return InstA->comesBefore(InstB); + + DomTreeNode *DA = DT->getNode(InstA->getParent()); + DomTreeNode *DB = DT->getNode(InstB->getParent()); + return DA->getLevel() < DB->getLevel(); +} + +const Optional<ControlConditions> ControlConditions::collectControlConditions( + const BasicBlock &BB, const BasicBlock &Dominator, const DominatorTree &DT, + const PostDominatorTree &PDT, unsigned MaxLookup) { + assert(DT.dominates(&Dominator, &BB) && "Expecting Dominator to dominate BB"); + + ControlConditions Conditions; + unsigned NumConditions = 0; + + // BB is executed unconditional from itself. + if (&Dominator == &BB) + return Conditions; + + const BasicBlock *CurBlock = &BB; + // Walk up the dominator tree from the associated DT node for BB to the + // associated DT node for Dominator. + do { + assert(DT.getNode(CurBlock) && "Expecting a valid DT node for CurBlock"); + BasicBlock *IDom = DT.getNode(CurBlock)->getIDom()->getBlock(); + assert(DT.dominates(&Dominator, IDom) && + "Expecting Dominator to dominate IDom"); + + // Limitation: can only handle branch instruction currently. + const BranchInst *BI = dyn_cast<BranchInst>(IDom->getTerminator()); + if (!BI) + return None; + + bool Inserted = false; + if (PDT.dominates(CurBlock, IDom)) { + LLVM_DEBUG(dbgs() << CurBlock->getName() + << " is executed unconditionally from " + << IDom->getName() << "\n"); + } else if (PDT.dominates(CurBlock, BI->getSuccessor(0))) { + LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \"" + << *BI->getCondition() << "\" is true from " + << IDom->getName() << "\n"); + Inserted = Conditions.addControlCondition( + ControlCondition(BI->getCondition(), true)); + } else if (PDT.dominates(CurBlock, BI->getSuccessor(1))) { + LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \"" + << *BI->getCondition() << "\" is false from " + << IDom->getName() << "\n"); + Inserted = Conditions.addControlCondition( + ControlCondition(BI->getCondition(), false)); + } else + return None; + + if (Inserted) + ++NumConditions; + + if (MaxLookup != 0 && NumConditions > MaxLookup) + return None; + + CurBlock = IDom; + } while (CurBlock != &Dominator); + + return Conditions; +} + +bool ControlConditions::addControlCondition(ControlCondition C) { + bool Inserted = false; + if (none_of(Conditions, [&](ControlCondition &Exists) { + return ControlConditions::isEquivalent(C, Exists); + })) { + Conditions.push_back(C); + Inserted = true; + } + + LLVM_DEBUG(dbgs() << (Inserted ? "Inserted " : "Not inserted ") << C << "\n"); + return Inserted; +} + +bool ControlConditions::isEquivalent(const ControlConditions &Other) const { + if (Conditions.empty() && Other.Conditions.empty()) + return true; + + if (Conditions.size() != Other.Conditions.size()) + return false; + + return all_of(Conditions, [&](const ControlCondition &C) { + return any_of(Other.Conditions, [&](const ControlCondition &OtherC) { + return ControlConditions::isEquivalent(C, OtherC); + }); + }); +} + +bool ControlConditions::isEquivalent(const ControlCondition &C1, + const ControlCondition &C2) { + if (C1.getInt() == C2.getInt()) { + if (isEquivalent(*C1.getPointer(), *C2.getPointer())) + return true; + } else if (isInverse(*C1.getPointer(), *C2.getPointer())) + return true; + + return false; +} + +// FIXME: Use SCEV and reuse GVN/CSE logic to check for equivalence between +// Values. +// Currently, isEquivalent rely on other passes to ensure equivalent conditions +// have the same value, e.g. GVN. +bool ControlConditions::isEquivalent(const Value &V1, const Value &V2) { + return &V1 == &V2; +} + +bool ControlConditions::isInverse(const Value &V1, const Value &V2) { + if (const CmpInst *Cmp1 = dyn_cast<CmpInst>(&V1)) + if (const CmpInst *Cmp2 = dyn_cast<CmpInst>(&V2)) { + if (Cmp1->getPredicate() == Cmp2->getInversePredicate() && + Cmp1->getOperand(0) == Cmp2->getOperand(0) && + Cmp1->getOperand(1) == Cmp2->getOperand(1)) + return true; + + if (Cmp1->getPredicate() == + CmpInst::getSwappedPredicate(Cmp2->getInversePredicate()) && + Cmp1->getOperand(0) == Cmp2->getOperand(1) && + Cmp1->getOperand(1) == Cmp2->getOperand(0)) + return true; + } + return false; +} + bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1, const DominatorTree &DT, const PostDominatorTree &PDT) { @@ -42,8 +238,30 @@ bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1, if (&BB0 == &BB1) return true; - return ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) || - (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0))); + if ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) || + (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0))) + return true; + + // If the set of conditions required to execute BB0 and BB1 from their common + // dominator are the same, then BB0 and BB1 are control flow equivalent. + const BasicBlock *CommonDominator = DT.findNearestCommonDominator(&BB0, &BB1); + LLVM_DEBUG(dbgs() << "The nearest common dominator of " << BB0.getName() + << " and " << BB1.getName() << " is " + << CommonDominator->getName() << "\n"); + + const Optional<ControlConditions> BB0Conditions = + ControlConditions::collectControlConditions(BB0, *CommonDominator, DT, + PDT); + if (BB0Conditions == None) + return false; + + const Optional<ControlConditions> BB1Conditions = + ControlConditions::collectControlConditions(BB1, *CommonDominator, DT, + PDT); + if (BB1Conditions == None) + return false; + + return BB0Conditions->isEquivalent(*BB1Conditions); } static bool reportInvalidCandidate(const Instruction &I, @@ -90,9 +308,12 @@ collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst, } bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, - const DominatorTree &DT, - const PostDominatorTree &PDT, - DependenceInfo &DI) { + DominatorTree &DT, const PostDominatorTree *PDT, + DependenceInfo *DI) { + // Skip tests when we don't have PDT or DI + if (!PDT || !DI) + return false; + // Cannot move itself before itself. if (&I == &InsertPoint) return false; @@ -108,28 +329,22 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, return reportInvalidCandidate(I, NotMovedTerminator); // TODO remove this limitation. - if (!isControlFlowEquivalent(I, InsertPoint, DT, PDT)) + if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT)) return reportInvalidCandidate(I, NotControlFlowEquivalent); - // As I and InsertPoint are control flow equivalent, if I dominates - // InsertPoint, then I comes before InsertPoint. - const bool MoveForward = DT.dominates(&I, &InsertPoint); - if (MoveForward) { - // When I is being moved forward, we need to make sure the InsertPoint - // dominates every users. Or else, a user may be using an undefined I. + if (!DT.dominates(&InsertPoint, &I)) for (const Use &U : I.uses()) if (auto *UserInst = dyn_cast<Instruction>(U.getUser())) if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U)) return false; - } else { - // When I is being moved backward, we need to make sure all its opernads - // dominates the InsertPoint. Or else, an operand may be undefined for I. + if (!DT.dominates(&I, &InsertPoint)) for (const Value *Op : I.operands()) if (auto *OpInst = dyn_cast<Instruction>(Op)) if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint)) return false; - } + DT.updateDFSNumbers(); + const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint); Instruction &StartInst = (MoveForward ? I : InsertPoint); Instruction &EndInst = (MoveForward ? InsertPoint : I); SmallPtrSet<Instruction *, 10> InstsToCheck; @@ -162,7 +377,7 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, // StartInst to \p EndInst. if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(), [&DI, &I](Instruction *CurInst) { - auto DepResult = DI.depends(&I, CurInst, true); + auto DepResult = DI->depends(&I, CurInst, true); if (DepResult && (DepResult->isOutput() || DepResult->isFlow() || DepResult->isAnti())) @@ -174,16 +389,40 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, return true; } -void llvm::moveInstsBottomUp(BasicBlock &FromBB, BasicBlock &ToBB, - const DominatorTree &DT, - const PostDominatorTree &PDT, DependenceInfo &DI) { +bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint, + DominatorTree &DT, const PostDominatorTree *PDT, + DependenceInfo *DI) { + return llvm::all_of(BB, [&](Instruction &I) { + if (BB.getTerminator() == &I) + return true; + + return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI); + }); +} + +void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB, + DominatorTree &DT, + const PostDominatorTree &PDT, + DependenceInfo &DI) { for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) { Instruction *MovePos = ToBB.getFirstNonPHIOrDbg(); Instruction &I = *It; // Increment the iterator before modifying FromBB. ++It; - if (isSafeToMoveBefore(I, *MovePos, DT, PDT, DI)) + if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI)) + I.moveBefore(MovePos); + } +} + +void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB, + DominatorTree &DT, + const PostDominatorTree &PDT, + DependenceInfo &DI) { + Instruction *MovePos = ToBB.getTerminator(); + while (FromBB.size() > 1) { + Instruction &I = FromBB.front(); + if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI)) I.moveBefore(MovePos); } } diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index b7b4bfa3734d0..8f98d81a3d797 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -30,6 +30,17 @@ namespace { cl::opt<bool> Quiet("debugify-quiet", cl::desc("Suppress verbose debugify output")); +enum class Level { + Locations, + LocationsAndVariables +}; +cl::opt<Level> DebugifyLevel( + "debugify-level", cl::desc("Kind of debug info to add"), + cl::values(clEnumValN(Level::Locations, "locations", "Locations only"), + clEnumValN(Level::LocationsAndVariables, "location+variables", + "Locations and Variables")), + cl::init(Level::LocationsAndVariables)); + raw_ostream &dbg() { return Quiet ? nulls() : errs(); } uint64_t getAllocSizeInBits(Module &M, Type *Ty) { @@ -51,10 +62,11 @@ Instruction *findTerminatingInstruction(BasicBlock &BB) { return I; return BB.getTerminator(); } +} // end anonymous namespace -bool applyDebugifyMetadata(Module &M, - iterator_range<Module::iterator> Functions, - StringRef Banner) { +bool llvm::applyDebugifyMetadata( + Module &M, iterator_range<Module::iterator> Functions, StringRef Banner, + std::function<bool(DIBuilder &DIB, Function &F)> ApplyToMF) { // Skip modules with debug info. if (M.getNamedMetadata("llvm.dbg.cu")) { dbg() << Banner << "Skipping module with debug info\n"; @@ -63,6 +75,7 @@ bool applyDebugifyMetadata(Module &M, DIBuilder DIB(M); LLVMContext &Ctx = M.getContext(); + auto *Int32Ty = Type::getInt32Ty(Ctx); // Get a DIType which corresponds to Ty. DenseMap<uint64_t, DIType *> TypeCache; @@ -87,6 +100,7 @@ bool applyDebugifyMetadata(Module &M, if (isFunctionSkipped(F)) continue; + bool InsertedDbgVal = false; auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None)); DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized; @@ -95,11 +109,31 @@ bool applyDebugifyMetadata(Module &M, auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine, SPType, NextLine, DINode::FlagZero, SPFlags); F.setSubprogram(SP); + + // Helper that inserts a dbg.value before \p InsertBefore, copying the + // location (and possibly the type, if it's non-void) from \p TemplateInst. + auto insertDbgVal = [&](Instruction &TemplateInst, + Instruction *InsertBefore) { + std::string Name = utostr(NextVar++); + Value *V = &TemplateInst; + if (TemplateInst.getType()->isVoidTy()) + V = ConstantInt::get(Int32Ty, 0); + const DILocation *Loc = TemplateInst.getDebugLoc().get(); + auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(), + getCachedDIType(V->getType()), + /*AlwaysPreserve=*/true); + DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc, + InsertBefore); + }; + for (BasicBlock &BB : F) { // Attach debug locations. for (Instruction &I : BB) I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP)); + if (DebugifyLevel < Level::LocationsAndVariables) + continue; + // Inserting debug values into EH pads can break IR invariants. if (BB.isEHPad()) continue; @@ -126,25 +160,30 @@ bool applyDebugifyMetadata(Module &M, if (!isa<PHINode>(I) && !I->isEHPad()) InsertBefore = I->getNextNode(); - std::string Name = utostr(NextVar++); - const DILocation *Loc = I->getDebugLoc().get(); - auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(), - getCachedDIType(I->getType()), - /*AlwaysPreserve=*/true); - DIB.insertDbgValueIntrinsic(I, LocalVar, DIB.createExpression(), Loc, - InsertBefore); + insertDbgVal(*I, InsertBefore); + InsertedDbgVal = true; } } + // Make sure we emit at least one dbg.value, otherwise MachineDebugify may + // not have anything to work with as it goes about inserting DBG_VALUEs. + // (It's common for MIR tests to be written containing skeletal IR with + // empty functions -- we're still interested in debugifying the MIR within + // those tests, and this helps with that.) + if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) { + auto *Term = findTerminatingInstruction(F.getEntryBlock()); + insertDbgVal(*Term, Term); + } + if (ApplyToMF) + ApplyToMF(DIB, F); DIB.finalizeSubprogram(SP); } DIB.finalize(); // Track the number of distinct lines and variables. NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify"); - auto *IntTy = Type::getInt32Ty(Ctx); auto addDebugifyOperand = [&](unsigned N) { NMD->addOperand(MDNode::get( - Ctx, ValueAsMetadata::getConstant(ConstantInt::get(IntTy, N)))); + Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N)))); }; addDebugifyOperand(NextLine - 1); // Original number of lines. addDebugifyOperand(NextVar - 1); // Original number of variables. @@ -159,6 +198,54 @@ bool applyDebugifyMetadata(Module &M, return true; } +bool llvm::stripDebugifyMetadata(Module &M) { + bool Changed = false; + + // Remove the llvm.debugify module-level named metadata. + NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify"); + if (DebugifyMD) { + M.eraseNamedMetadata(DebugifyMD); + Changed = true; + } + + // Strip out all debug intrinsics and supporting metadata (subprograms, types, + // variables, etc). + Changed |= StripDebugInfo(M); + + // Strip out the dead dbg.value prototype. + Function *DbgValF = M.getFunction("llvm.dbg.value"); + if (DbgValF) { + assert(DbgValF->isDeclaration() && DbgValF->use_empty() && + "Not all debug info stripped?"); + DbgValF->eraseFromParent(); + Changed = true; + } + + // Strip out the module-level Debug Info Version metadata. + // FIXME: There must be an easier way to remove an operand from a NamedMDNode. + NamedMDNode *NMD = M.getModuleFlagsMetadata(); + if (!NMD) + return Changed; + SmallVector<MDNode *, 4> Flags; + for (MDNode *Flag : NMD->operands()) + Flags.push_back(Flag); + NMD->clearOperands(); + for (MDNode *Flag : Flags) { + MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1)); + if (Key->getString() == "Debug Info Version") { + Changed = true; + continue; + } + NMD->addOperand(Flag); + } + // If we left it empty we might as well remove it. + if (NMD->getNumOperands() == 0) + NMD->eraseFromParent(); + + return Changed; +} + +namespace { /// Return true if a mis-sized diagnostic is issued for \p DVI. bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { // The size of a dbg.value's value operand should match the size of the @@ -206,7 +293,7 @@ bool checkDebugifyMetadata(Module &M, // Skip modules without debugify metadata. NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify"); if (!NMD) { - dbg() << Banner << "Skipping module without debugify metadata\n"; + dbg() << Banner << ": Skipping module without debugify metadata\n"; return false; } @@ -233,7 +320,7 @@ bool checkDebugifyMetadata(Module &M, // Find missing lines. for (Instruction &I : instructions(F)) { - if (isa<DbgValueInst>(&I)) + if (isa<DbgValueInst>(&I) || isa<PHINode>(&I)) continue; auto DL = I.getDebugLoc(); @@ -243,11 +330,10 @@ bool checkDebugifyMetadata(Module &M, } if (!DL) { - dbg() << "ERROR: Instruction with empty DebugLoc in function "; + dbg() << "WARNING: Instruction with empty DebugLoc in function "; dbg() << F.getName() << " --"; I.print(dbg()); dbg() << "\n"; - HasErrors = true; } } @@ -287,12 +373,9 @@ bool checkDebugifyMetadata(Module &M, dbg() << " [" << NameOfWrappedPass << "]"; dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n'; - // Strip the Debugify Metadata if required. - if (Strip) { - StripDebugInfo(M); - M.eraseNamedMetadata(NMD); - return true; - } + // Strip debugify metadata if required. + if (Strip) + return stripDebugifyMetadata(M); return false; } @@ -301,7 +384,8 @@ bool checkDebugifyMetadata(Module &M, /// legacy module pass manager. struct DebugifyModulePass : public ModulePass { bool runOnModule(Module &M) override { - return applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: "); + return applyDebugifyMetadata(M, M.functions(), + "ModuleDebugify: ", /*ApplyToMF*/ nullptr); } DebugifyModulePass() : ModulePass(ID) {} @@ -320,7 +404,7 @@ struct DebugifyFunctionPass : public FunctionPass { Module &M = *F.getParent(); auto FuncIt = F.getIterator(); return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)), - "FunctionDebugify: "); + "FunctionDebugify: ", /*ApplyToMF*/ nullptr); } DebugifyFunctionPass() : FunctionPass(ID) {} @@ -395,7 +479,8 @@ FunctionPass *createDebugifyFunctionPass() { } PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) { - applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: "); + applyDebugifyMetadata(M, M.functions(), + "ModuleDebugify: ", /*ApplyToMF*/ nullptr); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 651f776a4915b..f84ff9e5aad1d 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -11,6 +11,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" diff --git a/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp index 914babeb6829d..cae9d9ee6d709 100644 --- a/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp +++ b/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp @@ -12,10 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Local.h" + using namespace llvm; static FunctionCallee getDefaultPersonalityFn(Module *M) { diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp index ad36790b8c6a6..c5dfbf9d92d13 100644 --- a/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -196,8 +195,7 @@ evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL, Constant *const IdxList[] = {IdxZero, IdxZero}; Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList); - if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) - Ptr = FoldedPtr; + Ptr = ConstantFoldConstant(Ptr, DL, TLI); } return Val; } @@ -266,33 +264,33 @@ static Function *getFunction(Constant *C) { } Function * -Evaluator::getCalleeWithFormalArgs(CallSite &CS, - SmallVector<Constant *, 8> &Formals) { - auto *V = CS.getCalledValue(); +Evaluator::getCalleeWithFormalArgs(CallBase &CB, + SmallVectorImpl<Constant *> &Formals) { + auto *V = CB.getCalledOperand(); if (auto *Fn = getFunction(getVal(V))) - return getFormalParams(CS, Fn, Formals) ? Fn : nullptr; + return getFormalParams(CB, Fn, Formals) ? Fn : nullptr; auto *CE = dyn_cast<ConstantExpr>(V); if (!CE || CE->getOpcode() != Instruction::BitCast || - !getFormalParams(CS, getFunction(CE->getOperand(0)), Formals)) + !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals)) return nullptr; return dyn_cast<Function>( ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL)); } -bool Evaluator::getFormalParams(CallSite &CS, Function *F, - SmallVector<Constant *, 8> &Formals) { +bool Evaluator::getFormalParams(CallBase &CB, Function *F, + SmallVectorImpl<Constant *> &Formals) { if (!F) return false; auto *FTy = F->getFunctionType(); - if (FTy->getNumParams() > CS.getNumArgOperands()) { + if (FTy->getNumParams() > CB.getNumArgOperands()) { LLVM_DEBUG(dbgs() << "Too few arguments for function.\n"); return false; } - auto ArgI = CS.arg_begin(); + auto ArgI = CB.arg_begin(); for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE; ++ParI) { auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL); @@ -339,7 +337,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, return false; // no volatile/atomic accesses. } Constant *Ptr = getVal(SI->getOperand(1)); - if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) { + Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI); + if (Ptr != FoldedPtr) { LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr); Ptr = FoldedPtr; LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n"); @@ -448,7 +447,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, } Constant *Ptr = getVal(LI->getOperand(0)); - if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) { + Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI); + if (Ptr != FoldedPtr) { Ptr = FoldedPtr; LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant " "folding: " @@ -476,22 +476,22 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, InstResult = AllocaTmps.back().get(); LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n"); } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) { - CallSite CS(&*CurInst); + CallBase &CB = *cast<CallBase>(&*CurInst); // Debug info can safely be ignored here. - if (isa<DbgInfoIntrinsic>(CS.getInstruction())) { + if (isa<DbgInfoIntrinsic>(CB)) { LLVM_DEBUG(dbgs() << "Ignoring debug info.\n"); ++CurInst; continue; } // Cannot handle inline asm. - if (isa<InlineAsm>(CS.getCalledValue())) { + if (CB.isInlineAsm()) { LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n"); return false; } - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CB)) { if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) { if (MSI->isVolatile()) { LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset " @@ -559,7 +559,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, // Resolve function pointers. SmallVector<Constant *, 8> Formals; - Function *Callee = getCalleeWithFormalArgs(CS, Formals); + Function *Callee = getCalleeWithFormalArgs(CB, Formals); if (!Callee || Callee->isInterposable()) { LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n"); return false; // Cannot resolve. @@ -567,9 +567,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, if (Callee->isDeclaration()) { // If this is a function we can constant fold, do it. - if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), - Callee, Formals, TLI)) { - InstResult = castCallResultIfNeeded(CS.getCalledValue(), C); + if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) { + InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C); if (!InstResult) return false; LLVM_DEBUG(dbgs() << "Constant folded function call. Result: " @@ -592,7 +591,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, return false; } ValueStack.pop_back(); - InstResult = castCallResultIfNeeded(CS.getCalledValue(), RetVal); + InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal); if (RetVal && !InstResult) return false; @@ -648,9 +647,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, } if (!CurInst->use_empty()) { - if (auto *FoldedInstResult = ConstantFoldConstant(InstResult, DL, TLI)) - InstResult = FoldedInstResult; - + InstResult = ConstantFoldConstant(InstResult, DL, TLI); setVal(&*CurInst, InstResult); } diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp new file mode 100644 index 0000000000000..460ba9e97fc6e --- /dev/null +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -0,0 +1,337 @@ +//===- FixIrreducible.cpp - Convert irreducible control-flow into loops ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// An irreducible SCC is one which has multiple "header" blocks, i.e., blocks +// with control-flow edges incident from outside the SCC. This pass converts a +// irreducible SCC into a natural loop by applying the following transformation: +// +// 1. Collect the set of headers H of the SCC. +// 2. Collect the set of predecessors P of these headers. These may be inside as +// well as outside the SCC. +// 3. Create block N and redirect every edge from set P to set H through N. +// +// This converts the SCC into a natural loop with N as the header: N is the only +// block with edges incident from outside the SCC, and all backedges in the SCC +// are incident on N, i.e., for every backedge, the head now dominates the tail. +// +// INPUT CFG: The blocks A and B form an irreducible loop with two headers. +// +// Entry +// / \ +// v v +// A ----> B +// ^ /| +// `----' | +// v +// Exit +// +// OUTPUT CFG: Edges incident on A and B are now redirected through a +// new block N, forming a natural loop consisting of N, A and B. +// +// Entry +// | +// v +// .---> N <---. +// / / \ \ +// | / \ | +// \ v v / +// `-- A B --' +// | +// v +// Exit +// +// The transformation is applied to every maximal SCC that is not already +// recognized as a loop. The pass operates on all maximal SCCs found in the +// function body outside of any loop, as well as those found inside each loop, +// including inside any newly created loops. This ensures that any SCC hidden +// inside a maximal SCC is also transformed. +// +// The actual transformation is handled by function CreateControlFlowHub, which +// takes a set of incoming blocks (the predecessors) and outgoing blocks (the +// headers). The function also moves every PHINode in an outgoing block to the +// hub. Since the hub dominates all the outgoing blocks, each such PHINode +// continues to dominate its uses. Since every header in an SCC has at least two +// predecessors, every value used in the header (or later) but defined in a +// predecessor (or earlier) is represented by a PHINode in a header. Hence the +// above handling of PHINodes is sufficient and no further processing is +// required to restore SSA. +// +// Limitation: The pass cannot handle switch statements and indirect +// branches. Both must be lowered to plain branches first. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "fix-irreducible" + +using namespace llvm; + +namespace { +struct FixIrreducible : public FunctionPass { + static char ID; + FixIrreducible() : FunctionPass(ID) { + initializeFixIrreduciblePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(LowerSwitchID); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreservedID(LowerSwitchID); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +char FixIrreducible::ID = 0; + +FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); } + +INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible", + "Convert irreducible control-flow into natural loops", + false /* Only looks at CFG */, false /* Analysis Pass */) +INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible", + "Convert irreducible control-flow into natural loops", + false /* Only looks at CFG */, false /* Analysis Pass */) + +// When a new loop is created, existing children of the parent loop may now be +// fully inside the new loop. Reconnect these as children of the new loop. +static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop, + SetVector<BasicBlock *> &Blocks, + SetVector<BasicBlock *> &Headers) { + auto &CandidateLoops = ParentLoop ? ParentLoop->getSubLoopsVector() + : LI.getTopLevelLoopsVector(); + // The new loop cannot be its own child, and any candidate is a + // child iff its header is owned by the new loop. Move all the + // children to a new vector. + auto FirstChild = std::partition( + CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) { + return L == NewLoop || Blocks.count(L->getHeader()) == 0; + }); + SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end()); + CandidateLoops.erase(FirstChild, CandidateLoops.end()); + + for (auto II = ChildLoops.begin(), IE = ChildLoops.end(); II != IE; ++II) { + auto Child = *II; + LLVM_DEBUG(dbgs() << "child loop: " << Child->getHeader()->getName() + << "\n"); + // TODO: A child loop whose header is also a header in the current + // SCC gets destroyed since its backedges are removed. That may + // not be necessary if we can retain such backedges. + if (Headers.count(Child->getHeader())) { + for (auto BB : Child->blocks()) { + LI.changeLoopFor(BB, NewLoop); + LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName() + << "\n"); + } + LI.destroy(Child); + LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n"); + continue; + } + + Child->setParentLoop(nullptr); + NewLoop->addChildLoop(Child); + LLVM_DEBUG(dbgs() << "added child loop to new loop\n"); + } +} + +// Given a set of blocks and headers in an irreducible SCC, convert it into a +// natural loop. Also insert this new loop at its appropriate place in the +// hierarchy of loops. +static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT, + Loop *ParentLoop, + SetVector<BasicBlock *> &Blocks, + SetVector<BasicBlock *> &Headers) { +#ifndef NDEBUG + // All headers are part of the SCC + for (auto H : Headers) { + assert(Blocks.count(H)); + } +#endif + + SetVector<BasicBlock *> Predecessors; + for (auto H : Headers) { + for (auto P : predecessors(H)) { + Predecessors.insert(P); + } + } + + LLVM_DEBUG( + dbgs() << "Found predecessors:"; + for (auto P : Predecessors) { + dbgs() << " " << P->getName(); + } + dbgs() << "\n"); + + // Redirect all the backedges through a "hub" consisting of a series + // of guard blocks that manage the flow of control from the + // predecessors to the headers. + SmallVector<BasicBlock *, 8> GuardBlocks; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr"); +#if defined(EXPENSIVE_CHECKS) + assert(DT.verify(DominatorTree::VerificationLevel::Full)); +#else + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); +#endif + + // Create a new loop from the now-transformed cycle + auto NewLoop = LI.AllocateLoop(); + if (ParentLoop) { + ParentLoop->addChildLoop(NewLoop); + } else { + LI.addTopLevelLoop(NewLoop); + } + + // Add the guard blocks to the new loop. The first guard block is + // the head of all the backedges, and it is the first to be inserted + // in the loop. This ensures that it is recognized as the + // header. Since the new loop is already in LoopInfo, the new blocks + // are also propagated up the chain of parent loops. + for (auto G : GuardBlocks) { + LLVM_DEBUG(dbgs() << "added guard block: " << G->getName() << "\n"); + NewLoop->addBasicBlockToLoop(G, LI); + } + + // Add the SCC blocks to the new loop. + for (auto BB : Blocks) { + NewLoop->addBlockEntry(BB); + if (LI.getLoopFor(BB) == ParentLoop) { + LLVM_DEBUG(dbgs() << "moved block from parent: " << BB->getName() + << "\n"); + LI.changeLoopFor(BB, NewLoop); + } else { + LLVM_DEBUG(dbgs() << "added block from child: " << BB->getName() << "\n"); + } + } + LLVM_DEBUG(dbgs() << "header for new loop: " + << NewLoop->getHeader()->getName() << "\n"); + + reconnectChildLoops(LI, ParentLoop, NewLoop, Blocks, Headers); + + NewLoop->verifyLoop(); + if (ParentLoop) { + ParentLoop->verifyLoop(); + } +#if defined(EXPENSIVE_CHECKS) + LI.verify(DT); +#endif // EXPENSIVE_CHECKS +} + +namespace llvm { +// Enable the graph traits required for traversing a Loop body. +template <> struct GraphTraits<Loop> : LoopBodyTraits {}; +} // namespace llvm + +// Overloaded wrappers to go with the function template below. +static BasicBlock *unwrapBlock(BasicBlock *B) { return B; } +static BasicBlock *unwrapBlock(LoopBodyTraits::NodeRef &N) { return N.second; } + +static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Function *F, + SetVector<BasicBlock *> &Blocks, + SetVector<BasicBlock *> &Headers) { + createNaturalLoopInternal(LI, DT, nullptr, Blocks, Headers); +} + +static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Loop &L, + SetVector<BasicBlock *> &Blocks, + SetVector<BasicBlock *> &Headers) { + createNaturalLoopInternal(LI, DT, &L, Blocks, Headers); +} + +// Convert irreducible SCCs; Graph G may be a Function* or a Loop&. +template <class Graph> +static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) { + bool Changed = false; + for (auto Scc = scc_begin(G); !Scc.isAtEnd(); ++Scc) { + if (Scc->size() < 2) + continue; + SetVector<BasicBlock *> Blocks; + LLVM_DEBUG(dbgs() << "Found SCC:"); + for (auto N : *Scc) { + auto BB = unwrapBlock(N); + LLVM_DEBUG(dbgs() << " " << BB->getName()); + Blocks.insert(BB); + } + LLVM_DEBUG(dbgs() << "\n"); + + // Minor optimization: The SCC blocks are usually discovered in an order + // that is the opposite of the order in which these blocks appear as branch + // targets. This results in a lot of condition inversions in the control + // flow out of the new ControlFlowHub, which can be mitigated if the orders + // match. So we discover the headers using the reverse of the block order. + SetVector<BasicBlock *> Headers; + LLVM_DEBUG(dbgs() << "Found headers:"); + for (auto BB : reverse(Blocks)) { + for (const auto P : predecessors(BB)) { + // Skip unreachable predecessors. + if (!DT.isReachableFromEntry(P)) + continue; + if (!Blocks.count(P)) { + LLVM_DEBUG(dbgs() << " " << BB->getName()); + Headers.insert(BB); + break; + } + } + } + LLVM_DEBUG(dbgs() << "\n"); + + if (Headers.size() == 1) { + assert(LI.isLoopHeader(Headers.front())); + LLVM_DEBUG(dbgs() << "Natural loop with a single header: skipped\n"); + continue; + } + createNaturalLoop(LI, DT, G, Blocks, Headers); + Changed = true; + } + return Changed; +} + +bool FixIrreducible::runOnFunction(Function &F) { + LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: " + << F.getName() << "\n"); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + bool Changed = false; + SmallVector<Loop *, 8> WorkList; + + LLVM_DEBUG(dbgs() << "visiting top-level\n"); + Changed |= makeReducible(LI, DT, &F); + + // Any SCCs reduced are now already in the list of top-level loops, so simply + // add them all to the worklist. + for (auto L : LI) { + WorkList.push_back(L); + } + + while (!WorkList.empty()) { + auto L = WorkList.back(); + WorkList.pop_back(); + LLVM_DEBUG(dbgs() << "visiting loop with header " + << L->getHeader()->getName() << "\n"); + Changed |= makeReducible(LI, DT, *L); + // Any SCCs reduced are now already in the list of child loops, so simply + // add them all to the worklist. + WorkList.append(L->begin(), L->end()); + } + + return Changed; +} diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 893f23eb60482..0098dcaeb07a0 100644 --- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -45,12 +45,12 @@ class FlattenCFGOpt { bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder); /// Compare a pair of blocks: \p Block1 and \p Block2, which - /// are from two if-regions whose entry blocks are \p Head1 and \p - /// Head2. \returns true if \p Block1 and \p Block2 contain identical + /// are from two if-regions, where \p Head2 is the entry block of the 2nd + /// if-region. \returns true if \p Block1 and \p Block2 contain identical /// instructions, and have no memory reference alias with \p Head2. /// This is used as a legality check for merging if-regions. - bool CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, - BasicBlock *Block1, BasicBlock *Block2); + bool CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2, + BasicBlock *Head2); public: FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {} @@ -97,7 +97,7 @@ public: /// br label %if.end; /// /// Current implementation handles two cases. -/// Case 1: \param BB is on the else-path. +/// Case 1: BB is on the else-path. /// /// BB1 /// / | @@ -105,7 +105,7 @@ public: /// / \ | /// BB3 \ | where, BB1, BB2 contain conditional branches. /// \ | / BB3 contains unconditional branch. -/// \ | / BB4 corresponds to \param BB which is also the merge. +/// \ | / BB4 corresponds to BB which is also the merge. /// BB => BB4 /// /// @@ -114,14 +114,14 @@ public: /// if (a == b && c == d) /// statement; // BB3 /// -/// Case 2: \param BB BB is on the then-path. +/// Case 2: BB is on the then-path. /// /// BB1 /// / | /// | BB2 /// \ / | where BB1, BB2 contain conditional branches. /// BB => BB3 | BB3 contains unconditiona branch and corresponds -/// \ / to \param BB. BB4 is the merge. +/// \ / to BB. BB4 is the merge. /// BB4 /// /// Corresponding source code: @@ -129,9 +129,9 @@ public: /// if (a == b || c == d) /// statement; // BB3 /// -/// In both cases, \param BB is the common successor of conditional branches. -/// In Case 1, \param BB (BB4) has an unconditional branch (BB3) as -/// its predecessor. In Case 2, \param BB (BB3) only has conditional branches +/// In both cases, BB is the common successor of conditional branches. +/// In Case 1, BB (BB4) has an unconditional branch (BB3) as +/// its predecessor. In Case 2, BB (BB3) only has conditional branches /// as its predecessors. bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { PHINode *PHI = dyn_cast<PHINode>(BB->begin()); @@ -315,25 +315,16 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { return true; } -/// Compare blocks from two if-regions, where \param Head1 is the entry of the -/// 1st if-region. \param Head2 is the entry of the 2nd if-region. \param -/// Block1 is a block in the 1st if-region to compare. \param Block2 is a block -// in the 2nd if-region to compare. \returns true if \param Block1 and \param -/// Block2 have identical instructions and do not have memory reference alias -/// with \param Head2. -bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, - BasicBlock *Block1, - BasicBlock *Block2) { +/// Compare blocks from two if-regions, where \param Head2 is the entry of the +/// 2nd if-region. \param Block1 is a block in the 1st if-region to compare. +/// \param Block2 is a block in the 2nd if-region to compare. \returns true if +/// Block1 and Block2 have identical instructions and do not have +/// memory reference alias with Head2. +bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2, + BasicBlock *Head2) { Instruction *PTI2 = Head2->getTerminator(); Instruction *PBI2 = &Head2->front(); - bool eq1 = (Block1 == Head1); - bool eq2 = (Block2 == Head2); - if (eq1 || eq2) { - // An empty then-path or else-path. - return (eq1 == eq2); - } - // Check whether instructions in Block1 and Block2 are identical // and do not alias with instructions in Head2. BasicBlock::iterator iter1 = Block1->begin(); @@ -395,6 +386,29 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, /// To: /// if (a || b) /// statement; +/// +/// +/// And from: +/// if (a) +/// ; +/// else +/// statement; +/// if (b) +/// ; +/// else +/// statement; +/// +/// To: +/// if (a && b) +/// ; +/// else +/// statement; +/// +/// We always take the form of the first if-region. This means that if the +/// statement in the first if-region, is in the "then-path", while in the second +/// if-region it is in the "else-path", then we convert the second to the first +/// form, by inverting the condition and the branch successors. The same +/// approach goes for the opposite case. bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { BasicBlock *IfTrue2, *IfFalse2; Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2); @@ -415,22 +429,42 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { BasicBlock *FirstEntryBlock = CInst1->getParent(); // Either then-path or else-path should be empty. - if ((IfTrue1 != FirstEntryBlock) && (IfFalse1 != FirstEntryBlock)) - return false; - if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock)) - return false; + bool InvertCond2 = false; + BinaryOperator::BinaryOps CombineOp; + if (IfFalse1 == FirstEntryBlock) { + // The else-path is empty, so we must use "or" operation to combine the + // conditions. + CombineOp = BinaryOperator::Or; + if (IfFalse2 != SecondEntryBlock) { + if (IfTrue2 != SecondEntryBlock) + return false; - Instruction *PTI2 = SecondEntryBlock->getTerminator(); - Instruction *PBI2 = &SecondEntryBlock->front(); + InvertCond2 = true; + std::swap(IfTrue2, IfFalse2); + } - if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1, - IfTrue2)) - return false; + if (!CompareIfRegionBlock(IfTrue1, IfTrue2, SecondEntryBlock)) + return false; + } else if (IfTrue1 == FirstEntryBlock) { + // The then-path is empty, so we must use "and" operation to combine the + // conditions. + CombineOp = BinaryOperator::And; + if (IfTrue2 != SecondEntryBlock) { + if (IfFalse2 != SecondEntryBlock) + return false; + + InvertCond2 = true; + std::swap(IfTrue2, IfFalse2); + } - if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfFalse1, - IfFalse2)) + if (!CompareIfRegionBlock(IfFalse1, IfFalse2, SecondEntryBlock)) + return false; + } else return false; + Instruction *PTI2 = SecondEntryBlock->getTerminator(); + Instruction *PBI2 = &SecondEntryBlock->front(); + // Check whether \param SecondEntryBlock has side-effect and is safe to // speculate. for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { @@ -445,12 +479,22 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { FirstEntryBlock->getInstList() .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList()); BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator()); - Value *CC = PBI->getCondition(); + assert(PBI->getCondition() == IfCond2); BasicBlock *SaveInsertBB = Builder.GetInsertBlock(); BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint(); Builder.SetInsertPoint(PBI); - Value *NC = Builder.CreateOr(CInst1, CC); - PBI->replaceUsesOfWith(CC, NC); + if (InvertCond2) { + // If this is a "cmp" instruction, only used for branching (and nowhere + // else), then we can simply invert the predicate. + auto Cmp2 = dyn_cast<CmpInst>(CInst2); + if (Cmp2 && Cmp2->hasOneUse()) + Cmp2->setPredicate(Cmp2->getInversePredicate()); + else + CInst2 = cast<Instruction>(Builder.CreateNot(CInst2)); + PBI->swapSuccessors(); + } + Value *NC = Builder.CreateBinOp(CombineOp, CInst1, CInst2); + PBI->replaceUsesOfWith(IfCond2, NC); Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); // Handle PHI node to replace its predecessors to FirstEntryBlock. @@ -496,6 +540,6 @@ bool FlattenCFGOpt::run(BasicBlock *BB) { /// FlattenCFG - This function is used to flatten a CFG. For /// example, it uses parallel-and and parallel-or mode to collapse /// if-conditions and merge if-regions with identical statements. -bool llvm::FlattenCFG(BasicBlock *BB, AliasAnalysis *AA) { +bool llvm::FlattenCFG(BasicBlock *BB, AAResults *AA) { return FlattenCFGOpt(AA).run(BB); } diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp index a9b28754c8e9c..101cb232d8aed 100644 --- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp +++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp @@ -20,7 +20,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -52,22 +51,28 @@ using namespace llvm; #define DEBUG_TYPE "functioncomparator" int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { - if (L < R) return -1; - if (L > R) return 1; + if (L < R) + return -1; + if (L > R) + return 1; return 0; } int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const { - if ((int)L < (int)R) return -1; - if ((int)L > (int)R) return 1; + if ((int)L < (int)R) + return -1; + if ((int)L > (int)R) + return 1; return 0; } int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const { if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth())) return Res; - if (L.ugt(R)) return 1; - if (R.ugt(L)) return -1; + if (L.ugt(R)) + return 1; + if (R.ugt(L)) + return -1; return 0; } @@ -166,21 +171,17 @@ int FunctionComparator::cmpRangeMetadata(const MDNode *L, return 0; } -int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L, - const Instruction *R) const { - ImmutableCallSite LCS(L); - ImmutableCallSite RCS(R); - - assert(LCS && RCS && "Must be calls or invokes!"); - assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!"); +int FunctionComparator::cmpOperandBundlesSchema(const CallBase &LCS, + const CallBase &RCS) const { + assert(LCS.getOpcode() == RCS.getOpcode() && "Can't compare otherwise!"); if (int Res = cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles())) return Res; - for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) { - auto OBL = LCS.getOperandBundleAt(i); - auto OBR = RCS.getOperandBundleAt(i); + for (unsigned I = 0, E = LCS.getNumOperandBundles(); I != E; ++I) { + auto OBL = LCS.getOperandBundleAt(I); + auto OBR = RCS.getOperandBundleAt(I); if (int Res = OBL.getTagName().compare(OBR.getTagName())) return Res; @@ -227,9 +228,9 @@ int FunctionComparator::cmpConstants(const Constant *L, unsigned TyRWidth = 0; if (auto *VecTyL = dyn_cast<VectorType>(TyL)) - TyLWidth = VecTyL->getBitWidth(); + TyLWidth = VecTyL->getPrimitiveSizeInBits().getFixedSize(); if (auto *VecTyR = dyn_cast<VectorType>(TyR)) - TyRWidth = VecTyR->getBitWidth(); + TyRWidth = VecTyR->getPrimitiveSizeInBits().getFixedSize(); if (TyLWidth != TyRWidth) return cmpNumbers(TyLWidth, TyRWidth); @@ -328,8 +329,8 @@ int FunctionComparator::cmpConstants(const Constant *L, case Value::ConstantVectorVal: { const ConstantVector *LV = cast<ConstantVector>(L); const ConstantVector *RV = cast<ConstantVector>(R); - unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements(); - unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements(); + unsigned NumElementsL = cast<FixedVectorType>(TyL)->getNumElements(); + unsigned NumElementsR = cast<FixedVectorType>(TyR)->getNumElements(); if (int Res = cmpNumbers(NumElementsL, NumElementsR)) return Res; for (uint64_t i = 0; i < NumElementsL; ++i) { @@ -361,12 +362,12 @@ int FunctionComparator::cmpConstants(const Constant *L, if (LBA->getFunction() == RBA->getFunction()) { // They are BBs in the same function. Order by which comes first in the // BB order of the function. This order is deterministic. - Function* F = LBA->getFunction(); + Function *F = LBA->getFunction(); BasicBlock *LBB = LBA->getBasicBlock(); BasicBlock *RBB = RBA->getBasicBlock(); if (LBB == RBB) return 0; - for(BasicBlock &BB : F->getBasicBlockList()) { + for (BasicBlock &BB : F->getBasicBlockList()) { if (&BB == LBB) { assert(&BB != RBB); return -1; @@ -476,14 +477,25 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { return 0; } - case Type::ArrayTyID: - case Type::VectorTyID: { - auto *STyL = cast<SequentialType>(TyL); - auto *STyR = cast<SequentialType>(TyR); + case Type::ArrayTyID: { + auto *STyL = cast<ArrayType>(TyL); + auto *STyR = cast<ArrayType>(TyR); if (STyL->getNumElements() != STyR->getNumElements()) return cmpNumbers(STyL->getNumElements(), STyR->getNumElements()); return cmpTypes(STyL->getElementType(), STyR->getElementType()); } + case Type::FixedVectorTyID: + case Type::ScalableVectorTyID: { + auto *STyL = cast<VectorType>(TyL); + auto *STyR = cast<VectorType>(TyR); + if (STyL->getElementCount().Scalable != STyR->getElementCount().Scalable) + return cmpNumbers(STyL->getElementCount().Scalable, + STyR->getElementCount().Scalable); + if (STyL->getElementCount().Min != STyR->getElementCount().Min) + return cmpNumbers(STyL->getElementCount().Min, + STyR->getElementCount().Min); + return cmpTypes(STyL->getElementType(), STyR->getElementType()); + } } } @@ -551,7 +563,8 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpNumbers(LI->getSyncScopeID(), cast<LoadInst>(R)->getSyncScopeID())) return Res; - return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range), + return cmpRangeMetadata( + LI->getMetadata(LLVMContext::MD_range), cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); } if (const StoreInst *SI = dyn_cast<StoreInst>(L)) { @@ -569,13 +582,13 @@ int FunctionComparator::cmpOperations(const Instruction *L, } if (const CmpInst *CI = dyn_cast<CmpInst>(L)) return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate()); - if (auto CSL = CallSite(const_cast<Instruction *>(L))) { - auto CSR = CallSite(const_cast<Instruction *>(R)); - if (int Res = cmpNumbers(CSL.getCallingConv(), CSR.getCallingConv())) + if (auto *CBL = dyn_cast<CallBase>(L)) { + auto *CBR = cast<CallBase>(R); + if (int Res = cmpNumbers(CBL->getCallingConv(), CBR->getCallingConv())) return Res; - if (int Res = cmpAttrs(CSL.getAttributes(), CSR.getAttributes())) + if (int Res = cmpAttrs(CBL->getAttributes(), CBR->getAttributes())) return Res; - if (int Res = cmpOperandBundlesSchema(L, R)) + if (int Res = cmpOperandBundlesSchema(*CBL, *CBR)) return Res; if (const CallInst *CI = dyn_cast<CallInst>(L)) if (int Res = cmpNumbers(CI->getTailCallKind(), @@ -616,8 +629,8 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpNumbers(CXI->isVolatile(), cast<AtomicCmpXchgInst>(R)->isVolatile())) return Res; - if (int Res = cmpNumbers(CXI->isWeak(), - cast<AtomicCmpXchgInst>(R)->isWeak())) + if (int Res = + cmpNumbers(CXI->isWeak(), cast<AtomicCmpXchgInst>(R)->isWeak())) return Res; if (int Res = cmpOrderings(CXI->getSuccessOrdering(), @@ -638,11 +651,21 @@ int FunctionComparator::cmpOperations(const Instruction *L, cast<AtomicRMWInst>(R)->isVolatile())) return Res; if (int Res = cmpOrderings(RMWI->getOrdering(), - cast<AtomicRMWInst>(R)->getOrdering())) + cast<AtomicRMWInst>(R)->getOrdering())) return Res; return cmpNumbers(RMWI->getSyncScopeID(), cast<AtomicRMWInst>(R)->getSyncScopeID()); } + if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(L)) { + ArrayRef<int> LMask = SVI->getShuffleMask(); + ArrayRef<int> RMask = cast<ShuffleVectorInst>(R)->getShuffleMask(); + if (int Res = cmpNumbers(LMask.size(), RMask.size())) + return Res; + for (size_t i = 0, e = LMask.size(); i != e; ++i) { + if (int Res = cmpNumbers(LMask[i], RMask[i])) + return Res; + } + } if (const PHINode *PNL = dyn_cast<PHINode>(L)) { const PHINode *PNR = cast<PHINode>(R); // Ensure that in addition to the incoming values being identical @@ -675,8 +698,8 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, if (GEPL->accumulateConstantOffset(DL, OffsetL) && GEPR->accumulateConstantOffset(DL, OffsetR)) return cmpAPInts(OffsetL, OffsetR); - if (int Res = cmpTypes(GEPL->getSourceElementType(), - GEPR->getSourceElementType())) + if (int Res = + cmpTypes(GEPL->getSourceElementType(), GEPR->getSourceElementType())) return Res; if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands())) @@ -829,8 +852,8 @@ int FunctionComparator::compareSignature() const { // Visit the arguments so that they get enumerated in the order they're // passed in. for (Function::const_arg_iterator ArgLI = FnL->arg_begin(), - ArgRI = FnR->arg_begin(), - ArgLE = FnL->arg_end(); + ArgRI = FnR->arg_begin(), + ArgLE = FnL->arg_end(); ArgLI != ArgLE; ++ArgLI, ++ArgRI) { if (cmpValues(&*ArgLI, &*ArgRI) != 0) llvm_unreachable("Arguments repeat!"); @@ -897,9 +920,7 @@ public: // Initialize to random constant, so the state isn't zero. HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; } - void add(uint64_t V) { - Hash = hashing::detail::hash_16_bytes(Hash, V); - } + void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); } // No finishing is required, because the entire hash value is used. uint64_t getHash() { return Hash; } diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 26d48ee0d23fa..8df7ae9563d8a 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -212,13 +212,6 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { } } } - // Check the summaries to see if the symbol gets resolved to a known local - // definition. - if (VI && VI.isDSOLocal()) { - GV.setDSOLocal(true); - if (GV.hasDLLImportStorageClass()) - GV.setDLLStorageClass(GlobalValue::DefaultStorageClass); - } } // We should always have a ValueInfo (i.e. GV in index) for definitions when @@ -280,6 +273,20 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { } else GV.setLinkage(getLinkage(&GV, /* DoPromote */ false)); + // When ClearDSOLocalOnDeclarations is true, clear dso_local if GV is + // converted to a declaration, to disable direct access. Don't do this if GV + // is implicitly dso_local due to a non-default visibility. + if (ClearDSOLocalOnDeclarations && GV.isDeclarationForLinker() && + !GV.isImplicitDSOLocal()) { + GV.setDSOLocal(false); + } else if (VI && VI.isDSOLocal()) { + // If all summaries are dso_local, symbol gets resolved to a known local + // definition. + GV.setDSOLocal(true); + if (GV.hasDLLImportStorageClass()) + GV.setDLLStorageClass(GlobalValue::DefaultStorageClass); + } + // Remove functions imported as available externally defs from comdats, // as this is a declaration for the linker, and will be dropped eventually. // It is illegal for comdats to contain declarations. @@ -319,7 +326,9 @@ bool FunctionImportGlobalProcessing::run() { } bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index, + bool ClearDSOLocalOnDeclarations, SetVector<GlobalValue *> *GlobalsToImport) { - FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport); + FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport, + ClearDSOLocalOnDeclarations); return ThinLTOProcessing.run(); } diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp index a2942869130d5..fe58f0e0fe400 100644 --- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -9,7 +9,6 @@ #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalValue.h" @@ -164,8 +163,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, if (MSI->isVolatile()) return true; GS.StoredType = GlobalStatus::Stored; - } else if (auto C = ImmutableCallSite(I)) { - if (!C.isCallee(&U)) + } else if (const auto *CB = dyn_cast<CallBase>(I)) { + if (!CB->isCallee(&U)) return true; GS.IsLoaded = true; } else { diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp index 9192e74b9ace9..9d8f59d62d6d0 100644 --- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp +++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp @@ -13,8 +13,12 @@ #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -30,40 +34,6 @@ STATISTIC(NumVFDeclAdded, STATISTIC(NumCompUsedAdded, "Number of `@llvm.compiler.used` operands that have been added."); -/// Helper function to map the TLI name to a strings that holds -/// scalar-to-vector mapping. -/// -/// _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>) -/// -/// where: -/// -/// <isa> = "_LLVM_" -/// <mask> = "N". Note: TLI does not support masked interfaces. -/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor` -/// field of the `VecDesc` struct. -/// <vparams> = "v", as many as are the number of parameters of CI. -/// <scalarname> = the name of the scalar function called by CI. -/// <vectorname> = the name of the vector function mapped by the TLI. -static std::string mangleTLIName(StringRef VectorName, const CallInst &CI, - unsigned VF) { - SmallString<256> Buffer; - llvm::raw_svector_ostream Out(Buffer); - Out << "_ZGV" << VFABI::_LLVM_ << "N" << VF; - for (unsigned I = 0; I < CI.getNumArgOperands(); ++I) - Out << "v"; - Out << "_" << CI.getCalledFunction()->getName() << "(" << VectorName << ")"; - return Out.str(); -} - -/// A helper function for converting Scalar types to vector types. -/// If the incoming type is void, we return void. If the VF is 1, we return -/// the scalar type. -static Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { - if (Scalar->isVoidTy() || VF == 1) - return Scalar; - return VectorType::get(Scalar, {VF, isScalable}); -} - /// A helper function that adds the vector function declaration that /// vectorizes the CallInst CI with a vectorization factor of VF /// lanes. The TLI assumes that all parameters and the return type of @@ -107,7 +77,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { if (CI.isNoBuiltin() || !CI.getCalledFunction()) return; - const std::string ScalarName = CI.getCalledFunction()->getName(); + const std::string ScalarName = std::string(CI.getCalledFunction()->getName()); // Nothing to be done if the TLI thinks the function is not // vectorizable. if (!TLI.isFunctionVectorizable(ScalarName)) @@ -120,9 +90,11 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { // All VFs in the TLI are powers of 2. for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF; VF *= 2) { - const std::string TLIName = TLI.getVectorizedFunction(ScalarName, VF); + const std::string TLIName = + std::string(TLI.getVectorizedFunction(ScalarName, VF)); if (!TLIName.empty()) { - std::string MangledName = mangleTLIName(TLIName, CI, VF); + std::string MangledName = VFABI::mangleTLIVectorName( + TLIName, ScalarName, CI.getNumArgOperands(), VF); if (!OriginalSetOfMappings.count(MangledName)) { Mappings.push_back(MangledName); ++NumCallInjected; @@ -168,6 +140,12 @@ void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<LoopAccessLegacyAnalysis>(); + AU.addPreserved<DemandedBitsWrapperPass>(); + AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 6da612eb4e658..b0b7ca4847980 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -34,7 +34,6 @@ #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" @@ -60,6 +59,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> @@ -79,16 +79,23 @@ EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true), cl::Hidden, cl::desc("Convert noalias attributes to metadata during inlining.")); +// Disabled by default, because the added alignment assumptions may increase +// compile-time and block optimizations. This option is not suitable for use +// with frontends that emit comprehensive parameter alignment annotations. static cl::opt<bool> PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining", - cl::init(true), cl::Hidden, + cl::init(false), cl::Hidden, cl::desc("Convert align attributes to assumptions during inlining.")); -llvm::InlineResult llvm::InlineFunction(CallBase *CB, InlineFunctionInfo &IFI, - AAResults *CalleeAAR, - bool InsertLifetime) { - return InlineFunction(CallSite(CB), IFI, CalleeAAR, InsertLifetime); -} +static cl::opt<bool> UpdateReturnAttributes( + "update-return-attrs", cl::init(true), cl::Hidden, + cl::desc("Update return attributes on calls within inlined body")); + +static cl::opt<unsigned> InlinerAttributeWindow( + "max-inst-checked-for-throw-during-inlining", cl::Hidden, + cl::desc("the maximum number of instructions analyzed for may throw during " + "attribute inference in inlined body"), + cl::init(4)); namespace { @@ -530,7 +537,7 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke( // instructions require no special handling. CallInst *CI = dyn_cast<CallInst>(I); - if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue())) + if (!CI || CI->doesNotThrow() || CI->isInlineAsm()) continue; // We do not need to (and in fact, cannot) convert possibly throwing calls @@ -767,12 +774,10 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, /// When inlining a call site that has !llvm.mem.parallel_loop_access or /// llvm.access.group metadata, that metadata should be propagated to all /// memory-accessing cloned instructions. -static void PropagateParallelLoopAccessMetadata(CallSite CS, +static void PropagateParallelLoopAccessMetadata(CallBase &CB, ValueToValueMapTy &VMap) { - MDNode *M = - CS.getInstruction()->getMetadata(LLVMContext::MD_mem_parallel_loop_access); - MDNode *CallAccessGroup = - CS.getInstruction()->getMetadata(LLVMContext::MD_access_group); + MDNode *M = CB.getMetadata(LLVMContext::MD_mem_parallel_loop_access); + MDNode *CallAccessGroup = CB.getMetadata(LLVMContext::MD_access_group); if (!M && !CallAccessGroup) return; @@ -810,8 +815,8 @@ static void PropagateParallelLoopAccessMetadata(CallSite CS, /// not be differentiated (and this would lead to miscompiles because the /// non-aliasing property communicated by the metadata could have /// call-site-specific control dependencies). -static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { - const Function *CalledFunc = CS.getCalledFunction(); +static void CloneAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap) { + const Function *CalledFunc = CB.getCalledFunction(); SetVector<const MDNode *> MD; // Note: We could only clone the metadata if it is already used in the @@ -886,13 +891,11 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { // If the call site also had alias scope metadata (a list of scopes to // which instructions inside it might belong), propagate those scopes to // the inlined instructions. - if (MDNode *CSM = - CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope)) + if (MDNode *CSM = CB.getMetadata(LLVMContext::MD_alias_scope)) NewMD = MDNode::concatenate(NewMD, CSM); NI->setMetadata(LLVMContext::MD_alias_scope, NewMD); } else if (NI->mayReadOrWriteMemory()) { - if (MDNode *M = - CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope)) + if (MDNode *M = CB.getMetadata(LLVMContext::MD_alias_scope)) NI->setMetadata(LLVMContext::MD_alias_scope, M); } @@ -901,12 +904,11 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { // If the call site also had noalias metadata (a list of scopes with // which instructions inside it don't alias), propagate those scopes to // the inlined instructions. - if (MDNode *CSM = - CS.getInstruction()->getMetadata(LLVMContext::MD_noalias)) + if (MDNode *CSM = CB.getMetadata(LLVMContext::MD_noalias)) NewMD = MDNode::concatenate(NewMD, CSM); NI->setMetadata(LLVMContext::MD_noalias, NewMD); } else if (NI->mayReadOrWriteMemory()) { - if (MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_noalias)) + if (MDNode *M = CB.getMetadata(LLVMContext::MD_noalias)) NI->setMetadata(LLVMContext::MD_noalias, M); } } @@ -916,16 +918,16 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { /// then add new alias scopes for each noalias argument, tag the mapped noalias /// parameters with noalias metadata specifying the new scope, and tag all /// non-derived loads, stores and memory intrinsics with the new alias scopes. -static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, +static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, const DataLayout &DL, AAResults *CalleeAAR) { if (!EnableNoAliasConversion) return; - const Function *CalledFunc = CS.getCalledFunction(); + const Function *CalledFunc = CB.getCalledFunction(); SmallVector<const Argument *, 4> NoAliasArgs; for (const Argument &Arg : CalledFunc->args()) - if (Arg.hasNoAliasAttr() && !Arg.use_empty()) + if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty()) NoAliasArgs.push_back(&Arg); if (NoAliasArgs.empty()) @@ -951,7 +953,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) { const Argument *A = NoAliasArgs[i]; - std::string Name = CalledFunc->getName(); + std::string Name = std::string(CalledFunc->getName()); if (A->hasName()) { Name += ": %"; Name += A->getName(); @@ -1002,8 +1004,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, IsFuncCall = true; if (CalleeAAR) { FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(Call); - if (MRB == FMRB_OnlyAccessesArgumentPointees || - MRB == FMRB_OnlyReadsArgumentPointees) + if (AAResults::onlyAccessesArgPointees(MRB)) IsArgMemOnlyCall = true; } @@ -1059,7 +1060,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, // completely describe the aliasing properties using alias.scope // metadata (and, thus, won't add any). if (const Argument *A = dyn_cast<Argument>(V)) { - if (!A->hasNoAliasAttr()) + if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias)) UsesAliasingPtr = true; } else { UsesAliasingPtr = true; @@ -1136,37 +1137,128 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, } } +static bool MayContainThrowingOrExitingCall(Instruction *Begin, + Instruction *End) { + + assert(Begin->getParent() == End->getParent() && + "Expected to be in same basic block!"); + unsigned NumInstChecked = 0; + // Check that all instructions in the range [Begin, End) are guaranteed to + // transfer execution to successor. + for (auto &I : make_range(Begin->getIterator(), End->getIterator())) + if (NumInstChecked++ > InlinerAttributeWindow || + !isGuaranteedToTransferExecutionToSuccessor(&I)) + return true; + return false; +} + +static AttrBuilder IdentifyValidAttributes(CallBase &CB) { + + AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex); + if (AB.empty()) + return AB; + AttrBuilder Valid; + // Only allow these white listed attributes to be propagated back to the + // callee. This is because other attributes may only be valid on the call + // itself, i.e. attributes such as signext and zeroext. + if (auto DerefBytes = AB.getDereferenceableBytes()) + Valid.addDereferenceableAttr(DerefBytes); + if (auto DerefOrNullBytes = AB.getDereferenceableOrNullBytes()) + Valid.addDereferenceableOrNullAttr(DerefOrNullBytes); + if (AB.contains(Attribute::NoAlias)) + Valid.addAttribute(Attribute::NoAlias); + if (AB.contains(Attribute::NonNull)) + Valid.addAttribute(Attribute::NonNull); + return Valid; +} + +static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { + if (!UpdateReturnAttributes) + return; + + AttrBuilder Valid = IdentifyValidAttributes(CB); + if (Valid.empty()) + return; + auto *CalledFunction = CB.getCalledFunction(); + auto &Context = CalledFunction->getContext(); + + for (auto &BB : *CalledFunction) { + auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()); + if (!RI || !isa<CallBase>(RI->getOperand(0))) + continue; + auto *RetVal = cast<CallBase>(RI->getOperand(0)); + // Sanity check that the cloned RetVal exists and is a call, otherwise we + // cannot add the attributes on the cloned RetVal. + // Simplification during inlining could have transformed the cloned + // instruction. + auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal)); + if (!NewRetVal) + continue; + // Backward propagation of attributes to the returned value may be incorrect + // if it is control flow dependent. + // Consider: + // @callee { + // %rv = call @foo() + // %rv2 = call @bar() + // if (%rv2 != null) + // return %rv2 + // if (%rv == null) + // exit() + // return %rv + // } + // caller() { + // %val = call nonnull @callee() + // } + // Here we cannot add the nonnull attribute on either foo or bar. So, we + // limit the check to both RetVal and RI are in the same basic block and + // there are no throwing/exiting instructions between these instructions. + if (RI->getParent() != RetVal->getParent() || + MayContainThrowingOrExitingCall(RetVal, RI)) + continue; + // Add to the existing attributes of NewRetVal, i.e. the cloned call + // instruction. + // NB! When we have the same attribute already existing on NewRetVal, but + // with a differing value, the AttributeList's merge API honours the already + // existing attribute value (i.e. attributes such as dereferenceable, + // dereferenceable_or_null etc). See AttrBuilder::merge for more details. + AttributeList AL = NewRetVal->getAttributes(); + AttributeList NewAL = + AL.addAttributes(Context, AttributeList::ReturnIndex, Valid); + NewRetVal->setAttributes(NewAL); + } +} + /// If the inlined function has non-byval align arguments, then /// add @llvm.assume-based alignment assumptions to preserve this information. -static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { +static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) { if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache) return; - AssumptionCache *AC = &(*IFI.GetAssumptionCache)(*CS.getCaller()); - auto &DL = CS.getCaller()->getParent()->getDataLayout(); + AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller()); + auto &DL = CB.getCaller()->getParent()->getDataLayout(); // To avoid inserting redundant assumptions, we should check for assumptions // already in the caller. To do this, we might need a DT of the caller. DominatorTree DT; bool DTCalculated = false; - Function *CalledFunc = CS.getCalledFunction(); + Function *CalledFunc = CB.getCalledFunction(); for (Argument &Arg : CalledFunc->args()) { unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0; - if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) { + if (Align && !Arg.hasPassPointeeByValueAttr() && !Arg.hasNUses(0)) { if (!DTCalculated) { - DT.recalculate(*CS.getCaller()); + DT.recalculate(*CB.getCaller()); DTCalculated = true; } // If we can already prove the asserted alignment in the context of the // caller, then don't bother inserting the assumption. - Value *ArgVal = CS.getArgument(Arg.getArgNo()); - if (getKnownAlignment(ArgVal, DL, CS.getInstruction(), AC, &DT) >= Align) + Value *ArgVal = CB.getArgOperand(Arg.getArgNo()); + if (getKnownAlignment(ArgVal, DL, &CB, AC, &DT) >= Align) continue; - CallInst *NewAsmp = IRBuilder<>(CS.getInstruction()) - .CreateAlignmentAssumption(DL, ArgVal, Align); + CallInst *NewAsmp = + IRBuilder<>(&CB).CreateAlignmentAssumption(DL, ArgVal, Align); AC->registerAssumption(NewAsmp); } } @@ -1176,13 +1268,13 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { /// update the specified callgraph to reflect the changes we made. /// Note that it's possible that not all code was copied over, so only /// some edges of the callgraph may remain. -static void UpdateCallGraphAfterInlining(CallSite CS, +static void UpdateCallGraphAfterInlining(CallBase &CB, Function::iterator FirstNewBlock, ValueToValueMapTy &VMap, InlineFunctionInfo &IFI) { CallGraph &CG = *IFI.CG; - const Function *Caller = CS.getCaller(); - const Function *Callee = CS.getCalledFunction(); + const Function *Caller = CB.getCaller(); + const Function *Callee = CB.getCalledFunction(); CallGraphNode *CalleeNode = CG[Callee]; CallGraphNode *CallerNode = CG[Caller]; @@ -1199,7 +1291,11 @@ static void UpdateCallGraphAfterInlining(CallSite CS, } for (; I != E; ++I) { - const Value *OrigCall = I->first; + // Skip 'refererence' call records. + if (!I->first) + continue; + + const Value *OrigCall = *I->first; ValueToValueMapTy::iterator VMI = VMap.find(OrigCall); // Only copy the edge if the call was inlined! @@ -1240,7 +1336,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS, // Update the call graph by deleting the edge from Callee to Caller. We must // do this after the loop above in case Caller and Callee are the same. - CallerNode->removeCallEdgeFor(*cast<CallBase>(CS.getInstruction())); + CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB)); } static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, @@ -1254,8 +1350,8 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, // Always generate a memcpy of alignment 1 here because we don't know // the alignment of the src pointer. Other optimizations can infer // better alignment. - Builder.CreateMemCpy(Dst, /*DstAlign*/ Align::None(), Src, - /*SrcAlign*/ Align::None(), Size); + Builder.CreateMemCpy(Dst, /*DstAlign*/ Align(1), Src, + /*SrcAlign*/ Align(1), Size); } /// When inlining a call site that has a byval argument, @@ -1281,12 +1377,12 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, return Arg; AssumptionCache *AC = - IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr; + IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr; // If the pointer is already known to be sufficiently aligned, or if we can // round it up to a larger alignment, then we don't need a temporary. - if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >= - ByValAlignment) + if (getOrEnforceKnownAlignment(Arg, Align(ByValAlignment), DL, TheCall, + AC) >= ByValAlignment) return Arg; // Otherwise, we have to make a memcpy to get a safe alignment. This is bad @@ -1356,34 +1452,6 @@ static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt, IA); } -/// Returns the LoopID for a loop which has has been cloned from another -/// function for inlining with the new inlined-at start and end locs. -static MDNode *inlineLoopID(const MDNode *OrigLoopId, DILocation *InlinedAt, - LLVMContext &Ctx, - DenseMap<const MDNode *, MDNode *> &IANodes) { - assert(OrigLoopId && OrigLoopId->getNumOperands() > 0 && - "Loop ID needs at least one operand"); - assert(OrigLoopId && OrigLoopId->getOperand(0).get() == OrigLoopId && - "Loop ID should refer to itself"); - - // Save space for the self-referential LoopID. - SmallVector<Metadata *, 4> MDs = {nullptr}; - - for (unsigned i = 1; i < OrigLoopId->getNumOperands(); ++i) { - Metadata *MD = OrigLoopId->getOperand(i); - // Update the DILocations to encode the inlined-at metadata. - if (DILocation *DL = dyn_cast<DILocation>(MD)) - MDs.push_back(inlineDebugLoc(DL, InlinedAt, Ctx, IANodes)); - else - MDs.push_back(MD); - } - - MDNode *NewLoopID = MDNode::getDistinct(Ctx, MDs); - // Insert the self-referential LoopID. - NewLoopID->replaceOperandWith(0, NewLoopID); - return NewLoopID; -} - /// Update inlined instructions' line numbers to /// to encode location where these instructions are inlined. static void fixupLineNumbers(Function *Fn, Function::iterator FI, @@ -1415,11 +1483,11 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI != BE; ++BI) { // Loop metadata needs to be updated so that the start and end locs // reference inlined-at locations. - if (MDNode *LoopID = BI->getMetadata(LLVMContext::MD_loop)) { - MDNode *NewLoopID = - inlineLoopID(LoopID, InlinedAtNode, BI->getContext(), IANodes); - BI->setMetadata(LLVMContext::MD_loop, NewLoopID); - } + auto updateLoopInfoLoc = [&Ctx, &InlinedAtNode, &IANodes]( + const DILocation &Loc) -> DILocation * { + return inlineDebugLoc(&Loc, InlinedAtNode, Ctx, IANodes).get(); + }; + updateLoopMetadataDebugLocations(*BI, updateLoopInfoLoc); if (!NoInlineLineTables) if (DebugLoc DL = BI->getDebugLoc()) { @@ -1498,8 +1566,7 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, /// Update the branch metadata for cloned call instructions. static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, const ProfileCount &CalleeEntryCount, - const Instruction *TheCall, - ProfileSummaryInfo *PSI, + const CallBase &TheCall, ProfileSummaryInfo *PSI, BlockFrequencyInfo *CallerBFI) { if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() || CalleeEntryCount.getCount() < 1) @@ -1557,31 +1624,29 @@ void llvm::updateProfileCallee( /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now /// exists in the instruction stream. Similarly this will inline a recursive /// function by one level. -llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, +llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, AAResults *CalleeAAR, bool InsertLifetime, Function *ForwardVarArgsTo) { - Instruction *TheCall = CS.getInstruction(); - assert(TheCall->getParent() && TheCall->getFunction() - && "Instruction not in function!"); + assert(CB.getParent() && CB.getFunction() && "Instruction not in function!"); // FIXME: we don't inline callbr yet. - if (isa<CallBrInst>(TheCall)) - return false; + if (isa<CallBrInst>(CB)) + return InlineResult::failure("We don't inline callbr yet."); // If IFI has any state in it, zap it before we fill it in. IFI.reset(); - Function *CalledFunc = CS.getCalledFunction(); + Function *CalledFunc = CB.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration()) // call! - return "external or indirect"; + return InlineResult::failure("external or indirect"); // The inliner does not know how to inline through calls with operand bundles // in general ... - if (CS.hasOperandBundles()) { - for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { - uint32_t Tag = CS.getOperandBundleAt(i).getTagID(); + if (CB.hasOperandBundles()) { + for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) { + uint32_t Tag = CB.getOperandBundleAt(i).getTagID(); // ... but it knows how to inline through "deopt" operand bundles ... if (Tag == LLVMContext::OB_deopt) continue; @@ -1589,15 +1654,15 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, if (Tag == LLVMContext::OB_funclet) continue; - return "unsupported operand bundle"; + return InlineResult::failure("unsupported operand bundle"); } } // If the call to the callee cannot throw, set the 'nounwind' flag on any // calls that we inline. - bool MarkNoUnwind = CS.doesNotThrow(); + bool MarkNoUnwind = CB.doesNotThrow(); - BasicBlock *OrigBB = TheCall->getParent(); + BasicBlock *OrigBB = CB.getParent(); Function *Caller = OrigBB->getParent(); // GC poses two hazards to inlining, which only occur when the callee has GC: @@ -1608,7 +1673,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, if (!Caller->hasGC()) Caller->setGC(CalledFunc->getGC()); else if (CalledFunc->getGC() != Caller->getGC()) - return "incompatible GC"; + return InlineResult::failure("incompatible GC"); } // Get the personality function from the callee if it contains a landing pad. @@ -1632,7 +1697,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // TODO: This isn't 100% true. Some personality functions are proper // supersets of others and can be used in place of the other. else if (CalledPersonality != CallerPersonality) - return "incompatible personality"; + return InlineResult::failure("incompatible personality"); } // We need to figure out which funclet the callsite was in so that we may @@ -1642,7 +1707,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, EHPersonality Personality = classifyEHPersonality(CallerPersonality); if (isScopedEHPersonality(Personality)) { Optional<OperandBundleUse> ParentFunclet = - CS.getOperandBundle(LLVMContext::OB_funclet); + CB.getOperandBundle(LLVMContext::OB_funclet); if (ParentFunclet) CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); @@ -1657,7 +1722,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // for catchpads. for (const BasicBlock &CalledBB : *CalledFunc) { if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI())) - return "catch in cleanup funclet"; + return InlineResult::failure("catch in cleanup funclet"); } } } else if (isAsynchronousEHPersonality(Personality)) { @@ -1665,7 +1730,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // funclet in the callee. for (const BasicBlock &CalledBB : *CalledFunc) { if (CalledBB.isEHPad()) - return "SEH in cleanup funclet"; + return InlineResult::failure("SEH in cleanup funclet"); } } } @@ -1675,7 +1740,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Determine if we are dealing with a call in an EHPad which does not unwind // to caller. bool EHPadForCallUnwindsLocally = false; - if (CallSiteEHPad && CS.isCall()) { + if (CallSiteEHPad && isa<CallInst>(CB)) { UnwindDestMemoTy FuncletUnwindMap; Value *CallSiteUnwindDestToken = getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap); @@ -1704,7 +1769,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Calculate the vector of arguments to pass into the function cloner, which // matches up the formal to the actual argument values. - CallSite::arg_iterator AI = CS.arg_begin(); + auto AI = CB.arg_begin(); unsigned ArgNo = 0; for (Function::arg_iterator I = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) { @@ -1714,8 +1779,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // by them explicit. However, we don't do this if the callee is readonly // or readnone, because the copy would be unneeded: the callee doesn't // modify the struct. - if (CS.isByValArgument(ArgNo)) { - ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI, + if (CB.isByValArgument(ArgNo)) { + ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI, CalledFunc->getParamAlignment(ArgNo)); if (ActualArg != *AI) ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI)); @@ -1724,10 +1789,17 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, VMap[&*I] = ActualArg; } + // TODO: Remove this when users have been updated to the assume bundles. // Add alignment assumptions if necessary. We do this before the inlined // instructions are actually cloned into the caller so that we can easily // check what will be known at the start of the inlined code. - AddAlignmentAssumptions(CS, IFI); + AddAlignmentAssumptions(CB, IFI); + + AssumptionCache *AC = + IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr; + + /// Preserve all attributes on of the call and its parameters. + salvageKnowledge(&CB, AC); // We want the inliner to prune the code as it copies. We would LOVE to // have no dead or constant instructions leftover after inlining occurs @@ -1735,7 +1807,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", - &InlinedFunctionInfo, TheCall); + &InlinedFunctionInfo, &CB); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; @@ -1744,7 +1816,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI, CalledFunc->front()); - updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, + updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB, IFI.PSI, IFI.CallerBFI); // Inject byval arguments initialization. @@ -1753,21 +1825,22 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, &*FirstNewBlock, IFI); Optional<OperandBundleUse> ParentDeopt = - CS.getOperandBundle(LLVMContext::OB_deopt); + CB.getOperandBundle(LLVMContext::OB_deopt); if (ParentDeopt) { SmallVector<OperandBundleDef, 2> OpDefs; for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) { - Instruction *I = dyn_cast_or_null<Instruction>(VH); - if (!I) continue; // instruction was DCE'd or RAUW'ed to undef + CallBase *ICS = dyn_cast_or_null<CallBase>(VH); + if (!ICS) + continue; // instruction was DCE'd or RAUW'ed to undef OpDefs.clear(); - CallSite ICS(I); - OpDefs.reserve(ICS.getNumOperandBundles()); + OpDefs.reserve(ICS->getNumOperandBundles()); - for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) { - auto ChildOB = ICS.getOperandBundleAt(i); + for (unsigned COBi = 0, COBe = ICS->getNumOperandBundles(); COBi < COBe; + ++COBi) { + auto ChildOB = ICS->getOperandBundleAt(COBi); if (ChildOB.getTagID() != LLVMContext::OB_deopt) { // If the inlined call has other operand bundles, let them be OpDefs.emplace_back(ChildOB); @@ -1791,51 +1864,48 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs)); } - Instruction *NewI = nullptr; - if (isa<CallInst>(I)) - NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I); - else if (isa<CallBrInst>(I)) - NewI = CallBrInst::Create(cast<CallBrInst>(I), OpDefs, I); - else - NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I); + Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS); // Note: the RAUW does the appropriate fixup in VMap, so we need to do // this even if the call returns void. - I->replaceAllUsesWith(NewI); + ICS->replaceAllUsesWith(NewI); VH = nullptr; - I->eraseFromParent(); + ICS->eraseFromParent(); } } // Update the callgraph if requested. if (IFI.CG) - UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI); + UpdateCallGraphAfterInlining(CB, FirstNewBlock, VMap, IFI); // For 'nodebug' functions, the associated DISubprogram is always null. // Conservatively avoid propagating the callsite debug location to // instructions inlined from a function whose DISubprogram is not null. - fixupLineNumbers(Caller, FirstNewBlock, TheCall, + fixupLineNumbers(Caller, FirstNewBlock, &CB, CalledFunc->getSubprogram() != nullptr); // Clone existing noalias metadata if necessary. - CloneAliasScopeMetadata(CS, VMap); + CloneAliasScopeMetadata(CB, VMap); // Add noalias metadata if necessary. - AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR); + AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR); + + // Clone return attributes on the callsite into the calls within the inlined + // function which feed into its return value. + AddReturnAttributes(CB, VMap); // Propagate llvm.mem.parallel_loop_access if necessary. - PropagateParallelLoopAccessMetadata(CS, VMap); + PropagateParallelLoopAccessMetadata(CB, VMap); // Register any cloned assumptions. if (IFI.GetAssumptionCache) for (BasicBlock &NewBlock : make_range(FirstNewBlock->getIterator(), Caller->end())) - for (Instruction &I : NewBlock) { + for (Instruction &I : NewBlock) if (auto *II = dyn_cast<IntrinsicInst>(&I)) if (II->getIntrinsicID() == Intrinsic::assume) - (*IFI.GetAssumptionCache)(*Caller).registerAssumption(II); - } + IFI.GetAssumptionCache(*Caller).registerAssumption(II); } // If there are any alloca instructions in the block that used to be the entry @@ -1877,24 +1947,20 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, Caller->getEntryBlock().getInstList().splice( InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); } - // Move any dbg.declares describing the allocas into the entry basic block. - DIBuilder DIB(*Caller->getParent()); - for (auto &AI : IFI.StaticAllocas) - replaceDbgDeclareForAlloca(AI, AI, DIB, DIExpression::ApplyOffset, 0); } SmallVector<Value*,4> VarArgsToForward; SmallVector<AttributeSet, 4> VarArgsAttrs; for (unsigned i = CalledFunc->getFunctionType()->getNumParams(); - i < CS.getNumArgOperands(); i++) { - VarArgsToForward.push_back(CS.getArgOperand(i)); - VarArgsAttrs.push_back(CS.getAttributes().getParamAttributes(i)); + i < CB.getNumArgOperands(); i++) { + VarArgsToForward.push_back(CB.getArgOperand(i)); + VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i)); } bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false; if (InlinedFunctionInfo.ContainsCalls) { CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None; - if (CallInst *CI = dyn_cast<CallInst>(TheCall)) + if (CallInst *CI = dyn_cast<CallInst>(&CB)) CallSiteTailKind = CI->getTailCallKind(); // For inlining purposes, the "notail" marker is the same as no marker. @@ -2056,7 +2122,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // any call instructions into invoke instructions. This is sensitive to which // funclet pads were top-level in the inlinee, so must be done before // rewriting the "parent pad" links. - if (auto *II = dyn_cast<InvokeInst>(TheCall)) { + if (auto *II = dyn_cast<InvokeInst>(&CB)) { BasicBlock *UnwindDest = II->getUnwindDest(); Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); if (isa<LandingPadInst>(FirstNonPHI)) { @@ -2077,31 +2143,24 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Add bundle operands to any top-level call sites. SmallVector<OperandBundleDef, 1> OpBundles; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { - Instruction *I = &*BBI++; - CallSite CS(I); - if (!CS) + CallBase *I = dyn_cast<CallBase>(&*BBI++); + if (!I) continue; // Skip call sites which are nounwind intrinsics. auto *CalledFn = - dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); - if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow()) + dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts()); + if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow()) continue; // Skip call sites which already have a "funclet" bundle. - if (CS.getOperandBundle(LLVMContext::OB_funclet)) + if (I->getOperandBundle(LLVMContext::OB_funclet)) continue; - CS.getOperandBundlesAsDefs(OpBundles); + I->getOperandBundlesAsDefs(OpBundles); OpBundles.emplace_back("funclet", CallSiteEHPad); - Instruction *NewInst; - if (CS.isCall()) - NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I); - else if (CS.isCallBr()) - NewInst = CallBrInst::Create(cast<CallBrInst>(I), OpBundles, I); - else - NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I); + Instruction *NewInst = CallBase::Create(I, OpBundles, I); NewInst->takeName(I); I->replaceAllUsesWith(NewInst); I->eraseFromParent(); @@ -2138,7 +2197,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // caller (but terminate it instead). If the caller's return type does not // match the callee's return type, we also need to change the return type of // the intrinsic. - if (Caller->getReturnType() == TheCall->getType()) { + if (Caller->getReturnType() == CB.getType()) { auto NewEnd = llvm::remove_if(Returns, [](ReturnInst *RI) { return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr; }); @@ -2197,7 +2256,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, if (InlinedMustTailCalls) { // Check if we need to bitcast the result of any musttail calls. Type *NewRetTy = Caller->getReturnType(); - bool NeedBitCast = !TheCall->use_empty() && TheCall->getType() != NewRetTy; + bool NeedBitCast = !CB.use_empty() && CB.getType() != NewRetTy; // Handle the returns preceded by musttail calls separately. SmallVector<ReturnInst *, 8> NormalReturns; @@ -2237,8 +2296,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, for (BasicBlock &NewBB : make_range(FirstNewBlock->getIterator(), Caller->end())) for (Instruction &I : NewBB) - if (auto CS = CallSite(&I)) - IFI.InlinedCallSites.push_back(CS); + if (auto *CB = dyn_cast<CallBase>(&I)) + IFI.InlinedCallSites.push_back(CB); } // If we cloned in _exactly one_ basic block, and if that block ends in a @@ -2246,36 +2305,35 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // the calling basic block. if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) { // Move all of the instructions right before the call. - OrigBB->getInstList().splice(TheCall->getIterator(), - FirstNewBlock->getInstList(), + OrigBB->getInstList().splice(CB.getIterator(), FirstNewBlock->getInstList(), FirstNewBlock->begin(), FirstNewBlock->end()); // Remove the cloned basic block. Caller->getBasicBlockList().pop_back(); // If the call site was an invoke instruction, add a branch to the normal // destination. - if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) { - BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall); + if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { + BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB); NewBr->setDebugLoc(Returns[0]->getDebugLoc()); } // If the return instruction returned a value, replace uses of the call with // uses of the returned value. - if (!TheCall->use_empty()) { + if (!CB.use_empty()) { ReturnInst *R = Returns[0]; - if (TheCall == R->getReturnValue()) - TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); + if (&CB == R->getReturnValue()) + CB.replaceAllUsesWith(UndefValue::get(CB.getType())); else - TheCall->replaceAllUsesWith(R->getReturnValue()); + CB.replaceAllUsesWith(R->getReturnValue()); } // Since we are now done with the Call/Invoke, we can delete it. - TheCall->eraseFromParent(); + CB.eraseFromParent(); // Since we are now done with the return instruction, delete it also. Returns[0]->eraseFromParent(); // We are now done with the inlining. - return true; + return InlineResult::success(); } // Otherwise, we have the normal case, of more than one block to inline or @@ -2286,10 +2344,10 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // this is an invoke instruction or a call instruction. BasicBlock *AfterCallBB; BranchInst *CreatedBranchToNormalDest = nullptr; - if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) { + if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { // Add an unconditional branch to make this look like the CallInst case... - CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), TheCall); + CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB); // Split the basic block. This guarantees that no PHI nodes will have to be // updated due to new incoming edges, and make the invoke case more @@ -2298,11 +2356,11 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(), CalledFunc->getName() + ".exit"); - } else { // It's a call + } else { // It's a call // If this is a call instruction, we need to split the basic block that // the call lives in. // - AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(), + AfterCallBB = OrigBB->splitBasicBlock(CB.getIterator(), CalledFunc->getName() + ".exit"); } @@ -2335,12 +2393,12 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, if (Returns.size() > 1) { // The PHI node should go at the front of the new basic block to merge all // possible incoming values. - if (!TheCall->use_empty()) { - PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(), + if (!CB.use_empty()) { + PHI = PHINode::Create(RTy, Returns.size(), CB.getName(), &AfterCallBB->front()); // Anything that used the result of the function call should now use the // PHI node as their operand. - TheCall->replaceAllUsesWith(PHI); + CB.replaceAllUsesWith(PHI); } // Loop over all of the return instructions adding entries to the PHI node @@ -2372,11 +2430,11 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } else if (!Returns.empty()) { // Otherwise, if there is exactly one return value, just replace anything // using the return value of the call with the computed value. - if (!TheCall->use_empty()) { - if (TheCall == Returns[0]->getReturnValue()) - TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); + if (!CB.use_empty()) { + if (&CB == Returns[0]->getReturnValue()) + CB.replaceAllUsesWith(UndefValue::get(CB.getType())); else - TheCall->replaceAllUsesWith(Returns[0]->getReturnValue()); + CB.replaceAllUsesWith(Returns[0]->getReturnValue()); } // Update PHI nodes that use the ReturnBB to use the AfterCallBB. @@ -2394,14 +2452,14 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Delete the return instruction now and empty ReturnBB now. Returns[0]->eraseFromParent(); ReturnBB->eraseFromParent(); - } else if (!TheCall->use_empty()) { + } else if (!CB.use_empty()) { // No returns, but something is using the return value of the call. Just // nuke the result. - TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); + CB.replaceAllUsesWith(UndefValue::get(CB.getType())); } // Since we are now done with the Call/Invoke, we can delete it. - TheCall->eraseFromParent(); + CB.eraseFromParent(); // If we inlined any musttail calls and the original return is now // unreachable, delete it. It can only contain a bitcast and ret. @@ -2429,7 +2487,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // block other optimizations. if (PHI) { AssumptionCache *AC = - IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr; + IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr; auto &DL = Caller->getParent()->getDataLayout(); if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) { PHI->replaceAllUsesWith(V); @@ -2437,5 +2495,5 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } - return true; + return InlineResult::success(); } diff --git a/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/llvm/lib/Transforms/Utils/InstructionNamer.cpp index aac0b55801c46..8e339fe46d457 100644 --- a/llvm/lib/Transforms/Utils/InstructionNamer.cpp +++ b/llvm/lib/Transforms/Utils/InstructionNamer.cpp @@ -42,7 +42,7 @@ namespace { for (Instruction &I : BB) if (!I.hasName() && !I.getType()->isVoidTy()) - I.setName("tmp"); + I.setName("i"); } return true; } diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index 5746d69260d50..b1a1c564d2171 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -76,7 +76,7 @@ static bool isExitBlock(BasicBlock *BB, /// that are outside the current loop. If so, insert LCSSA PHI nodes and /// rewrite the uses. bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, - DominatorTree &DT, LoopInfo &LI, + const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE) { SmallVector<Use *, 16> UsesToRewrite; SmallSetVector<PHINode *, 16> PHIsToRemove; @@ -128,7 +128,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, if (auto *Inv = dyn_cast<InvokeInst>(I)) DomBB = Inv->getNormalDest(); - DomTreeNode *DomNode = DT.getNode(DomBB); + const DomTreeNode *DomNode = DT.getNode(DomBB); SmallVector<PHINode *, 16> AddedPHIs; SmallVector<PHINode *, 8> PostProcessPHIs; @@ -274,7 +274,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, // Compute the set of BasicBlocks in the loop `L` dominating at least one exit. static void computeBlocksDominatingExits( - Loop &L, DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks, + Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks, SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) { SmallVector<BasicBlock *, 8> BBWorklist; @@ -318,7 +318,7 @@ static void computeBlocksDominatingExits( } } -bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, +bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE) { bool Changed = false; @@ -383,8 +383,8 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, } /// Process a loop nest depth first. -bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE) { +bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT, + const LoopInfo *LI, ScalarEvolution *SE) { bool Changed = false; // Recurse depth-first through inner loops. @@ -396,7 +396,7 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, } /// Process all loops in the function, inner-most out. -static bool formLCSSAOnAllLoops(LoopInfo *LI, DominatorTree &DT, +static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT, ScalarEvolution *SE) { bool Changed = false; for (auto &L : *LI) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index b2d511c7c9a97..da40c342af3ac 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/TinyPtrVector.h" +#include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/EHPersonalities.h" @@ -40,7 +41,6 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" @@ -75,6 +75,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <cassert> @@ -402,15 +403,29 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, II->getIntrinsicID() == Intrinsic::launder_invariant_group) return true; - // Lifetime intrinsics are dead when their right-hand is undef. - if (II->isLifetimeStartOrEnd()) - return isa<UndefValue>(II->getArgOperand(1)); + if (II->isLifetimeStartOrEnd()) { + auto *Arg = II->getArgOperand(1); + // Lifetime intrinsics are dead when their right-hand is undef. + if (isa<UndefValue>(Arg)) + return true; + // If the right-hand is an alloc, global, or argument and the only uses + // are lifetime intrinsics then the intrinsics are dead. + if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg)) + return llvm::all_of(Arg->uses(), [](Use &Use) { + if (IntrinsicInst *IntrinsicUse = + dyn_cast<IntrinsicInst>(Use.getUser())) + return IntrinsicUse->isLifetimeStartOrEnd(); + return false; + }); + return false; + } // Assumptions are dead if their condition is trivially true. Guards on // true are operationally no-ops. In the future we can consider more // sophisticated tradeoffs for guards considering potential for check // widening, but for now we keep things simple. - if (II->getIntrinsicID() == Intrinsic::assume || + if ((II->getIntrinsicID() == Intrinsic::assume && + isAssumeWithEmptyBundle(*II)) || II->getIntrinsicID() == Intrinsic::experimental_guard) { if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0))) return !Cond->isZero(); @@ -443,29 +458,49 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructions( if (!I || !isInstructionTriviallyDead(I, TLI)) return false; - SmallVector<Instruction*, 16> DeadInsts; + SmallVector<WeakTrackingVH, 16> DeadInsts; DeadInsts.push_back(I); RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU); return true; } +bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive( + SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI, + MemorySSAUpdater *MSSAU) { + unsigned S = 0, E = DeadInsts.size(), Alive = 0; + for (; S != E; ++S) { + auto *I = cast<Instruction>(DeadInsts[S]); + if (!isInstructionTriviallyDead(I)) { + DeadInsts[S] = nullptr; + ++Alive; + } + } + if (Alive == E) + return false; + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU); + return true; +} + void llvm::RecursivelyDeleteTriviallyDeadInstructions( - SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI, + SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU) { // Process the dead instruction list until empty. while (!DeadInsts.empty()) { - Instruction &I = *DeadInsts.pop_back_val(); - assert(I.use_empty() && "Instructions with uses are not dead."); - assert(isInstructionTriviallyDead(&I, TLI) && + Value *V = DeadInsts.pop_back_val(); + Instruction *I = cast_or_null<Instruction>(V); + if (!I) + continue; + assert(isInstructionTriviallyDead(I, TLI) && "Live instruction found in dead worklist!"); + assert(I->use_empty() && "Instructions with uses are not dead."); // Don't lose the debug info while deleting the instructions. - salvageDebugInfo(I); + salvageDebugInfo(*I); // Null out all of the instruction's operands to see if any operand becomes // dead as we go. - for (Use &OpU : I.operands()) { + for (Use &OpU : I->operands()) { Value *OpV = OpU.get(); OpU.set(nullptr); @@ -480,9 +515,9 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions( DeadInsts.push_back(OpI); } if (MSSAU) - MSSAU->removeMemoryAccess(&I); + MSSAU->removeMemoryAccess(I); - I.eraseFromParent(); + I->eraseFromParent(); } } @@ -521,19 +556,20 @@ static bool areAllUsesEqual(Instruction *I) { /// delete it. If that makes any of its operands trivially dead, delete them /// too, recursively. Return true if a change was made. bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, + llvm::MemorySSAUpdater *MSSAU) { SmallPtrSet<Instruction*, 4> Visited; for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects(); I = cast<Instruction>(*I->user_begin())) { if (I->use_empty()) - return RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + return RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU); // If we find an instruction more than once, we're on a cycle that // won't prove fruitful. if (!Visited.insert(I).second) { // Break the cycle and delete the instruction and its operands. I->replaceAllUsesWith(UndefValue::get(I->getType())); - (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU); return true; } } @@ -1132,9 +1168,8 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { /// often possible though. If alignment is important, a more reliable approach /// is to simply align all global variables and allocation instructions to /// their preferred alignment from the beginning. -static unsigned enforceKnownAlignment(Value *V, unsigned Alignment, - unsigned PrefAlign, - const DataLayout &DL) { +static Align enforceKnownAlignment(Value *V, Align Alignment, Align PrefAlign, + const DataLayout &DL) { assert(PrefAlign > Alignment); V = V->stripPointerCasts(); @@ -1146,21 +1181,21 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Alignment, // stripPointerCasts recurses through infinite layers of bitcasts, // while computeKnownBits is not allowed to traverse more than 6 // levels. - Alignment = std::max(AI->getAlignment(), Alignment); + Alignment = std::max(AI->getAlign(), Alignment); if (PrefAlign <= Alignment) return Alignment; // If the preferred alignment is greater than the natural stack alignment // then don't round up. This avoids dynamic stack realignment. - if (DL.exceedsNaturalStackAlignment(Align(PrefAlign))) + if (DL.exceedsNaturalStackAlignment(PrefAlign)) return Alignment; - AI->setAlignment(MaybeAlign(PrefAlign)); + AI->setAlignment(PrefAlign); return PrefAlign; } if (auto *GO = dyn_cast<GlobalObject>(V)) { // TODO: as above, this shouldn't be necessary. - Alignment = std::max(GO->getAlignment(), Alignment); + Alignment = max(GO->getAlign(), Alignment); if (PrefAlign <= Alignment) return Alignment; @@ -1171,18 +1206,18 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Alignment, if (!GO->canIncreaseAlignment()) return Alignment; - GO->setAlignment(MaybeAlign(PrefAlign)); + GO->setAlignment(PrefAlign); return PrefAlign; } return Alignment; } -unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, - const DataLayout &DL, - const Instruction *CxtI, - AssumptionCache *AC, - const DominatorTree *DT) { +Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, + const DataLayout &DL, + const Instruction *CxtI, + AssumptionCache *AC, + const DominatorTree *DT) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); @@ -1191,42 +1226,22 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. - TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1)); - - unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ); + // LLVM doesn't support alignments larger than (1 << MaxAlignmentExponent). + TrailZ = std::min(TrailZ, +Value::MaxAlignmentExponent); - // LLVM doesn't support alignments larger than this currently. - Align = std::min(Align, +Value::MaximumAlignment); + Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); - if (PrefAlign > Align) - Align = enforceKnownAlignment(V, Align, PrefAlign, DL); + if (PrefAlign && *PrefAlign > Alignment) + Alignment = enforceKnownAlignment(V, Alignment, *PrefAlign, DL); // We don't need to make any adjustment. - return Align; + return Alignment; } ///===---------------------------------------------------------------------===// /// Dbg Intrinsic utilities /// -/// See if there is a dbg.value intrinsic for DIVar before I. -static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr, - Instruction *I) { - // Since we can't guarantee that the original dbg.declare instrinsic - // is removed by LowerDbgDeclare(), we need to make sure that we are - // not inserting the same dbg.value intrinsic over and over. - BasicBlock::InstListType::iterator PrevI(I); - if (PrevI != I->getParent()->getInstList().begin()) { - --PrevI; - if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI)) - if (DVI->getValue() == I->getOperand(0) && - DVI->getVariable() == DIVar && - DVI->getExpression() == DIExpr) - return true; - } - return false; -} - /// See if there is a dbg.value intrinsic for DIVar for the PHI node. static bool PhiHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr, @@ -1303,13 +1318,11 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, // know which part) we insert an dbg.value instrinsic to indicate that we // know nothing about the variable's content. DV = UndefValue::get(DV->getType()); - if (!LdStHasDebugValue(DIVar, DIExpr, SI)) - Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI); + Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI); return; } - if (!LdStHasDebugValue(DIVar, DIExpr, SI)) - Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI); + Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI); } /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value @@ -1320,9 +1333,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, auto *DIExpr = DII->getExpression(); assert(DIVar && "Missing variable"); - if (LdStHasDebugValue(DIVar, DIExpr, LI)) - return; - if (!valueCoversEntireFragment(LI->getType(), DII)) { // FIXME: If only referring to a part of the variable described by the // dbg.declare, then we want to insert a dbg.value for the corresponding @@ -1389,6 +1399,7 @@ static bool isStructure(AllocaInst *AI) { /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set /// of llvm.dbg.value intrinsics. bool llvm::LowerDbgDeclare(Function &F) { + bool Changed = false; DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<DbgDeclareInst *, 4> Dbgs; for (auto &FI : F) @@ -1397,7 +1408,7 @@ bool llvm::LowerDbgDeclare(Function &F) { Dbgs.push_back(DDI); if (Dbgs.empty()) - return false; + return Changed; for (auto &I : Dbgs) { DbgDeclareInst *DDI = I; @@ -1450,8 +1461,14 @@ bool llvm::LowerDbgDeclare(Function &F) { } } DDI->eraseFromParent(); + Changed = true; } - return true; + + if (Changed) + for (BasicBlock &BB : F) + RemoveRedundantDbgInstrs(&BB); + + return Changed; } /// Propagate dbg.value intrinsics through the newly inserted PHIs. @@ -1521,6 +1538,14 @@ TinyPtrVector<DbgVariableIntrinsic *> llvm::FindDbgAddrUses(Value *V) { return Declares; } +TinyPtrVector<DbgDeclareInst *> llvm::FindDbgDeclareUses(Value *V) { + TinyPtrVector<DbgDeclareInst *> DDIs; + for (DbgVariableIntrinsic *DVI : FindDbgAddrUses(V)) + if (auto *DDI = dyn_cast<DbgDeclareInst>(DVI)) + DDIs.push_back(DDI); + return DDIs; +} + void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) { // This function is hot. Check whether the value has any metadata to avoid a // DenseMap lookup. @@ -1547,8 +1572,8 @@ void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, } bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, - Instruction *InsertBefore, DIBuilder &Builder, - uint8_t DIExprFlags, int Offset) { + DIBuilder &Builder, uint8_t DIExprFlags, + int Offset) { auto DbgAddrs = FindDbgAddrUses(Address); for (DbgVariableIntrinsic *DII : DbgAddrs) { DebugLoc Loc = DII->getDebugLoc(); @@ -1556,23 +1581,14 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, auto *DIExpr = DII->getExpression(); assert(DIVar && "Missing variable"); DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset); - // Insert llvm.dbg.declare immediately before InsertBefore, and remove old + // Insert llvm.dbg.declare immediately before DII, and remove old // llvm.dbg.declare. - Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore); - if (DII == InsertBefore) - InsertBefore = InsertBefore->getNextNode(); + Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, DII); DII->eraseFromParent(); } return !DbgAddrs.empty(); } -bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder, uint8_t DIExprFlags, - int Offset) { - return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, - DIExprFlags, Offset); -} - static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress, DIBuilder &Builder, int Offset) { DebugLoc Loc = DVI->getDebugLoc(); @@ -1612,23 +1628,18 @@ static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) { return MetadataAsValue::get(C, ValueAsMetadata::get(V)); } -bool llvm::salvageDebugInfo(Instruction &I) { +/// Where possible to salvage debug information for \p I do so +/// and return True. If not possible mark undef and return False. +void llvm::salvageDebugInfo(Instruction &I) { SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; findDbgUsers(DbgUsers, &I); - if (DbgUsers.empty()) - return false; - - return salvageDebugInfoForDbgValues(I, DbgUsers); -} - -void llvm::salvageDebugInfoOrMarkUndef(Instruction &I) { - if (!salvageDebugInfo(I)) - replaceDbgUsesWithUndef(&I); + salvageDebugInfoForDbgValues(I, DbgUsers); } -bool llvm::salvageDebugInfoForDbgValues( +void llvm::salvageDebugInfoForDbgValues( Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) { auto &Ctx = I.getContext(); + bool Salvaged = false; auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); }; for (auto *DII : DbgUsers) { @@ -1643,14 +1654,22 @@ bool llvm::salvageDebugInfoForDbgValues( // salvageDebugInfoImpl should fail on examining the first element of // DbgUsers, or none of them. if (!DIExpr) - return false; + break; DII->setOperand(0, wrapMD(I.getOperand(0))); DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr)); LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n'); + Salvaged = true; } - return true; + if (Salvaged) + return; + + for (auto *DII : DbgUsers) { + Value *Undef = UndefValue::get(I.getType()); + DII->setOperand(0, MetadataAsValue::get(DII->getContext(), + ValueAsMetadata::get(Undef))); + } } DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, @@ -1682,13 +1701,14 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, }; if (auto *CI = dyn_cast<CastInst>(&I)) { - // No-op casts and zexts are irrelevant for debug info. - if (CI->isNoopCast(DL) || isa<ZExtInst>(&I)) + // No-op casts are irrelevant for debug info. + if (CI->isNoopCast(DL)) return SrcDIExpr; Type *Type = CI->getType(); - // Casts other than Trunc or SExt to scalar types cannot be salvaged. - if (Type->isVectorTy() || (!isa<TruncInst>(&I) && !isa<SExtInst>(&I))) + // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged. + if (Type->isVectorTy() || + !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I))) return nullptr; Value *FromValue = CI->getOperand(0); @@ -1805,7 +1825,7 @@ static bool rewriteDebugUsers( if (!UndefOrSalvage.empty()) { // Try to salvage the remaining debug users. - salvageDebugInfoOrMarkUndef(From); + salvageDebugInfo(From); Changed = true; } @@ -1960,11 +1980,23 @@ CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) { SmallVector<OperandBundleDef, 1> OpBundles; II->getOperandBundlesAsDefs(OpBundles); CallInst *NewCall = CallInst::Create(II->getFunctionType(), - II->getCalledValue(), Args, OpBundles); + II->getCalledOperand(), Args, OpBundles); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); NewCall->setDebugLoc(II->getDebugLoc()); NewCall->copyMetadata(*II); + + // If the invoke had profile metadata, try converting them for CallInst. + uint64_t TotalWeight; + if (NewCall->extractProfTotalWeight(TotalWeight)) { + // Set the total weight if it fits into i32, otherwise reset. + MDBuilder MDB(NewCall->getContext()); + auto NewWeights = uint32_t(TotalWeight) != TotalWeight + ? nullptr + : MDB.createBranchWeights({uint32_t(TotalWeight)}); + NewCall->setMetadata(LLVMContext::MD_prof, NewWeights); + } + return NewCall; } @@ -2011,7 +2043,7 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI, // as of this time. InvokeInst *II = - InvokeInst::Create(CI->getFunctionType(), CI->getCalledValue(), Split, + InvokeInst::Create(CI->getFunctionType(), CI->getCalledOperand(), Split, UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB); II->setDebugLoc(CI->getDebugLoc()); II->setCallingConv(CI->getCallingConv()); @@ -2042,7 +2074,7 @@ static bool markAliveBlocks(Function &F, // canonicalizes unreachable insts into stores to null or undef. for (Instruction &I : *BB) { if (auto *CI = dyn_cast<CallInst>(&I)) { - Value *Callee = CI->getCalledValue(); + Value *Callee = CI->getCalledOperand(); // Handle intrinsic calls. if (Function *F = dyn_cast<Function>(Callee)) { auto IntrinsicID = F->getIntrinsicID(); @@ -2117,7 +2149,7 @@ static bool markAliveBlocks(Function &F, Instruction *Terminator = BB->getTerminator(); if (auto *II = dyn_cast<InvokeInst>(Terminator)) { // Turn invokes that call 'nounwind' functions into ordinary calls. - Value *Callee = II->getCalledValue(); + Value *Callee = II->getCalledOperand(); if ((isa<ConstantPointerNull>(Callee) && !NullPointerIsDefined(BB->getParent())) || isa<UndefValue>(Callee)) { @@ -2243,7 +2275,7 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, SmallSetVector<BasicBlock *, 8> DeadBlockSet; for (BasicBlock &BB : F) { // Skip reachable basic blocks - if (Reachable.find(&BB) != Reachable.end()) + if (Reachable.count(&BB)) continue; DeadBlockSet.insert(&BB); } @@ -2548,7 +2580,7 @@ bool llvm::callsGCLeafFunction(const CallBase *Call, // marked as 'gc-leaf-function.' All available Libcalls are // GC-leaf. LibFunc LF; - if (TLI.getLibFunc(ImmutableCallSite(Call), LF)) { + if (TLI.getLibFunc(*Call, LF)) { return TLI.has(LF); } @@ -2928,21 +2960,40 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { default: return true; case Instruction::Call: - case Instruction::Invoke: + case Instruction::Invoke: { + const auto &CB = cast<CallBase>(*I); + // Can't handle inline asm. Skip it. - if (isa<InlineAsm>(ImmutableCallSite(I).getCalledValue())) - return false; - // Many arithmetic intrinsics have no issue taking a - // variable, however it's hard to distingish these from - // specials such as @llvm.frameaddress that require a constant. - if (isa<IntrinsicInst>(I)) + if (CB.isInlineAsm()) return false; // Constant bundle operands may need to retain their constant-ness for // correctness. - if (ImmutableCallSite(I).isBundleOperand(OpIdx)) + if (CB.isBundleOperand(OpIdx)) return false; - return true; + + if (OpIdx < CB.getNumArgOperands()) { + // Some variadic intrinsics require constants in the variadic arguments, + // which currently aren't markable as immarg. + if (isa<IntrinsicInst>(CB) && + OpIdx >= CB.getFunctionType()->getNumParams()) { + // This is known to be OK for stackmap. + return CB.getIntrinsicID() == Intrinsic::experimental_stackmap; + } + + // gcroot is a special case, since it requires a constant argument which + // isn't also required to be a simple ConstantInt. + if (CB.getIntrinsicID() == Intrinsic::gcroot) + return false; + + // Some intrinsic operands are required to be immediates. + return !CB.paramHasAttr(OpIdx, Attribute::ImmArg); + } + + // It is never allowed to replace the call argument to an intrinsic, but it + // may be possible for a call. + return !isa<IntrinsicInst>(CB); + } case Instruction::ShuffleVector: // Shufflevector masks are constant. return OpIdx != 2; @@ -3006,3 +3057,37 @@ AllocaInst *llvm::findAllocaForValue(Value *V, AllocaForValue[V] = Res; return Res; } + +Value *llvm::invertCondition(Value *Condition) { + // First: Check if it's a constant + if (Constant *C = dyn_cast<Constant>(Condition)) + return ConstantExpr::getNot(C); + + // Second: If the condition is already inverted, return the original value + Value *NotCondition; + if (match(Condition, m_Not(m_Value(NotCondition)))) + return NotCondition; + + BasicBlock *Parent = nullptr; + Instruction *Inst = dyn_cast<Instruction>(Condition); + if (Inst) + Parent = Inst->getParent(); + else if (Argument *Arg = dyn_cast<Argument>(Condition)) + Parent = &Arg->getParent()->getEntryBlock(); + assert(Parent && "Unsupported condition to invert"); + + // Third: Check all the users for an invert + for (User *U : Condition->users()) + if (Instruction *I = dyn_cast<Instruction>(U)) + if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) + return I; + + // Last option: Create a new instruction + auto *Inverted = + BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv"); + if (Inst && !isa<PHINode>(Inst)) + Inverted->insertAfter(Inst); + else + Inverted->insertBefore(&*Parent->getFirstInsertionPt()); + return Inverted; +} diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index c065e0269c64a..8804bba975b6a 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -46,6 +46,11 @@ using namespace llvm; STATISTIC(NumRotated, "Number of loops rotated"); +static cl::opt<bool> + MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden, + cl::desc("Allow loop rotation multiple times in order to reach " + "a better latch exit")); + namespace { /// A simple loop rotation transformation. class LoopRotate { @@ -177,14 +182,16 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } -// Look for a phi which is only used outside the loop (via a LCSSA phi) -// in the exit from the header. This means that rotating the loop can -// remove the phi. -static bool shouldRotateLoopExitingLatch(Loop *L) { +// Assuming both header and latch are exiting, look for a phi which is only +// used outside the loop (via a LCSSA phi) in the exit from the header. +// This means that rotating the loop can remove the phi. +static bool profitableToRotateLoopExitingLatch(Loop *L) { BasicBlock *Header = L->getHeader(); - BasicBlock *HeaderExit = Header->getTerminator()->getSuccessor(0); + BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator()); + assert(BI && BI->isConditional() && "need header with conditional exit"); + BasicBlock *HeaderExit = BI->getSuccessor(0); if (L->contains(HeaderExit)) - HeaderExit = Header->getTerminator()->getSuccessor(1); + HeaderExit = BI->getSuccessor(1); for (auto &Phi : Header->phis()) { // Look for uses of this phi in the loop/via exits other than the header. @@ -194,7 +201,50 @@ static bool shouldRotateLoopExitingLatch(Loop *L) { continue; return true; } + return false; +} + +// Check that latch exit is deoptimizing (which means - very unlikely to happen) +// and there is another exit from the loop which is non-deoptimizing. +// If we rotate latch to that exit our loop has a better chance of being fully +// canonical. +// +// It can give false positives in some rare cases. +static bool canRotateDeoptimizingLatchExit(Loop *L) { + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "need latch"); + BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator()); + // Need normal exiting latch. + if (!BI || !BI->isConditional()) + return false; + + BasicBlock *Exit = BI->getSuccessor(1); + if (L->contains(Exit)) + Exit = BI->getSuccessor(0); + // Latch exit is non-deoptimizing, no need to rotate. + if (!Exit->getPostdominatingDeoptimizeCall()) + return false; + + SmallVector<BasicBlock *, 4> Exits; + L->getUniqueExitBlocks(Exits); + if (!Exits.empty()) { + // There is at least one non-deoptimizing exit. + // + // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact, + // as it can conservatively return false for deoptimizing exits with + // complex enough control flow down to deoptimize call. + // + // That means here we can report success for a case where + // all exits are deoptimizing but one of them has complex enough + // control flow (e.g. with loops). + // + // That should be a very rare case and false positives for this function + // have compile-time effect only. + return any_of(Exits, [](const BasicBlock *BB) { + return !BB->getPostdominatingDeoptimizeCall(); + }); + } return false; } @@ -208,319 +258,342 @@ static bool shouldRotateLoopExitingLatch(Loop *L) { /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. +/// +/// If -loop-rotate-multi is enabled we can do multiple rotations in one go +/// so to reach a suitable (non-deoptimizing) exit. bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); - - BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) - return false; - - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) - return false; - - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) - return false; - - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. Or if we think it will be profitable. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && - !shouldRotateLoopExitingLatch(L)) - return false; - - // Check size of original header and reject loop if it is very big or we can't - // duplicate blocks inside it. - { - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); - if (Metrics.notDuplicatable) { - LLVM_DEBUG( - dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; - L->dump()); - return false; - } - if (Metrics.convergent) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " - "instructions: "; - L->dump()); - return false; + bool Rotated = false; + do { + BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (!BI || BI->isUnconditional()) + return Rotated; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return Rotated; + + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (!OrigLatch) + return Rotated; + + // Rotate if either the loop latch does *not* exit the loop, or if the loop + // latch was just simplified. Or if we think it will be profitable. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && + !profitableToRotateLoopExitingLatch(L) && + !canRotateDeoptimizingLatchExit(L)) + return Rotated; + + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. + { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); + if (Metrics.notDuplicatable) { + LLVM_DEBUG( + dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" + << " instructions: "; + L->dump()); + return Rotated; + } + if (Metrics.convergent) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); + return Rotated; + } + if (Metrics.NumInsts > MaxHeaderSize) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " + << Metrics.NumInsts + << " instructions, which is more than the threshold (" + << MaxHeaderSize << " instructions): "; + L->dump()); + return Rotated; + } } - if (Metrics.NumInsts > MaxHeaderSize) - return false; - } - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!OrigPreheader || !L->hasDedicatedExits()) + return Rotated; + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. We should also invalidate + // all outer loops because insertion and deletion of blocks that happens + // during the rotation may violate invariants related to backedge taken + // infos in them. + if (SE) + SE->forgetTopmostLoop(L); + + LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + if (L->contains(Exit)) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap, ValueMapMSSA; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. + using DbgIntrinsicHash = + std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>; + auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash { + return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; + }; + SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics; + for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); + I != E; ++I) { + if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I)) + DbgIntrinsics.insert(makeHash(DII)); + else + break; + } - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader || !L->hasDedicatedExits()) - return false; + while (I != E) { + Instruction *Inst = &*I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !Inst->isTerminator() && + !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) { + Inst->moveBefore(LoopEntryBranch); + continue; + } - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. We should also invalidate - // all outer loops because insertion and deletion of blocks that happens - // during the rotation may violate invariants related to backedge taken - // infos in them. - if (SE) - SE->forgetTopmostLoop(L); + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); - LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - if (L->contains(Exit)) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap, ValueMapMSSA; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - InsertNewValueIntoMap(ValueMap, PN, - PN->getIncomingValueForBlock(OrigPreheader)); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); - - // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. - using DbgIntrinsicHash = - std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>; - auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash { - return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; - }; - SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics; - for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); - I != E; ++I) { - if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I)) - DbgIntrinsics.insert(makeHash(DII)); - else - break; - } + // Avoid inserting the same intrinsic twice. + if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C)) + if (DbgIntrinsics.count(makeHash(DII))) { + C->deleteValue(); + continue; + } - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !Inst->isTerminator() && - !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) { - Inst->moveBefore(LoopEntryBranch); - continue; + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = SimplifyInstruction(C, SQ); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + InsertNewValueIntoMap(ValueMap, Inst, V); + if (!C->mayHaveSideEffects()) { + C->deleteValue(); + C = nullptr; + } + } else { + InsertNewValueIntoMap(ValueMap, Inst, C); + } + if (C) { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + C->insertBefore(LoopEntryBranch); + + if (auto *II = dyn_cast<IntrinsicInst>(C)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); + // MemorySSA cares whether the cloned instruction was inserted or not, and + // not whether it can be remapped to a simplified value. + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + } } - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + for (BasicBlock *SuccBB : successors(OrigHeader)) + for (BasicBlock::iterator BI = SuccBB->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + + // Update MemorySSA before the rewrite call below changes the 1:1 + // instruction:cloned_instruction_or_value mapping. + if (MSSAU) { + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); + MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, + ValueMapMSSA); + } - // Avoid inserting the same intrinsic twice. - if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C)) - if (DbgIntrinsics.count(makeHash(DII))) { - C->deleteValue(); - continue; + SmallVector<PHINode*, 2> InsertedPHIs; + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, + &InsertedPHIs); + + // Attach dbg.value intrinsics to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector<DominatorTree::UpdateType, 3> Updates; + Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); + Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); + Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); + DT->applyUpdates(Updates); + + if (MSSAU) { + MSSAU->applyUpdates(Updates, *DT); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); } + } - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - Value *V = SimplifyInstruction(C, SQ); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - InsertNewValueIntoMap(ValueMap, Inst, V); - if (!C->mayHaveSideEffects()) { - C->deleteValue(); - C = nullptr; + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + if (!isa<ConstantInt>(PHBI->getCondition()) || + PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != + NewHeader) { + // The conditional branch can't be folded, handle the general case. + // Split edges as necessary to preserve LoopSimplify form. + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); + bool SplitLatchEdge = false; + for (BasicBlock *ExitPred : ExitPreds) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(ExitPred); + if (!PredLoop || PredLoop->contains(Exit) || + ExitPred->getTerminator()->isIndirectTerminator()) + continue; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; + BasicBlock *ExitSplit = SplitCriticalEdge( + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + ExitSplit->moveBefore(Exit); } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); } else { - InsertNewValueIntoMap(ValueMap, Inst, C); - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - C->insertBefore(LoopEntryBranch); - - if (auto *II = dyn_cast<IntrinsicInst>(C)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); - // MemorySSA cares whether the cloned instruction was inserted or not, and - // not whether it can be remapped to a simplified value. + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DT) DT->deleteEdge(OrigPreheader, Exit); + + // Update MSSA too, if available. if (MSSAU) - InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + MSSAU->removeEdge(OrigPreheader, Exit); } - } - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - for (BasicBlock *SuccBB : successors(OrigHeader)) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - - // Update MemorySSA before the rewrite call below changes the 1:1 - // instruction:cloned_instruction_or_value mapping. - if (MSSAU) { - InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); - MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, - ValueMapMSSA); - } + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); - SmallVector<PHINode*, 2> InsertedPHIs; - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, - &InsertedPHIs); - - // Attach dbg.value intrinsics to the new phis if that phi uses a value that - // previously had debug metadata attached. This keeps the debug info - // up-to-date in the loop body. - if (!InsertedPHIs.empty()) - insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // Inform DT about changes to the CFG. - if (DT) { - // The OrigPreheader branches to the NewHeader and Exit now. Then, inform - // the DT about the removed edge to the OrigHeader (that got removed). - SmallVector<DominatorTree::UpdateType, 3> Updates; - Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); - Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); - Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); - DT->applyUpdates(Updates); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - if (MSSAU) { - MSSAU->applyUpdates(Updates, *DT); - if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - } - } + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - if (!isa<ConstantInt>(PHBI->getCondition()) || - PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != - NewHeader) { - // The conditional branch can't be folded, handle the general case. - // Split edges as necessary to preserve LoopSimplify form. - - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit) || - ExitPred->getTerminator()->isIndirectTerminator()) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); - } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); - - // With our CFG finalized, update DomTree if it is available. - if (DT) DT->deleteEdge(OrigPreheader, Exit); - - // Update MSSA too, if available. - if (MSSAU) - MSSAU->removeEdge(OrigPreheader, Exit); - } + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + ++NumRotated; - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); + Rotated = true; + SimplifiedLatch = false; - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + // Check that new latch is a deoptimizing exit and then repeat rotation if possible. + // Deoptimizing latch exit is not a generally typical case, so we just loop over. + // TODO: if it becomes a performance bottleneck extend rotation algorithm + // to handle multiple rotations in one go. + } while (MultiRotate && canRotateDeoptimizingLatchExit(L)); - LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); - ++NumRotated; return true; } diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 28f88f39a712d..a8445e94e55a0 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -230,6 +230,27 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (!Preheader) return nullptr; + // Treat the presence of convergent functions conservatively. The + // transformation is invalid if calls to certain convergent + // functions (like an AMDGPU barrier) get included in the resulting + // inner loop. But blocks meant for the inner loop will be + // identified later at a point where it's too late to abort the + // transformation. Also, the convergent attribute is not really + // sufficient to express the semantics of functions that are + // affected by this transformation. So we choose to back off if such + // a function call is present until a better alternative becomes + // available. This is similar to the conservative treatment of + // convergent function calls in GVNHoist and JumpThreading. + for (auto BB : L->blocks()) { + for (auto &II : *BB) { + if (auto CI = dyn_cast<CallBase>(&II)) { + if (CI->isConvergent()) { + return nullptr; + } + } + } + } + // The header is not a landing pad; preheader insertion should ensure this. BasicBlock *Header = L->getHeader(); assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); @@ -598,6 +619,7 @@ ReprocessLoop: if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) { PN->replaceAllUsesWith(V); PN->eraseFromParent(); + Changed = true; } } @@ -674,10 +696,8 @@ ReprocessLoop: LI->removeBlock(ExitingBlock); DomTreeNode *Node = DT->getNode(ExitingBlock); - const std::vector<DomTreeNodeBase<BasicBlock> *> &Children = - Node->getChildren(); - while (!Children.empty()) { - DomTreeNode *Child = Children.front(); + while (!Node->isLeaf()) { + DomTreeNode *Child = Node->back(); DT->changeImmediateDominator(Child, Node->getIDom()); } DT->eraseNode(ExitingBlock); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4b94b371e70a9..3875c631f839b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -15,21 +15,46 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/ilist_iterator.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/GenericDomTree.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -38,6 +63,17 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <algorithm> +#include <assert.h> +#include <type_traits> +#include <vector> + +namespace llvm { +class DataLayout; +class Value; +} // namespace llvm + using namespace llvm; #define DEBUG_TYPE "loop-unroll" @@ -45,8 +81,8 @@ using namespace llvm; // TODO: Should these be here or in LoopUnroll? STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled"); STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)"); -STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a " - "conditional latch (completely or otherwise)"); +STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional " + "latch (completely or otherwise)"); static cl::opt<bool> UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden, @@ -63,39 +99,6 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden, #endif ); -/// Convert the instruction operands from referencing the current values into -/// those specified by VMap. -void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) { - for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) { - Value *Op = I->getOperand(op); - - // Unwrap arguments of dbg.value intrinsics. - bool Wrapped = false; - if (auto *V = dyn_cast<MetadataAsValue>(Op)) - if (auto *Unwrapped = dyn_cast<ValueAsMetadata>(V->getMetadata())) { - Op = Unwrapped->getValue(); - Wrapped = true; - } - - auto wrap = [&](Value *V) { - auto &C = I->getContext(); - return Wrapped ? MetadataAsValue::get(C, ValueAsMetadata::get(V)) : V; - }; - - ValueToValueMapTy::iterator It = VMap.find(Op); - if (It != VMap.end()) - I->setOperand(op, wrap(It->second)); - } - - if (PHINode *PN = dyn_cast<PHINode>(I)) { - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i)); - if (It != VMap.end()) - PN->setIncomingBlock(i, cast<BasicBlock>(It->second)); - } - } -} - /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. /// \param Blocks is a vector of basic blocks representing unrolled loop. @@ -199,18 +202,20 @@ static bool isEpilogProfitable(Loop *L) { /// simplify/dce pass of the instructions. void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC) { + AssumptionCache *AC, + const TargetTransformInfo *TTI) { // Simplify any new induction variables in the partially unrolled loop. if (SE && SimplifyIVs) { SmallVector<WeakTrackingVH, 16> DeadInsts; - simplifyLoopIVs(L, SE, DT, LI, DeadInsts); + simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts); // Aggressively clean up dead instructions that simplifyLoopIVs already // identified. Any remaining should be cleaned up below. - while (!DeadInsts.empty()) - if (Instruction *Inst = - dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + while (!DeadInsts.empty()) { + Value *V = DeadInsts.pop_back_val(); + if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) RecursivelyDeleteTriviallyDeadInstructions(Inst); + } } // At this point, the code is well formed. We now do a quick sweep over the @@ -277,6 +282,7 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop) { @@ -298,48 +304,35 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, return LoopUnrollResult::Unmodified; } - // The current loop unroll pass can unroll loops with a single latch or header - // that's a conditional branch exiting the loop. + // The current loop unroll pass can unroll loops that have + // (1) single latch; and + // (2a) latch is unconditional; or + // (2b) latch is conditional and is an exiting block // FIXME: The implementation can be extended to work with more complicated // cases, e.g. loops with multiple latches. BasicBlock *Header = L->getHeader(); - BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator()); - BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); - - // FIXME: Support loops without conditional latch and multiple exiting blocks. - if (!BI || - (BI->isUnconditional() && (!HeaderBI || HeaderBI->isUnconditional() || - L->getExitingBlock() != Header))) { - LLVM_DEBUG(dbgs() << " Can't unroll; loop not terminated by a conditional " - "branch in the latch or header.\n"); - return LoopUnrollResult::Unmodified; - } - - auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) { - return BI->isConditional() && BI->getSuccessor(S1) == Header && - !L->contains(BI->getSuccessor(S2)); - }; - - // If we have a conditional latch, it must exit the loop. - if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) && - !CheckLatchSuccessors(1, 0)) { + BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); + + // A conditional branch which exits the loop, which can be optimized to an + // unconditional branch in the unrolled loop in some cases. + BranchInst *ExitingBI = nullptr; + bool LatchIsExiting = L->isLoopExiting(LatchBlock); + if (LatchIsExiting) + ExitingBI = LatchBI; + else if (BasicBlock *ExitingBlock = L->getExitingBlock()) + ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); + if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) { LLVM_DEBUG( dbgs() << "Can't unroll; a conditional latch must exit the loop"); return LoopUnrollResult::Unmodified; } - - auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) { - return HeaderBI && HeaderBI->isConditional() && - L->contains(HeaderBI->getSuccessor(S1)) && - !L->contains(HeaderBI->getSuccessor(S2)); - }; - - // If we do not have a conditional latch, the header must exit the loop. - if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() && - !CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) { - LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop"); - return LoopUnrollResult::Unmodified; - } + LLVM_DEBUG({ + if (ExitingBI) + dbgs() << " Exiting Block = " << ExitingBI->getParent()->getName() + << "\n"; + else + dbgs() << " No single exiting block\n"; + }); if (Header->hasAddressTaken()) { // The loop-rotate pass can be helpful to avoid this in many cases. @@ -421,8 +414,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, bool HasConvergent = false; for (auto &BB : L->blocks()) for (auto &I : *BB) - if (auto CS = CallSite(&I)) - HasConvergent |= CS.isConvergent(); + if (auto *CB = dyn_cast<CallBase>(&I)) + HasConvergent |= CB->isConvergent(); assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) && "Unroll count must divide trip multiple if loop contains a " "convergent operation."); @@ -435,7 +428,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 && !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, ULO.UnrollRemainder, - ULO.ForgetAllSCEV, LI, SE, DT, AC, + ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, PreserveLCSSA, RemainderLoop)) { if (ULO.Force) RuntimeTripCount = false; @@ -528,16 +521,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, SE->forgetTopmostLoop(L); } - bool ContinueOnTrue; - bool LatchIsExiting = BI->isConditional(); + if (!LatchIsExiting) + ++NumUnrolledNotLatch; + Optional<bool> ContinueOnTrue = None; BasicBlock *LoopExit = nullptr; - if (LatchIsExiting) { - ContinueOnTrue = L->contains(BI->getSuccessor(0)); - LoopExit = BI->getSuccessor(ContinueOnTrue); - } else { - NumUnrolledWithHeader++; - ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0)); - LoopExit = HeaderBI->getSuccessor(ContinueOnTrue); + if (ExitingBI) { + ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0)); + LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue); } // For the first iteration of the loop, we should use the precloned values for @@ -549,20 +539,14 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } std::vector<BasicBlock *> Headers; - std::vector<BasicBlock *> HeaderSucc; + std::vector<BasicBlock *> ExitingBlocks; + std::vector<BasicBlock *> ExitingSucc; std::vector<BasicBlock *> Latches; Headers.push_back(Header); Latches.push_back(LatchBlock); - - if (!LatchIsExiting) { - auto *Term = cast<BranchInst>(Header->getTerminator()); - if (Term->isUnconditional() || L->contains(Term->getSuccessor(0))) { - assert(L->contains(Term->getSuccessor(0))); - HeaderSucc.push_back(Term->getSuccessor(0)); - } else { - assert(L->contains(Term->getSuccessor(1))); - HeaderSucc.push_back(Term->getSuccessor(1)); - } + if (ExitingBI) { + ExitingBlocks.push_back(ExitingBI->getParent()); + ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue))); } // The current on-the-fly SSA update requires blocks to be processed in @@ -600,7 +584,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } for (unsigned It = 1; It != ULO.Count; ++It) { - std::vector<BasicBlock*> NewBlocks; + SmallVector<BasicBlock *, 8> NewBlocks; SmallDenseMap<const Loop *, Loop *, 4> NewLoops; NewLoops[L] = L; @@ -654,12 +638,14 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, if (*BB == LatchBlock) Latches.push_back(New); - // Keep track of the successor of the new header in the current iteration. - for (auto *Pred : predecessors(*BB)) - if (Pred == Header) { - HeaderSucc.push_back(New); - break; - } + // Keep track of the exiting block and its successor block contained in + // the loop for the current iteration. + if (ExitingBI) { + if (*BB == ExitingBlocks[0]) + ExitingBlocks.push_back(New); + if (*BB == ExitingSucc[0]) + ExitingSucc.push_back(New); + } NewBlocks.push_back(New); UnrolledLoopBlocks.push_back(New); @@ -682,9 +668,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } // Remap all instructions in the most recent iteration + remapInstructionsInBlocks(NewBlocks, LastValueMap); for (BasicBlock *NewBlock : NewBlocks) { for (Instruction &I : *NewBlock) { - ::remapInstruction(&I, LastValueMap); if (auto *II = dyn_cast<IntrinsicInst>(&I)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); @@ -710,18 +696,19 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } - auto setDest = [LoopExit, ContinueOnTrue](BasicBlock *Src, BasicBlock *Dest, - ArrayRef<BasicBlock *> NextBlocks, - BasicBlock *BlockInLoop, - bool NeedConditional) { + auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop, + bool NeedConditional, Optional<bool> ContinueOnTrue, + bool IsDestLoopExit) { auto *Term = cast<BranchInst>(Src->getTerminator()); if (NeedConditional) { // Update the conditional branch's successor for the following // iteration. - Term->setSuccessor(!ContinueOnTrue, Dest); + assert(ContinueOnTrue.hasValue() && + "Expecting valid ContinueOnTrue when NeedConditional is true"); + Term->setSuccessor(!(*ContinueOnTrue), Dest); } else { // Remove phi operands at this loop exit - if (Dest != LoopExit) { + if (!IsDestLoopExit) { BasicBlock *BB = Src; for (BasicBlock *Succ : successors(BB)) { // Preserve the incoming value from BB if we are jumping to the block @@ -738,29 +725,27 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } }; - // Now that all the basic blocks for the unrolled iterations are in place, - // set up the branches to connect them. - if (LatchIsExiting) { - // Set up latches to branch to the new header in the unrolled iterations or - // the loop exit for the last latch in a fully unrolled loop. - for (unsigned i = 0, e = Latches.size(); i != e; ++i) { - // The branch destination. - unsigned j = (i + 1) % e; - BasicBlock *Dest = Headers[j]; - bool NeedConditional = true; + // Connect latches of the unrolled iterations to the headers of the next + // iteration. If the latch is also the exiting block, the conditional branch + // may have to be preserved. + for (unsigned i = 0, e = Latches.size(); i != e; ++i) { + // The branch destination. + unsigned j = (i + 1) % e; + BasicBlock *Dest = Headers[j]; + bool NeedConditional = LatchIsExiting; - if (RuntimeTripCount && j != 0) { + if (LatchIsExiting) { + if (RuntimeTripCount && j != 0) NeedConditional = false; - } // For a complete unroll, make the last iteration end with a branch // to the exit block. if (CompletelyUnroll) { if (j == 0) Dest = LoopExit; - // If using trip count upper bound to completely unroll, we need to keep - // the conditional branch except the last one because the loop may exit - // after any iteration. + // If using trip count upper bound to completely unroll, we need to + // keep the conditional branch except the last one because the loop + // may exit after any iteration. assert(NeedConditional && "NeedCondition cannot be modified by both complete " "unrolling and runtime unrolling"); @@ -772,16 +757,18 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // unconditional branch for some iterations. NeedConditional = false; } - - setDest(Latches[i], Dest, Headers, Headers[i], NeedConditional); } - } else { - // Setup headers to branch to their new successors in the unrolled - // iterations. - for (unsigned i = 0, e = Headers.size(); i != e; ++i) { + + setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue, + Dest == LoopExit); + } + + if (!LatchIsExiting) { + // If the latch is not exiting, we may be able to simplify the conditional + // branches in the unrolled exiting blocks. + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { // The branch destination. unsigned j = (i + 1) % e; - BasicBlock *Dest = HeaderSucc[i]; bool NeedConditional = true; if (RuntimeTripCount && j != 0) @@ -797,27 +784,19 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // unconditional branch for some iterations. NeedConditional = false; - setDest(Headers[i], Dest, Headers, HeaderSucc[i], NeedConditional); + // Conditional branches from non-latch exiting block have successors + // either in the same loop iteration or outside the loop. The branches are + // already correct. + if (NeedConditional) + continue; + setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional, + None, false); } - // Set up latches to branch to the new header in the unrolled iterations or - // the loop exit for the last latch in a fully unrolled loop. - - for (unsigned i = 0, e = Latches.size(); i != e; ++i) { - // The original branch was replicated in each unrolled iteration. - BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); - - // The branch destination. - unsigned j = (i + 1) % e; - BasicBlock *Dest = Headers[j]; - - // When completely unrolling, the last latch becomes unreachable. - if (CompletelyUnroll && j == 0) - new UnreachableInst(Term->getContext(), Term); - else - // Replace the conditional branch with an unconditional one. - BranchInst::Create(Dest, Term); - + // When completely unrolling, the last latch becomes unreachable. + if (CompletelyUnroll) { + BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator()); + new UnreachableInst(Term->getContext(), Term); Term->eraseFromParent(); } } @@ -830,15 +809,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, for (auto *BB : OriginalLoopBlocks) { auto *BBDomNode = DT->getNode(BB); SmallVector<BasicBlock *, 16> ChildrenToUpdate; - for (auto *ChildDomNode : BBDomNode->getChildren()) { + for (auto *ChildDomNode : BBDomNode->children()) { auto *ChildBB = ChildDomNode->getBlock(); if (!L->contains(ChildBB)) ChildrenToUpdate.push_back(ChildBB); } BasicBlock *NewIDom; - BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header; - auto &TermBlocks = LatchIsExiting ? Latches : Headers; - if (BB == TermBlock) { + if (ExitingBI && BB == ExitingBlocks[0]) { // The latch is special because we emit unconditional branches in // some cases where the original loop contained a conditional branch. // Since the latch is always at the bottom of the loop, if the latch @@ -846,13 +823,14 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // must also be a latch. Specifically, the dominator is the first // latch which ends in a conditional branch, or the last latch if // there is no such latch. - // For loops exiting from the header, we limit the supported loops - // to have a single exiting block. - NewIDom = TermBlocks.back(); - for (BasicBlock *Iter : TermBlocks) { - Instruction *Term = Iter->getTerminator(); + // For loops exiting from non latch exiting block, we limit the + // branch simplification to single exiting block loops. + NewIDom = ExitingBlocks.back(); + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { + Instruction *Term = ExitingBlocks[i]->getTerminator(); if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) { - NewIDom = Iter; + NewIDom = + DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]); break; } } @@ -897,7 +875,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // At this point, the code is well formed. We now simplify the unrolled loop, // doing constant propagation and dead code elimination as we go. simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI, - SE, DT, AC); + SE, DT, AC, TTI); NumCompletelyUnrolled += CompletelyUnroll; ++NumUnrolled; diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp index f1965934b2d71..dd628f3e7e0ca 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -11,31 +11,54 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DependenceAnalysis.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <assert.h> +#include <memory> +#include <type_traits> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "loop-unroll-and-jam" @@ -47,17 +70,14 @@ typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; // Partition blocks in an outer/inner loop pair into blocks before and after // the loop -static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, - BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, - DominatorTree *DT) { +static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, + BasicBlockSet &AftBlocks, DominatorTree &DT) { + Loop *SubLoop = L.getSubLoops()[0]; BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); - for (BasicBlock *BB : L->blocks()) { + for (BasicBlock *BB : L.blocks()) { if (!SubLoop->contains(BB)) { - if (DT->dominates(SubLoopLatch, BB)) + if (DT.dominates(SubLoopLatch, BB)) AftBlocks.insert(BB); else ForeBlocks.insert(BB); @@ -71,14 +91,44 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, if (BB == SubLoopPreHeader) continue; Instruction *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!ForeBlocks.count(TI->getSuccessor(i))) + for (BasicBlock *Succ : successors(TI)) + if (!ForeBlocks.count(Succ)) return false; } return true; } +/// Partition blocks in a loop nest into blocks before and after each inner +/// loop. +static bool partitionOuterLoopBlocks( + Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, + DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, + DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { + JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); + + for (Loop *L : Root.getLoopsInPreorder()) { + if (L == &JamLoop) + break; + + if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) + return false; + } + + return true; +} + +// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more +// than 2 levels loop. +static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, + BasicBlockSet &ForeBlocks, + BasicBlockSet &SubLoopBlocks, + BasicBlockSet &AftBlocks, + DominatorTree *DT) { + SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); + return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); +} + // Looks at the phi nodes in Header for values coming from Latch. For these // instructions and all their operands calls Visit on them, keeping going for // all the operands in AftBlocks. Returns false if Visit returns false, @@ -169,10 +219,12 @@ static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, If EpilogueLoop is non-null, it receives the epilogue loop (if it was necessary to create one and not fully unrolled). */ -LoopUnrollResult llvm::UnrollAndJamLoop( - Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, - bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { +LoopUnrollResult +llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, + unsigned TripMultiple, bool UnrollRemainder, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, const TargetTransformInfo *TTI, + OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { // When we enter here we should have already checked that it is safe BasicBlock *Header = L->getHeader(); @@ -198,7 +250,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, /*UseEpilogRemainder*/ true, UnrollRemainder, /*ForgetAllSCEV*/ false, - LI, SE, DT, AC, true, EpilogueLoop)) { + LI, SE, DT, AC, TTI, true, EpilogueLoop)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " "generated when assuming runtime trip count\n"); return LoopUnrollResult::Unmodified; @@ -284,8 +336,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Move any instructions from fore phi operands from AftBlocks into Fore. moveHeaderPhiOperandsToForeBlocks( - Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(), - AftBlocks); + Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); // The current on-the-fly SSA update requires blocks to be processed in // reverse postorder so that LastValueMap contains the correct value at each @@ -312,32 +363,32 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Copy all blocks for (unsigned It = 1; It != Count; ++It) { - std::vector<BasicBlock *> NewBlocks; + SmallVector<BasicBlock *, 8> NewBlocks; // Maps Blocks[It] -> Blocks[It-1] DenseMap<Value *, Value *> PrevItValueMap; + SmallDenseMap<const Loop *, Loop *, 4> NewLoops; + NewLoops[L] = L; + NewLoops[SubLoop] = SubLoop; for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { ValueToValueMapTy VMap; BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); Header->getParent()->getBasicBlockList().push_back(New); - if (ForeBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); + // Tell LI about New. + addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); + if (ForeBlocks.count(*BB)) { if (*BB == ForeBlocksFirst[0]) ForeBlocksFirst.push_back(New); if (*BB == ForeBlocksLast[0]) ForeBlocksLast.push_back(New); } else if (SubLoopBlocks.count(*BB)) { - SubLoop->addBasicBlockToLoop(New, *LI); - if (*BB == SubLoopBlocksFirst[0]) SubLoopBlocksFirst.push_back(New); if (*BB == SubLoopBlocksLast[0]) SubLoopBlocksLast.push_back(New); } else if (AftBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); - if (*BB == AftBlocksFirst[0]) AftBlocksFirst.push_back(New); if (*BB == AftBlocksLast[0]) @@ -379,9 +430,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop( } // Remap all instructions in the most recent iteration + remapInstructionsInBlocks(NewBlocks, LastValueMap); for (BasicBlock *NewBlock : NewBlocks) { for (Instruction &I : *NewBlock) { - ::remapInstruction(&I, LastValueMap); if (auto *II = dyn_cast<IntrinsicInst>(&I)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); @@ -447,8 +498,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Update ForeBlocks successors and phi nodes BranchInst *ForeTerm = cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); - BasicBlock *Dest = SubLoopBlocksFirst[0]; - ForeTerm->setSuccessor(0, Dest); + assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); + ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); if (CompletelyUnroll) { while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { @@ -465,8 +516,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Remap ForeBlock successors from previous iteration to this BranchInst *ForeTerm = cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); - BasicBlock *Dest = ForeBlocksFirst[It]; - ForeTerm->setSuccessor(0, Dest); + assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); + ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); } // Subloop successors and phis @@ -495,12 +546,14 @@ LoopUnrollResult llvm::UnrollAndJamLoop( } // Aft blocks successors and phis - BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); + BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); if (CompletelyUnroll) { - BranchInst::Create(LoopExit, Term); - Term->eraseFromParent(); + BranchInst::Create(LoopExit, AftTerm); + AftTerm->eraseFromParent(); } else { - Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); + AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); + assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && + "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); } updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0], SubLoopBlocksLast.back()); @@ -540,55 +593,48 @@ LoopUnrollResult llvm::UnrollAndJamLoop( MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); - while (!MergeBlocks.empty()) { - BasicBlock *BB = *MergeBlocks.begin(); - BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); - if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) { - BasicBlock *Dest = Term->getSuccessor(0); - BasicBlock *Fold = Dest->getUniquePredecessor(); - if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) { - // Don't remove BB and add Fold as they are the same BB - assert(Fold == BB); - (void)Fold; - MergeBlocks.erase(Dest); - } else - MergeBlocks.erase(BB); - } else - MergeBlocks.erase(BB); - } + + MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); + // Apply updates to the DomTree. DT = &DTU.getDomTree(); // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. - simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC); - simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC); + simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); + simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, + TTI); NumCompletelyUnrolledAndJammed += CompletelyUnroll; ++NumUnrolledAndJammed; + // Update LoopInfo if the loop is completely removed. + if (CompletelyUnroll) + LI->erase(L); + #ifndef NDEBUG // We shouldn't have done anything to break loop simplify form or LCSSA. - Loop *OuterL = L->getParentLoop(); - Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop); + Loop *OutestLoop = SubLoop->getParentLoop() + ? SubLoop->getParentLoop()->getParentLoop() + ? SubLoop->getParentLoop()->getParentLoop() + : SubLoop->getParentLoop() + : SubLoop; + assert(DT->verify()); + LI->verify(*DT); assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); if (!CompletelyUnroll) assert(L->isLoopSimplifyForm()); assert(SubLoop->isLoopSimplifyForm()); - assert(DT->verify()); + SE->verify(); #endif - // Update LoopInfo if the loop is completely removed. - if (CompletelyUnroll) - LI->erase(L); - return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled : LoopUnrollResult::PartiallyUnrolled; } static bool getLoadsAndStores(BasicBlockSet &Blocks, - SmallVector<Value *, 4> &MemInstr) { + SmallVector<Instruction *, 4> &MemInstr) { // Scan the BBs and collect legal loads and stores. // Returns false if non-simple loads/stores are found. for (BasicBlock *BB : Blocks) { @@ -609,97 +655,235 @@ static bool getLoadsAndStores(BasicBlockSet &Blocks, return true; } -static bool checkDependencies(SmallVector<Value *, 4> &Earlier, - SmallVector<Value *, 4> &Later, - unsigned LoopDepth, bool InnerLoop, - DependenceInfo &DI) { - // Use DA to check for dependencies between loads and stores that make unroll - // and jam invalid - for (Value *I : Earlier) { - for (Value *J : Later) { - Instruction *Src = cast<Instruction>(I); - Instruction *Dst = cast<Instruction>(J); - if (Src == Dst) - continue; - // Ignore Input dependencies. - if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) - continue; - - // Track dependencies, and if we find them take a conservative approach - // by allowing only = or < (not >), altough some > would be safe - // (depending upon unroll width). - // For the inner loop, we need to disallow any (> <) dependencies - // FIXME: Allow > so long as distance is less than unroll width - if (auto D = DI.depends(Src, Dst, true)) { - assert(D->isOrdered() && "Expected an output, flow or anti dep."); - - if (D->isConfused()) { - LLVM_DEBUG(dbgs() << " Confused dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); +static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, Dependence *D) { + // UnrollLevel might carry the dependency Src --> Dst + // Does a different loop after unrolling? + for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; + ++CurLoopDepth) { + auto JammedDir = D->getDirection(CurLoopDepth); + if (JammedDir == Dependence::DVEntry::LT) + return true; + + if (JammedDir & Dependence::DVEntry::GT) + return false; + } + + return true; +} + +static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, Dependence *D) { + // UnrollLevel might carry the dependency Dst --> Src + for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; + ++CurLoopDepth) { + auto JammedDir = D->getDirection(CurLoopDepth); + if (JammedDir == Dependence::DVEntry::GT) + return true; + + if (JammedDir & Dependence::DVEntry::LT) + return false; + } + + // Backward dependencies are only preserved if not interleaved. + return Sequentialized; +} + +// Check whether it is semantically safe Src and Dst considering any potential +// dependency between them. +// +// @param UnrollLevel The level of the loop being unrolled +// @param JamLevel The level of the loop being jammed; if Src and Dst are on +// different levels, the outermost common loop counts as jammed level +// +// @return true if is safe and false if there is a dependency violation. +static bool checkDependency(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, DependenceInfo &DI) { + assert(UnrollLevel <= JamLevel && + "Expecting JamLevel to be at least UnrollLevel"); + + if (Src == Dst) + return true; + // Ignore Input dependencies. + if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) + return true; + + // Check whether unroll-and-jam may violate a dependency. + // By construction, every dependency will be lexicographically non-negative + // (if it was, it would violate the current execution order), such as + // (0,0,>,*,*) + // Unroll-and-jam changes the GT execution of two executions to the same + // iteration of the chosen unroll level. That is, a GT dependence becomes a GE + // dependence (or EQ, if we fully unrolled the loop) at the loop's position: + // (0,0,>=,*,*) + // Now, the dependency is not necessarily non-negative anymore, i.e. + // unroll-and-jam may violate correctness. + std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); + if (!D) + return true; + assert(D->isOrdered() && "Expected an output, flow or anti dep."); + + if (D->isConfused()) { + LLVM_DEBUG(dbgs() << " Confused dependency between:\n" + << " " << *Src << "\n" + << " " << *Dst << "\n"); + return false; + } + + // If outer levels (levels enclosing the loop being unroll-and-jammed) have a + // non-equal direction, then the locations accessed in the inner levels cannot + // overlap in memory. We assumes the indexes never overlap into neighboring + // dimensions. + for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) + if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) + return true; + + auto UnrollDirection = D->getDirection(UnrollLevel); + + // If the distance carried by the unrolled loop is 0, then after unrolling + // that distance will become non-zero resulting in non-overlapping accesses in + // the inner loops. + if (UnrollDirection == Dependence::DVEntry::EQ) + return true; + + if (UnrollDirection & Dependence::DVEntry::LT && + !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, + Sequentialized, D.get())) + return false; + + if (UnrollDirection & Dependence::DVEntry::GT && + !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, + Sequentialized, D.get())) + return false; + + return true; +} + +static bool +checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, + const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, + const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, + DependenceInfo &DI, LoopInfo &LI) { + SmallVector<BasicBlockSet, 8> AllBlocks; + for (Loop *L : Root.getLoopsInPreorder()) + if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) + AllBlocks.push_back(ForeBlocksMap.lookup(L)); + AllBlocks.push_back(SubLoopBlocks); + for (Loop *L : Root.getLoopsInPreorder()) + if (AftBlocksMap.find(L) != AftBlocksMap.end()) + AllBlocks.push_back(AftBlocksMap.lookup(L)); + + unsigned LoopDepth = Root.getLoopDepth(); + SmallVector<Instruction *, 4> EarlierLoadsAndStores; + SmallVector<Instruction *, 4> CurrentLoadsAndStores; + for (BasicBlockSet &Blocks : AllBlocks) { + CurrentLoadsAndStores.clear(); + if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) + return false; + + Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); + unsigned CurLoopDepth = CurLoop->getLoopDepth(); + + for (auto *Earlier : EarlierLoadsAndStores) { + Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); + unsigned EarlierDepth = EarlierLoop->getLoopDepth(); + unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); + for (auto *Later : CurrentLoadsAndStores) { + if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, + DI)) return false; - } - if (!InnerLoop) { - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { - LLVM_DEBUG(dbgs() << " > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; - } - } else { - assert(LoopDepth + 1 <= D->getLevels()); - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && - D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { - LLVM_DEBUG(dbgs() << " < > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; - } - } } } + + size_t NumInsts = CurrentLoadsAndStores.size(); + for (size_t I = 0; I < NumInsts; ++I) { + for (size_t J = I; J < NumInsts; ++J) { + if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], + LoopDepth, CurLoopDepth, true, DI)) + return false; + } + } + + EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), + CurrentLoadsAndStores.end()); } return true; } -static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, DependenceInfo &DI) { - // Get all loads/store pairs for each blocks - SmallVector<Value *, 4> ForeMemInstr; - SmallVector<Value *, 4> SubLoopMemInstr; - SmallVector<Value *, 4> AftMemInstr; - if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || - !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || - !getLoadsAndStores(AftBlocks, AftMemInstr)) +static bool isEligibleLoopForm(const Loop &Root) { + // Root must have a child. + if (Root.getSubLoops().size() != 1) return false; - // Check for dependencies between any blocks that may change order - unsigned LoopDepth = L->getLoopDepth(); - return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, - DI) && - checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && - checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, - DI) && - checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, - DI); + const Loop *L = &Root; + do { + // All loops in Root need to be in simplify and rotated form. + if (!L->isLoopSimplifyForm()) + return false; + + if (!L->isRotatedForm()) + return false; + + if (L->getHeader()->hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); + return false; + } + + unsigned SubLoopsSize = L->getSubLoops().size(); + if (SubLoopsSize == 0) + return true; + + // Only one child is allowed. + if (SubLoopsSize != 1) + return false; + + L = L->getSubLoops()[0]; + } while (L); + + return true; +} + +static Loop *getInnerMostLoop(Loop *L) { + while (!L->getSubLoops().empty()) + L = L->getSubLoops()[0]; + return L; } bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI) { + DependenceInfo &DI, LoopInfo &LI) { + if (!isEligibleLoopForm(*L)) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); + return false; + } + /* We currently handle outer loops like this: | - ForeFirst <----\ } - Blocks | } ForeBlocks - ForeLast | } - | | - SubLoopFirst <\ | } - Blocks | | } SubLoopBlocks - SubLoopLast -/ | } - | | - AftFirst | } - Blocks | } AftBlocks - AftLast ------/ } + ForeFirst <------\ } + Blocks | } ForeBlocks of L + ForeLast | } + | | + ... | + | | + ForeFirst <----\ | } + Blocks | | } ForeBlocks of a inner loop of L + ForeLast | | } + | | | + JamLoopFirst <\ | | } + Blocks | | | } JamLoopBlocks of the innermost loop + JamLoopLast -/ | | } + | | | + AftFirst | | } + Blocks | | } AftBlocks of a inner loop of L + AftLast ------/ | } + | | + ... | + | | + AftFirst | } + Blocks | } AftBlocks of L + AftLast --------/ } | There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks @@ -709,14 +893,16 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, things further in the profitablility checks of the unroll and jam pass. Because of the way we rearrange basic blocks, we also require that - the Fore blocks on all unrolled iterations are safe to move before the - SubLoop blocks of all iterations. So we require that the phi node looping - operands of ForeHeader can be moved to at least the end of ForeEnd, so that - we can arrange cloned Fore Blocks before the subloop and match up Phi's - correctly. + the Fore blocks of L on all unrolled iterations are safe to move before the + blocks of the direct child of L of all iterations. So we require that the + phi node looping operands of ForeHeader can be moved to at least the end of + ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and + match up Phi's correctly. - i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. - It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. + i.e. The old order of blocks used to be + (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. + It needs to be safe to transform this to + (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. There are then a number of checks along the lines of no calls, no exceptions, inner loop IV is consistent, etc. Note that for loops requiring @@ -724,35 +910,13 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, UnrollAndJamLoop if the trip count cannot be easily calculated. */ - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return false; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return false; - - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopHeader = SubLoop->getHeader(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit) - return false; - if (SubLoopLatch != SubLoopExit) - return false; - - if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { - LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); - return false; - } - // Split blocks into Fore/SubLoop/Aft based on dominators + Loop *JamLoop = getInnerMostLoop(L); BasicBlockSet SubLoopBlocks; - BasicBlockSet ForeBlocks; - BasicBlockSet AftBlocks; - if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, - AftBlocks, &DT)) { + DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; + DenseMap<Loop *, BasicBlockSet> AftBlocksMap; + if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, + AftBlocksMap, DT)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); return false; } @@ -760,7 +924,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // Aft blocks may need to move instructions to fore blocks, which becomes more // difficult if there are multiple (potentially conditionally executed) // blocks. For now we just exclude loops with multiple aft blocks. - if (AftBlocks.size() != 1) { + if (AftBlocksMap[L].size() != 1) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " "multiple blocks after the loop\n"); return false; @@ -768,7 +932,9 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // Check inner loop backedge count is consistent on all iterations of the // outer loop - if (!hasIterationCountInvariantInParent(SubLoop, SE)) { + if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { + return !hasIterationCountInvariantInParent(SubLoop, SE); + })) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " "not consistent on each iteration\n"); return false; @@ -789,6 +955,10 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // ForeBlock phi operands before the subloop // Make sure we can move all instructions we need to before the subloop + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlockSet AftBlocks = AftBlocksMap[L]; + Loop *SubLoop = L->getSubLoops()[0]; if (!processHeaderPhiOperands( Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { if (SubLoop->contains(I->getParent())) @@ -814,7 +984,8 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // Check for memory dependencies which prohibit the unrolling we are doing. // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. - if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { + if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, + LI)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); return false; } diff --git a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp index 7a168ff6f32b0..c653aacbee6cc 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -262,10 +262,9 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, // iteration. See if that makes !Pred become unknown again. if (ICmpInst::isEquality(Pred) && !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal, - RightSCEV)) { - assert(!SE.isKnownPredicate(Pred, IterVal, RightSCEV) && - SE.isKnownPredicate(Pred, NextIterVal, RightSCEV) && - "Expected Pred to go from known to unknown."); + RightSCEV) && + !SE.isKnownPredicate(Pred, IterVal, RightSCEV) && + SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) { if (!CanPeelOneMoreIteration()) continue; // Need to peel one more iteration, but can't. Give up. PeelOneMoreIteration(); // Great! @@ -280,17 +279,20 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, + TargetTransformInfo::PeelingPreferences &PP, unsigned &TripCount, ScalarEvolution &SE) { assert(LoopSize > 0 && "Zero loop size is not allowed!"); - // Save the UP.PeelCount value set by the target in - // TTI.getUnrollingPreferences or by the flag -unroll-peel-count. - unsigned TargetPeelCount = UP.PeelCount; - UP.PeelCount = 0; + // Save the PP.PeelCount value set by the target in + // TTI.getPeelingPreferences or by the flag -unroll-peel-count. + unsigned TargetPeelCount = PP.PeelCount; + PP.PeelCount = 0; if (!canPeel(L)) return; - // Only try to peel innermost loops. - if (!L->empty()) + // Only try to peel innermost loops by default. + // The constraint can be relaxed by the target in TTI.getUnrollingPreferences + // or by the flag -unroll-allow-loop-nests-peeling. + if (!PP.AllowLoopNestsPeeling && !L->empty()) return; // If the user provided a peel count, use that. @@ -298,13 +300,13 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, if (UserPeelCount) { LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount << " iterations.\n"); - UP.PeelCount = UnrollForcePeelCount; - UP.PeelProfiledIterations = true; + PP.PeelCount = UnrollForcePeelCount; + PP.PeelProfiledIterations = true; return; } // Skip peeling if it's disabled. - if (!UP.AllowPeeling) + if (!PP.AllowPeeling) return; unsigned AlreadyPeeled = 0; @@ -353,8 +355,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to turn" << " some Phis into invariants.\n"); - UP.PeelCount = DesiredPeelCount; - UP.PeelProfiledIterations = false; + PP.PeelCount = DesiredPeelCount; + PP.PeelProfiledIterations = false; return; } } @@ -366,7 +368,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, return; // Do not apply profile base peeling if it is disabled. - if (!UP.PeelProfiledIterations) + if (!PP.PeelProfiledIterations) return; // If we don't know the trip count, but have reason to believe the average // trip count is low, peeling should be beneficial, since we will usually @@ -386,7 +388,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, (LoopSize * (*PeelCount + 1) <= UP.Threshold)) { LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n"); - UP.PeelCount = *PeelCount; + PP.PeelCount = *PeelCount; return; } LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n"); @@ -508,7 +510,10 @@ static void cloneLoopBlocks( BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F); NewBlocks.push_back(NewBB); - if (ParentLoop) + // If an original block is an immediate child of the loop L, its copy + // is a child of a ParentLoop after peeling. If a block is a child of + // a nested loop, it is handled in the cloneLoop() call below. + if (ParentLoop && LI->getLoopFor(*BB) == L) ParentLoop->addBasicBlockToLoop(NewBB, *LI); VMap[*BB] = NewBB; @@ -525,6 +530,12 @@ static void cloneLoopBlocks( } } + // Recursively create the new Loop objects for nested loops, if any, + // to preserve LoopInfo. + for (Loop *ChildLoop : *L) { + cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr); + } + // Hook-up the control flow for the newly inserted blocks. // The new header is hooked up directly to the "top", which is either // the original loop preheader (for the first iteration) or the previous diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index ddb7479924bdc..2515b1676cb99 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Metadata.h" @@ -37,6 +36,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <algorithm> @@ -543,13 +543,11 @@ static bool canProfitablyUnrollMultiExitLoop( /// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2. /// EpilExit: -bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, - bool AllowExpensiveTripCount, - bool UseEpilogRemainder, - bool UnrollRemainder, bool ForgetAllSCEV, - LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, - bool PreserveLCSSA, Loop **ResultLoop) { +bool llvm::UnrollRuntimeLoopRemainder( + Loop *L, unsigned Count, bool AllowExpensiveTripCount, + bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -637,7 +635,8 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, const DataLayout &DL = Header->getModule()->getDataLayout(); SCEVExpander Expander(*SE, DL, "loop-unroll"); if (!AllowExpensiveTripCount && - Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) { + Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget, + TTI, PreHeaderBR)) { LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n"); return false; } @@ -849,7 +848,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // dominator of the exit blocks. for (auto *BB : L->blocks()) { auto *DomNodeBB = DT->getNode(BB); - for (auto *DomChild : DomNodeBB->getChildren()) { + for (auto *DomChild : DomNodeBB->children()) { auto *DomChildBB = DomChild->getBlock(); if (!L->contains(LI->getLoopFor(DomChildBB))) ChildrenToUpdate.push_back(DomChildBB); @@ -949,7 +948,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1, /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV}, - LI, SE, DT, AC, /*ORE*/ nullptr, PreserveLCSSA); + LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); } if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index c4c40189fda46..43363736684ee 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -11,12 +11,19 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/PriorityWorklist.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemorySSA.h" @@ -31,7 +38,9 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" @@ -39,10 +48,17 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; using namespace llvm::PatternMatch; +static cl::opt<bool> ForceReductionIntrinsic( + "force-reduction-intrinsics", cl::Hidden, + cl::desc("Force creating reduction intrinsics for testing."), + cl::init(false)); + #define DEBUG_TYPE "loop-utils" static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced"; @@ -496,20 +512,24 @@ llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) { AddRegionToWorklist(N); - for (size_t I = 0; I < Worklist.size(); I++) - for (DomTreeNode *Child : Worklist[I]->getChildren()) + for (size_t I = 0; I < Worklist.size(); I++) { + for (DomTreeNode *Child : Worklist[I]->children()) AddRegionToWorklist(Child); + } return Worklist; } -void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, - ScalarEvolution *SE = nullptr, - LoopInfo *LI = nullptr) { +void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, + LoopInfo *LI, MemorySSA *MSSA) { assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!"); auto *Preheader = L->getLoopPreheader(); assert(Preheader && "Preheader should exist!"); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSA) + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); + // Now that we know the removal is safe, remove the loop by changing the // branch from the preheader to go to the single exit block. // @@ -582,18 +602,33 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, "Should have exactly one value and that's from the preheader!"); } + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + if (DT) { + DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}}); + if (MSSA) { + MSSAU->applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}}, *DT); + if (VerifyMemorySSA) + MSSA->verifyMemorySSA(); + } + } + // Disconnect the loop body by branching directly to its exit. Builder.SetInsertPoint(Preheader->getTerminator()); Builder.CreateBr(ExitBlock); // Remove the old branch. Preheader->getTerminator()->eraseFromParent(); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); if (DT) { - // Update the dominator tree by informing it about the new edge from the - // preheader to the exit and the removed edge. - DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}, - {DominatorTree::Delete, Preheader, L->getHeader()}}); + DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}}); + if (MSSA) { + MSSAU->applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}}, + *DT); + SmallSetVector<BasicBlock *, 8> DeadBlockSet(L->block_begin(), + L->block_end()); + MSSAU->removeBlocks(DeadBlockSet); + if (VerifyMemorySSA) + MSSA->verifyMemorySSA(); + } } // Use a map to unique and a vector to guarantee deterministic ordering. @@ -654,6 +689,9 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, for (auto *Block : L->blocks()) Block->dropAllReferences(); + if (MSSA && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + if (LI) { // Erase the instructions and the blocks without having to worry // about ordering because we already dropped the references. @@ -676,11 +714,11 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, // its parent. While removeLoop/removeChildLoop remove the given loop but // not relink its subloops, which is what we want. if (Loop *ParentLoop = L->getParentLoop()) { - Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(), L); + Loop::iterator I = find(*ParentLoop, L); assert(I != ParentLoop->end() && "Couldn't find loop"); ParentLoop->removeChildLoop(I); } else { - Loop::iterator I = find(LI->begin(), LI->end(), L); + Loop::iterator I = find(*LI, L); assert(I != LI->end() && "Couldn't find loop"); LI->removeLoop(I); } @@ -688,17 +726,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, } } -Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. - - // Get the branch weights for the loop's backedge. +/// Checks if \p L has single exit through latch block except possibly +/// "deoptimizing" exits. Returns branch instruction terminating the loop +/// latch if above check is successful, nullptr otherwise. +static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); if (!Latch) - return None; + return nullptr; + BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator()); if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) - return None; + return nullptr; assert((LatchBR->getSuccessor(0) == L->getHeader() || LatchBR->getSuccessor(1) == L->getHeader()) && @@ -709,24 +747,73 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) { if (any_of(ExitBlocks, [](const BasicBlock *EB) { return !EB->getTerminatingDeoptimizeCall(); })) + return nullptr; + + return LatchBR; +} + +Optional<unsigned> +llvm::getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedLoopInvocationWeight) { + // Support loops with an exiting latch and other existing exists only + // deoptimize. + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) return None; // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. uint64_t BackedgeTakenWeight, LatchExitWeight; - if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) + if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) return None; - if (LatchBR->getSuccessor(0) != L->getHeader()) + if (LatchBranch->getSuccessor(0) != L->getHeader()) + std::swap(BackedgeTakenWeight, LatchExitWeight); + + if (!LatchExitWeight) + return None; + + if (EstimatedLoopInvocationWeight) + *EstimatedLoopInvocationWeight = LatchExitWeight; + + // Estimated backedge taken count is a ratio of the backedge taken weight by + // the weight of the edge exiting the loop, rounded to nearest. + uint64_t BackedgeTakenCount = + llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); + // Estimated trip count is one plus estimated backedge taken count. + return BackedgeTakenCount + 1; +} + +bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, + unsigned EstimatedloopInvocationWeight) { + // Support loops with an exiting latch and other existing exists only + // deoptimize. + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + + // Calculate taken and exit weights. + unsigned LatchExitWeight = 0; + unsigned BackedgeTakenWeight = 0; + + if (EstimatedTripCount > 0) { + LatchExitWeight = EstimatedloopInvocationWeight; + BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight; + } + + // Make a swap if back edge is taken when condition is "false". + if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); - if (!BackedgeTakenWeight || !LatchExitWeight) - return 0; + MDBuilder MDB(LatchBranch->getContext()); - // Divide the count of the backedge by the count of the edge exiting the loop, - // rounding to nearest. - return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); + // Set/Update profile metadata. + LatchBranch->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + + return true; } bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, @@ -751,7 +838,7 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, return true; } -Value *llvm::createMinMaxOp(IRBuilder<> &Builder, +Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurrenceDescriptor::MinMaxRecurrenceKind RK, Value *Left, Value *Right) { CmpInst::Predicate P = CmpInst::ICMP_NE; @@ -780,29 +867,22 @@ Value *llvm::createMinMaxOp(IRBuilder<> &Builder, // We only match FP sequences that are 'fast', so we can unconditionally // set it on any generated instructions. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); + IRBuilderBase::FastMathFlagGuard FMFG(Builder); FastMathFlags FMF; FMF.setFast(); Builder.setFastMathFlags(FMF); - - Value *Cmp; - if (RK == RecurrenceDescriptor::MRK_FloatMin || - RK == RecurrenceDescriptor::MRK_FloatMax) - Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); - else - Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); - + Value *Cmp = Builder.CreateCmp(P, Left, Right, "rdx.minmax.cmp"); Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); return Select; } // Helper to generate an ordered reduction. Value * -llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, +llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, ArrayRef<Value *> RedOps) { - unsigned VF = Src->getType()->getVectorNumElements(); + unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); // Extract and apply reduction ops in ascending order: // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1] @@ -829,29 +909,27 @@ llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, // Helper to generate a log2 shuffle reduction. Value * -llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, +llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, ArrayRef<Value *> RedOps) { - unsigned VF = Src->getType()->getVectorNumElements(); + unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = Src; - SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); + SmallVector<int, 32> ShuffleMask(VF); for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); + ShuffleMask[j] = i / 2 + j; // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); + TmpVec, UndefValue::get(TmpVec->getType()), ShuffleMask, "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) { // The builder propagates its fast-math-flags setting. @@ -864,6 +942,11 @@ llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, } if (!RedOps.empty()) propagateIRFlags(TmpVec, RedOps); + + // We may compute the reassociated scalar ops in a way that does not + // preserve nsw/nuw etc. Conservatively, drop those flags. + if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec)) + ReductionInst->dropPoisonGeneratingFlags(); } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); @@ -872,10 +955,10 @@ llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, /// Create a simple vector reduction specified by an opcode and some /// flags (if generating min/max reductions). Value *llvm::createSimpleTargetReduction( - IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, + IRBuilderBase &Builder, const TargetTransformInfo *TTI, unsigned Opcode, Value *Src, TargetTransformInfo::ReductionFlags Flags, ArrayRef<Value *> RedOps) { - assert(isa<VectorType>(Src->getType()) && "Type must be a vector"); + auto *SrcVTy = cast<VectorType>(Src->getType()); std::function<Value *()> BuildFunc; using RD = RecurrenceDescriptor; @@ -900,13 +983,13 @@ Value *llvm::createSimpleTargetReduction( case Instruction::FAdd: BuildFunc = [&]() { auto Rdx = Builder.CreateFAddReduce( - Constant::getNullValue(Src->getType()->getVectorElementType()), Src); + Constant::getNullValue(SrcVTy->getElementType()), Src); return Rdx; }; break; case Instruction::FMul: BuildFunc = [&]() { - Type *Ty = Src->getType()->getVectorElementType(); + Type *Ty = SrcVTy->getElementType(); auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src); return Rdx; }; @@ -937,13 +1020,14 @@ Value *llvm::createSimpleTargetReduction( llvm_unreachable("Unhandled opcode"); break; } - if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) + if (ForceReductionIntrinsic || + TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) return BuildFunc(); return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); } /// Create a vector reduction using a given recurrence descriptor. -Value *llvm::createTargetReduction(IRBuilder<> &B, +Value *llvm::createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, RecurrenceDescriptor &Desc, Value *Src, bool NoNaN) { @@ -955,7 +1039,7 @@ Value *llvm::createTargetReduction(IRBuilder<> &B, // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. - IRBuilder<>::FastMathFlagGuard FMFGuard(B); + IRBuilderBase::FastMathFlagGuard FMFGuard(B); B.setFastMathFlags(Desc.getFastMathFlags()); switch (RecKind) { @@ -1042,3 +1126,586 @@ bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE, SE.isLoopEntryGuardedByCond(L, Predicate, S, SE.getConstant(Max)); } + +//===----------------------------------------------------------------------===// +// rewriteLoopExitValues - Optimize IV users outside the loop. +// As a side effect, reduces the amount of IV processing within the loop. +//===----------------------------------------------------------------------===// + +// Return true if the SCEV expansion generated by the rewriter can replace the +// original value. SCEV guarantees that it produces the same value, but the way +// it is produced may be illegal IR. Ideally, this function will only be +// called for verification. +static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) { + // If an SCEV expression subsumed multiple pointers, its expansion could + // reassociate the GEP changing the base pointer. This is illegal because the + // final address produced by a GEP chain must be inbounds relative to its + // underlying object. Otherwise basic alias analysis, among other things, + // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid + // producing an expression involving multiple pointers. Until then, we must + // bail out here. + // + // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject + // because it understands lcssa phis while SCEV does not. + Value *FromPtr = FromVal; + Value *ToPtr = ToVal; + if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) + FromPtr = GEP->getPointerOperand(); + + if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) + ToPtr = GEP->getPointerOperand(); + + if (FromPtr != FromVal || ToPtr != ToVal) { + // Quickly check the common case + if (FromPtr == ToPtr) + return true; + + // SCEV may have rewritten an expression that produces the GEP's pointer + // operand. That's ok as long as the pointer operand has the same base + // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the + // base of a recurrence. This handles the case in which SCEV expansion + // converts a pointer type recurrence into a nonrecurrent pointer base + // indexed by an integer recurrence. + + // If the GEP base pointer is a vector of pointers, abort. + if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy()) + return false; + + const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr)); + const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr)); + if (FromBase == ToBase) + return true; + + LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out " + << *FromBase << " != " << *ToBase << "\n"); + + return false; + } + return true; +} + +static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) { + SmallPtrSet<const Instruction *, 8> Visited; + SmallVector<const Instruction *, 8> WorkList; + Visited.insert(I); + WorkList.push_back(I); + while (!WorkList.empty()) { + const Instruction *Curr = WorkList.pop_back_val(); + // This use is outside the loop, nothing to do. + if (!L->contains(Curr)) + continue; + // Do we assume it is a "hard" use which will not be eliminated easily? + if (Curr->mayHaveSideEffects()) + return true; + // Otherwise, add all its users to worklist. + for (auto U : Curr->users()) { + auto *UI = cast<Instruction>(U); + if (Visited.insert(UI).second) + WorkList.push_back(UI); + } + } + return false; +} + +// Collect information about PHI nodes which can be transformed in +// rewriteLoopExitValues. +struct RewritePhi { + PHINode *PN; // For which PHI node is this replacement? + unsigned Ith; // For which incoming value? + const SCEV *ExpansionSCEV; // The SCEV of the incoming value we are rewriting. + Instruction *ExpansionPoint; // Where we'd like to expand that SCEV? + bool HighCost; // Is this expansion a high-cost? + + Value *Expansion = nullptr; + bool ValidRewrite = false; + + RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt, + bool H) + : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt), + HighCost(H) {} +}; + +// Check whether it is possible to delete the loop after rewriting exit +// value. If it is possible, ignore ReplaceExitValue and do rewriting +// aggressively. +static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { + BasicBlock *Preheader = L->getLoopPreheader(); + // If there is no preheader, the loop will not be deleted. + if (!Preheader) + return false; + + // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1. + // We obviate multiple ExitingBlocks case for simplicity. + // TODO: If we see testcase with multiple ExitingBlocks can be deleted + // after exit value rewriting, we can enhance the logic here. + SmallVector<BasicBlock *, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + SmallVector<BasicBlock *, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1) + return false; + + BasicBlock *ExitBlock = ExitBlocks[0]; + BasicBlock::iterator BI = ExitBlock->begin(); + while (PHINode *P = dyn_cast<PHINode>(BI)) { + Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]); + + // If the Incoming value of P is found in RewritePhiSet, we know it + // could be rewritten to use a loop invariant value in transformation + // phase later. Skip it in the loop invariant check below. + bool found = false; + for (const RewritePhi &Phi : RewritePhiSet) { + if (!Phi.ValidRewrite) + continue; + unsigned i = Phi.Ith; + if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) { + found = true; + break; + } + } + + Instruction *I; + if (!found && (I = dyn_cast<Instruction>(Incoming))) + if (!L->hasLoopInvariantOperands(I)) + return false; + + ++BI; + } + + for (auto *BB : L->blocks()) + if (llvm::any_of(*BB, [](Instruction &I) { + return I.mayHaveSideEffects(); + })) + return false; + + return true; +} + +int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, + ScalarEvolution *SE, + const TargetTransformInfo *TTI, + SCEVExpander &Rewriter, DominatorTree *DT, + ReplaceExitVal ReplaceExitValue, + SmallVector<WeakTrackingVH, 16> &DeadInsts) { + // Check a pre-condition. + assert(L->isRecursivelyLCSSAForm(*DT, *LI) && + "Indvars did not preserve LCSSA!"); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + + SmallVector<RewritePhi, 8> RewritePhiSet; + // Find all values that are computed inside the loop, but used outside of it. + // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan + // the exit blocks of the loop to find them. + for (BasicBlock *ExitBB : ExitBlocks) { + // If there are no PHI nodes in this exit block, then no values defined + // inside the loop are used on this path, skip it. + PHINode *PN = dyn_cast<PHINode>(ExitBB->begin()); + if (!PN) continue; + + unsigned NumPreds = PN->getNumIncomingValues(); + + // Iterate over all of the PHI nodes. + BasicBlock::iterator BBI = ExitBB->begin(); + while ((PN = dyn_cast<PHINode>(BBI++))) { + if (PN->use_empty()) + continue; // dead use, don't replace it + + if (!SE->isSCEVable(PN->getType())) + continue; + + // It's necessary to tell ScalarEvolution about this explicitly so that + // it can walk the def-use list and forget all SCEVs, as it may not be + // watching the PHI itself. Once the new exit value is in place, there + // may not be a def-use connection between the loop and every instruction + // which got a SCEVAddRecExpr for that loop. + SE->forgetValue(PN); + + // Iterate over all of the values in all the PHI nodes. + for (unsigned i = 0; i != NumPreds; ++i) { + // If the value being merged in is not integer or is not defined + // in the loop, skip it. + Value *InVal = PN->getIncomingValue(i); + if (!isa<Instruction>(InVal)) + continue; + + // If this pred is for a subloop, not L itself, skip it. + if (LI->getLoopFor(PN->getIncomingBlock(i)) != L) + continue; // The Block is in a subloop, skip it. + + // Check that InVal is defined in the loop. + Instruction *Inst = cast<Instruction>(InVal); + if (!L->contains(Inst)) + continue; + + // Okay, this instruction has a user outside of the current loop + // and varies predictably *inside* the loop. Evaluate the value it + // contains when the loop exits, if possible. We prefer to start with + // expressions which are true for all exits (so as to maximize + // expression reuse by the SCEVExpander), but resort to per-exit + // evaluation if that fails. + const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); + if (isa<SCEVCouldNotCompute>(ExitValue) || + !SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) { + // TODO: This should probably be sunk into SCEV in some way; maybe a + // getSCEVForExit(SCEV*, L, ExitingBB)? It can be generalized for + // most SCEV expressions and other recurrence types (e.g. shift + // recurrences). Is there existing code we can reuse? + const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i)); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst))) + if (AddRec->getLoop() == L) + ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE); + if (isa<SCEVCouldNotCompute>(ExitValue) || + !SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) + continue; + } + + // Computing the value outside of the loop brings no benefit if it is + // definitely used inside the loop in a way which can not be optimized + // away. Avoid doing so unless we know we have a value which computes + // the ExitValue already. TODO: This should be merged into SCEV + // expander to leverage its knowledge of existing expressions. + if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) && + !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst)) + continue; + + // Check if expansions of this SCEV would count as being high cost. + bool HighCost = Rewriter.isHighCostExpansion( + ExitValue, L, SCEVCheapExpansionBudget, TTI, Inst); + + // Note that we must not perform expansions until after + // we query *all* the costs, because if we perform temporary expansion + // inbetween, one that we might not intend to keep, said expansion + // *may* affect cost calculation of the the next SCEV's we'll query, + // and next SCEV may errneously get smaller cost. + + // Collect all the candidate PHINodes to be rewritten. + RewritePhiSet.emplace_back(PN, i, ExitValue, Inst, HighCost); + } + } + } + + // Now that we've done preliminary filtering and billed all the SCEV's, + // we can perform the last sanity check - the expansion must be valid. + for (RewritePhi &Phi : RewritePhiSet) { + Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(), + Phi.ExpansionPoint); + + LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = " + << *(Phi.Expansion) << '\n' + << " LoopVal = " << *(Phi.ExpansionPoint) << "\n"); + + // FIXME: isValidRewrite() is a hack. it should be an assert, eventually. + Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion); + if (!Phi.ValidRewrite) { + DeadInsts.push_back(Phi.Expansion); + continue; + } + +#ifndef NDEBUG + // If we reuse an instruction from a loop which is neither L nor one of + // its containing loops, we end up breaking LCSSA form for this loop by + // creating a new use of its instruction. + if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion)) + if (auto *EVL = LI->getLoopFor(ExitInsn->getParent())) + if (EVL != L) + assert(EVL->contains(L) && "LCSSA breach detected!"); +#endif + } + + // TODO: after isValidRewrite() is an assertion, evaluate whether + // it is beneficial to change how we calculate high-cost: + // if we have SCEV 'A' which we know we will expand, should we calculate + // the cost of other SCEV's after expanding SCEV 'A', + // thus potentially giving cost bonus to those other SCEV's? + + bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet); + int NumReplaced = 0; + + // Transformation. + for (const RewritePhi &Phi : RewritePhiSet) { + if (!Phi.ValidRewrite) + continue; + + PHINode *PN = Phi.PN; + Value *ExitVal = Phi.Expansion; + + // Only do the rewrite when the ExitValue can be expanded cheaply. + // If LoopCanBeDel is true, rewrite exit value aggressively. + if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) { + DeadInsts.push_back(ExitVal); + continue; + } + + NumReplaced++; + Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith)); + PN->setIncomingValue(Phi.Ith, ExitVal); + + // If this instruction is dead now, delete it. Don't do it now to avoid + // invalidating iterators. + if (isInstructionTriviallyDead(Inst, TLI)) + DeadInsts.push_back(Inst); + + // Replace PN with ExitVal if that is legal and does not break LCSSA. + if (PN->getNumIncomingValues() == 1 && + LI->replacementPreservesLCSSAForm(PN, ExitVal)) { + PN->replaceAllUsesWith(ExitVal); + PN->eraseFromParent(); + } + } + + // The insertion point instruction may have been deleted; clear it out + // so that the rewriter doesn't trip over it later. + Rewriter.clearInsertPoint(); + return NumReplaced; +} + +/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for +/// \p OrigLoop. +void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, + Loop *RemainderLoop, uint64_t UF) { + assert(UF > 0 && "Zero unrolled factor is not supported"); + assert(UnrolledLoop != RemainderLoop && + "Unrolled and Remainder loops are expected to distinct"); + + // Get number of iterations in the original scalar loop. + unsigned OrigLoopInvocationWeight = 0; + Optional<unsigned> OrigAverageTripCount = + getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight); + if (!OrigAverageTripCount) + return; + + // Calculate number of iterations in unrolled loop. + unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF; + // Calculate number of iterations for remainder loop. + unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF; + + setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount, + OrigLoopInvocationWeight); + setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount, + OrigLoopInvocationWeight); +} + +/// Utility that implements appending of loops onto a worklist. +/// Loops are added in preorder (analogous for reverse postorder for trees), +/// and the worklist is processed LIFO. +template <typename RangeT> +void llvm::appendReversedLoopsToWorklist( + RangeT &&Loops, SmallPriorityWorklist<Loop *, 4> &Worklist) { + // We use an internal worklist to build up the preorder traversal without + // recursion. + SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist; + + // We walk the initial sequence of loops in reverse because we generally want + // to visit defs before uses and the worklist is LIFO. + for (Loop *RootL : Loops) { + assert(PreOrderLoops.empty() && "Must start with an empty preorder walk."); + assert(PreOrderWorklist.empty() && + "Must start with an empty preorder walk worklist."); + PreOrderWorklist.push_back(RootL); + do { + Loop *L = PreOrderWorklist.pop_back_val(); + PreOrderWorklist.append(L->begin(), L->end()); + PreOrderLoops.push_back(L); + } while (!PreOrderWorklist.empty()); + + Worklist.insert(std::move(PreOrderLoops)); + PreOrderLoops.clear(); + } +} + +template <typename RangeT> +void llvm::appendLoopsToWorklist(RangeT &&Loops, + SmallPriorityWorklist<Loop *, 4> &Worklist) { + appendReversedLoopsToWorklist(reverse(Loops), Worklist); +} + +template void llvm::appendLoopsToWorklist<ArrayRef<Loop *> &>( + ArrayRef<Loop *> &Loops, SmallPriorityWorklist<Loop *, 4> &Worklist); + +template void +llvm::appendLoopsToWorklist<Loop &>(Loop &L, + SmallPriorityWorklist<Loop *, 4> &Worklist); + +void llvm::appendLoopsToWorklist(LoopInfo &LI, + SmallPriorityWorklist<Loop *, 4> &Worklist) { + appendReversedLoopsToWorklist(LI, Worklist); +} + +Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, + LoopInfo *LI, LPPassManager *LPM) { + Loop &New = *LI->AllocateLoop(); + if (PL) + PL->addChildLoop(&New); + else + LI->addTopLevelLoop(&New); + + if (LPM) + LPM->addLoop(New); + + // Add all of the blocks in L to the new loop. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) + if (LI->getLoopFor(*I) == L) + New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); + + // Add all of the subloops to the new loop. + for (Loop *I : *L) + cloneLoop(I, &New, VM, LI, LPM); + + return &New; +} + +/// IR Values for the lower and upper bounds of a pointer evolution. We +/// need to use value-handles because SCEV expansion can invalidate previously +/// expanded values. Thus expansion of a pointer can invalidate the bounds for +/// a previous one. +struct PointerBounds { + TrackingVH<Value> Start; + TrackingVH<Value> End; +}; + +/// Expand code for the lower and upper bound of the pointer group \p CG +/// in \p TheLoop. \return the values for the bounds. +static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, + Loop *TheLoop, Instruction *Loc, + SCEVExpander &Exp, ScalarEvolution *SE) { + // TODO: Add helper to retrieve pointers to CG. + Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue; + const SCEV *Sc = SE->getSCEV(Ptr); + + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + LLVMContext &Ctx = Loc->getContext(); + + // Use this type for pointer arithmetic. + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); + + if (SE->isLoopInvariant(Sc, TheLoop)) { + LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" + << *Ptr << "\n"); + // Ptr could be in the loop body. If so, expand a new one at the correct + // location. + Instruction *Inst = dyn_cast<Instruction>(Ptr); + Value *NewPtr = (Inst && TheLoop->contains(Inst)) + ? Exp.expandCodeFor(Sc, PtrArithTy, Loc) + : Ptr; + // We must return a half-open range, which means incrementing Sc. + const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy)); + Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc); + return {NewPtr, NewPtrPlusOne}; + } else { + Value *Start = nullptr, *End = nullptr; + LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n"); + Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc); + End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc); + LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High + << "\n"); + return {Start, End}; + } +} + +/// Turns a collection of checks into a collection of expanded upper and +/// lower bounds for both pointers in the check. +static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> +expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L, + Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) { + SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds; + + // Here we're relying on the SCEV Expander's cache to only emit code for the + // same bounds once. + transform(PointerChecks, std::back_inserter(ChecksWithBounds), + [&](const RuntimePointerCheck &Check) { + PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE), + Second = + expandBounds(Check.second, L, Loc, Exp, SE); + return std::make_pair(First, Second); + }); + + return ChecksWithBounds; +} + +std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks( + Instruction *Loc, Loop *TheLoop, + const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, + ScalarEvolution *SE) { + // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. + // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + SCEVExpander Exp(*SE, DL, "induction"); + auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp); + + LLVMContext &Ctx = Loc->getContext(); + Instruction *FirstInst = nullptr; + IRBuilder<> ChkBuilder(Loc); + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = nullptr; + + // FIXME: this helper is currently a duplicate of the one in + // LoopVectorize.cpp. + auto GetFirstInst = [](Instruction *FirstInst, Value *V, + Instruction *Loc) -> Instruction * { + if (FirstInst) + return FirstInst; + if (Instruction *I = dyn_cast<Instruction>(V)) + return I->getParent() == Loc->getParent() ? I : nullptr; + return nullptr; + }; + + for (const auto &Check : ExpandedChecks) { + const PointerBounds &A = Check.first, &B = Check.second; + // Check if two pointers (A and B) conflict where conflict is computed as: + // start(A) <= end(B) && start(B) <= end(A) + unsigned AS0 = A.Start->getType()->getPointerAddressSpace(); + unsigned AS1 = B.Start->getType()->getPointerAddressSpace(); + + assert((AS0 == B.End->getType()->getPointerAddressSpace()) && + (AS1 == A.End->getType()->getPointerAddressSpace()) && + "Trying to bounds check pointers with different address spaces"); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + + Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc"); + + // [A|B].Start points to the first accessed byte under base [A|B]. + // [A|B].End points to the last accessed byte, plus one. + // There is no conflict when the intervals are disjoint: + // NoConflict = (B.Start >= A.End) || (A.Start >= B.End) + // + // bound0 = (B.Start < A.End) + // bound1 = (A.Start < B.End) + // IsConflict = bound0 & bound1 + Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0"); + FirstInst = GetFirstInst(FirstInst, Cmp0, Loc); + Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1"); + FirstInst = GetFirstInst(FirstInst, Cmp1, Loc); + Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); + FirstInst = GetFirstInst(FirstInst, IsConflict, Loc); + if (MemoryRuntimeCheck) { + IsConflict = + ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); + FirstInst = GetFirstInst(FirstInst, IsConflict, Loc); + } + MemoryRuntimeCheck = IsConflict; + } + + if (!MemoryRuntimeCheck) + return std::make_pair(nullptr, nullptr); + + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + Instruction *Check = + BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(Check, "memcheck.conflict"); + FirstInst = GetFirstInst(FirstInst, Check, Loc); + return std::make_pair(FirstInst, Check); +} diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 50752bd78a650..16bd08c704eeb 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -13,15 +13,16 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; @@ -44,9 +45,8 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, } } -void LoopVersioning::setAliasChecks( - SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) { - AliasChecks = std::move(Checks); +void LoopVersioning::setAliasChecks(ArrayRef<RuntimePointerCheck> Checks) { + AliasChecks = {Checks.begin(), Checks.end()}; } void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) { @@ -62,8 +62,10 @@ void LoopVersioning::versionLoop( // Add the memcheck in the original preheader (this is empty initially). BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); std::tie(FirstCheckInst, MemRuntimeCheck) = - LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks); + addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, + AliasChecks, RtPtrChecking.getSE()); const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate(); SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), @@ -194,8 +196,7 @@ void LoopVersioning::prepareNoAliasMetadata() { // Go through the checks and for each pointer group, collect the scopes for // each non-aliasing pointer group. - DenseMap<const RuntimePointerChecking::CheckingPtrGroup *, - SmallVector<Metadata *, 4>> + DenseMap<const RuntimeCheckingPtrGroup *, SmallVector<Metadata *, 4>> GroupToNonAliasingScopes; for (const auto &Check : AliasChecks) diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp index 1af0ce3d86cc1..0b225e8abc4e7 100644 --- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -53,7 +53,7 @@ static bool runImpl(Function &F) { II->getOperandBundlesAsDefs(OpBundles); // Insert a normal call instruction... CallInst *NewCall = - CallInst::Create(II->getFunctionType(), II->getCalledValue(), + CallInst::Create(II->getFunctionType(), II->getCalledOperand(), CallArgs, OpBundles, "", II); NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 0cc085dc366c6..616b4e8eb01c9 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -14,17 +14,9 @@ using namespace llvm; -static unsigned getLoopOperandSizeInBytes(Type *Type) { - if (VectorType *VTy = dyn_cast<VectorType>(Type)) { - return VTy->getBitWidth() / 8; - } - - return Type->getPrimitiveSizeInBits() / 8; -} - void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, - unsigned SrcAlign, unsigned DestAlign, + Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI) { // No need to expand zero length copies. @@ -35,17 +27,18 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, BasicBlock *PostLoopBB = nullptr; Function *ParentFunc = PreLoopBB->getParent(); LLVMContext &Ctx = PreLoopBB->getContext(); + const DataLayout &DL = ParentFunc->getParent()->getDataLayout(); + + unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); + unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); Type *TypeOfCopyLen = CopyLen->getType(); - Type *LoopOpType = - TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign); + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); - unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; - unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); - unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); - if (LoopEndCount != 0) { // Split PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split"); @@ -66,16 +59,20 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); } + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); + IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB); // Loop Body Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); - Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile); + Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, + PartSrcAlign, SrcIsVolatile); Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); - LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U)); @@ -93,17 +90,17 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI() : InsertBefore); - // Update the alignment based on the copy size used in the loop body. - SrcAlign = std::min(SrcAlign, LoopOpSize); - DestAlign = std::min(DestAlign, LoopOpSize); - SmallVector<Type *, 5> RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, - SrcAlign, DestAlign); + SrcAS, DstAS, SrcAlign.value(), + DstAlign.value()); for (auto OpTy : RemainingOps) { + Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied)); + Align PartDstAlign(commonAlignment(DstAlign, BytesCopied)); + // Calaculate the new index - unsigned OperandSize = getLoopOperandSizeInBytes(OpTy); + unsigned OperandSize = DL.getTypeStoreSize(OpTy); uint64_t GepIndex = BytesCopied / OperandSize; assert(GepIndex * OperandSize == BytesCopied && "Division should have no Remainder!"); @@ -114,7 +111,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, : RBuilder.CreateBitCast(SrcAddr, SrcPtrType); Value *SrcGEP = RBuilder.CreateInBoundsGEP( OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex)); - Value *Load = RBuilder.CreateLoad(OpTy, SrcGEP, SrcIsVolatile); + Value *Load = + RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile); // Cast destination to operand type and store. PointerType *DstPtrType = PointerType::get(OpTy, DstAS); @@ -123,7 +121,7 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, : RBuilder.CreateBitCast(DstAddr, DstPtrType); Value *DstGEP = RBuilder.CreateInBoundsGEP( OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex)); - RBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); BytesCopied += OperandSize; } @@ -134,8 +132,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, - Value *CopyLen, unsigned SrcAlign, - unsigned DestAlign, bool SrcIsVolatile, + Value *CopyLen, Align SrcAlign, + Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI) { BasicBlock *PreLoopBB = InsertBefore->getParent(); @@ -143,16 +141,17 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); Function *ParentFunc = PreLoopBB->getParent(); + const DataLayout &DL = ParentFunc->getParent()->getDataLayout(); LLVMContext &Ctx = PreLoopBB->getContext(); + unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); + unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); - Type *LoopOpType = - TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign); - unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); - unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); - unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS); PointerType *DstOpType = PointerType::get(LoopOpType, DstAS); if (SrcAddr->getType() != SrcOpType) { @@ -177,13 +176,17 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); IRBuilder<> LoopBuilder(LoopBB); + Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); - Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile); + Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign, + SrcIsVolatile); Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); - LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); @@ -234,10 +237,11 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex); Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset); - Value *Load = ResBuilder.CreateLoad(Int8Type, SrcGEP, SrcIsVolatile); + Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign, + SrcIsVolatile); Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset); - ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); Value *ResNewIndex = ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U)); @@ -284,13 +288,14 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, // } // return dst; // } -static void createMemMoveLoop(Instruction *InsertBefore, - Value *SrcAddr, Value *DstAddr, Value *CopyLen, - unsigned SrcAlign, unsigned DestAlign, - bool SrcIsVolatile, bool DstIsVolatile) { +static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, Align SrcAlign, + Align DstAlign, bool SrcIsVolatile, + bool DstIsVolatile) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType(); @@ -318,6 +323,10 @@ static void createMemMoveLoop(Instruction *InsertBefore, BasicBlock *ExitBB = InsertBefore->getParent(); ExitBB->setName("memmove_done"); + unsigned PartSize = DL.getTypeStoreSize(EltTy); + Align PartSrcAlign(commonAlignment(SrcAlign, PartSize)); + Align PartDstAlign(commonAlignment(DstAlign, PartSize)); + // Initial comparison of n == 0 that lets us skip the loops altogether. Shared // between both backwards and forward copy clauses. ICmpInst *CompareN = @@ -331,11 +340,12 @@ static void createMemMoveLoop(Instruction *InsertBefore, PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); Value *IndexPtr = LoopBuilder.CreateSub( LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr"); - Value *Element = LoopBuilder.CreateLoad( + Value *Element = LoopBuilder.CreateAlignedLoad( EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr), - "element"); - LoopBuilder.CreateStore( - Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr)); + PartSrcAlign, "element"); + LoopBuilder.CreateAlignedStore( + Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr), + PartDstAlign); LoopBuilder.CreateCondBr( LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)), ExitBB, LoopBB); @@ -349,11 +359,11 @@ static void createMemMoveLoop(Instruction *InsertBefore, BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB); IRBuilder<> FwdLoopBuilder(FwdLoopBB); PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr"); - Value *FwdElement = FwdLoopBuilder.CreateLoad( - EltTy, FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi), - "element"); - FwdLoopBuilder.CreateStore( - FwdElement, FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi)); + Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi); + Value *FwdElement = + FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element"); + Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi); + FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign); Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd( FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment"); FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen), @@ -365,12 +375,13 @@ static void createMemMoveLoop(Instruction *InsertBefore, ElseTerm->eraseFromParent(); } -static void createMemSetLoop(Instruction *InsertBefore, - Value *DstAddr, Value *CopyLen, Value *SetValue, - unsigned Align, bool IsVolatile) { +static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, + Value *CopyLen, Value *SetValue, Align DstAlign, + bool IsVolatile) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split"); BasicBlock *LoopBB @@ -388,14 +399,17 @@ static void createMemSetLoop(Instruction *InsertBefore, LoopBB); OrigBB->getTerminator()->eraseFromParent(); + unsigned PartSize = DL.getTypeStoreSize(SetValue->getType()); + Align PartAlign(commonAlignment(DstAlign, PartSize)); + IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); - LoopBuilder.CreateStore( + LoopBuilder.CreateAlignedStore( SetValue, LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex), - IsVolatile); + PartAlign, IsVolatile); Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); @@ -408,25 +422,27 @@ static void createMemSetLoop(Instruction *InsertBefore, void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, const TargetTransformInfo &TTI) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) { - createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ CI, - /* SrcAlign */ Memcpy->getSourceAlignment(), - /* DestAlign */ Memcpy->getDestAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile(), - /* TargetTransformInfo */ TTI); + createMemCpyLoopKnownSize( + /* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ CI, + /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), + /* DestAlign */ Memcpy->getDestAlign().valueOrOne(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransformInfo */ TTI); } else { - createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ Memcpy->getLength(), - /* SrcAlign */ Memcpy->getSourceAlignment(), - /* DestAlign */ Memcpy->getDestAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile(), - /* TargetTransfomrInfo */ TTI); + createMemCpyLoopUnknownSize( + /* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ Memcpy->getLength(), + /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), + /* DestAlign */ Memcpy->getDestAlign().valueOrOne(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransfomrInfo */ TTI); } } @@ -435,8 +451,8 @@ void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) { /* SrcAddr */ Memmove->getRawSource(), /* DstAddr */ Memmove->getRawDest(), /* CopyLen */ Memmove->getLength(), - /* SrcAlign */ Memmove->getSourceAlignment(), - /* DestAlign */ Memmove->getDestAlignment(), + /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(), + /* DestAlign */ Memmove->getDestAlign().valueOrOne(), /* SrcIsVolatile */ Memmove->isVolatile(), /* DstIsVolatile */ Memmove->isVolatile()); } @@ -446,6 +462,6 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) { /* DstAddr */ Memset->getRawDest(), /* CopyLen */ Memset->getLength(), /* SetValue */ Memset->getValue(), - /* Alignment */ Memset->getDestAlignment(), + /* Alignment */ Memset->getDestAlign().valueOrOne(), Memset->isVolatile()); } diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 4b9d0dadfc173..34e836d9660f3 100644 --- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -148,13 +148,6 @@ bool LowerSwitch::runOnFunction(Function &F) { LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>(); AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr; - // Prevent LazyValueInfo from using the DominatorTree as LowerSwitch does not - // preserve it and it becomes stale (when available) pretty much immediately. - // Currently the DominatorTree is only used by LowerSwitch indirectly via LVI - // and computeKnownBits to refine isValidAssumeForContext's results. Given - // that the latter can handle some of the simple cases w/o a DominatorTree, - // it's easier to refrain from using the tree than to keep it up to date. - LVI->disableDT(); bool Changed = false; SmallPtrSet<BasicBlock*, 8> DeleteList; diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index b94f57e4dc2ca..ef9f18a2289e9 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -11,15 +11,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" - using namespace llvm; +#define DEBUG_TYPE "moduleutils" + static void appendToGlobalArray(const char *Array, Module &M, Function *F, int Priority, Constant *Data) { IRBuilder<> IRB(M.getContext()); @@ -117,6 +119,15 @@ llvm::declareSanitizerInitFunction(Module &M, StringRef InitName, AttributeList()); } +Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) { + Function *Ctor = Function::Create( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalValue::InternalLinkage, CtorName, &M); + BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor); + ReturnInst::Create(M.getContext(), CtorBB); + return Ctor; +} + std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions( Module &M, StringRef CtorName, StringRef InitName, ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs, @@ -126,11 +137,8 @@ std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions( "Sanitizer's init function expects different number of arguments"); FunctionCallee InitFunction = declareSanitizerInitFunction(M, InitName, InitArgTypes); - Function *Ctor = Function::Create( - FunctionType::get(Type::getVoidTy(M.getContext()), false), - GlobalValue::InternalLinkage, CtorName, &M); - BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor); - IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB)); + Function *Ctor = createSanitizerCtor(M, CtorName); + IRBuilder<> IRB(Ctor->getEntryBlock().getTerminator()); IRB.CreateCall(InitFunction, InitArgs); if (!VersionCheckName.empty()) { FunctionCallee VersionCheckFunction = M.getOrInsertFunction( @@ -298,8 +306,9 @@ void VFABI::setVectorVariantNames( Module *M = CI->getModule(); #ifndef NDEBUG for (const std::string &VariantMapping : VariantMappings) { - Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping); - assert(VI.hasValue() && "Canno add an invalid VFABI name."); + LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n"); + Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M); + assert(VI.hasValue() && "Cannot add an invalid VFABI name."); assert(M->getNamedValue(VI.getValue().VectorName) && "Cannot add variant to attribute: " "vector function declaration is missing."); diff --git a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp index 1c5c41abc6823..7083789267d9c 100644 --- a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp +++ b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp @@ -55,7 +55,7 @@ public: Hasher.final(Hash); SmallString<32> Result; MD5::stringifyResult(Hash, Result); - TheHash = Result.str(); + TheHash = std::string(Result.str()); return TheHash; } }; diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index dda2867f44b24..99b64a7462f62 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Support/FormattedStream.h" @@ -39,7 +40,6 @@ #define DEBUG_TYPE "predicateinfo" using namespace llvm; using namespace PatternMatch; -using namespace llvm::PredicateInfoClasses; INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo", "PredicateInfo Printer", false, false) @@ -83,7 +83,6 @@ getBlockEdge(const PredicateBase *PB) { } namespace llvm { -namespace PredicateInfoClasses { enum LocalNum { // Operations that must appear first in the block. LN_First, @@ -109,8 +108,7 @@ struct ValueDFS { }; // Perform a strict weak ordering on instructions and arguments. -static bool valueComesBefore(OrderedInstructions &OI, const Value *A, - const Value *B) { +static bool valueComesBefore(const Value *A, const Value *B) { auto *ArgA = dyn_cast_or_null<Argument>(A); auto *ArgB = dyn_cast_or_null<Argument>(B); if (ArgA && !ArgB) @@ -119,17 +117,14 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A, return false; if (ArgA && ArgB) return ArgA->getArgNo() < ArgB->getArgNo(); - return OI.dfsBefore(cast<Instruction>(A), cast<Instruction>(B)); + return cast<Instruction>(A)->comesBefore(cast<Instruction>(B)); } -// This compares ValueDFS structures, creating OrderedBasicBlocks where -// necessary to compare uses/defs in the same block. Doing so allows us to walk -// the minimum number of instructions necessary to compute our def/use ordering. +// This compares ValueDFS structures. Doing so allows us to walk the minimum +// number of instructions necessary to compute our def/use ordering. struct ValueDFS_Compare { DominatorTree &DT; - OrderedInstructions &OI; - ValueDFS_Compare(DominatorTree &DT, OrderedInstructions &OI) - : DT(DT), OI(OI) {} + ValueDFS_Compare(DominatorTree &DT) : DT(DT) {} bool operator()(const ValueDFS &A, const ValueDFS &B) const { if (&A == &B) @@ -210,14 +205,14 @@ struct ValueDFS_Compare { // numbering will say the placed predicaeinfos should go first (IE // LN_beginning), so we won't be in this function. For assumes, we will end // up here, beause we need to order the def we will place relative to the - // assume. So for the purpose of ordering, we pretend the def is the assume - // because that is where we will insert the info. + // assume. So for the purpose of ordering, we pretend the def is right + // after the assume, because that is where we will insert the info. if (!VD.U) { assert(VD.PInfo && "No def, no use, and no predicateinfo should not occur"); assert(isa<PredicateAssume>(VD.PInfo) && "Middle of block should only occur for assumes"); - return cast<PredicateAssume>(VD.PInfo)->AssumeInst; + return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode(); } return nullptr; } @@ -243,18 +238,71 @@ struct ValueDFS_Compare { auto *ArgB = dyn_cast_or_null<Argument>(BDef); if (ArgA || ArgB) - return valueComesBefore(OI, ArgA, ArgB); + return valueComesBefore(ArgA, ArgB); auto *AInst = getDefOrUser(ADef, A.U); auto *BInst = getDefOrUser(BDef, B.U); - return valueComesBefore(OI, AInst, BInst); + return valueComesBefore(AInst, BInst); } }; -} // namespace PredicateInfoClasses +class PredicateInfoBuilder { + // Used to store information about each value we might rename. + struct ValueInfo { + SmallVector<PredicateBase *, 4> Infos; + }; + + PredicateInfo &PI; + Function &F; + DominatorTree &DT; + AssumptionCache &AC; + + // This stores info about each operand or comparison result we make copies + // of. The real ValueInfos start at index 1, index 0 is unused so that we + // can more easily detect invalid indexing. + SmallVector<ValueInfo, 32> ValueInfos; + + // This gives the index into the ValueInfos array for a given Value. Because + // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell + // whether it returned a valid result. + DenseMap<Value *, unsigned int> ValueInfoNums; + + // The set of edges along which we can only handle phi uses, due to critical + // edges. + DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly; + + ValueInfo &getOrCreateValueInfo(Value *); + const ValueInfo &getValueInfo(Value *) const; + + void processAssume(IntrinsicInst *, BasicBlock *, + SmallVectorImpl<Value *> &OpsToRename); + void processBranch(BranchInst *, BasicBlock *, + SmallVectorImpl<Value *> &OpsToRename); + void processSwitch(SwitchInst *, BasicBlock *, + SmallVectorImpl<Value *> &OpsToRename); + void renameUses(SmallVectorImpl<Value *> &OpsToRename); + void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op, + PredicateBase *PB); + + typedef SmallVectorImpl<ValueDFS> ValueDFSStack; + void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &); + Value *materializeStack(unsigned int &, ValueDFSStack &, Value *); + bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const; + void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &); + +public: + PredicateInfoBuilder(PredicateInfo &PI, Function &F, DominatorTree &DT, + AssumptionCache &AC) + : PI(PI), F(F), DT(DT), AC(AC) { + // Push an empty operand info so that we can detect 0 as not finding one + ValueInfos.resize(1); + } + + void buildPredicateInfo(); +}; -bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack, - const ValueDFS &VDUse) const { +bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack, + const ValueDFS &VDUse) const { if (Stack.empty()) return false; // If it's a phi only use, make sure it's for this phi node edge, and that the @@ -281,15 +329,15 @@ bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack, VDUse.DFSOut <= Stack.back().DFSOut); } -void PredicateInfo::popStackUntilDFSScope(ValueDFSStack &Stack, - const ValueDFS &VD) { +void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack, + const ValueDFS &VD) { while (!Stack.empty() && !stackIsInScope(Stack, VD)) Stack.pop_back(); } // Convert the uses of Op into a vector of uses, associating global and local // DFS info with each one. -void PredicateInfo::convertUsesToDFSOrdered( +void PredicateInfoBuilder::convertUsesToDFSOrdered( Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) { for (auto &U : Op->uses()) { if (auto *I = dyn_cast<Instruction>(U.getUser())) { @@ -338,19 +386,20 @@ void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) { } // Add Op, PB to the list of value infos for Op, and mark Op to be renamed. -void PredicateInfo::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op, - PredicateBase *PB) { +void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, + Value *Op, PredicateBase *PB) { auto &OperandInfo = getOrCreateValueInfo(Op); if (OperandInfo.Infos.empty()) OpsToRename.push_back(Op); - AllInfos.push_back(PB); + PI.AllInfos.push_back(PB); OperandInfo.Infos.push_back(PB); } // Process an assume instruction and place relevant operations we want to rename // into OpsToRename. -void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB, - SmallVectorImpl<Value *> &OpsToRename) { +void PredicateInfoBuilder::processAssume( + IntrinsicInst *II, BasicBlock *AssumeBB, + SmallVectorImpl<Value *> &OpsToRename) { // See if we have a comparison we support SmallVector<Value *, 8> CmpOperands; SmallVector<Value *, 2> ConditionsToProcess; @@ -389,8 +438,9 @@ void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB, // Process a block terminating branch, and place relevant operations to be // renamed into OpsToRename. -void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB, - SmallVectorImpl<Value *> &OpsToRename) { +void PredicateInfoBuilder::processBranch( + BranchInst *BI, BasicBlock *BranchBB, + SmallVectorImpl<Value *> &OpsToRename) { BasicBlock *FirstBB = BI->getSuccessor(0); BasicBlock *SecondBB = BI->getSuccessor(1); SmallVector<BasicBlock *, 2> SuccsToProcess; @@ -459,8 +509,9 @@ void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB, } // Process a block terminating switch, and place relevant operations to be // renamed into OpsToRename. -void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB, - SmallVectorImpl<Value *> &OpsToRename) { +void PredicateInfoBuilder::processSwitch( + SwitchInst *SI, BasicBlock *BranchBB, + SmallVectorImpl<Value *> &OpsToRename) { Value *Op = SI->getCondition(); if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse()) return; @@ -486,7 +537,7 @@ void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB, } // Build predicate info for our function -void PredicateInfo::buildPredicateInfo() { +void PredicateInfoBuilder::buildPredicateInfo() { DT.updateDFSNumbers(); // Collect operands to rename from all conditional branch terminators, as well // as assume statements. @@ -530,9 +581,9 @@ static Function *getCopyDeclaration(Module *M, Type *Ty) { // Given the renaming stack, make all the operands currently on the stack real // by inserting them into the IR. Return the last operation's value. -Value *PredicateInfo::materializeStack(unsigned int &Counter, - ValueDFSStack &RenameStack, - Value *OrigOp) { +Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, + ValueDFSStack &RenameStack, + Value *OrigOp) { // Find the first thing we have to materialize auto RevIter = RenameStack.rbegin(); for (; RevIter != RenameStack.rend(); ++RevIter) @@ -549,6 +600,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def; ValueDFS &Result = *RenameIter; auto *ValInfo = Result.PInfo; + ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin() + ? OrigOp + : (RenameStack.end() - Start - 1)->Def; // For edge predicates, we can just place the operand in the block before // the terminator. For assume, we have to place it right before the assume // to ensure we dominate all of our uses. Always insert right before the @@ -558,21 +612,23 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, IRBuilder<> B(getBranchTerminator(ValInfo)); Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); if (IF->users().empty()) - CreatedDeclarations.insert(IF); + PI.CreatedDeclarations.insert(IF); CallInst *PIC = B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++)); - PredicateMap.insert({PIC, ValInfo}); + PI.PredicateMap.insert({PIC, ValInfo}); Result.Def = PIC; } else { auto *PAssume = dyn_cast<PredicateAssume>(ValInfo); assert(PAssume && "Should not have gotten here without it being an assume"); - IRBuilder<> B(PAssume->AssumeInst); + // Insert the predicate directly after the assume. While it also holds + // directly before it, assume(i1 true) is not a useful fact. + IRBuilder<> B(PAssume->AssumeInst->getNextNode()); Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); if (IF->users().empty()) - CreatedDeclarations.insert(IF); + PI.CreatedDeclarations.insert(IF); CallInst *PIC = B.CreateCall(IF, Op); - PredicateMap.insert({PIC, ValInfo}); + PI.PredicateMap.insert({PIC, ValInfo}); Result.Def = PIC; } } @@ -598,8 +654,8 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, // // TODO: Use this algorithm to perform fast single-variable renaming in // promotememtoreg and memoryssa. -void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) { - ValueDFS_Compare Compare(DT, OI); +void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) { + ValueDFS_Compare Compare(DT); // Compute liveness, and rename in O(uses) per Op. for (auto *Op : OpsToRename) { LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n"); @@ -719,7 +775,8 @@ void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) { } } -PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) { +PredicateInfoBuilder::ValueInfo & +PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) { auto OIN = ValueInfoNums.find(Operand); if (OIN == ValueInfoNums.end()) { // This will grow it @@ -732,8 +789,8 @@ PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) { return ValueInfos[OIN->second]; } -const PredicateInfo::ValueInfo & -PredicateInfo::getValueInfo(Value *Operand) const { +const PredicateInfoBuilder::ValueInfo & +PredicateInfoBuilder::getValueInfo(Value *Operand) const { auto OINI = ValueInfoNums.lookup(Operand); assert(OINI != 0 && "Operand was not really in the Value Info Numbers"); assert(OINI < ValueInfos.size() && @@ -743,10 +800,9 @@ PredicateInfo::getValueInfo(Value *Operand) const { PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT, AssumptionCache &AC) - : F(F), DT(DT), AC(AC), OI(&DT) { - // Push an empty operand info so that we can detect 0 as not finding one - ValueInfos.resize(1); - buildPredicateInfo(); + : F(F) { + PredicateInfoBuilder Builder(*this, F, DT, AC); + Builder.buildPredicateInfo(); } // Remove all declarations we created . The PredicateInfo consumers are @@ -829,11 +885,11 @@ class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter { public: PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {} - virtual void emitBasicBlockStartAnnot(const BasicBlock *BB, - formatted_raw_ostream &OS) {} + void emitBasicBlockStartAnnot(const BasicBlock *BB, + formatted_raw_ostream &OS) override {} - virtual void emitInstructionAnnot(const Instruction *I, - formatted_raw_ostream &OS) { + void emitInstructionAnnot(const Instruction *I, + formatted_raw_ostream &OS) override { if (const auto *PI = PredInfo->getPredicateInfoFor(I)) { OS << "; Has predicate info\n"; if (const auto *PB = dyn_cast<PredicateBranch>(PI)) { @@ -842,18 +898,21 @@ public: PB->From->printAsOperand(OS); OS << ","; PB->To->printAsOperand(OS); - OS << "] }\n"; + OS << "]"; } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) { OS << "; switch predicate info { CaseValue: " << *PS->CaseValue << " Switch:" << *PS->Switch << " Edge: ["; PS->From->printAsOperand(OS); OS << ","; PS->To->printAsOperand(OS); - OS << "] }\n"; + OS << "]"; } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) { OS << "; assume predicate info {" - << " Comparison:" << *PA->Condition << " }\n"; + << " Comparison:" << *PA->Condition; } + OS << ", RenamedOp: "; + PI->RenamedOp->printAsOperand(OS, false); + OS << " }\n"; } } }; diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index d58e1ea574ef8..c7e9c919ec471 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -595,11 +595,6 @@ void PromoteMem2Reg::run() { // Keep the reverse mapping of the 'Allocas' array for the rename pass. AllocaLookup[Allocas[AllocaNum]] = AllocaNum; - // At this point, we're committed to promoting the alloca using IDF's, and - // the standard SSA construction algorithm. Determine which blocks need PHI - // nodes and see if we can optimize out some work by avoiding insertion of - // dead phi nodes. - // Unique the set of defining blocks for efficient lookup. SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end()); diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index bffdd115d940c..57df2334c750f 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -56,7 +56,7 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) { else getAvailableVals(AV).clear(); ProtoType = Ty; - ProtoName = Name; + ProtoName = std::string(Name); } bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { @@ -195,11 +195,6 @@ void SSAUpdater::RewriteUse(Use &U) { else V = GetValueInMiddleOfBlock(User->getParent()); - // Notify that users of the existing value that it is being replaced. - Value *OldVal = U.get(); - if (OldVal != V && OldVal->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(OldVal, V); - U.set(V); } diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp new file mode 100644 index 0000000000000..71b48482f26aa --- /dev/null +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -0,0 +1,2569 @@ +//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the scalar evolution expander, +// which is used to generate the code corresponding to a given scalar evolution +// expression. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +cl::opt<unsigned> llvm::SCEVCheapExpansionBudget( + "scev-cheap-expansion-budget", cl::Hidden, cl::init(4), + cl::desc("When performing SCEV expansion only if it is cheap to do, this " + "controls the budget that is considered cheap (default = 4)")); + +using namespace PatternMatch; + +/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP, +/// reusing an existing cast if a suitable one exists, moving an existing +/// cast if a suitable one exists but isn't in the right place, or +/// creating a new one. +Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty, + Instruction::CastOps Op, + BasicBlock::iterator IP) { + // This function must be called with the builder having a valid insertion + // point. It doesn't need to be the actual IP where the uses of the returned + // cast will be added, but it must dominate such IP. + // We use this precondition to produce a cast that will dominate all its + // uses. In particular, this is crucial for the case where the builder's + // insertion point *is* the point where we were asked to put the cast. + // Since we don't know the builder's insertion point is actually + // where the uses will be added (only that it dominates it), we are + // not allowed to move it. + BasicBlock::iterator BIP = Builder.GetInsertPoint(); + + Instruction *Ret = nullptr; + + // Check to see if there is already a cast! + for (User *U : V->users()) + if (U->getType() == Ty) + if (CastInst *CI = dyn_cast<CastInst>(U)) + if (CI->getOpcode() == Op) { + // If the cast isn't where we want it, create a new cast at IP. + // Likewise, do not reuse a cast at BIP because it must dominate + // instructions that might be inserted before BIP. + if (BasicBlock::iterator(CI) != IP || BIP == IP) { + // Create a new cast, and leave the old cast in place in case + // it is being used as an insert point. + Ret = CastInst::Create(Op, V, Ty, "", &*IP); + Ret->takeName(CI); + CI->replaceAllUsesWith(Ret); + break; + } + Ret = CI; + break; + } + + // Create a new cast. + if (!Ret) + Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP); + + // We assert at the end of the function since IP might point to an + // instruction with different dominance properties than a cast + // (an invoke for example) and not dominate BIP (but the cast does). + assert(SE.DT.dominates(Ret, &*BIP)); + + rememberInstruction(Ret); + return Ret; +} + +static BasicBlock::iterator findInsertPointAfter(Instruction *I, + BasicBlock *MustDominate) { + BasicBlock::iterator IP = ++I->getIterator(); + if (auto *II = dyn_cast<InvokeInst>(I)) + IP = II->getNormalDest()->begin(); + + while (isa<PHINode>(IP)) + ++IP; + + if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) { + ++IP; + } else if (isa<CatchSwitchInst>(IP)) { + IP = MustDominate->getFirstInsertionPt(); + } else { + assert(!IP->isEHPad() && "unexpected eh pad!"); + } + + return IP; +} + +/// InsertNoopCastOfTo - Insert a cast of V to the specified type, +/// which must be possible with a noop cast, doing what we can to share +/// the casts. +Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) { + Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false); + assert((Op == Instruction::BitCast || + Op == Instruction::PtrToInt || + Op == Instruction::IntToPtr) && + "InsertNoopCastOfTo cannot perform non-noop casts!"); + assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) && + "InsertNoopCastOfTo cannot change sizes!"); + + // Short-circuit unnecessary bitcasts. + if (Op == Instruction::BitCast) { + if (V->getType() == Ty) + return V; + if (CastInst *CI = dyn_cast<CastInst>(V)) { + if (CI->getOperand(0)->getType() == Ty) + return CI->getOperand(0); + } + } + // Short-circuit unnecessary inttoptr<->ptrtoint casts. + if ((Op == Instruction::PtrToInt || Op == Instruction::IntToPtr) && + SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) { + if (CastInst *CI = dyn_cast<CastInst>(V)) + if ((CI->getOpcode() == Instruction::PtrToInt || + CI->getOpcode() == Instruction::IntToPtr) && + SE.getTypeSizeInBits(CI->getType()) == + SE.getTypeSizeInBits(CI->getOperand(0)->getType())) + return CI->getOperand(0); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if ((CE->getOpcode() == Instruction::PtrToInt || + CE->getOpcode() == Instruction::IntToPtr) && + SE.getTypeSizeInBits(CE->getType()) == + SE.getTypeSizeInBits(CE->getOperand(0)->getType())) + return CE->getOperand(0); + } + + // Fold a cast of a constant. + if (Constant *C = dyn_cast<Constant>(V)) + return ConstantExpr::getCast(Op, C, Ty); + + // Cast the argument at the beginning of the entry block, after + // any bitcasts of other arguments. + if (Argument *A = dyn_cast<Argument>(V)) { + BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin(); + while ((isa<BitCastInst>(IP) && + isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) && + cast<BitCastInst>(IP)->getOperand(0) != A) || + isa<DbgInfoIntrinsic>(IP)) + ++IP; + return ReuseOrCreateCast(A, Ty, Op, IP); + } + + // Cast the instruction immediately after the instruction. + Instruction *I = cast<Instruction>(V); + BasicBlock::iterator IP = findInsertPointAfter(I, Builder.GetInsertBlock()); + return ReuseOrCreateCast(I, Ty, Op, IP); +} + +/// InsertBinop - Insert the specified binary operator, doing a small amount +/// of work to avoid inserting an obviously redundant operation, and hoisting +/// to an outer loop when the opportunity is there and it is safe. +Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, + Value *LHS, Value *RHS, + SCEV::NoWrapFlags Flags, bool IsSafeToHoist) { + // Fold a binop with constant operands. + if (Constant *CLHS = dyn_cast<Constant>(LHS)) + if (Constant *CRHS = dyn_cast<Constant>(RHS)) + return ConstantExpr::get(Opcode, CLHS, CRHS); + + // Do a quick scan to see if we have this binop nearby. If so, reuse it. + unsigned ScanLimit = 6; + BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin(); + // Scanning starts from the last instruction before the insertion point. + BasicBlock::iterator IP = Builder.GetInsertPoint(); + if (IP != BlockBegin) { + --IP; + for (; ScanLimit; --IP, --ScanLimit) { + // Don't count dbg.value against the ScanLimit, to avoid perturbing the + // generated code. + if (isa<DbgInfoIntrinsic>(IP)) + ScanLimit++; + + auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) { + // Ensure that no-wrap flags match. + if (isa<OverflowingBinaryOperator>(I)) { + if (I->hasNoSignedWrap() != (Flags & SCEV::FlagNSW)) + return true; + if (I->hasNoUnsignedWrap() != (Flags & SCEV::FlagNUW)) + return true; + } + // Conservatively, do not use any instruction which has any of exact + // flags installed. + if (isa<PossiblyExactOperator>(I) && I->isExact()) + return true; + return false; + }; + if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS && + IP->getOperand(1) == RHS && !canGenerateIncompatiblePoison(&*IP)) + return &*IP; + if (IP == BlockBegin) break; + } + } + + // Save the original insertion point so we can restore it when we're done. + DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc(); + SCEVInsertPointGuard Guard(Builder, this); + + if (IsSafeToHoist) { + // Move the insertion point out of as many loops as we can. + while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { + if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break; + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) break; + + // Ok, move up a level. + Builder.SetInsertPoint(Preheader->getTerminator()); + } + } + + // If we haven't found this binop, insert it. + Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS)); + BO->setDebugLoc(Loc); + if (Flags & SCEV::FlagNUW) + BO->setHasNoUnsignedWrap(); + if (Flags & SCEV::FlagNSW) + BO->setHasNoSignedWrap(); + rememberInstruction(BO); + + return BO; +} + +/// FactorOutConstant - Test if S is divisible by Factor, using signed +/// division. If so, update S with Factor divided out and return true. +/// S need not be evenly divisible if a reasonable remainder can be +/// computed. +static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder, + const SCEV *Factor, ScalarEvolution &SE, + const DataLayout &DL) { + // Everything is divisible by one. + if (Factor->isOne()) + return true; + + // x/x == 1. + if (S == Factor) { + S = SE.getConstant(S->getType(), 1); + return true; + } + + // For a Constant, check for a multiple of the given factor. + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { + // 0/x == 0. + if (C->isZero()) + return true; + // Check for divisibility. + if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) { + ConstantInt *CI = + ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt())); + // If the quotient is zero and the remainder is non-zero, reject + // the value at this scale. It will be considered for subsequent + // smaller scales. + if (!CI->isZero()) { + const SCEV *Div = SE.getConstant(CI); + S = Div; + Remainder = SE.getAddExpr( + Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt()))); + return true; + } + } + } + + // In a Mul, check if there is a constant operand which is a multiple + // of the given factor. + if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) { + // Size is known, check if there is a constant operand which is a multiple + // of the given factor. If so, we can factor it. + if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0))) + if (!C->getAPInt().srem(FC->getAPInt())) { + SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end()); + NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt())); + S = SE.getMulExpr(NewMulOps); + return true; + } + } + + // In an AddRec, check if both start and step are divisible. + if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) { + const SCEV *Step = A->getStepRecurrence(SE); + const SCEV *StepRem = SE.getConstant(Step->getType(), 0); + if (!FactorOutConstant(Step, StepRem, Factor, SE, DL)) + return false; + if (!StepRem->isZero()) + return false; + const SCEV *Start = A->getStart(); + if (!FactorOutConstant(Start, Remainder, Factor, SE, DL)) + return false; + S = SE.getAddRecExpr(Start, Step, A->getLoop(), + A->getNoWrapFlags(SCEV::FlagNW)); + return true; + } + + return false; +} + +/// SimplifyAddOperands - Sort and simplify a list of add operands. NumAddRecs +/// is the number of SCEVAddRecExprs present, which are kept at the end of +/// the list. +/// +static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops, + Type *Ty, + ScalarEvolution &SE) { + unsigned NumAddRecs = 0; + for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i) + ++NumAddRecs; + // Group Ops into non-addrecs and addrecs. + SmallVector<const SCEV *, 8> NoAddRecs(Ops.begin(), Ops.end() - NumAddRecs); + SmallVector<const SCEV *, 8> AddRecs(Ops.end() - NumAddRecs, Ops.end()); + // Let ScalarEvolution sort and simplify the non-addrecs list. + const SCEV *Sum = NoAddRecs.empty() ? + SE.getConstant(Ty, 0) : + SE.getAddExpr(NoAddRecs); + // If it returned an add, use the operands. Otherwise it simplified + // the sum into a single value, so just use that. + Ops.clear(); + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum)) + Ops.append(Add->op_begin(), Add->op_end()); + else if (!Sum->isZero()) + Ops.push_back(Sum); + // Then append the addrecs. + Ops.append(AddRecs.begin(), AddRecs.end()); +} + +/// SplitAddRecs - Flatten a list of add operands, moving addrec start values +/// out to the top level. For example, convert {a + b,+,c} to a, b, {0,+,d}. +/// This helps expose more opportunities for folding parts of the expressions +/// into GEP indices. +/// +static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops, + Type *Ty, + ScalarEvolution &SE) { + // Find the addrecs. + SmallVector<const SCEV *, 8> AddRecs; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i])) { + const SCEV *Start = A->getStart(); + if (Start->isZero()) break; + const SCEV *Zero = SE.getConstant(Ty, 0); + AddRecs.push_back(SE.getAddRecExpr(Zero, + A->getStepRecurrence(SE), + A->getLoop(), + A->getNoWrapFlags(SCEV::FlagNW))); + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) { + Ops[i] = Zero; + Ops.append(Add->op_begin(), Add->op_end()); + e += Add->getNumOperands(); + } else { + Ops[i] = Start; + } + } + if (!AddRecs.empty()) { + // Add the addrecs onto the end of the list. + Ops.append(AddRecs.begin(), AddRecs.end()); + // Resort the operand list, moving any constants to the front. + SimplifyAddOperands(Ops, Ty, SE); + } +} + +/// expandAddToGEP - Expand an addition expression with a pointer type into +/// a GEP instead of using ptrtoint+arithmetic+inttoptr. This helps +/// BasicAliasAnalysis and other passes analyze the result. See the rules +/// for getelementptr vs. inttoptr in +/// http://llvm.org/docs/LangRef.html#pointeraliasing +/// for details. +/// +/// Design note: The correctness of using getelementptr here depends on +/// ScalarEvolution not recognizing inttoptr and ptrtoint operators, as +/// they may introduce pointer arithmetic which may not be safely converted +/// into getelementptr. +/// +/// Design note: It might seem desirable for this function to be more +/// loop-aware. If some of the indices are loop-invariant while others +/// aren't, it might seem desirable to emit multiple GEPs, keeping the +/// loop-invariant portions of the overall computation outside the loop. +/// However, there are a few reasons this is not done here. Hoisting simple +/// arithmetic is a low-level optimization that often isn't very +/// important until late in the optimization process. In fact, passes +/// like InstructionCombining will combine GEPs, even if it means +/// pushing loop-invariant computation down into loops, so even if the +/// GEPs were split here, the work would quickly be undone. The +/// LoopStrengthReduction pass, which is usually run quite late (and +/// after the last InstructionCombining pass), takes care of hoisting +/// loop-invariant portions of expressions, after considering what +/// can be folded using target addressing modes. +/// +Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, + const SCEV *const *op_end, + PointerType *PTy, + Type *Ty, + Value *V) { + Type *OriginalElTy = PTy->getElementType(); + Type *ElTy = OriginalElTy; + SmallVector<Value *, 4> GepIndices; + SmallVector<const SCEV *, 8> Ops(op_begin, op_end); + bool AnyNonZeroIndices = false; + + // Split AddRecs up into parts as either of the parts may be usable + // without the other. + SplitAddRecs(Ops, Ty, SE); + + Type *IntIdxTy = DL.getIndexType(PTy); + + // Descend down the pointer's type and attempt to convert the other + // operands into GEP indices, at each level. The first index in a GEP + // indexes into the array implied by the pointer operand; the rest of + // the indices index into the element or field type selected by the + // preceding index. + for (;;) { + // If the scale size is not 0, attempt to factor out a scale for + // array indexing. + SmallVector<const SCEV *, 8> ScaledOps; + if (ElTy->isSized()) { + const SCEV *ElSize = SE.getSizeOfExpr(IntIdxTy, ElTy); + if (!ElSize->isZero()) { + SmallVector<const SCEV *, 8> NewOps; + for (const SCEV *Op : Ops) { + const SCEV *Remainder = SE.getConstant(Ty, 0); + if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) { + // Op now has ElSize factored out. + ScaledOps.push_back(Op); + if (!Remainder->isZero()) + NewOps.push_back(Remainder); + AnyNonZeroIndices = true; + } else { + // The operand was not divisible, so add it to the list of operands + // we'll scan next iteration. + NewOps.push_back(Op); + } + } + // If we made any changes, update Ops. + if (!ScaledOps.empty()) { + Ops = NewOps; + SimplifyAddOperands(Ops, Ty, SE); + } + } + } + + // Record the scaled array index for this level of the type. If + // we didn't find any operands that could be factored, tentatively + // assume that element zero was selected (since the zero offset + // would obviously be folded away). + Value *Scaled = ScaledOps.empty() ? + Constant::getNullValue(Ty) : + expandCodeFor(SE.getAddExpr(ScaledOps), Ty); + GepIndices.push_back(Scaled); + + // Collect struct field index operands. + while (StructType *STy = dyn_cast<StructType>(ElTy)) { + bool FoundFieldNo = false; + // An empty struct has no fields. + if (STy->getNumElements() == 0) break; + // Field offsets are known. See if a constant offset falls within any of + // the struct fields. + if (Ops.empty()) + break; + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0])) + if (SE.getTypeSizeInBits(C->getType()) <= 64) { + const StructLayout &SL = *DL.getStructLayout(STy); + uint64_t FullOffset = C->getValue()->getZExtValue(); + if (FullOffset < SL.getSizeInBytes()) { + unsigned ElIdx = SL.getElementContainingOffset(FullOffset); + GepIndices.push_back( + ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx)); + ElTy = STy->getTypeAtIndex(ElIdx); + Ops[0] = + SE.getConstant(Ty, FullOffset - SL.getElementOffset(ElIdx)); + AnyNonZeroIndices = true; + FoundFieldNo = true; + } + } + // If no struct field offsets were found, tentatively assume that + // field zero was selected (since the zero offset would obviously + // be folded away). + if (!FoundFieldNo) { + ElTy = STy->getTypeAtIndex(0u); + GepIndices.push_back( + Constant::getNullValue(Type::getInt32Ty(Ty->getContext()))); + } + } + + if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy)) + ElTy = ATy->getElementType(); + else + // FIXME: Handle VectorType. + // E.g., If ElTy is scalable vector, then ElSize is not a compile-time + // constant, therefore can not be factored out. The generated IR is less + // ideal with base 'V' cast to i8* and do ugly getelementptr over that. + break; + } + + // If none of the operands were convertible to proper GEP indices, cast + // the base to i8* and do an ugly getelementptr with that. It's still + // better than ptrtoint+arithmetic+inttoptr at least. + if (!AnyNonZeroIndices) { + // Cast the base to i8*. + V = InsertNoopCastOfTo(V, + Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace())); + + assert(!isa<Instruction>(V) || + SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint())); + + // Expand the operands for a plain byte offset. + Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty); + + // Fold a GEP with constant operands. + if (Constant *CLHS = dyn_cast<Constant>(V)) + if (Constant *CRHS = dyn_cast<Constant>(Idx)) + return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ty->getContext()), + CLHS, CRHS); + + // Do a quick scan to see if we have this GEP nearby. If so, reuse it. + unsigned ScanLimit = 6; + BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin(); + // Scanning starts from the last instruction before the insertion point. + BasicBlock::iterator IP = Builder.GetInsertPoint(); + if (IP != BlockBegin) { + --IP; + for (; ScanLimit; --IP, --ScanLimit) { + // Don't count dbg.value against the ScanLimit, to avoid perturbing the + // generated code. + if (isa<DbgInfoIntrinsic>(IP)) + ScanLimit++; + if (IP->getOpcode() == Instruction::GetElementPtr && + IP->getOperand(0) == V && IP->getOperand(1) == Idx) + return &*IP; + if (IP == BlockBegin) break; + } + } + + // Save the original insertion point so we can restore it when we're done. + SCEVInsertPointGuard Guard(Builder, this); + + // Move the insertion point out of as many loops as we can. + while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { + if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break; + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) break; + + // Ok, move up a level. + Builder.SetInsertPoint(Preheader->getTerminator()); + } + + // Emit a GEP. + Value *GEP = Builder.CreateGEP(Builder.getInt8Ty(), V, Idx, "uglygep"); + rememberInstruction(GEP); + + return GEP; + } + + { + SCEVInsertPointGuard Guard(Builder, this); + + // Move the insertion point out of as many loops as we can. + while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { + if (!L->isLoopInvariant(V)) break; + + bool AnyIndexNotLoopInvariant = any_of( + GepIndices, [L](Value *Op) { return !L->isLoopInvariant(Op); }); + + if (AnyIndexNotLoopInvariant) + break; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) break; + + // Ok, move up a level. + Builder.SetInsertPoint(Preheader->getTerminator()); + } + + // Insert a pretty getelementptr. Note that this GEP is not marked inbounds, + // because ScalarEvolution may have changed the address arithmetic to + // compute a value which is beyond the end of the allocated object. + Value *Casted = V; + if (V->getType() != PTy) + Casted = InsertNoopCastOfTo(Casted, PTy); + Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep"); + Ops.push_back(SE.getUnknown(GEP)); + rememberInstruction(GEP); + } + + return expand(SE.getAddExpr(Ops)); +} + +Value *SCEVExpander::expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty, + Value *V) { + const SCEV *const Ops[1] = {Op}; + return expandAddToGEP(Ops, Ops + 1, PTy, Ty, V); +} + +/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for +/// SCEV expansion. If they are nested, this is the most nested. If they are +/// neighboring, pick the later. +static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B, + DominatorTree &DT) { + if (!A) return B; + if (!B) return A; + if (A->contains(B)) return B; + if (B->contains(A)) return A; + if (DT.dominates(A->getHeader(), B->getHeader())) return B; + if (DT.dominates(B->getHeader(), A->getHeader())) return A; + return A; // Arbitrarily break the tie. +} + +/// getRelevantLoop - Get the most relevant loop associated with the given +/// expression, according to PickMostRelevantLoop. +const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { + // Test whether we've already computed the most relevant loop for this SCEV. + auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr)); + if (!Pair.second) + return Pair.first->second; + + if (isa<SCEVConstant>(S)) + // A constant has no relevant loops. + return nullptr; + if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { + if (const Instruction *I = dyn_cast<Instruction>(U->getValue())) + return Pair.first->second = SE.LI.getLoopFor(I->getParent()); + // A non-instruction has no relevant loops. + return nullptr; + } + if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) { + const Loop *L = nullptr; + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) + L = AR->getLoop(); + for (const SCEV *Op : N->operands()) + L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT); + return RelevantLoops[N] = L; + } + if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) { + const Loop *Result = getRelevantLoop(C->getOperand()); + return RelevantLoops[C] = Result; + } + if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { + const Loop *Result = PickMostRelevantLoop( + getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT); + return RelevantLoops[D] = Result; + } + llvm_unreachable("Unexpected SCEV type!"); +} + +namespace { + +/// LoopCompare - Compare loops by PickMostRelevantLoop. +class LoopCompare { + DominatorTree &DT; +public: + explicit LoopCompare(DominatorTree &dt) : DT(dt) {} + + bool operator()(std::pair<const Loop *, const SCEV *> LHS, + std::pair<const Loop *, const SCEV *> RHS) const { + // Keep pointer operands sorted at the end. + if (LHS.second->getType()->isPointerTy() != + RHS.second->getType()->isPointerTy()) + return LHS.second->getType()->isPointerTy(); + + // Compare loops with PickMostRelevantLoop. + if (LHS.first != RHS.first) + return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first; + + // If one operand is a non-constant negative and the other is not, + // put the non-constant negative on the right so that a sub can + // be used instead of a negate and add. + if (LHS.second->isNonConstantNegative()) { + if (!RHS.second->isNonConstantNegative()) + return false; + } else if (RHS.second->isNonConstantNegative()) + return true; + + // Otherwise they are equivalent according to this comparison. + return false; + } +}; + +} + +Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + + // Collect all the add operands in a loop, along with their associated loops. + // Iterate in reverse so that constants are emitted last, all else equal, and + // so that pointer operands are inserted first, which the code below relies on + // to form more involved GEPs. + SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops; + for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()), + E(S->op_begin()); I != E; ++I) + OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I)); + + // Sort by loop. Use a stable sort so that constants follow non-constants and + // pointer operands precede non-pointer operands. + llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT)); + + // Emit instructions to add all the operands. Hoist as much as possible + // out of loops, and form meaningful getelementptrs where possible. + Value *Sum = nullptr; + for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) { + const Loop *CurLoop = I->first; + const SCEV *Op = I->second; + if (!Sum) { + // This is the first operand. Just expand it. + Sum = expand(Op); + ++I; + } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) { + // The running sum expression is a pointer. Try to form a getelementptr + // at this level with that as the base. + SmallVector<const SCEV *, 4> NewOps; + for (; I != E && I->first == CurLoop; ++I) { + // If the operand is SCEVUnknown and not instructions, peek through + // it, to enable more of it to be folded into the GEP. + const SCEV *X = I->second; + if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X)) + if (!isa<Instruction>(U->getValue())) + X = SE.getSCEV(U->getValue()); + NewOps.push_back(X); + } + Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum); + } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) { + // The running sum is an integer, and there's a pointer at this level. + // Try to form a getelementptr. If the running sum is instructions, + // use a SCEVUnknown to avoid re-analyzing them. + SmallVector<const SCEV *, 4> NewOps; + NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) : + SE.getSCEV(Sum)); + for (++I; I != E && I->first == CurLoop; ++I) + NewOps.push_back(I->second); + Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op)); + } else if (Op->isNonConstantNegative()) { + // Instead of doing a negate and add, just do a subtract. + Value *W = expandCodeFor(SE.getNegativeSCEV(Op), Ty); + Sum = InsertNoopCastOfTo(Sum, Ty); + Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap, + /*IsSafeToHoist*/ true); + ++I; + } else { + // A simple add. + Value *W = expandCodeFor(Op, Ty); + Sum = InsertNoopCastOfTo(Sum, Ty); + // Canonicalize a constant to the RHS. + if (isa<Constant>(Sum)) std::swap(Sum, W); + Sum = InsertBinop(Instruction::Add, Sum, W, S->getNoWrapFlags(), + /*IsSafeToHoist*/ true); + ++I; + } + } + + return Sum; +} + +Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) { + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + + // Collect all the mul operands in a loop, along with their associated loops. + // Iterate in reverse so that constants are emitted last, all else equal. + SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops; + for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()), + E(S->op_begin()); I != E; ++I) + OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I)); + + // Sort by loop. Use a stable sort so that constants follow non-constants. + llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT)); + + // Emit instructions to mul all the operands. Hoist as much as possible + // out of loops. + Value *Prod = nullptr; + auto I = OpsAndLoops.begin(); + + // Expand the calculation of X pow N in the following manner: + // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then: + // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK). + const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() { + auto E = I; + // Calculate how many times the same operand from the same loop is included + // into this power. + uint64_t Exponent = 0; + const uint64_t MaxExponent = UINT64_MAX >> 1; + // No one sane will ever try to calculate such huge exponents, but if we + // need this, we stop on UINT64_MAX / 2 because we need to exit the loop + // below when the power of 2 exceeds our Exponent, and we want it to be + // 1u << 31 at most to not deal with unsigned overflow. + while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) { + ++Exponent; + ++E; + } + assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?"); + + // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them + // that are needed into the result. + Value *P = expandCodeFor(I->second, Ty); + Value *Result = nullptr; + if (Exponent & 1) + Result = P; + for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) { + P = InsertBinop(Instruction::Mul, P, P, SCEV::FlagAnyWrap, + /*IsSafeToHoist*/ true); + if (Exponent & BinExp) + Result = Result ? InsertBinop(Instruction::Mul, Result, P, + SCEV::FlagAnyWrap, + /*IsSafeToHoist*/ true) + : P; + } + + I = E; + assert(Result && "Nothing was expanded?"); + return Result; + }; + + while (I != OpsAndLoops.end()) { + if (!Prod) { + // This is the first operand. Just expand it. + Prod = ExpandOpBinPowN(); + } else if (I->second->isAllOnesValue()) { + // Instead of doing a multiply by negative one, just do a negate. + Prod = InsertNoopCastOfTo(Prod, Ty); + Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod, + SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); + ++I; + } else { + // A simple mul. + Value *W = ExpandOpBinPowN(); + Prod = InsertNoopCastOfTo(Prod, Ty); + // Canonicalize a constant to the RHS. + if (isa<Constant>(Prod)) std::swap(Prod, W); + const APInt *RHS; + if (match(W, m_Power2(RHS))) { + // Canonicalize Prod*(1<<C) to Prod<<C. + assert(!Ty->isVectorTy() && "vector types are not SCEVable"); + auto NWFlags = S->getNoWrapFlags(); + // clear nsw flag if shl will produce poison value. + if (RHS->logBase2() == RHS->getBitWidth() - 1) + NWFlags = ScalarEvolution::clearFlags(NWFlags, SCEV::FlagNSW); + Prod = InsertBinop(Instruction::Shl, Prod, + ConstantInt::get(Ty, RHS->logBase2()), NWFlags, + /*IsSafeToHoist*/ true); + } else { + Prod = InsertBinop(Instruction::Mul, Prod, W, S->getNoWrapFlags(), + /*IsSafeToHoist*/ true); + } + } + } + + return Prod; +} + +Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) { + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + + Value *LHS = expandCodeFor(S->getLHS(), Ty); + if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) { + const APInt &RHS = SC->getAPInt(); + if (RHS.isPowerOf2()) + return InsertBinop(Instruction::LShr, LHS, + ConstantInt::get(Ty, RHS.logBase2()), + SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); + } + + Value *RHS = expandCodeFor(S->getRHS(), Ty); + return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap, + /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS())); +} + +/// Move parts of Base into Rest to leave Base with the minimal +/// expression that provides a pointer operand suitable for a +/// GEP expansion. +static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest, + ScalarEvolution &SE) { + while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) { + Base = A->getStart(); + Rest = SE.getAddExpr(Rest, + SE.getAddRecExpr(SE.getConstant(A->getType(), 0), + A->getStepRecurrence(SE), + A->getLoop(), + A->getNoWrapFlags(SCEV::FlagNW))); + } + if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) { + Base = A->getOperand(A->getNumOperands()-1); + SmallVector<const SCEV *, 8> NewAddOps(A->op_begin(), A->op_end()); + NewAddOps.back() = Rest; + Rest = SE.getAddExpr(NewAddOps); + ExposePointerBase(Base, Rest, SE); + } +} + +/// Determine if this is a well-behaved chain of instructions leading back to +/// the PHI. If so, it may be reused by expanded expressions. +bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, + const Loop *L) { + if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) || + (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV))) + return false; + // If any of the operands don't dominate the insert position, bail. + // Addrec operands are always loop-invariant, so this can only happen + // if there are instructions which haven't been hoisted. + if (L == IVIncInsertLoop) { + for (User::op_iterator OI = IncV->op_begin()+1, + OE = IncV->op_end(); OI != OE; ++OI) + if (Instruction *OInst = dyn_cast<Instruction>(OI)) + if (!SE.DT.dominates(OInst, IVIncInsertPos)) + return false; + } + // Advance to the next instruction. + IncV = dyn_cast<Instruction>(IncV->getOperand(0)); + if (!IncV) + return false; + + if (IncV->mayHaveSideEffects()) + return false; + + if (IncV == PN) + return true; + + return isNormalAddRecExprPHI(PN, IncV, L); +} + +/// getIVIncOperand returns an induction variable increment's induction +/// variable operand. +/// +/// If allowScale is set, any type of GEP is allowed as long as the nonIV +/// operands dominate InsertPos. +/// +/// If allowScale is not set, ensure that a GEP increment conforms to one of the +/// simple patterns generated by getAddRecExprPHILiterally and +/// expandAddtoGEP. If the pattern isn't recognized, return NULL. +Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV, + Instruction *InsertPos, + bool allowScale) { + if (IncV == InsertPos) + return nullptr; + + switch (IncV->getOpcode()) { + default: + return nullptr; + // Check for a simple Add/Sub or GEP of a loop invariant step. + case Instruction::Add: + case Instruction::Sub: { + Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1)); + if (!OInst || SE.DT.dominates(OInst, InsertPos)) + return dyn_cast<Instruction>(IncV->getOperand(0)); + return nullptr; + } + case Instruction::BitCast: + return dyn_cast<Instruction>(IncV->getOperand(0)); + case Instruction::GetElementPtr: + for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) { + if (isa<Constant>(*I)) + continue; + if (Instruction *OInst = dyn_cast<Instruction>(*I)) { + if (!SE.DT.dominates(OInst, InsertPos)) + return nullptr; + } + if (allowScale) { + // allow any kind of GEP as long as it can be hoisted. + continue; + } + // This must be a pointer addition of constants (pretty), which is already + // handled, or some number of address-size elements (ugly). Ugly geps + // have 2 operands. i1* is used by the expander to represent an + // address-size element. + if (IncV->getNumOperands() != 2) + return nullptr; + unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace(); + if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS) + && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS)) + return nullptr; + break; + } + return dyn_cast<Instruction>(IncV->getOperand(0)); + } +} + +/// If the insert point of the current builder or any of the builders on the +/// stack of saved builders has 'I' as its insert point, update it to point to +/// the instruction after 'I'. This is intended to be used when the instruction +/// 'I' is being moved. If this fixup is not done and 'I' is moved to a +/// different block, the inconsistent insert point (with a mismatched +/// Instruction and Block) can lead to an instruction being inserted in a block +/// other than its parent. +void SCEVExpander::fixupInsertPoints(Instruction *I) { + BasicBlock::iterator It(*I); + BasicBlock::iterator NewInsertPt = std::next(It); + if (Builder.GetInsertPoint() == It) + Builder.SetInsertPoint(&*NewInsertPt); + for (auto *InsertPtGuard : InsertPointGuards) + if (InsertPtGuard->GetInsertPoint() == It) + InsertPtGuard->SetInsertPoint(NewInsertPt); +} + +/// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make +/// it available to other uses in this loop. Recursively hoist any operands, +/// until we reach a value that dominates InsertPos. +bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { + if (SE.DT.dominates(IncV, InsertPos)) + return true; + + // InsertPos must itself dominate IncV so that IncV's new position satisfies + // its existing users. + if (isa<PHINode>(InsertPos) || + !SE.DT.dominates(InsertPos->getParent(), IncV->getParent())) + return false; + + if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos)) + return false; + + // Check that the chain of IV operands leading back to Phi can be hoisted. + SmallVector<Instruction*, 4> IVIncs; + for(;;) { + Instruction *Oper = getIVIncOperand(IncV, InsertPos, /*allowScale*/true); + if (!Oper) + return false; + // IncV is safe to hoist. + IVIncs.push_back(IncV); + IncV = Oper; + if (SE.DT.dominates(IncV, InsertPos)) + break; + } + for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) { + fixupInsertPoints(*I); + (*I)->moveBefore(InsertPos); + } + return true; +} + +/// Determine if this cyclic phi is in a form that would have been generated by +/// LSR. We don't care if the phi was actually expanded in this pass, as long +/// as it is in a low-cost form, for example, no implied multiplication. This +/// should match any patterns generated by getAddRecExprPHILiterally and +/// expandAddtoGEP. +bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, + const Loop *L) { + for(Instruction *IVOper = IncV; + (IVOper = getIVIncOperand(IVOper, L->getLoopPreheader()->getTerminator(), + /*allowScale=*/false));) { + if (IVOper == PN) + return true; + } + return false; +} + +/// expandIVInc - Expand an IV increment at Builder's current InsertPos. +/// Typically this is the LatchBlock terminator or IVIncInsertPos, but we may +/// need to materialize IV increments elsewhere to handle difficult situations. +Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L, + Type *ExpandTy, Type *IntTy, + bool useSubtract) { + Value *IncV; + // If the PHI is a pointer, use a GEP, otherwise use an add or sub. + if (ExpandTy->isPointerTy()) { + PointerType *GEPPtrTy = cast<PointerType>(ExpandTy); + // If the step isn't constant, don't use an implicitly scaled GEP, because + // that would require a multiply inside the loop. + if (!isa<ConstantInt>(StepV)) + GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()), + GEPPtrTy->getAddressSpace()); + IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN); + if (IncV->getType() != PN->getType()) { + IncV = Builder.CreateBitCast(IncV, PN->getType()); + rememberInstruction(IncV); + } + } else { + IncV = useSubtract ? + Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") : + Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next"); + rememberInstruction(IncV); + } + return IncV; +} + +/// Hoist the addrec instruction chain rooted in the loop phi above the +/// position. This routine assumes that this is possible (has been checked). +void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist, + Instruction *Pos, PHINode *LoopPhi) { + do { + if (DT->dominates(InstToHoist, Pos)) + break; + // Make sure the increment is where we want it. But don't move it + // down past a potential existing post-inc user. + fixupInsertPoints(InstToHoist); + InstToHoist->moveBefore(Pos); + Pos = InstToHoist; + InstToHoist = cast<Instruction>(InstToHoist->getOperand(0)); + } while (InstToHoist != LoopPhi); +} + +/// Check whether we can cheaply express the requested SCEV in terms of +/// the available PHI SCEV by truncation and/or inversion of the step. +static bool canBeCheaplyTransformed(ScalarEvolution &SE, + const SCEVAddRecExpr *Phi, + const SCEVAddRecExpr *Requested, + bool &InvertStep) { + Type *PhiTy = SE.getEffectiveSCEVType(Phi->getType()); + Type *RequestedTy = SE.getEffectiveSCEVType(Requested->getType()); + + if (RequestedTy->getIntegerBitWidth() > PhiTy->getIntegerBitWidth()) + return false; + + // Try truncate it if necessary. + Phi = dyn_cast<SCEVAddRecExpr>(SE.getTruncateOrNoop(Phi, RequestedTy)); + if (!Phi) + return false; + + // Check whether truncation will help. + if (Phi == Requested) { + InvertStep = false; + return true; + } + + // Check whether inverting will help: {R,+,-1} == R - {0,+,1}. + if (SE.getAddExpr(Requested->getStart(), + SE.getNegativeSCEV(Requested)) == Phi) { + InvertStep = true; + return true; + } + + return false; +} + +static bool IsIncrementNSW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) { + if (!isa<IntegerType>(AR->getType())) + return false; + + unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth(); + Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2); + const SCEV *Step = AR->getStepRecurrence(SE); + const SCEV *OpAfterExtend = SE.getAddExpr(SE.getSignExtendExpr(Step, WideTy), + SE.getSignExtendExpr(AR, WideTy)); + const SCEV *ExtendAfterOp = + SE.getSignExtendExpr(SE.getAddExpr(AR, Step), WideTy); + return ExtendAfterOp == OpAfterExtend; +} + +static bool IsIncrementNUW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) { + if (!isa<IntegerType>(AR->getType())) + return false; + + unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth(); + Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2); + const SCEV *Step = AR->getStepRecurrence(SE); + const SCEV *OpAfterExtend = SE.getAddExpr(SE.getZeroExtendExpr(Step, WideTy), + SE.getZeroExtendExpr(AR, WideTy)); + const SCEV *ExtendAfterOp = + SE.getZeroExtendExpr(SE.getAddExpr(AR, Step), WideTy); + return ExtendAfterOp == OpAfterExtend; +} + +/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand +/// the base addrec, which is the addrec without any non-loop-dominating +/// values, and return the PHI. +PHINode * +SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, + const Loop *L, + Type *ExpandTy, + Type *IntTy, + Type *&TruncTy, + bool &InvertStep) { + assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position"); + + // Reuse a previously-inserted PHI, if present. + BasicBlock *LatchBlock = L->getLoopLatch(); + if (LatchBlock) { + PHINode *AddRecPhiMatch = nullptr; + Instruction *IncV = nullptr; + TruncTy = nullptr; + InvertStep = false; + + // Only try partially matching scevs that need truncation and/or + // step-inversion if we know this loop is outside the current loop. + bool TryNonMatchingSCEV = + IVIncInsertLoop && + SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader()); + + for (PHINode &PN : L->getHeader()->phis()) { + if (!SE.isSCEVable(PN.getType())) + continue; + + const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN)); + if (!PhiSCEV) + continue; + + bool IsMatchingSCEV = PhiSCEV == Normalized; + // We only handle truncation and inversion of phi recurrences for the + // expanded expression if the expanded expression's loop dominates the + // loop we insert to. Check now, so we can bail out early. + if (!IsMatchingSCEV && !TryNonMatchingSCEV) + continue; + + // TODO: this possibly can be reworked to avoid this cast at all. + Instruction *TempIncV = + dyn_cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock)); + if (!TempIncV) + continue; + + // Check whether we can reuse this PHI node. + if (LSRMode) { + if (!isExpandedAddRecExprPHI(&PN, TempIncV, L)) + continue; + if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos)) + continue; + } else { + if (!isNormalAddRecExprPHI(&PN, TempIncV, L)) + continue; + } + + // Stop if we have found an exact match SCEV. + if (IsMatchingSCEV) { + IncV = TempIncV; + TruncTy = nullptr; + InvertStep = false; + AddRecPhiMatch = &PN; + break; + } + + // Try whether the phi can be translated into the requested form + // (truncated and/or offset by a constant). + if ((!TruncTy || InvertStep) && + canBeCheaplyTransformed(SE, PhiSCEV, Normalized, InvertStep)) { + // Record the phi node. But don't stop we might find an exact match + // later. + AddRecPhiMatch = &PN; + IncV = TempIncV; + TruncTy = SE.getEffectiveSCEVType(Normalized->getType()); + } + } + + if (AddRecPhiMatch) { + // Potentially, move the increment. We have made sure in + // isExpandedAddRecExprPHI or hoistIVInc that this is possible. + if (L == IVIncInsertLoop) + hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch); + + // Ok, the add recurrence looks usable. + // Remember this PHI, even in post-inc mode. + InsertedValues.insert(AddRecPhiMatch); + // Remember the increment. + rememberInstruction(IncV); + return AddRecPhiMatch; + } + } + + // Save the original insertion point so we can restore it when we're done. + SCEVInsertPointGuard Guard(Builder, this); + + // Another AddRec may need to be recursively expanded below. For example, if + // this AddRec is quadratic, the StepV may itself be an AddRec in this + // loop. Remove this loop from the PostIncLoops set before expanding such + // AddRecs. Otherwise, we cannot find a valid position for the step + // (i.e. StepV can never dominate its loop header). Ideally, we could do + // SavedIncLoops.swap(PostIncLoops), but we generally have a single element, + // so it's not worth implementing SmallPtrSet::swap. + PostIncLoopSet SavedPostIncLoops = PostIncLoops; + PostIncLoops.clear(); + + // Expand code for the start value into the loop preheader. + assert(L->getLoopPreheader() && + "Can't expand add recurrences without a loop preheader!"); + Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy, + L->getLoopPreheader()->getTerminator()); + + // StartV must have been be inserted into L's preheader to dominate the new + // phi. + assert(!isa<Instruction>(StartV) || + SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(), + L->getHeader())); + + // Expand code for the step value. Do this before creating the PHI so that PHI + // reuse code doesn't see an incomplete PHI. + const SCEV *Step = Normalized->getStepRecurrence(SE); + // If the stride is negative, insert a sub instead of an add for the increment + // (unless it's a constant, because subtracts of constants are canonicalized + // to adds). + bool useSubtract = !ExpandTy->isPointerTy() && Step->isNonConstantNegative(); + if (useSubtract) + Step = SE.getNegativeSCEV(Step); + // Expand the step somewhere that dominates the loop header. + Value *StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front()); + + // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if + // we actually do emit an addition. It does not apply if we emit a + // subtraction. + bool IncrementIsNUW = !useSubtract && IsIncrementNUW(SE, Normalized); + bool IncrementIsNSW = !useSubtract && IsIncrementNSW(SE, Normalized); + + // Create the PHI. + BasicBlock *Header = L->getHeader(); + Builder.SetInsertPoint(Header, Header->begin()); + pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header); + PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE), + Twine(IVName) + ".iv"); + rememberInstruction(PN); + + // Create the step instructions and populate the PHI. + for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) { + BasicBlock *Pred = *HPI; + + // Add a start value. + if (!L->contains(Pred)) { + PN->addIncoming(StartV, Pred); + continue; + } + + // Create a step value and add it to the PHI. + // If IVIncInsertLoop is non-null and equal to the addrec's loop, insert the + // instructions at IVIncInsertPos. + Instruction *InsertPos = L == IVIncInsertLoop ? + IVIncInsertPos : Pred->getTerminator(); + Builder.SetInsertPoint(InsertPos); + Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract); + + if (isa<OverflowingBinaryOperator>(IncV)) { + if (IncrementIsNUW) + cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap(); + if (IncrementIsNSW) + cast<BinaryOperator>(IncV)->setHasNoSignedWrap(); + } + PN->addIncoming(IncV, Pred); + } + + // After expanding subexpressions, restore the PostIncLoops set so the caller + // can ensure that IVIncrement dominates the current uses. + PostIncLoops = SavedPostIncLoops; + + // Remember this PHI, even in post-inc mode. + InsertedValues.insert(PN); + + return PN; +} + +Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { + Type *STy = S->getType(); + Type *IntTy = SE.getEffectiveSCEVType(STy); + const Loop *L = S->getLoop(); + + // Determine a normalized form of this expression, which is the expression + // before any post-inc adjustment is made. + const SCEVAddRecExpr *Normalized = S; + if (PostIncLoops.count(L)) { + PostIncLoopSet Loops; + Loops.insert(L); + Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE)); + } + + // Strip off any non-loop-dominating component from the addrec start. + const SCEV *Start = Normalized->getStart(); + const SCEV *PostLoopOffset = nullptr; + if (!SE.properlyDominates(Start, L->getHeader())) { + PostLoopOffset = Start; + Start = SE.getConstant(Normalized->getType(), 0); + Normalized = cast<SCEVAddRecExpr>( + SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE), + Normalized->getLoop(), + Normalized->getNoWrapFlags(SCEV::FlagNW))); + } + + // Strip off any non-loop-dominating component from the addrec step. + const SCEV *Step = Normalized->getStepRecurrence(SE); + const SCEV *PostLoopScale = nullptr; + if (!SE.dominates(Step, L->getHeader())) { + PostLoopScale = Step; + Step = SE.getConstant(Normalized->getType(), 1); + if (!Start->isZero()) { + // The normalization below assumes that Start is constant zero, so if + // it isn't re-associate Start to PostLoopOffset. + assert(!PostLoopOffset && "Start not-null but PostLoopOffset set?"); + PostLoopOffset = Start; + Start = SE.getConstant(Normalized->getType(), 0); + } + Normalized = + cast<SCEVAddRecExpr>(SE.getAddRecExpr( + Start, Step, Normalized->getLoop(), + Normalized->getNoWrapFlags(SCEV::FlagNW))); + } + + // Expand the core addrec. If we need post-loop scaling, force it to + // expand to an integer type to avoid the need for additional casting. + Type *ExpandTy = PostLoopScale ? IntTy : STy; + // We can't use a pointer type for the addrec if the pointer type is + // non-integral. + Type *AddRecPHIExpandTy = + DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy; + + // In some cases, we decide to reuse an existing phi node but need to truncate + // it and/or invert the step. + Type *TruncTy = nullptr; + bool InvertStep = false; + PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy, + IntTy, TruncTy, InvertStep); + + // Accommodate post-inc mode, if necessary. + Value *Result; + if (!PostIncLoops.count(L)) + Result = PN; + else { + // In PostInc mode, use the post-incremented value. + BasicBlock *LatchBlock = L->getLoopLatch(); + assert(LatchBlock && "PostInc mode requires a unique loop latch!"); + Result = PN->getIncomingValueForBlock(LatchBlock); + + // For an expansion to use the postinc form, the client must call + // expandCodeFor with an InsertPoint that is either outside the PostIncLoop + // or dominated by IVIncInsertPos. + if (isa<Instruction>(Result) && + !SE.DT.dominates(cast<Instruction>(Result), + &*Builder.GetInsertPoint())) { + // The induction variable's postinc expansion does not dominate this use. + // IVUsers tries to prevent this case, so it is rare. However, it can + // happen when an IVUser outside the loop is not dominated by the latch + // block. Adjusting IVIncInsertPos before expansion begins cannot handle + // all cases. Consider a phi outside whose operand is replaced during + // expansion with the value of the postinc user. Without fundamentally + // changing the way postinc users are tracked, the only remedy is + // inserting an extra IV increment. StepV might fold into PostLoopOffset, + // but hopefully expandCodeFor handles that. + bool useSubtract = + !ExpandTy->isPointerTy() && Step->isNonConstantNegative(); + if (useSubtract) + Step = SE.getNegativeSCEV(Step); + Value *StepV; + { + // Expand the step somewhere that dominates the loop header. + SCEVInsertPointGuard Guard(Builder, this); + StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front()); + } + Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract); + } + } + + // We have decided to reuse an induction variable of a dominating loop. Apply + // truncation and/or inversion of the step. + if (TruncTy) { + Type *ResTy = Result->getType(); + // Normalize the result type. + if (ResTy != SE.getEffectiveSCEVType(ResTy)) + Result = InsertNoopCastOfTo(Result, SE.getEffectiveSCEVType(ResTy)); + // Truncate the result. + if (TruncTy != Result->getType()) { + Result = Builder.CreateTrunc(Result, TruncTy); + rememberInstruction(Result); + } + // Invert the result. + if (InvertStep) { + Result = Builder.CreateSub(expandCodeFor(Normalized->getStart(), TruncTy), + Result); + rememberInstruction(Result); + } + } + + // Re-apply any non-loop-dominating scale. + if (PostLoopScale) { + assert(S->isAffine() && "Can't linearly scale non-affine recurrences."); + Result = InsertNoopCastOfTo(Result, IntTy); + Result = Builder.CreateMul(Result, + expandCodeFor(PostLoopScale, IntTy)); + rememberInstruction(Result); + } + + // Re-apply any non-loop-dominating offset. + if (PostLoopOffset) { + if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) { + if (Result->getType()->isIntegerTy()) { + Value *Base = expandCodeFor(PostLoopOffset, ExpandTy); + Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base); + } else { + Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result); + } + } else { + Result = InsertNoopCastOfTo(Result, IntTy); + Result = Builder.CreateAdd(Result, + expandCodeFor(PostLoopOffset, IntTy)); + rememberInstruction(Result); + } + } + + return Result; +} + +Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { + // In canonical mode we compute the addrec as an expression of a canonical IV + // using evaluateAtIteration and expand the resulting SCEV expression. This + // way we avoid introducing new IVs to carry on the comutation of the addrec + // throughout the loop. + // + // For nested addrecs evaluateAtIteration might need a canonical IV of a + // type wider than the addrec itself. Emitting a canonical IV of the + // proper type might produce non-legal types, for example expanding an i64 + // {0,+,2,+,1} addrec would need an i65 canonical IV. To avoid this just fall + // back to non-canonical mode for nested addrecs. + if (!CanonicalMode || (S->getNumOperands() > 2)) + return expandAddRecExprLiterally(S); + + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + const Loop *L = S->getLoop(); + + // First check for an existing canonical IV in a suitable type. + PHINode *CanonicalIV = nullptr; + if (PHINode *PN = L->getCanonicalInductionVariable()) + if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty)) + CanonicalIV = PN; + + // Rewrite an AddRec in terms of the canonical induction variable, if + // its type is more narrow. + if (CanonicalIV && + SE.getTypeSizeInBits(CanonicalIV->getType()) > + SE.getTypeSizeInBits(Ty)) { + SmallVector<const SCEV *, 4> NewOps(S->getNumOperands()); + for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i) + NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType()); + Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(), + S->getNoWrapFlags(SCEV::FlagNW))); + BasicBlock::iterator NewInsertPt = + findInsertPointAfter(cast<Instruction>(V), Builder.GetInsertBlock()); + V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr, + &*NewInsertPt); + return V; + } + + // {X,+,F} --> X + {0,+,F} + if (!S->getStart()->isZero()) { + SmallVector<const SCEV *, 4> NewOps(S->op_begin(), S->op_end()); + NewOps[0] = SE.getConstant(Ty, 0); + const SCEV *Rest = SE.getAddRecExpr(NewOps, L, + S->getNoWrapFlags(SCEV::FlagNW)); + + // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the + // comments on expandAddToGEP for details. + const SCEV *Base = S->getStart(); + // Dig into the expression to find the pointer base for a GEP. + const SCEV *ExposedRest = Rest; + ExposePointerBase(Base, ExposedRest, SE); + // If we found a pointer, expand the AddRec with a GEP. + if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) { + // Make sure the Base isn't something exotic, such as a multiplied + // or divided pointer value. In those cases, the result type isn't + // actually a pointer type. + if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) { + Value *StartV = expand(Base); + assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!"); + return expandAddToGEP(ExposedRest, PTy, Ty, StartV); + } + } + + // Just do a normal add. Pre-expand the operands to suppress folding. + // + // The LHS and RHS values are factored out of the expand call to make the + // output independent of the argument evaluation order. + const SCEV *AddExprLHS = SE.getUnknown(expand(S->getStart())); + const SCEV *AddExprRHS = SE.getUnknown(expand(Rest)); + return expand(SE.getAddExpr(AddExprLHS, AddExprRHS)); + } + + // If we don't yet have a canonical IV, create one. + if (!CanonicalIV) { + // Create and insert the PHI node for the induction variable in the + // specified loop. + BasicBlock *Header = L->getHeader(); + pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header); + CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar", + &Header->front()); + rememberInstruction(CanonicalIV); + + SmallSet<BasicBlock *, 4> PredSeen; + Constant *One = ConstantInt::get(Ty, 1); + for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) { + BasicBlock *HP = *HPI; + if (!PredSeen.insert(HP).second) { + // There must be an incoming value for each predecessor, even the + // duplicates! + CanonicalIV->addIncoming(CanonicalIV->getIncomingValueForBlock(HP), HP); + continue; + } + + if (L->contains(HP)) { + // Insert a unit add instruction right before the terminator + // corresponding to the back-edge. + Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One, + "indvar.next", + HP->getTerminator()); + Add->setDebugLoc(HP->getTerminator()->getDebugLoc()); + rememberInstruction(Add); + CanonicalIV->addIncoming(Add, HP); + } else { + CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP); + } + } + } + + // {0,+,1} --> Insert a canonical induction variable into the loop! + if (S->isAffine() && S->getOperand(1)->isOne()) { + assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) && + "IVs with types different from the canonical IV should " + "already have been handled!"); + return CanonicalIV; + } + + // {0,+,F} --> {0,+,1} * F + + // If this is a simple linear addrec, emit it now as a special case. + if (S->isAffine()) // {0,+,F} --> i*F + return + expand(SE.getTruncateOrNoop( + SE.getMulExpr(SE.getUnknown(CanonicalIV), + SE.getNoopOrAnyExtend(S->getOperand(1), + CanonicalIV->getType())), + Ty)); + + // If this is a chain of recurrences, turn it into a closed form, using the + // folders, then expandCodeFor the closed form. This allows the folders to + // simplify the expression without having to build a bunch of special code + // into this folder. + const SCEV *IH = SE.getUnknown(CanonicalIV); // Get I as a "symbolic" SCEV. + + // Promote S up to the canonical IV type, if the cast is foldable. + const SCEV *NewS = S; + const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType()); + if (isa<SCEVAddRecExpr>(Ext)) + NewS = Ext; + + const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE); + //cerr << "Evaluated: " << *this << "\n to: " << *V << "\n"; + + // Truncate the result down to the original type, if needed. + const SCEV *T = SE.getTruncateOrNoop(V, Ty); + return expand(T); +} + +Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) { + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + Value *V = expandCodeFor(S->getOperand(), + SE.getEffectiveSCEVType(S->getOperand()->getType())); + Value *I = Builder.CreateTrunc(V, Ty); + rememberInstruction(I); + return I; +} + +Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) { + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + Value *V = expandCodeFor(S->getOperand(), + SE.getEffectiveSCEVType(S->getOperand()->getType())); + Value *I = Builder.CreateZExt(V, Ty); + rememberInstruction(I); + return I; +} + +Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { + Type *Ty = SE.getEffectiveSCEVType(S->getType()); + Value *V = expandCodeFor(S->getOperand(), + SE.getEffectiveSCEVType(S->getOperand()->getType())); + Value *I = Builder.CreateSExt(V, Ty); + rememberInstruction(I); + return I; +} + +Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { + Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); + Type *Ty = LHS->getType(); + for (int i = S->getNumOperands()-2; i >= 0; --i) { + // In the case of mixed integer and pointer types, do the + // rest of the comparisons as integer. + Type *OpTy = S->getOperand(i)->getType(); + if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { + Ty = SE.getEffectiveSCEVType(Ty); + LHS = InsertNoopCastOfTo(LHS, Ty); + } + Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *ICmp = Builder.CreateICmpSGT(LHS, RHS); + rememberInstruction(ICmp); + Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax"); + rememberInstruction(Sel); + LHS = Sel; + } + // In the case of mixed integer and pointer types, cast the + // final result back to the pointer type. + if (LHS->getType() != S->getType()) + LHS = InsertNoopCastOfTo(LHS, S->getType()); + return LHS; +} + +Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { + Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); + Type *Ty = LHS->getType(); + for (int i = S->getNumOperands()-2; i >= 0; --i) { + // In the case of mixed integer and pointer types, do the + // rest of the comparisons as integer. + Type *OpTy = S->getOperand(i)->getType(); + if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { + Ty = SE.getEffectiveSCEVType(Ty); + LHS = InsertNoopCastOfTo(LHS, Ty); + } + Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *ICmp = Builder.CreateICmpUGT(LHS, RHS); + rememberInstruction(ICmp); + Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax"); + rememberInstruction(Sel); + LHS = Sel; + } + // In the case of mixed integer and pointer types, cast the + // final result back to the pointer type. + if (LHS->getType() != S->getType()) + LHS = InsertNoopCastOfTo(LHS, S->getType()); + return LHS; +} + +Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { + Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); + Type *Ty = LHS->getType(); + for (int i = S->getNumOperands() - 2; i >= 0; --i) { + // In the case of mixed integer and pointer types, do the + // rest of the comparisons as integer. + Type *OpTy = S->getOperand(i)->getType(); + if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { + Ty = SE.getEffectiveSCEVType(Ty); + LHS = InsertNoopCastOfTo(LHS, Ty); + } + Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *ICmp = Builder.CreateICmpSLT(LHS, RHS); + rememberInstruction(ICmp); + Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin"); + rememberInstruction(Sel); + LHS = Sel; + } + // In the case of mixed integer and pointer types, cast the + // final result back to the pointer type. + if (LHS->getType() != S->getType()) + LHS = InsertNoopCastOfTo(LHS, S->getType()); + return LHS; +} + +Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { + Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); + Type *Ty = LHS->getType(); + for (int i = S->getNumOperands() - 2; i >= 0; --i) { + // In the case of mixed integer and pointer types, do the + // rest of the comparisons as integer. + Type *OpTy = S->getOperand(i)->getType(); + if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { + Ty = SE.getEffectiveSCEVType(Ty); + LHS = InsertNoopCastOfTo(LHS, Ty); + } + Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *ICmp = Builder.CreateICmpULT(LHS, RHS); + rememberInstruction(ICmp); + Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin"); + rememberInstruction(Sel); + LHS = Sel; + } + // In the case of mixed integer and pointer types, cast the + // final result back to the pointer type. + if (LHS->getType() != S->getType()) + LHS = InsertNoopCastOfTo(LHS, S->getType()); + return LHS; +} + +Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty, + Instruction *IP) { + setInsertPoint(IP); + return expandCodeFor(SH, Ty); +} + +Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) { + // Expand the code for this SCEV. + Value *V = expand(SH); + if (Ty) { + assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) && + "non-trivial casts should be done with the SCEVs directly!"); + V = InsertNoopCastOfTo(V, Ty); + } + return V; +} + +ScalarEvolution::ValueOffsetPair +SCEVExpander::FindValueInExprValueMap(const SCEV *S, + const Instruction *InsertPt) { + SetVector<ScalarEvolution::ValueOffsetPair> *Set = SE.getSCEVValues(S); + // If the expansion is not in CanonicalMode, and the SCEV contains any + // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally. + if (CanonicalMode || !SE.containsAddRecurrence(S)) { + // If S is scConstant, it may be worse to reuse an existing Value. + if (S->getSCEVType() != scConstant && Set) { + // Choose a Value from the set which dominates the insertPt. + // insertPt should be inside the Value's parent loop so as not to break + // the LCSSA form. + for (auto const &VOPair : *Set) { + Value *V = VOPair.first; + ConstantInt *Offset = VOPair.second; + Instruction *EntInst = nullptr; + if (V && isa<Instruction>(V) && (EntInst = cast<Instruction>(V)) && + S->getType() == V->getType() && + EntInst->getFunction() == InsertPt->getFunction() && + SE.DT.dominates(EntInst, InsertPt) && + (SE.LI.getLoopFor(EntInst->getParent()) == nullptr || + SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt))) + return {V, Offset}; + } + } + } + return {nullptr, nullptr}; +} + +// The expansion of SCEV will either reuse a previous Value in ExprValueMap, +// or expand the SCEV literally. Specifically, if the expansion is in LSRMode, +// and the SCEV contains any sub scAddRecExpr type SCEV, it will be expanded +// literally, to prevent LSR's transformed SCEV from being reverted. Otherwise, +// the expansion will try to reuse Value from ExprValueMap, and only when it +// fails, expand the SCEV literally. +Value *SCEVExpander::expand(const SCEV *S) { + // Compute an insertion point for this SCEV object. Hoist the instructions + // as far out in the loop nest as possible. + Instruction *InsertPt = &*Builder.GetInsertPoint(); + + // We can move insertion point only if there is no div or rem operations + // otherwise we are risky to move it over the check for zero denominator. + auto SafeToHoist = [](const SCEV *S) { + return !SCEVExprContains(S, [](const SCEV *S) { + if (const auto *D = dyn_cast<SCEVUDivExpr>(S)) { + if (const auto *SC = dyn_cast<SCEVConstant>(D->getRHS())) + // Division by non-zero constants can be hoisted. + return SC->getValue()->isZero(); + // All other divisions should not be moved as they may be + // divisions by zero and should be kept within the + // conditions of the surrounding loops that guard their + // execution (see PR35406). + return true; + } + return false; + }); + }; + if (SafeToHoist(S)) { + for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());; + L = L->getParentLoop()) { + if (SE.isLoopInvariant(S, L)) { + if (!L) break; + if (BasicBlock *Preheader = L->getLoopPreheader()) + InsertPt = Preheader->getTerminator(); + else + // LSR sets the insertion point for AddRec start/step values to the + // block start to simplify value reuse, even though it's an invalid + // position. SCEVExpander must correct for this in all cases. + InsertPt = &*L->getHeader()->getFirstInsertionPt(); + } else { + // If the SCEV is computable at this level, insert it into the header + // after the PHIs (and after any other instructions that we've inserted + // there) so that it is guaranteed to dominate any user inside the loop. + if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L)) + InsertPt = &*L->getHeader()->getFirstInsertionPt(); + while (InsertPt->getIterator() != Builder.GetInsertPoint() && + (isInsertedInstruction(InsertPt) || + isa<DbgInfoIntrinsic>(InsertPt))) + InsertPt = &*std::next(InsertPt->getIterator()); + break; + } + } + } + + // IndVarSimplify sometimes sets the insertion point at the block start, even + // when there are PHIs at that point. We must correct for this. + if (isa<PHINode>(*InsertPt)) + InsertPt = &*InsertPt->getParent()->getFirstInsertionPt(); + + // Check to see if we already expanded this here. + auto I = InsertedExpressions.find(std::make_pair(S, InsertPt)); + if (I != InsertedExpressions.end()) + return I->second; + + SCEVInsertPointGuard Guard(Builder, this); + Builder.SetInsertPoint(InsertPt); + + // Expand the expression into instructions. + ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, InsertPt); + Value *V = VO.first; + + if (!V) + V = visit(S); + else if (VO.second) { + if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { + Type *Ety = Vty->getPointerElementType(); + int64_t Offset = VO.second->getSExtValue(); + int64_t ESize = SE.getTypeSizeInBits(Ety); + if ((Offset * 8) % ESize == 0) { + ConstantInt *Idx = + ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize); + V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); + } else { + ConstantInt *Idx = + ConstantInt::getSigned(VO.second->getType(), -Offset); + unsigned AS = Vty->getAddressSpace(); + V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); + V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, + "uglygep"); + V = Builder.CreateBitCast(V, Vty); + } + } else { + V = Builder.CreateSub(V, VO.second); + } + } + // Remember the expanded value for this SCEV at this location. + // + // This is independent of PostIncLoops. The mapped value simply materializes + // the expression at this insertion point. If the mapped value happened to be + // a postinc expansion, it could be reused by a non-postinc user, but only if + // its insertion point was already at the head of the loop. + InsertedExpressions[std::make_pair(S, InsertPt)] = V; + return V; +} + +void SCEVExpander::rememberInstruction(Value *I) { + if (!PostIncLoops.empty()) + InsertedPostIncValues.insert(I); + else + InsertedValues.insert(I); +} + +/// getOrInsertCanonicalInductionVariable - This method returns the +/// canonical induction variable of the specified type for the specified +/// loop (inserting one if there is none). A canonical induction variable +/// starts at zero and steps by one on each iteration. +PHINode * +SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L, + Type *Ty) { + assert(Ty->isIntegerTy() && "Can only insert integer induction variables!"); + + // Build a SCEV for {0,+,1}<L>. + // Conservatively use FlagAnyWrap for now. + const SCEV *H = SE.getAddRecExpr(SE.getConstant(Ty, 0), + SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap); + + // Emit code for it. + SCEVInsertPointGuard Guard(Builder, this); + PHINode *V = + cast<PHINode>(expandCodeFor(H, nullptr, &L->getHeader()->front())); + + return V; +} + +/// replaceCongruentIVs - Check for congruent phis in this loop header and +/// replace them with their most canonical representative. Return the number of +/// phis eliminated. +/// +/// This does not depend on any SCEVExpander state but should be used in +/// the same context that SCEVExpander is used. +unsigned +SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, + SmallVectorImpl<WeakTrackingVH> &DeadInsts, + const TargetTransformInfo *TTI) { + // Find integer phis in order of increasing width. + SmallVector<PHINode*, 8> Phis; + for (PHINode &PN : L->getHeader()->phis()) + Phis.push_back(&PN); + + if (TTI) + llvm::sort(Phis, [](Value *LHS, Value *RHS) { + // Put pointers at the back and make sure pointer < pointer = false. + if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy()) + return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy(); + return RHS->getType()->getPrimitiveSizeInBits() < + LHS->getType()->getPrimitiveSizeInBits(); + }); + + unsigned NumElim = 0; + DenseMap<const SCEV *, PHINode *> ExprToIVMap; + // Process phis from wide to narrow. Map wide phis to their truncation + // so narrow phis can reuse them. + for (PHINode *Phi : Phis) { + auto SimplifyPHINode = [&](PHINode *PN) -> Value * { + if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC})) + return V; + if (!SE.isSCEVable(PN->getType())) + return nullptr; + auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN)); + if (!Const) + return nullptr; + return Const->getValue(); + }; + + // Fold constant phis. They may be congruent to other constant phis and + // would confuse the logic below that expects proper IVs. + if (Value *V = SimplifyPHINode(Phi)) { + if (V->getType() != Phi->getType()) + continue; + Phi->replaceAllUsesWith(V); + DeadInsts.emplace_back(Phi); + ++NumElim; + DEBUG_WITH_TYPE(DebugType, dbgs() + << "INDVARS: Eliminated constant iv: " << *Phi << '\n'); + continue; + } + + if (!SE.isSCEVable(Phi->getType())) + continue; + + PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)]; + if (!OrigPhiRef) { + OrigPhiRef = Phi; + if (Phi->getType()->isIntegerTy() && TTI && + TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) { + // This phi can be freely truncated to the narrowest phi type. Map the + // truncated expression to it so it will be reused for narrow types. + const SCEV *TruncExpr = + SE.getTruncateExpr(SE.getSCEV(Phi), Phis.back()->getType()); + ExprToIVMap[TruncExpr] = Phi; + } + continue; + } + + // Replacing a pointer phi with an integer phi or vice-versa doesn't make + // sense. + if (OrigPhiRef->getType()->isPointerTy() != Phi->getType()->isPointerTy()) + continue; + + if (BasicBlock *LatchBlock = L->getLoopLatch()) { + Instruction *OrigInc = dyn_cast<Instruction>( + OrigPhiRef->getIncomingValueForBlock(LatchBlock)); + Instruction *IsomorphicInc = + dyn_cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock)); + + if (OrigInc && IsomorphicInc) { + // If this phi has the same width but is more canonical, replace the + // original with it. As part of the "more canonical" determination, + // respect a prior decision to use an IV chain. + if (OrigPhiRef->getType() == Phi->getType() && + !(ChainedPhis.count(Phi) || + isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)) && + (ChainedPhis.count(Phi) || + isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) { + std::swap(OrigPhiRef, Phi); + std::swap(OrigInc, IsomorphicInc); + } + // Replacing the congruent phi is sufficient because acyclic + // redundancy elimination, CSE/GVN, should handle the + // rest. However, once SCEV proves that a phi is congruent, + // it's often the head of an IV user cycle that is isomorphic + // with the original phi. It's worth eagerly cleaning up the + // common case of a single IV increment so that DeleteDeadPHIs + // can remove cycles that had postinc uses. + const SCEV *TruncExpr = + SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType()); + if (OrigInc != IsomorphicInc && + TruncExpr == SE.getSCEV(IsomorphicInc) && + SE.LI.replacementPreservesLCSSAForm(IsomorphicInc, OrigInc) && + hoistIVInc(OrigInc, IsomorphicInc)) { + DEBUG_WITH_TYPE(DebugType, + dbgs() << "INDVARS: Eliminated congruent iv.inc: " + << *IsomorphicInc << '\n'); + Value *NewInc = OrigInc; + if (OrigInc->getType() != IsomorphicInc->getType()) { + Instruction *IP = nullptr; + if (PHINode *PN = dyn_cast<PHINode>(OrigInc)) + IP = &*PN->getParent()->getFirstInsertionPt(); + else + IP = OrigInc->getNextNode(); + + IRBuilder<> Builder(IP); + Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc()); + NewInc = Builder.CreateTruncOrBitCast( + OrigInc, IsomorphicInc->getType(), IVName); + } + IsomorphicInc->replaceAllUsesWith(NewInc); + DeadInsts.emplace_back(IsomorphicInc); + } + } + } + DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: " + << *Phi << '\n'); + ++NumElim; + Value *NewIV = OrigPhiRef; + if (OrigPhiRef->getType() != Phi->getType()) { + IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(Phi->getDebugLoc()); + NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName); + } + Phi->replaceAllUsesWith(NewIV); + DeadInsts.emplace_back(Phi); + } + return NumElim; +} + +Value *SCEVExpander::getExactExistingExpansion(const SCEV *S, + const Instruction *At, Loop *L) { + Optional<ScalarEvolution::ValueOffsetPair> VO = + getRelatedExistingExpansion(S, At, L); + if (VO && VO.getValue().second == nullptr) + return VO.getValue().first; + return nullptr; +} + +Optional<ScalarEvolution::ValueOffsetPair> +SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At, + Loop *L) { + using namespace llvm::PatternMatch; + + SmallVector<BasicBlock *, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // Look for suitable value in simple conditions at the loop exits. + for (BasicBlock *BB : ExitingBlocks) { + ICmpInst::Predicate Pred; + Instruction *LHS, *RHS; + + if (!match(BB->getTerminator(), + m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), + m_BasicBlock(), m_BasicBlock()))) + continue; + + if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At)) + return ScalarEvolution::ValueOffsetPair(LHS, nullptr); + + if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At)) + return ScalarEvolution::ValueOffsetPair(RHS, nullptr); + } + + // Use expand's logic which is used for reusing a previous Value in + // ExprValueMap. + ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At); + if (VO.first) + return VO; + + // There is potential to make this significantly smarter, but this simple + // heuristic already gets some interesting cases. + + // Can not find suitable value. + return None; +} + +bool SCEVExpander::isHighCostExpansionHelper( + const SCEV *S, Loop *L, const Instruction &At, int &BudgetRemaining, + const TargetTransformInfo &TTI, SmallPtrSetImpl<const SCEV *> &Processed, + SmallVectorImpl<const SCEV *> &Worklist) { + if (BudgetRemaining < 0) + return true; // Already run out of budget, give up. + + // Was the cost of expansion of this expression already accounted for? + if (!Processed.insert(S).second) + return false; // We have already accounted for this expression. + + // If we can find an existing value for this scev available at the point "At" + // then consider the expression cheap. + if (getRelatedExistingExpansion(S, &At, L)) + return false; // Consider the expression to be free. + + switch (S->getSCEVType()) { + case scUnknown: + case scConstant: + return false; // Assume to be zero-cost. + } + + TargetTransformInfo::TargetCostKind CostKind = + TargetTransformInfo::TCK_RecipThroughput; + + if (auto *CastExpr = dyn_cast<SCEVCastExpr>(S)) { + unsigned Opcode; + switch (S->getSCEVType()) { + case scTruncate: + Opcode = Instruction::Trunc; + break; + case scZeroExtend: + Opcode = Instruction::ZExt; + break; + case scSignExtend: + Opcode = Instruction::SExt; + break; + default: + llvm_unreachable("There are no other cast types."); + } + const SCEV *Op = CastExpr->getOperand(); + BudgetRemaining -= TTI.getCastInstrCost(Opcode, /*Dst=*/S->getType(), + /*Src=*/Op->getType(), CostKind); + Worklist.emplace_back(Op); + return false; // Will answer upon next entry into this function. + } + + if (auto *UDivExpr = dyn_cast<SCEVUDivExpr>(S)) { + // If the divisor is a power of two count this as a logical right-shift. + if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS())) { + if (SC->getAPInt().isPowerOf2()) { + BudgetRemaining -= + TTI.getArithmeticInstrCost(Instruction::LShr, S->getType(), + CostKind); + // Note that we don't count the cost of RHS, because it is a constant, + // and we consider those to be free. But if that changes, we would need + // to log2() it first before calling isHighCostExpansionHelper(). + Worklist.emplace_back(UDivExpr->getLHS()); + return false; // Will answer upon next entry into this function. + } + } + + // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or + // HowManyLessThans produced to compute a precise expression, rather than a + // UDiv from the user's code. If we can't find a UDiv in the code with some + // simple searching, we need to account for it's cost. + + // At the beginning of this function we already tried to find existing + // value for plain 'S'. Now try to lookup 'S + 1' since it is common + // pattern involving division. This is just a simple search heuristic. + if (getRelatedExistingExpansion( + SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L)) + return false; // Consider it to be free. + + // Need to count the cost of this UDiv. + BudgetRemaining -= + TTI.getArithmeticInstrCost(Instruction::UDiv, S->getType(), + CostKind); + Worklist.insert(Worklist.end(), {UDivExpr->getLHS(), UDivExpr->getRHS()}); + return false; // Will answer upon next entry into this function. + } + + if (const auto *NAry = dyn_cast<SCEVAddRecExpr>(S)) { + Type *OpType = NAry->getType(); + + assert(NAry->getNumOperands() >= 2 && + "Polynomial should be at least linear"); + + int AddCost = + TTI.getArithmeticInstrCost(Instruction::Add, OpType, CostKind); + int MulCost = + TTI.getArithmeticInstrCost(Instruction::Mul, OpType, CostKind); + + // In this polynominal, we may have some zero operands, and we shouldn't + // really charge for those. So how many non-zero coeffients are there? + int NumTerms = llvm::count_if(NAry->operands(), + [](const SCEV *S) { return !S->isZero(); }); + assert(NumTerms >= 1 && "Polynominal should have at least one term."); + assert(!(*std::prev(NAry->operands().end()))->isZero() && + "Last operand should not be zero"); + + // Much like with normal add expr, the polynominal will require + // one less addition than the number of it's terms. + BudgetRemaining -= AddCost * (NumTerms - 1); + if (BudgetRemaining < 0) + return true; + + // Ignoring constant term (operand 0), how many of the coeffients are u> 1? + int NumNonZeroDegreeNonOneTerms = + llvm::count_if(make_range(std::next(NAry->op_begin()), NAry->op_end()), + [](const SCEV *S) { + auto *SConst = dyn_cast<SCEVConstant>(S); + return !SConst || SConst->getAPInt().ugt(1); + }); + // Here, *each* one of those will require a multiplication. + BudgetRemaining -= MulCost * NumNonZeroDegreeNonOneTerms; + if (BudgetRemaining < 0) + return true; + + // What is the degree of this polynominal? + int PolyDegree = NAry->getNumOperands() - 1; + assert(PolyDegree >= 1 && "Should be at least affine."); + + // The final term will be: + // Op_{PolyDegree} * x ^ {PolyDegree} + // Where x ^ {PolyDegree} will again require PolyDegree-1 mul operations. + // Note that x ^ {PolyDegree} = x * x ^ {PolyDegree-1} so charging for + // x ^ {PolyDegree} will give us x ^ {2} .. x ^ {PolyDegree-1} for free. + // FIXME: this is conservatively correct, but might be overly pessimistic. + BudgetRemaining -= MulCost * (PolyDegree - 1); + if (BudgetRemaining < 0) + return true; + + // And finally, the operands themselves should fit within the budget. + Worklist.insert(Worklist.end(), NAry->operands().begin(), + NAry->operands().end()); + return false; // So far so good, though ops may be too costly? + } + + if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(S)) { + Type *OpType = NAry->getType(); + + int PairCost; + switch (S->getSCEVType()) { + case scAddExpr: + PairCost = + TTI.getArithmeticInstrCost(Instruction::Add, OpType, CostKind); + break; + case scMulExpr: + // TODO: this is a very pessimistic cost modelling for Mul, + // because of Bin Pow algorithm actually used by the expander, + // see SCEVExpander::visitMulExpr(), ExpandOpBinPowN(). + PairCost = + TTI.getArithmeticInstrCost(Instruction::Mul, OpType, CostKind); + break; + case scSMaxExpr: + case scUMaxExpr: + case scSMinExpr: + case scUMinExpr: + PairCost = TTI.getCmpSelInstrCost(Instruction::ICmp, OpType, + CmpInst::makeCmpResultType(OpType), + CostKind) + + TTI.getCmpSelInstrCost(Instruction::Select, OpType, + CmpInst::makeCmpResultType(OpType), + CostKind); + break; + default: + llvm_unreachable("There are no other variants here."); + } + + assert(NAry->getNumOperands() > 1 && + "Nary expr should have more than 1 operand."); + // The simple nary expr will require one less op (or pair of ops) + // than the number of it's terms. + BudgetRemaining -= PairCost * (NAry->getNumOperands() - 1); + if (BudgetRemaining < 0) + return true; + + // And finally, the operands themselves should fit within the budget. + Worklist.insert(Worklist.end(), NAry->operands().begin(), + NAry->operands().end()); + return false; // So far so good, though ops may be too costly? + } + + llvm_unreachable("No other scev expressions possible."); +} + +Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred, + Instruction *IP) { + assert(IP); + switch (Pred->getKind()) { + case SCEVPredicate::P_Union: + return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP); + case SCEVPredicate::P_Equal: + return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP); + case SCEVPredicate::P_Wrap: { + auto *AddRecPred = cast<SCEVWrapPredicate>(Pred); + return expandWrapPredicate(AddRecPred, IP); + } + } + llvm_unreachable("Unknown SCEV predicate type"); +} + +Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred, + Instruction *IP) { + Value *Expr0 = expandCodeFor(Pred->getLHS(), Pred->getLHS()->getType(), IP); + Value *Expr1 = expandCodeFor(Pred->getRHS(), Pred->getRHS()->getType(), IP); + + Builder.SetInsertPoint(IP); + auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check"); + return I; +} + +Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, + Instruction *Loc, bool Signed) { + assert(AR->isAffine() && "Cannot generate RT check for " + "non-affine expression"); + + SCEVUnionPredicate Pred; + const SCEV *ExitCount = + SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred); + + assert(ExitCount != SE.getCouldNotCompute() && "Invalid loop count"); + + const SCEV *Step = AR->getStepRecurrence(SE); + const SCEV *Start = AR->getStart(); + + Type *ARTy = AR->getType(); + unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType()); + unsigned DstBits = SE.getTypeSizeInBits(ARTy); + + // The expression {Start,+,Step} has nusw/nssw if + // Step < 0, Start - |Step| * Backedge <= Start + // Step >= 0, Start + |Step| * Backedge > Start + // and |Step| * Backedge doesn't unsigned overflow. + + IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits); + Builder.SetInsertPoint(Loc); + Value *TripCountVal = expandCodeFor(ExitCount, CountTy, Loc); + + IntegerType *Ty = + IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy)); + Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty; + + Value *StepValue = expandCodeFor(Step, Ty, Loc); + Value *NegStepValue = expandCodeFor(SE.getNegativeSCEV(Step), Ty, Loc); + Value *StartValue = expandCodeFor(Start, ARExpandTy, Loc); + + ConstantInt *Zero = + ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits)); + + Builder.SetInsertPoint(Loc); + // Compute |Step| + Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero); + Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue); + + // Get the backedge taken count and truncate or extended to the AR type. + Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); + auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), + Intrinsic::umul_with_overflow, Ty); + + // Compute |Step| * Backedge + CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); + Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); + Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); + + // Compute: + // Start + |Step| * Backedge < Start + // Start - |Step| * Backedge > Start + Value *Add = nullptr, *Sub = nullptr; + if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) { + const SCEV *MulS = SE.getSCEV(MulV); + const SCEV *NegMulS = SE.getNegativeSCEV(MulS); + Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue), + ARPtrTy); + Sub = Builder.CreateBitCast( + expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy); + } else { + Add = Builder.CreateAdd(StartValue, MulV); + Sub = Builder.CreateSub(StartValue, MulV); + } + + Value *EndCompareGT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); + + Value *EndCompareLT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue); + + // Select the answer based on the sign of Step. + Value *EndCheck = + Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT); + + // If the backedge taken count type is larger than the AR type, + // check that we don't drop any bits by truncating it. If we are + // dropping bits, then we have overflow (unless the step is zero). + if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) { + auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits); + auto *BackedgeCheck = + Builder.CreateICmp(ICmpInst::ICMP_UGT, TripCountVal, + ConstantInt::get(Loc->getContext(), MaxVal)); + BackedgeCheck = Builder.CreateAnd( + BackedgeCheck, Builder.CreateICmp(ICmpInst::ICMP_NE, StepValue, Zero)); + + EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck); + } + + EndCheck = Builder.CreateOr(EndCheck, OfMul); + return EndCheck; +} + +Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred, + Instruction *IP) { + const auto *A = cast<SCEVAddRecExpr>(Pred->getExpr()); + Value *NSSWCheck = nullptr, *NUSWCheck = nullptr; + + // Add a check for NUSW + if (Pred->getFlags() & SCEVWrapPredicate::IncrementNUSW) + NUSWCheck = generateOverflowCheck(A, IP, false); + + // Add a check for NSSW + if (Pred->getFlags() & SCEVWrapPredicate::IncrementNSSW) + NSSWCheck = generateOverflowCheck(A, IP, true); + + if (NUSWCheck && NSSWCheck) + return Builder.CreateOr(NUSWCheck, NSSWCheck); + + if (NUSWCheck) + return NUSWCheck; + + if (NSSWCheck) + return NSSWCheck; + + return ConstantInt::getFalse(IP->getContext()); +} + +Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union, + Instruction *IP) { + auto *BoolType = IntegerType::get(IP->getContext(), 1); + Value *Check = ConstantInt::getNullValue(BoolType); + + // Loop over all checks in this set. + for (auto Pred : Union->getPredicates()) { + auto *NextCheck = expandCodeForPredicate(Pred, IP); + Builder.SetInsertPoint(IP); + Check = Builder.CreateOr(Check, NextCheck); + } + + return Check; +} + +namespace { +// Search for a SCEV subexpression that is not safe to expand. Any expression +// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely +// UDiv expressions. We don't know if the UDiv is derived from an IR divide +// instruction, but the important thing is that we prove the denominator is +// nonzero before expansion. +// +// IVUsers already checks that IV-derived expressions are safe. So this check is +// only needed when the expression includes some subexpression that is not IV +// derived. +// +// Currently, we only allow division by a nonzero constant here. If this is +// inadequate, we could easily allow division by SCEVUnknown by using +// ValueTracking to check isKnownNonZero(). +// +// We cannot generally expand recurrences unless the step dominates the loop +// header. The expander handles the special case of affine recurrences by +// scaling the recurrence outside the loop, but this technique isn't generally +// applicable. Expanding a nested recurrence outside a loop requires computing +// binomial coefficients. This could be done, but the recurrence has to be in a +// perfectly reduced form, which can't be guaranteed. +struct SCEVFindUnsafe { + ScalarEvolution &SE; + bool IsUnsafe; + + SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {} + + bool follow(const SCEV *S) { + if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { + const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS()); + if (!SC || SC->getValue()->isZero()) { + IsUnsafe = true; + return false; + } + } + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + const SCEV *Step = AR->getStepRecurrence(SE); + if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) { + IsUnsafe = true; + return false; + } + } + return true; + } + bool isDone() const { return IsUnsafe; } +}; +} + +namespace llvm { +bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) { + SCEVFindUnsafe Search(SE); + visitAll(S, Search); + return !Search.IsUnsafe; +} + +bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint, + ScalarEvolution &SE) { + if (!isSafeToExpand(S, SE)) + return false; + // We have to prove that the expanded site of S dominates InsertionPoint. + // This is easy when not in the same block, but hard when S is an instruction + // to be expanded somewhere inside the same block as our insertion point. + // What we really need here is something analogous to an OrderedBasicBlock, + // but for the moment, we paper over the problem by handling two common and + // cheap to check cases. + if (SE.properlyDominates(S, InsertionPoint->getParent())) + return true; + if (SE.dominates(S, InsertionPoint->getParent())) { + if (InsertionPoint->getParent()->getTerminator() == InsertionPoint) + return true; + if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) + for (const Value *V : InsertionPoint->operand_values()) + if (V == U->getValue()) + return true; + } + return false; +} +} diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index d93ca4f04cdbf..b450d71c996cb 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -33,7 +33,6 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" @@ -134,6 +133,11 @@ static cl::opt<unsigned> MaxSpeculationDepth( cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt<int> +MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10), + cl::desc("Max size of a block which is still considered " + "small enough to thread through")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -192,20 +196,34 @@ class SimplifyCFGOpt { bool FoldValueComparisonIntoPredecessors(Instruction *TI, IRBuilder<> &Builder); - bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); - bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder); - bool SimplifySingleResume(ResumeInst *RI); - bool SimplifyCommonResume(ResumeInst *RI); - bool SimplifyCleanupReturn(CleanupReturnInst *RI); - bool SimplifyUnreachable(UnreachableInst *UI); - bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder); - bool SimplifyIndirectBr(IndirectBrInst *IBI); - bool SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder); - bool SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder); + bool simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); + bool simplifyResume(ResumeInst *RI, IRBuilder<> &Builder); + bool simplifySingleResume(ResumeInst *RI); + bool simplifyCommonResume(ResumeInst *RI); + bool simplifyCleanupReturn(CleanupReturnInst *RI); + bool simplifyUnreachable(UnreachableInst *UI); + bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder); + bool simplifyIndirectBr(IndirectBrInst *IBI); + bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder); + bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder); + bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder); + bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder); bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, IRBuilder<> &Builder); + bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI); + bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, + const TargetTransformInfo &TTI); + bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond, + BasicBlock *TrueBB, BasicBlock *FalseBB, + uint32_t TrueWeight, uint32_t FalseWeight); + bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, + const DataLayout &DL); + bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select); + bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI); + bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder); + public: SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL, SmallPtrSetImpl<BasicBlock *> *LoopHeaders, @@ -317,7 +335,7 @@ static unsigned ComputeSpeculationCost(const User *I, const TargetTransformInfo &TTI) { assert(isSafeToSpeculativelyExecute(I) && "Instruction is not safe to speculatively execute!"); - return TTI.getUserCost(I); + return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency); } /// If we have a merge point of an "if condition" as accepted above, @@ -1235,8 +1253,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I); /// Given a conditional branch that goes to BB1 and BB2, hoist any common code /// in the two blocks up into the branch block. The caller of this function /// guarantees that BI's block dominates BB1 and BB2. -static bool HoistThenElseCodeToIf(BranchInst *BI, - const TargetTransformInfo &TTI) { +bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, + const TargetTransformInfo &TTI) { // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. In particular, we don't want to get into // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As @@ -1287,6 +1305,14 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2)) return Changed; + // If any of the two call sites has nomerge attribute, stop hoisting. + if (const auto *CB1 = dyn_cast<CallBase>(I1)) + if (CB1->cannotMerge()) + return Changed; + if (const auto *CB2 = dyn_cast<CallBase>(I2)) + if (CB2->cannotMerge()) + return Changed; + if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) { assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2)); // The debug location is an integral part of a debug info intrinsic @@ -1444,6 +1470,13 @@ static bool isLifeTimeMarker(const Instruction *I) { return false; } +// TODO: Refine this. This should avoid cases like turning constant memcpy sizes +// into variables. +static bool replacingOperandWithVariableIsCheap(const Instruction *I, + int OpIdx) { + return !isa<IntrinsicInst>(I); +} + // All instructions in Insts belong to different blocks that all unconditionally // branch to a common successor. Analyze each instruction and return true if it // would be possible to sink them into their successor, creating one common @@ -1465,8 +1498,9 @@ static bool canSinkInstructions( // Conservatively return false if I is an inline-asm instruction. Sinking // and merging inline-asm instructions can potentially create arguments // that cannot satisfy the inline-asm constraints. + // If the instruction has nomerge attribute, return false. if (const auto *C = dyn_cast<CallBase>(I)) - if (C->isInlineAsm()) + if (C->isInlineAsm() || C->cannotMerge()) return false; // Each instruction must have zero or one use. @@ -1521,7 +1555,8 @@ static bool canSinkInstructions( return false; for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { - if (I0->getOperand(OI)->getType()->isTokenTy()) + Value *Op = I0->getOperand(OI); + if (Op->getType()->isTokenTy()) // Don't touch any operand of token type. return false; @@ -1530,7 +1565,8 @@ static bool canSinkInstructions( return I->getOperand(OI) == I0->getOperand(OI); }; if (!all_of(Insts, SameAsI0)) { - if (!canReplaceOperandWithVariable(I0, OI)) + if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) || + !canReplaceOperandWithVariable(I0, OI)) // We can't create a PHI from this GEP. return false; // Don't create indirect calls! The called value is the final operand. @@ -1960,8 +1996,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, /// \endcode /// /// \returns true if the conditional block is removed. -static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, - const TargetTransformInfo &TTI) { +bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, + const TargetTransformInfo &TTI) { // Be conservative for now. FP select instruction can often be expensive. Value *BrCond = BI->getCondition(); if (isa<FCmpInst>(BrCond)) @@ -2110,9 +2146,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, } // Metadata can be dependent on the condition we are hoisting above. - // Conservatively strip all metadata on the instruction. - for (auto &I : *ThenBB) + // Conservatively strip all metadata on the instruction. Drop the debug loc + // to avoid making it appear as if the condition is a constant, which would + // be misleading while debugging. + for (auto &I : *ThenBB) { + if (!SpeculatedStoreValue || &I != SpeculatedStore) + I.setDebugLoc(DebugLoc()); I.dropUnknownNonDebugMetadata(); + } // Hoist the instructions. BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), @@ -2131,13 +2172,12 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, continue; // Create a select whose true value is the speculatively executed value and - // false value is the preexisting value. Swap them if the branch + // false value is the pre-existing value. Swap them if the branch // destinations were inverted. Value *TrueV = ThenV, *FalseV = OrigV; if (Invert) std::swap(TrueV, FalseV); - Value *V = Builder.CreateSelect( - BrCond, TrueV, FalseV, "spec.select", BI); + Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV, "spec.select", BI); PN.setIncomingValue(OrigI, V); PN.setIncomingValue(ThenI, V); } @@ -2154,12 +2194,15 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, /// Return true if we can thread a branch across this block. static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { - unsigned Size = 0; + int Size = 0; for (Instruction &I : BB->instructionsWithoutDebug()) { - if (Size > 10) + if (Size > MaxSmallBlockSize) return false; // Don't clone large BB's. - ++Size; + // We will delete Phis while threading, so Phis should not be accounted in + // block's size + if (!isa<PHINode>(I)) + ++Size; // We can only support instructions that do not define values that are // live outside of the current basic block. @@ -2306,9 +2349,6 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // dependence information for this check, but simplifycfg can't keep it up // to date, and this catches most of the cases we care about anyway. BasicBlock *BB = PN->getParent(); - const Function *Fn = BB->getParent(); - if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing)) - return false; BasicBlock *IfTrue, *IfFalse; Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse); @@ -2454,8 +2494,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, /// If we found a conditional branch that goes to two returning blocks, /// try to merge them together into one return, /// introducing a select if the return values disagree. -static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, - IRBuilder<> &Builder) { +bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI, + IRBuilder<> &Builder) { assert(BI->isConditional() && "Must be a conditional branch"); BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); @@ -2531,8 +2571,8 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, (void)RI; LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:" - << "\n " << *BI << "NewRet = " << *RI << "TRUEBLOCK: " - << *TrueSucc << "FALSEBLOCK: " << *FalseSucc); + << "\n " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: " + << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc); EraseTerminatorAndDCECond(BI); @@ -2588,6 +2628,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, const unsigned PredCount = pred_size(BB); + bool Changed = false; + Instruction *Cond = nullptr; if (BI->isConditional()) Cond = dyn_cast<Instruction>(BI->getCondition()); @@ -2611,17 +2653,18 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, } // Quit if we can't remove this instruction. if (!tryCSEWithPredecessor(Curr, PB)) - return false; + return Changed; + Changed = true; } } if (!Cond) - return false; + return Changed; } if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || Cond->getParent() != BB || !Cond->hasOneUse()) - return false; + return Changed; // Make sure the instruction after the condition is the cond branch. BasicBlock::iterator CondIt = ++Cond->getIterator(); @@ -2631,7 +2674,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, ++CondIt; if (&*CondIt != BI) - return false; + return Changed; // Only allow this transformation if computing the condition doesn't involve // too many instructions and these involved instructions can be executed @@ -2645,11 +2688,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, if (isa<DbgInfoIntrinsic>(I)) continue; if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I)) - return false; + return Changed; // I has only one use and can be executed unconditionally. Instruction *User = dyn_cast<Instruction>(I->user_back()); if (User == nullptr || User->getParent() != BB) - return false; + return Changed; // I is used in the same BB. Since BI uses Cond and doesn't have more slots // to use any other instruction, User must be an instruction between next(I) // and Cond. @@ -2659,23 +2702,23 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, NumBonusInsts += PredCount; // Early exits once we reach the limit. if (NumBonusInsts > BonusInstThreshold) - return false; + return Changed; } // Cond is known to be a compare or binary operator. Check to make sure that // neither operand is a potentially-trapping constant expression. if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0))) if (CE->canTrap()) - return false; + return Changed; if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1))) if (CE->canTrap()) - return false; + return Changed; // Finally, don't infinitely unroll conditional loops. BasicBlock *TrueDest = BI->getSuccessor(0); BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr; if (TrueDest == BB || FalseDest == BB) - return false; + return Changed; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *PredBlock = *PI; @@ -2715,6 +2758,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, } LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); + Changed = true; + IRBuilder<> Builder(PBI); // If we need to invert the condition in the pred block to match, do so now. @@ -2744,6 +2789,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, if (isa<DbgInfoIntrinsic>(BonusInst)) continue; Instruction *NewBonusInst = BonusInst->clone(); + + // When we fold the bonus instructions we want to make sure we + // reset their debug locations in order to avoid stepping on dead + // code caused by folding dead branches. + NewBonusInst->setDebugLoc(DebugLoc()); + RemapInstruction(NewBonusInst, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); VMap[&*BonusInst] = NewBonusInst; @@ -2763,6 +2814,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, // Clone Cond into the predecessor basic block, and or/and the // two conditions together. Instruction *CondInPred = Cond->clone(); + + // Reset the condition debug location to avoid jumping on dead code + // as the result of folding dead branches. + CondInPred->setDebugLoc(DebugLoc()); + RemapInstruction(CondInPred, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); PredBlock->getInstList().insert(PBI->getIterator(), CondInPred); @@ -2877,13 +2933,18 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, // could replace PBI's branch probabilities with BI's. // Copy any debug value intrinsics into the end of PredBlock. - for (Instruction &I : *BB) - if (isa<DbgInfoIntrinsic>(I)) - I.clone()->insertBefore(PBI); + for (Instruction &I : *BB) { + if (isa<DbgInfoIntrinsic>(I)) { + Instruction *NewI = I.clone(); + RemapInstruction(NewI, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + NewI->insertBefore(PBI); + } + } - return true; + return Changed; } - return false; + return Changed; } // If there is only one store in BB1 and BB2, return it, otherwise return @@ -3024,7 +3085,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, return false; // Not in white-list - not worthwhile folding. // And finally, if this is a non-free instruction that we are okay // speculating, ensure that we consider the speculation budget. - BudgetRemaining -= TTI.getUserCost(&I); + BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency); if (BudgetRemaining < 0) return false; // Eagerly refuse to fold as soon as we're out of budget. } @@ -3086,29 +3147,11 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, PStore->getAAMetadata(AAMD, /*Merge=*/false); PStore->getAAMetadata(AAMD, /*Merge=*/true); SI->setAAMetadata(AAMD); - unsigned PAlignment = PStore->getAlignment(); - unsigned QAlignment = QStore->getAlignment(); - unsigned TypeAlignment = - DL.getABITypeAlignment(SI->getValueOperand()->getType()); - unsigned MinAlignment; - unsigned MaxAlignment; - std::tie(MinAlignment, MaxAlignment) = std::minmax(PAlignment, QAlignment); // Choose the minimum alignment. If we could prove both stores execute, we // could use biggest one. In this case, though, we only know that one of the // stores executes. And we don't know it's safe to take the alignment from a // store that doesn't execute. - if (MinAlignment != 0) { - // Choose the minimum of all non-zero alignments. - SI->setAlignment(Align(MinAlignment)); - } else if (MaxAlignment != 0) { - // Choose the minimal alignment between the non-zero alignment and the ABI - // default alignment for the type of the stored value. - SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment))); - } else { - // If both alignments are zero, use ABI default alignment for the type of - // the stored value. - SI->setAlignment(Align(TypeAlignment)); - } + SI->setAlignment(std::min(PStore->getAlign(), QStore->getAlign())); QStore->eraseFromParent(); PStore->eraseFromParent(); @@ -3514,10 +3557,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // Takes care of updating the successors and removing the old terminator. // Also makes sure not to introduce new successors by assuming that edges to // non-successor TrueBBs and FalseBBs aren't reachable. -static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond, - BasicBlock *TrueBB, BasicBlock *FalseBB, - uint32_t TrueWeight, - uint32_t FalseWeight) { +bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm, + Value *Cond, BasicBlock *TrueBB, + BasicBlock *FalseBB, + uint32_t TrueWeight, + uint32_t FalseWeight) { // Remove any superfluous successor edges from the CFG. // First, figure out which successors to preserve. // If TrueBB and FalseBB are equal, only try to preserve one copy of that @@ -3577,7 +3621,8 @@ static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond, // (switch (select cond, X, Y)) on constant X, Y // with a branch - conditional if X and Y lead to distinct BBs, // unconditional otherwise. -static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) { +bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI, + SelectInst *Select) { // Check for constant integer values in the select. ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue()); ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue()); @@ -3613,7 +3658,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) { // blockaddress(@fn, BlockB))) // with // (br cond, BlockA, BlockB). -static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { +bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, + SelectInst *SI) { // Check that both operands of the select are block addresses. BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue()); BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue()); @@ -3748,8 +3794,9 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( /// The specified branch is a conditional branch. /// Check to see if it is branching on an or/and chain of icmp instructions, and /// fold it into a switch instruction if so. -static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, - const DataLayout &DL) { +bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI, + IRBuilder<> &Builder, + const DataLayout &DL) { Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); if (!Cond) return false; @@ -3863,19 +3910,19 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, return true; } -bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { +bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { if (isa<PHINode>(RI->getValue())) - return SimplifyCommonResume(RI); + return simplifyCommonResume(RI); else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) && RI->getValue() == RI->getParent()->getFirstNonPHI()) // The resume must unwind the exception that caused control to branch here. - return SimplifySingleResume(RI); + return simplifySingleResume(RI); return false; } // Simplify resume that is shared by several landing pads (phi of landing pad). -bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) { +bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) { BasicBlock *BB = RI->getParent(); // Check that there are no other instructions except for debug intrinsics @@ -3953,18 +4000,38 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) { return !TrivialUnwindBlocks.empty(); } +// Check if cleanup block is empty +static bool isCleanupBlockEmpty(Instruction *Inst, Instruction *RI) { + BasicBlock::iterator I = Inst->getIterator(), E = RI->getIterator(); + while (++I != E) { + auto *II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + + Intrinsic::ID IntrinsicID = II->getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + case Intrinsic::dbg_label: + case Intrinsic::lifetime_end: + break; + default: + return false; + } + } + return true; +} + // Simplify resume that is only used by a single (non-phi) landing pad. -bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) { +bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) { BasicBlock *BB = RI->getParent(); auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI()); assert(RI->getValue() == LPInst && "Resume must unwind the exception that caused control to here"); // Check that there are no other instructions except for debug intrinsics. - BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator(); - while (++I != E) - if (!isa<DbgInfoIntrinsic>(I)) - return false; + if (!isCleanupBlockEmpty(LPInst, RI)) + return false; // Turn all invokes that unwind here into calls and delete the basic block. for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { @@ -4000,23 +4067,8 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) { return false; // Check that there are no other instructions except for benign intrinsics. - BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator(); - while (++I != E) { - auto *II = dyn_cast<IntrinsicInst>(I); - if (!II) - return false; - - Intrinsic::ID IntrinsicID = II->getIntrinsicID(); - switch (IntrinsicID) { - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - case Intrinsic::dbg_label: - case Intrinsic::lifetime_end: - break; - default: - return false; - } - } + if (!isCleanupBlockEmpty(CPInst, RI)) + return false; // If the cleanup return we are simplifying unwinds to the caller, this will // set UnwindDest to nullptr. @@ -4083,9 +4135,10 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) { // The iterator must be incremented here because the instructions are // being moved to another block. PHINode *PN = cast<PHINode>(I++); - if (PN->use_empty()) - // If the PHI node has no uses, just leave it. It will be erased - // when we erase BB below. + if (PN->use_empty() || !PN->isUsedOutsideOfBlock(BB)) + // If the PHI node has no uses or all of its uses are in this basic + // block (meaning they are debug or lifetime intrinsics), just leave + // it. It will be erased when we erase BB below. continue; // Otherwise, sink this PHI node into UnwindDest. @@ -4148,7 +4201,7 @@ static bool mergeCleanupPad(CleanupReturnInst *RI) { return true; } -bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) { +bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) { // It is possible to transiantly have an undef cleanuppad operand because we // have deleted some, but not all, dead blocks. // Eventually, this block will be deleted. @@ -4164,7 +4217,7 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) { return false; } -bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { +bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { BasicBlock *BB = RI->getParent(); if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false; @@ -4218,7 +4271,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { return false; } -bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { +bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) { BasicBlock *BB = UI->getParent(); bool Changed = false; @@ -4393,7 +4446,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch) { /// Turn a switch with two reachable destinations into an integer range /// comparison and branch. -static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { +bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI, + IRBuilder<> &Builder) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); bool HasDefault = @@ -5689,7 +5743,7 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } -bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { +bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { BasicBlock *BB = SI->getParent(); if (isValueEqualityComparison(SI)) { @@ -5740,7 +5794,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { return false; } -bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { +bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { BasicBlock *BB = IBI->getParent(); bool Changed = false; @@ -5855,7 +5909,12 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, return false; } -bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, +bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) { + return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder) + : simplifyCondBranch(Branch, Builder); +} + +bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); BasicBlock *Succ = BI->getSuccessor(0); @@ -5916,10 +5975,9 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) { return PredPred; } -bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { +bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); - const Function *Fn = BB->getParent(); - if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing)) + if (!Options.SimplifyCondBranch) return false; // Conditional branch @@ -6064,9 +6122,9 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) { SI->getPointerOperand() == I; // A call to null is undefined. - if (auto CS = CallSite(Use)) - return !NullPointerIsDefined(CS->getFunction()) && - CS.getCalledValue() == I; + if (auto *CB = dyn_cast<CallBase>(Use)) + return !NullPointerIsDefined(CB->getFunction()) && + CB->getCalledOperand() == I; } return false; } @@ -6133,39 +6191,38 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { IRBuilder<> Builder(BB); - // If there is a trivial two-entry PHI node in this basic block, and we can - // eliminate it, do so now. - if (auto *PN = dyn_cast<PHINode>(BB->begin())) - if (PN->getNumIncomingValues() == 2) - Changed |= FoldTwoEntryPHINode(PN, TTI, DL); - - Builder.SetInsertPoint(BB->getTerminator()); - if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - if (BI->isUnconditional()) { - if (SimplifyUncondBranch(BI, Builder)) - return true; - } else { - if (SimplifyCondBranch(BI, Builder)) - return true; - } - } else if (auto *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { - if (SimplifyReturn(RI, Builder)) - return true; - } else if (auto *RI = dyn_cast<ResumeInst>(BB->getTerminator())) { - if (SimplifyResume(RI, Builder)) - return true; - } else if (auto *RI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) { - if (SimplifyCleanupReturn(RI)) - return true; - } else if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { - if (SimplifySwitch(SI, Builder)) - return true; - } else if (auto *UI = dyn_cast<UnreachableInst>(BB->getTerminator())) { - if (SimplifyUnreachable(UI)) - return true; - } else if (auto *IBI = dyn_cast<IndirectBrInst>(BB->getTerminator())) { - if (SimplifyIndirectBr(IBI)) - return true; + if (Options.FoldTwoEntryPHINode) { + // If there is a trivial two-entry PHI node in this basic block, and we can + // eliminate it, do so now. + if (auto *PN = dyn_cast<PHINode>(BB->begin())) + if (PN->getNumIncomingValues() == 2) + Changed |= FoldTwoEntryPHINode(PN, TTI, DL); + } + + Instruction *Terminator = BB->getTerminator(); + Builder.SetInsertPoint(Terminator); + switch (Terminator->getOpcode()) { + case Instruction::Br: + Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder); + break; + case Instruction::Ret: + Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder); + break; + case Instruction::Resume: + Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder); + break; + case Instruction::CleanupRet: + Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator)); + break; + case Instruction::Switch: + Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder); + break; + case Instruction::Unreachable: + Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator)); + break; + case Instruction::IndirectBr: + Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator)); + break; } return Changed; diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index cbb114f9a47aa..d3d0c33419085 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -27,6 +26,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; @@ -54,6 +54,7 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; + const TargetTransformInfo *TTI; SCEVExpander &Rewriter; SmallVectorImpl<WeakTrackingVH> &DeadInsts; @@ -61,10 +62,11 @@ namespace { public: SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT, - LoopInfo *LI, SCEVExpander &Rewriter, + LoopInfo *LI, const TargetTransformInfo *TTI, + SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &Dead) - : L(Loop), LI(LI), SE(SE), DT(DT), Rewriter(Rewriter), DeadInsts(Dead), - Changed(false) { + : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter), + DeadInsts(Dead), Changed(false) { assert(LI && "IV simplification requires LoopInfo"); } @@ -655,7 +657,7 @@ static Instruction *GetLoopInvariantInsertPosition(Loop *L, Instruction *Hint) { return Hint; } -/// Replace the UseInst with a constant if possible. +/// Replace the UseInst with a loop invariant expression if it is safe. bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) { if (!SE->isSCEVable(I->getType())) return false; @@ -667,10 +669,17 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) { return false; // Do not generate something ridiculous even if S is loop invariant. - if (Rewriter.isHighCostExpansion(S, L, I)) + if (Rewriter.isHighCostExpansion(S, L, SCEVCheapExpansionBudget, TTI, I)) return false; auto *IP = GetLoopInvariantInsertPosition(L, I); + + if (!isSafeToExpandAt(S, IP, *SE)) { + LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I + << " with non-speculable loop invariant: " << *S << '\n'); + return false; + } + auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP); I->replaceAllUsesWith(Invariant); @@ -931,10 +940,11 @@ void IVVisitor::anchor() { } /// Simplify instructions that use this induction variable /// by using ScalarEvolution to analyze the IV's recurrence. bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT, - LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead, + LoopInfo *LI, const TargetTransformInfo *TTI, + SmallVectorImpl<WeakTrackingVH> &Dead, SCEVExpander &Rewriter, IVVisitor *V) { - SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Rewriter, - Dead); + SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI, + Rewriter, Dead); SIV.simplifyUsers(CurrIV, V); return SIV.hasChanged(); } @@ -942,14 +952,16 @@ bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT, /// Simplify users of induction variables within this /// loop. This does not actually change or add IVs. bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT, - LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead) { + LoopInfo *LI, const TargetTransformInfo *TTI, + SmallVectorImpl<WeakTrackingVH> &Dead) { SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif bool Changed = false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead, Rewriter); + Changed |= + simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter); } return Changed; } diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index fa3a9d21f3dfb..cfcc3454a2102 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -138,28 +138,6 @@ static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) { return ConstantInt::get(CI->getType(), Result); } -static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B, - const TargetLibraryInfo *TLI) { - CallInst *FOpen = dyn_cast<CallInst>(File); - if (!FOpen) - return false; - - Function *InnerCallee = FOpen->getCalledFunction(); - if (!InnerCallee) - return false; - - LibFunc Func; - if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) || - Func != LibFunc_fopen) - return false; - - inferLibFuncAttributes(*CI->getCalledFunction(), *TLI); - if (PointerMayBeCaptured(File, true, true)) - return false; - - return true; -} - static bool isOnlyUsedInComparisonWithZero(Value *V) { for (User *U : V->users()) { if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) @@ -177,8 +155,7 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len, if (!isOnlyUsedInComparisonWithZero(CI)) return false; - if (!isDereferenceableAndAlignedPointer(Str, Align::None(), APInt(64, Len), - DL)) + if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL)) return false; if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory)) @@ -252,7 +229,7 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A // String and Memory Library Call Optimizations //===----------------------------------------------------------------------===// -Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) { // Extract some information from the instruction Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); @@ -274,7 +251,7 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, - IRBuilder<> &B) { + IRBuilderBase &B) { // We need to find the end of the destination string. That's where the // memory is to be moved to. We just generate a call to strlen. Value *DstLen = emitStrLen(Dst, B, DL, TLI); @@ -289,12 +266,12 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy( - CpyDst, Align::None(), Src, Align::None(), + CpyDst, Align(1), Src, Align(1), ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); return Dst; } -Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { // Extract some information from the instruction. Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); @@ -337,7 +314,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { return emitStrLenMemCpy(Src, Dst, SrcLen, B); } -Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); Value *SrcStr = CI->getArgOperand(0); @@ -382,7 +359,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr"); } -Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) { Value *SrcStr = CI->getArgOperand(0); ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); annotateNonNullBasedOnAccess(CI, 0); @@ -410,7 +387,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr"); } -Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) { Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); if (Str1P == Str2P) // strcmp(x,x) -> 0 return ConstantInt::get(CI->getType(), 0); @@ -465,7 +442,7 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { Value *Str1P = CI->getArgOperand(0); Value *Str2P = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); @@ -533,7 +510,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) { Value *Src = CI->getArgOperand(0); ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1)); uint64_t SrcLen = GetStringLength(Src); @@ -546,7 +523,7 @@ Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; @@ -562,13 +539,13 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. CallInst *NewCI = - B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + B.CreateMemCpy(Dst, Align(1), Src, Align(1), ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); NewCI->setAttributes(CI->getAttributes()); return Dst; } -Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) @@ -590,13 +567,12 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. - CallInst *NewCI = - B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), LenV); + CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV); NewCI->setAttributes(CI->getAttributes()); return DstEnd; } -Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); @@ -626,7 +602,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { if (SrcLen == 0) { // strncpy(x, "", y) -> memset(align 1 x, '\0', y) - CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align::None()); + CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align(1)); AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); @@ -639,13 +615,13 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Type *PT = Callee->getFunctionType()->getParamType(0); // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant] - CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), ConstantInt::get(DL.getIntPtrType(PT), Len)); NewCI->setAttributes(CI->getAttributes()); return Dst; } -Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B, +Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, unsigned CharSize) { Value *Src = CI->getArgOperand(0); @@ -736,14 +712,14 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B, return nullptr; } -Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) { if (Value *V = optimizeStringLength(CI, B, 8)) return V; annotateNonNullBasedOnAccess(CI, 0); return nullptr; } -Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) { Module &M = *CI->getModule(); unsigned WCharSize = TLI->getWCharSize(M) * 8; // We cannot perform this optimization without wchar_size metadata. @@ -753,7 +729,7 @@ Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) { return optimizeStringLength(CI, B, WCharSize); } -Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) { StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); @@ -780,7 +756,7 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilderBase &B) { Value *EndPtr = CI->getArgOperand(1); if (isa<ConstantPointerNull>(EndPtr)) { // With a null EndPtr, this function won't capture the main argument. @@ -791,7 +767,7 @@ Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilderBase &B) { StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); @@ -812,7 +788,7 @@ Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) { StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); @@ -836,7 +812,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) { // fold strstr(x, x) -> x. if (CI->getArgOperand(0) == CI->getArgOperand(1)) return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); @@ -893,13 +869,13 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) { if (isKnownNonZero(CI->getOperand(2), DL)) annotateNonNullBasedOnAccess(CI, 0); return nullptr; } -Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { Value *SrcStr = CI->getArgOperand(0); Value *Size = CI->getArgOperand(2); annotateNonNullAndDereferenceable(CI, 0, Size, DL); @@ -988,7 +964,7 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) { } static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS, - uint64_t Len, IRBuilder<> &B, + uint64_t Len, IRBuilderBase &B, const DataLayout &DL) { if (Len == 0) // memcmp(s1,s2,0) -> 0 return Constant::getNullValue(CI->getType()); @@ -1065,7 +1041,7 @@ static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS, // Most simplifications for memcmp also apply to bcmp. Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); @@ -1088,7 +1064,7 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI, return nullptr; } -Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) { if (Value *V = optimizeMemCmpBCmpCommon(CI, B)) return V; @@ -1105,24 +1081,24 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) { return optimizeMemCmpBCmpCommon(CI, B); } -Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) { Value *Size = CI->getArgOperand(2); annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); if (isa<IntrinsicInst>(CI)) return nullptr; // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n) - CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(), - CI->getArgOperand(1), Align::None(), Size); + CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1), + CI->getArgOperand(1), Align(1), Size); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } -Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) { Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2)); @@ -1146,8 +1122,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) { size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF); if (Pos == StringRef::npos) { if (N->getZExtValue() <= SrcStr.size()) { - B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), - CI->getArgOperand(3)); + B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3)); return Constant::getNullValue(CI->getType()); } return nullptr; @@ -1156,37 +1131,37 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) { Value *NewN = ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue())); // memccpy -> llvm.memcpy - B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), NewN); + B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN); return Pos + 1 <= N->getZExtValue() ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN) : Constant::getNullValue(CI->getType()); } -Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) { Value *Dst = CI->getArgOperand(0); Value *N = CI->getArgOperand(2); // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n - CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), CI->getArgOperand(1), - Align::None(), N); + CallInst *NewCI = + B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N); NewCI->setAttributes(CI->getAttributes()); return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N); } -Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) { Value *Size = CI->getArgOperand(2); annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); if (isa<IntrinsicInst>(CI)) return nullptr; // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n) - CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(), - CI->getArgOperand(1), Align::None(), Size); + CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1), + CI->getArgOperand(1), Align(1), Size); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n). -Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) { +Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) { // This has to be a memset of zeros (bzero). auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1)); if (!FillValue || FillValue->getZExtValue() != 0) @@ -1229,7 +1204,7 @@ Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) { Value *Size = CI->getArgOperand(2); annotateNonNullAndDereferenceable(CI, 0, Size, DL); if (isa<IntrinsicInst>(CI)) @@ -1240,13 +1215,12 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { // memset(p, v, n) -> llvm.memset(align 1 p, v, n) Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - CallInst *NewCI = - B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align::None()); + CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1)); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } -Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) { if (isa<ConstantPointerNull>(CI->getArgOperand(0))) return emitMalloc(CI->getArgOperand(1), B, DL, TLI); @@ -1258,9 +1232,10 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) { //===----------------------------------------------------------------------===// // Replace a libcall \p CI with a call to intrinsic \p IID -static Value *replaceUnaryCall(CallInst *CI, IRBuilder<> &B, Intrinsic::ID IID) { +static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B, + Intrinsic::ID IID) { // Propagate fast-math flags from the existing call to the new call. - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); Module *M = CI->getModule(); @@ -1294,7 +1269,7 @@ static Value *valueHasFloatPrecision(Value *Val) { } /// Shrink double -> float functions. -static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B, +static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, bool isBinary, bool isPrecise = false) { Function *CalleeFn = CI->getCalledFunction(); if (!CI->getType()->isDoubleTy() || !CalleeFn) @@ -1333,7 +1308,7 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B, } // Propagate the math semantics from the current function to the new function. - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); // g((double) float) -> (double) gf(float) @@ -1352,24 +1327,24 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B, } /// Shrink double -> float for unary functions. -static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, +static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B, bool isPrecise = false) { return optimizeDoubleFP(CI, B, false, isPrecise); } /// Shrink double -> float for binary functions. -static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B, +static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B, bool isPrecise = false) { return optimizeDoubleFP(CI, B, true, isPrecise); } // cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z))) -Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) { if (!CI->isFast()) return nullptr; // Propagate fast-math flags from the existing call to new instructions. - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); Value *Real, *Imag; @@ -1393,11 +1368,11 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilder<> &B) { } static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, - IRBuilder<> &B) { + IRBuilderBase &B) { if (!isa<FPMathOperator>(Call)) return nullptr; - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(Call->getFastMathFlags()); // TODO: Can this be shared to also handle LLVM intrinsics? @@ -1427,7 +1402,7 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, return nullptr; } -static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { +static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) { // Multiplications calculated using Addition Chains. // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html @@ -1453,7 +1428,7 @@ static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { } // Return a properly extended 32-bit integer if the operation is an itofp. -static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) { +static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B) { if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) { Value *Op = cast<Instruction>(I2F)->getOperand(0); // Make sure that the exponent fits inside an int32_t, @@ -1471,9 +1446,9 @@ static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) { /// Use exp{,2}(x * y) for pow(exp{,2}(x), y); /// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x); /// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x). -Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { +Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1); - AttributeList Attrs = Pow->getCalledFunction()->getAttributes(); + AttributeList Attrs; // Attributes are only meaningful on the original call Module *Mod = Pow->getModule(); Type *Ty = Pow->getType(); bool Ignored; @@ -1588,9 +1563,14 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l, B, Attrs); - // pow(n, x) -> exp2(log2(n) * x) - if (Pow->hasOneUse() && Pow->hasApproxFunc() && Pow->hasNoNaNs() && - Pow->hasNoInfs() && BaseF->isNormal() && !BaseF->isNegative()) { + // pow(x, y) -> exp2(log2(x) * y) + if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() && + !BaseF->isNegative()) { + // pow(1, inf) is defined to be 1 but exp2(log2(1) * inf) evaluates to NaN. + // Luckily optimizePow has already handled the x == 1 case. + assert(!match(Base, m_FPOne()) && + "pow(1.0, y) should have been simplified earlier!"); + Value *Log = nullptr; if (Ty->isFloatTy()) Log = ConstantFP::get(Ty, std::log2(BaseF->convertToFloat())); @@ -1612,7 +1592,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { } static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno, - Module *M, IRBuilder<> &B, + Module *M, IRBuilderBase &B, const TargetLibraryInfo *TLI) { // If errno is never set, then use the intrinsic for sqrt(). if (NoErrno) { @@ -1633,9 +1613,9 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno, } /// Use square root in place of pow(x, +/-0.5). -Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) { +Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) { Value *Sqrt, *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1); - AttributeList Attrs = Pow->getCalledFunction()->getAttributes(); + AttributeList Attrs; // Attributes are only meaningful on the original call Module *Mod = Pow->getModule(); Type *Ty = Pow->getType(); @@ -1676,13 +1656,13 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) { } static Value *createPowWithIntegerExponent(Value *Base, Value *Expo, Module *M, - IRBuilder<> &B) { + IRBuilderBase &B) { Value *Args[] = {Base, Expo}; Function *F = Intrinsic::getDeclaration(M, Intrinsic::powi, Base->getType()); return B.CreateCall(F, Args); } -Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { Value *Base = Pow->getArgOperand(0); Value *Expo = Pow->getArgOperand(1); Function *Callee = Pow->getCalledFunction(); @@ -1693,12 +1673,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { bool AllowApprox = Pow->hasApproxFunc(); bool Ignored; - // Bail out if simplifying libcalls to pow() is disabled. - if (!hasFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl)) - return nullptr; - // Propagate the math semantics from the call to any created instructions. - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(Pow->getFastMathFlags()); // Shrink pow() to powf() if the arguments are single precision, @@ -1748,7 +1724,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { // be different) and it should also consider optimizing for size. APFloat LimF(ExpoF->getSemantics(), 33), ExpoA(abs(*ExpoF)); - if (ExpoA.compare(LimF) == APFloat::cmpLessThan) { + if (ExpoA < LimF) { // This transformation applies to integer or integer+0.5 exponents only. // For integer+0.5, we create a sqrt(Base) call. Value *Sqrt = nullptr; @@ -1807,8 +1783,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { return Shrunk; } -Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); + AttributeList Attrs; // Attributes are only meaningful on the original call StringRef Name = Callee->getName(); Value *Ret = nullptr; if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) && @@ -1825,13 +1802,13 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { if (Value *Exp = getIntToFPVal(Op, B)) return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, - B, CI->getCalledFunction()->getAttributes()); + B, Attrs); } return Ret; } -Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) { // If we can shrink the call to a float function rather than a double // function, do that first. Function *Callee = CI->getCalledFunction(); @@ -1847,7 +1824,7 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { // "Ideally, fmax would be sensitive to the sign of zero, for example // fmax(-0.0, +0.0) would return +0; however, implementation in software // might be impractical." - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); FastMathFlags FMF = CI->getFastMathFlags(); FMF.setNoSignedZeros(); B.setFastMathFlags(FMF); @@ -1858,9 +1835,9 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) }); } -Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { Function *LogFn = Log->getCalledFunction(); - AttributeList Attrs = LogFn->getAttributes(); + AttributeList Attrs; // Attributes are only meaningful on the original call StringRef LogNm = LogFn->getName(); Intrinsic::ID LogID = LogFn->getIntrinsicID(); Module *Mod = Log->getModule(); @@ -1963,12 +1940,12 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) { } else return Ret; - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(FastMathFlags::getFast()); Intrinsic::ID ArgID = Arg->getIntrinsicID(); LibFunc ArgLb = NotLibFunc; - TLI->getLibFunc(Arg, ArgLb); + TLI->getLibFunc(*Arg, ArgLb); // log(pow(x,y)) -> y*log(x) if (ArgLb == PowLb || ArgID == Intrinsic::pow) { @@ -2010,7 +1987,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) { return Ret; } -Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; // TODO: Once we have a way (other than checking for the existince of the @@ -2058,7 +2035,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { // Fast math flags for any created instructions should match the sqrt // and multiply. - IRBuilder<>::FastMathFlagGuard Guard(B); + IRBuilderBase::FastMathFlagGuard Guard(B); B.setFastMathFlags(I->getFastMathFlags()); // If we found a repeated factor, hoist it out of the square root and @@ -2079,7 +2056,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { } // TODO: Generalize to handle any trig function and its inverse. -Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); @@ -2116,7 +2093,7 @@ static bool isTrigLibCall(CallInst *CI) { CI->hasFnAttr(Attribute::ReadNone); } -static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, +static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg, bool UseFloat, Value *&Sin, Value *&Cos, Value *&SinCos) { Type *ArgTy = Arg->getType(); @@ -2131,7 +2108,7 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, // x86_64 can't use {float, float} since that would be returned in both // xmm0 and xmm1, which isn't what a real struct would do. ResTy = T.getArch() == Triple::x86_64 - ? static_cast<Type *>(VectorType::get(ArgTy, 2)) + ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2)) : static_cast<Type *>(StructType::get(ArgTy, ArgTy)); } else { Name = "__sincospi_stret"; @@ -2166,7 +2143,7 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, } } -Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) { // Make sure the prototype is as expected, otherwise the rest of the // function is probably invalid and likely to abort. if (!isTrigLibCall(CI)) @@ -2247,7 +2224,7 @@ void LibCallSimplifier::classifyArgUse( // Integer Library Call Optimizations //===----------------------------------------------------------------------===// -Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilderBase &B) { // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 Value *Op = CI->getArgOperand(0); Type *ArgType = Op->getType(); @@ -2261,7 +2238,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { return B.CreateSelect(Cond, V, B.getInt32(0)); } -Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilderBase &B) { // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false)) Value *Op = CI->getArgOperand(0); Type *ArgType = Op->getType(); @@ -2273,7 +2250,7 @@ Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) { return B.CreateIntCast(V, CI->getType(), false); } -Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) { // abs(x) -> x <s 0 ? -x : x // The negation has 'nsw' because abs of INT_MIN is undefined. Value *X = CI->getArgOperand(0); @@ -2282,7 +2259,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) { return B.CreateSelect(IsNeg, NegX, X); } -Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilderBase &B) { // isdigit(c) -> (c-'0') <u 10 Value *Op = CI->getArgOperand(0); Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); @@ -2290,20 +2267,20 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { return B.CreateZExt(Op, CI->getType()); } -Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilderBase &B) { // isascii(c) -> c <u 128 Value *Op = CI->getArgOperand(0); Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); return B.CreateZExt(Op, CI->getType()); } -Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) { // toascii(c) -> c & 0x7f return B.CreateAnd(CI->getArgOperand(0), ConstantInt::get(CI->getType(), 0x7F)); } -Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) { StringRef Str; if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; @@ -2311,7 +2288,7 @@ Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) { return convertStrToNumber(CI, Str, 10); } -Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { StringRef Str; if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; @@ -2332,7 +2309,7 @@ Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) { static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg); -Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B, +Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B, int StreamArg) { Function *Callee = CI->getCalledFunction(); // Error reporting calls should be cold, mark them as such. @@ -2372,7 +2349,7 @@ static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { return GV->getName() == "stderr"; } -Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { // Check for a fixed format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) @@ -2425,7 +2402,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); @@ -2462,7 +2439,8 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, + IRBuilderBase &B) { // Check for a fixed format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) @@ -2477,8 +2455,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1) B.CreateMemCpy( - CI->getArgOperand(0), Align::None(), CI->getArgOperand(1), - Align::None(), + CI->getArgOperand(0), Align(1), CI->getArgOperand(1), Align(1), ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size() + 1)); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); @@ -2515,8 +2492,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); - B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(2), - Align::None(), IncLen); + B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(2), + Align(1), IncLen); // The sprintf result is the unincremented number of bytes in the string. return B.CreateIntCast(Len, CI->getType(), false); @@ -2524,7 +2501,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (Value *V = optimizeSPrintFString(CI, B)) { @@ -2560,7 +2537,8 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, + IRBuilderBase &B) { // Check for size ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1)); if (!Size) @@ -2587,8 +2565,7 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, // strlen(fmt)+1) B.CreateMemCpy( - CI->getArgOperand(0), Align::None(), CI->getArgOperand(2), - Align::None(), + CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size() + 1)); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); @@ -2629,9 +2606,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { else if (N < Str.size() + 1) return nullptr; - B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(3), - Align::None(), - ConstantInt::get(CI->getType(), Str.size() + 1)); + B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3), + Align(1), ConstantInt::get(CI->getType(), Str.size() + 1)); // The snprintf result is the unincremented number of bytes in the string. return ConstantInt::get(CI->getType(), Str.size()); @@ -2640,7 +2616,7 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) { if (Value *V = optimizeSnPrintFString(CI, B)) { return V; } @@ -2650,7 +2626,8 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, + IRBuilderBase &B) { optimizeErrorReporting(CI, B, 0); // All the optimizations depend on the format string. @@ -2699,7 +2676,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (Value *V = optimizeFPrintFString(CI, B)) { @@ -2734,7 +2711,7 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilderBase &B) { optimizeErrorReporting(CI, B, 3); // Get the element size and count. @@ -2757,15 +2734,10 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) { } } - if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI)) - return emitFWriteUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(3), B, DL, - TLI); - return nullptr; } -Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) { optimizeErrorReporting(CI, B, 1); // Don't rewrite fputs to fwrite when optimising for size because fwrite @@ -2776,15 +2748,9 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { if (OptForSize) return nullptr; - // Check if has any use - if (!CI->use_empty()) { - if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI)) - return emitFPutSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B, - TLI); - else - // We can't optimize if return value is used. - return nullptr; - } + // We can't optimize if return value is used. + if (!CI->use_empty()) + return nullptr; // fputs(s,F) --> fwrite(s,strlen(s),1,F) uint64_t Len = GetStringLength(CI->getArgOperand(0)); @@ -2798,41 +2764,7 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { CI->getArgOperand(1), B, DL, TLI); } -Value *LibCallSimplifier::optimizeFPutc(CallInst *CI, IRBuilder<> &B) { - optimizeErrorReporting(CI, B, 1); - - if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI)) - return emitFPutCUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B, - TLI); - - return nullptr; -} - -Value *LibCallSimplifier::optimizeFGetc(CallInst *CI, IRBuilder<> &B) { - if (isLocallyOpenedFile(CI->getArgOperand(0), CI, B, TLI)) - return emitFGetCUnlocked(CI->getArgOperand(0), B, TLI); - - return nullptr; -} - -Value *LibCallSimplifier::optimizeFGets(CallInst *CI, IRBuilder<> &B) { - if (isLocallyOpenedFile(CI->getArgOperand(2), CI, B, TLI)) - return emitFGetSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); - - return nullptr; -} - -Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) { - if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI)) - return emitFReadUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(3), B, DL, - TLI); - - return nullptr; -} - -Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) { annotateNonNullBasedOnAccess(CI, 0); if (!CI->use_empty()) return nullptr; @@ -2846,11 +2778,10 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { return nullptr; } -Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) { +Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) { // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) - return B.CreateMemMove(CI->getArgOperand(1), Align::None(), - CI->getArgOperand(0), Align::None(), - CI->getArgOperand(2)); + return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0), + Align(1), CI->getArgOperand(2)); } bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { @@ -2863,7 +2794,7 @@ bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { } Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, - IRBuilder<> &Builder) { + IRBuilderBase &Builder) { LibFunc Func; Function *Callee = CI->getCalledFunction(); // Check for string/memory library functions. @@ -2944,7 +2875,7 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, LibFunc Func, - IRBuilder<> &Builder) { + IRBuilderBase &Builder) { // Don't optimize calls that require strict floating point semantics. if (CI->isStrictFP()) return nullptr; @@ -3000,6 +2931,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, return replaceUnaryCall(CI, Builder, Intrinsic::floor); case LibFunc_round: return replaceUnaryCall(CI, Builder, Intrinsic::round); + case LibFunc_roundeven: + return replaceUnaryCall(CI, Builder, Intrinsic::roundeven); case LibFunc_nearbyint: return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint); case LibFunc_rint: @@ -3044,7 +2977,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, } } -Value *LibCallSimplifier::optimizeCall(CallInst *CI) { +Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { // TODO: Split out the code below that operates on FP calls so that // we can all non-FP calls with the StrictFP attribute to be // optimized. @@ -3053,11 +2986,13 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { LibFunc Func; Function *Callee = CI->getCalledFunction(); + bool isCallingConvC = isCallingConvCCompatible(CI); SmallVector<OperandBundleDef, 2> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); - IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles); - bool isCallingConvC = isCallingConvCCompatible(CI); + + IRBuilderBase::OperandBundlesGuard Guard(Builder); + Builder.setDefaultOperandBundles(OpBundles); // Command-line parameter overrides instruction attribute. // This can't be moved to optimizeFloatingPointLibCall() because it may be @@ -3097,14 +3032,20 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { } // Also try to simplify calls to fortified library functions. - if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) { + if (Value *SimplifiedFortifiedCI = + FortifiedSimplifier.optimizeCall(CI, Builder)) { // Try to further simplify the result. CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { - // Use an IR Builder from SimplifiedCI if available instead of CI - // to guarantee we reach all uses we might replace later on. - IRBuilder<> TmpBuilder(SimplifiedCI); - if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) { + // Ensure that SimplifiedCI's uses are complete, since some calls have + // their uses analyzed. + replaceAllUsesWith(CI, SimplifiedCI); + + // Set insertion point to SimplifiedCI to guarantee we reach all uses + // we might replace later on. + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(SimplifiedCI); + if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { // If we were able to further simplify, remove the now redundant call. substituteInParent(SimplifiedCI, V); return V; @@ -3158,16 +3099,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return optimizeFPrintF(CI, Builder); case LibFunc_fwrite: return optimizeFWrite(CI, Builder); - case LibFunc_fread: - return optimizeFRead(CI, Builder); case LibFunc_fputs: return optimizeFPuts(CI, Builder); - case LibFunc_fgets: - return optimizeFGets(CI, Builder); - case LibFunc_fputc: - return optimizeFPutc(CI, Builder); - case LibFunc_fgetc: - return optimizeFGetc(CI, Builder); case LibFunc_puts: return optimizePuts(CI, Builder); case LibFunc_perror: @@ -3280,11 +3213,11 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 2)) { - CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(), - CI->getArgOperand(1), Align::None(), - CI->getArgOperand(2)); + CallInst *NewCI = + B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1), + Align(1), CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -3292,11 +3225,11 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 2)) { - CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(), - CI->getArgOperand(1), Align::None(), - CI->getArgOperand(2)); + CallInst *NewCI = + B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1), + Align(1), CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -3304,13 +3237,13 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { // TODO: Try foldMallocMemset() here. if (isFortifiedCallFoldable(CI, 3, 2)) { Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, - CI->getArgOperand(2), Align::None()); + CI->getArgOperand(2), Align(1)); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -3318,7 +3251,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, - IRBuilder<> &B, + IRBuilderBase &B, LibFunc Func) { const DataLayout &DL = CI->getModule()->getDataLayout(); Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1), @@ -3362,8 +3295,16 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, return Ret; } +Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI, + IRBuilderBase &B) { + if (isFortifiedCallFoldable(CI, 1, None, 0)) + return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(), + TLI); + return nullptr; +} + Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, - IRBuilder<> &B, + IRBuilderBase &B, LibFunc Func) { if (isFortifiedCallFoldable(CI, 3, 2)) { if (Func == LibFunc_strncpy_chk) @@ -3378,7 +3319,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 4, 3)) return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), CI->getArgOperand(3), B, TLI); @@ -3387,7 +3328,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) { SmallVector<Value *, 8> VariadicArgs(CI->arg_begin() + 5, CI->arg_end()); return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), @@ -3398,7 +3339,7 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2, None, None, 1)) { SmallVector<Value *, 8> VariadicArgs(CI->arg_begin() + 4, CI->arg_end()); return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs, @@ -3409,7 +3350,7 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2)) return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI); @@ -3417,7 +3358,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, TLI); @@ -3426,7 +3367,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, TLI); @@ -3435,7 +3376,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, TLI); @@ -3444,7 +3385,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4), CI->getArgOperand(5), B, TLI); @@ -3453,7 +3394,7 @@ Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, } Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI, - IRBuilder<> &B) { + IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2, None, None, 1)) return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), CI->getArgOperand(4), B, TLI); @@ -3461,7 +3402,8 @@ Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI, return nullptr; } -Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { +Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI, + IRBuilderBase &Builder) { // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here. // Some clang users checked for _chk libcall availability using: // __has_builtin(__builtin___memcpy_chk) @@ -3477,11 +3419,13 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { LibFunc Func; Function *Callee = CI->getCalledFunction(); + bool isCallingConvC = isCallingConvCCompatible(CI); SmallVector<OperandBundleDef, 2> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); - IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles); - bool isCallingConvC = isCallingConvCCompatible(CI); + + IRBuilderBase::OperandBundlesGuard Guard(Builder); + Builder.setDefaultOperandBundles(OpBundles); // First, check that this is a known library functions and that the prototype // is correct. @@ -3502,6 +3446,8 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { case LibFunc_stpcpy_chk: case LibFunc_strcpy_chk: return optimizeStrpCpyChk(CI, Builder, Func); + case LibFunc_strlen_chk: + return optimizeStrLenChk(CI, Builder); case LibFunc_stpncpy_chk: case LibFunc_strncpy_chk: return optimizeStrpNCpyChk(CI, Builder, Func); diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp index d2a400027d4b7..e257c5a015f51 100644 --- a/llvm/lib/Transforms/Utils/SizeOpts.cpp +++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp @@ -24,10 +24,25 @@ cl::opt<bool> PGSOLargeWorkingSetSizeOnly( "if the working set size is large (except for cold code.)")); cl::opt<bool> PGSOColdCodeOnly( - "pgso-cold-code-only", cl::Hidden, cl::init(true), + "pgso-cold-code-only", cl::Hidden, cl::init(false), cl::desc("Apply the profile guided size optimizations only " "to cold code.")); +cl::opt<bool> PGSOColdCodeOnlyForInstrPGO( + "pgso-cold-code-only-for-instr-pgo", cl::Hidden, cl::init(false), + cl::desc("Apply the profile guided size optimizations only " + "to cold code under instrumentation PGO.")); + +cl::opt<bool> PGSOColdCodeOnlyForSamplePGO( + "pgso-cold-code-only-for-sample-pgo", cl::Hidden, cl::init(false), + cl::desc("Apply the profile guided size optimizations only " + "to cold code under sample PGO.")); + +cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO( + "pgso-cold-code-only-for-partial-sample-pgo", cl::Hidden, cl::init(false), + cl::desc("Apply the profile guided size optimizations only " + "to cold code under partial-profile sample PGO.")); + cl::opt<bool> PGSOIRPassOrTestOnly( "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false), cl::desc("Apply the profile guided size optimizations only" @@ -38,12 +53,12 @@ cl::opt<bool> ForcePGSO( cl::desc("Force the (profiled-guided) size optimizations. ")); cl::opt<int> PgsoCutoffInstrProf( - "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore, + "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore, cl::desc("The profile guided size optimization profile summary cutoff " "for instrumentation profile.")); cl::opt<int> PgsoCutoffSampleProf( - "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore, + "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore, cl::desc("The profile guided size optimization profile summary cutoff " "for sample profile.")); @@ -60,6 +75,12 @@ struct BasicBlockBFIAdapter { BlockFrequencyInfo &BFI) { return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI); } + static bool isFunctionColdInCallGraphNthPercentile(int CutOff, + const Function *F, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo &BFI) { + return PSI->isFunctionColdInCallGraphNthPercentile(CutOff, F, BFI); + } static bool isColdBlock(const BasicBlock *BB, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { @@ -71,6 +92,11 @@ struct BasicBlockBFIAdapter { BlockFrequencyInfo *BFI) { return PSI->isHotBlockNthPercentile(CutOff, BB, BFI); } + static bool isColdBlockNthPercentile(int CutOff, const BasicBlock *BB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { + return PSI->isColdBlockNthPercentile(CutOff, BB, BFI); + } }; } // end anonymous namespace @@ -84,6 +110,7 @@ bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI, bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, PGSOQueryType QueryType) { + assert(BB); return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI, QueryType); } diff --git a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp index 7880ea1c6c479..b559811d120bc 100644 --- a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp +++ b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp @@ -48,7 +48,7 @@ bool StripGCRelocates::runOnFunction(Function &F) { // i.e. not bound to a single statepoint token. for (Instruction &I : instructions(F)) { if (auto *GCR = dyn_cast<GCRelocateInst>(&I)) - if (isStatepoint(GCR->getOperand(0))) + if (isa<GCStatepointInst>(GCR->getOperand(0))) GCRelocates.push_back(GCR); } // All gc.relocates are bound to a single statepoint token. The order of diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index aacf81d835193..ec4ea848a5d4a 100644 --- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -117,8 +117,9 @@ public: const std::string Target; ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked) - : RewriteDescriptor(DT), Source(Naked ? StringRef("\01" + S.str()) : S), - Target(T) {} + : RewriteDescriptor(DT), + Source(std::string(Naked ? StringRef("\01" + S.str()) : S)), + Target(std::string(T)) {} bool performOnModule(Module &M) override; @@ -159,7 +160,8 @@ public: const std::string Transform; PatternRewriteDescriptor(StringRef P, StringRef T) - : RewriteDescriptor(DT), Pattern(P), Transform(T) { } + : RewriteDescriptor(DT), Pattern(std::string(P)), + Transform(std::string(T)) {} bool performOnModule(Module &M) override; @@ -189,7 +191,7 @@ performOnModule(Module &M) { continue; if (GlobalObject *GO = dyn_cast<GlobalObject>(&C)) - rewriteComdat(M, GO, C.getName(), Name); + rewriteComdat(M, GO, std::string(C.getName()), Name); if (Value *V = (M.*Get)(Name)) C.setValueName(V->getValueName()); @@ -352,19 +354,19 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, if (KeyValue.equals("source")) { std::string Error; - Source = Value->getValue(ValueStorage); + Source = std::string(Value->getValue(ValueStorage)); if (!Regex(Source).isValid(Error)) { YS.printError(Field.getKey(), "invalid regex: " + Error); return false; } } else if (KeyValue.equals("target")) { - Target = Value->getValue(ValueStorage); + Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue.equals("transform")) { - Transform = Value->getValue(ValueStorage); + Transform = std::string(Value->getValue(ValueStorage)); } else if (KeyValue.equals("naked")) { std::string Undecorated; - Undecorated = Value->getValue(ValueStorage); + Undecorated = std::string(Value->getValue(ValueStorage)); Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1"; } else { YS.printError(Field.getKey(), "unknown key for function"); @@ -421,15 +423,15 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, if (KeyValue.equals("source")) { std::string Error; - Source = Value->getValue(ValueStorage); + Source = std::string(Value->getValue(ValueStorage)); if (!Regex(Source).isValid(Error)) { YS.printError(Field.getKey(), "invalid regex: " + Error); return false; } } else if (KeyValue.equals("target")) { - Target = Value->getValue(ValueStorage); + Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue.equals("transform")) { - Transform = Value->getValue(ValueStorage); + Transform = std::string(Value->getValue(ValueStorage)); } else { YS.printError(Field.getKey(), "unknown Key for Global Variable"); return false; @@ -484,15 +486,15 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, if (KeyValue.equals("source")) { std::string Error; - Source = Value->getValue(ValueStorage); + Source = std::string(Value->getValue(ValueStorage)); if (!Regex(Source).isValid(Error)) { YS.printError(Field.getKey(), "invalid regex: " + Error); return false; } } else if (KeyValue.equals("target")) { - Target = Value->getValue(ValueStorage); + Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue.equals("transform")) { - Transform = Value->getValue(ValueStorage); + Transform = std::string(Value->getValue(ValueStorage)); } else { YS.printError(Field.getKey(), "unknown key for Global Alias"); return false; diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp new file mode 100644 index 0000000000000..b10deee3907c7 --- /dev/null +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -0,0 +1,220 @@ +//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// For each natural loop with multiple exit blocks, this pass creates a new +// block N such that all exiting blocks now branch to N, and then control flow +// is redistributed to all the original exit blocks. +// +// Limitation: This assumes that all terminators in the CFG are direct branches +// (the "br" instruction). The presence of any other control flow +// such as indirectbr, switch or callbr will cause an assert. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "unify-loop-exits" + +using namespace llvm; + +namespace { +struct UnifyLoopExits : public FunctionPass { + static char ID; + UnifyLoopExits() : FunctionPass(ID) { + initializeUnifyLoopExitsPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(LowerSwitchID); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreservedID(LowerSwitchID); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +char UnifyLoopExits::ID = 0; + +FunctionPass *llvm::createUnifyLoopExitsPass() { return new UnifyLoopExits(); } + +INITIALIZE_PASS_BEGIN(UnifyLoopExits, "unify-loop-exits", + "Fixup each natural loop to have a single exit block", + false /* Only looks at CFG */, false /* Analysis Pass */) +INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(UnifyLoopExits, "unify-loop-exits", + "Fixup each natural loop to have a single exit block", + false /* Only looks at CFG */, false /* Analysis Pass */) + +// The current transform introduces new control flow paths which may break the +// SSA requirement that every def must dominate all its uses. For example, +// consider a value D defined inside the loop that is used by some instruction +// U outside the loop. It follows that D dominates U, since the original +// program has valid SSA form. After merging the exits, all paths from D to U +// now flow through the unified exit block. In addition, there may be other +// paths that do not pass through D, but now reach the unified exit +// block. Thus, D no longer dominates U. +// +// Restore the dominance by creating a phi for each such D at the new unified +// loop exit. But when doing this, ignore any uses U that are in the new unified +// loop exit, since those were introduced specially when the block was created. +// +// The use of SSAUpdater seems like overkill for this operation. The location +// for creating the new PHI is well-known, and also the set of incoming blocks +// to the new PHI. +static void restoreSSA(const DominatorTree &DT, const Loop *L, + const SetVector<BasicBlock *> &Incoming, + BasicBlock *LoopExitBlock) { + using InstVector = SmallVector<Instruction *, 8>; + using IIMap = DenseMap<Instruction *, InstVector>; + IIMap ExternalUsers; + for (auto BB : L->blocks()) { + for (auto &I : *BB) { + for (auto &U : I.uses()) { + auto UserInst = cast<Instruction>(U.getUser()); + auto UserBlock = UserInst->getParent(); + if (UserBlock == LoopExitBlock) + continue; + if (L->contains(UserBlock)) + continue; + LLVM_DEBUG(dbgs() << "added ext use for " << I.getName() << "(" + << BB->getName() << ")" + << ": " << UserInst->getName() << "(" + << UserBlock->getName() << ")" + << "\n"); + ExternalUsers[&I].push_back(UserInst); + } + } + } + + for (auto II : ExternalUsers) { + // For each Def used outside the loop, create NewPhi in + // LoopExitBlock. NewPhi receives Def only along exiting blocks that + // dominate it, while the remaining values are undefined since those paths + // didn't exist in the original CFG. + auto Def = II.first; + LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n"); + auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(), + Def->getName() + ".moved", + LoopExitBlock->getTerminator()); + for (auto In : Incoming) { + LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": "); + if (Def->getParent() == In || DT.dominates(Def, In)) { + LLVM_DEBUG(dbgs() << "dominated\n"); + NewPhi->addIncoming(Def, In); + } else { + LLVM_DEBUG(dbgs() << "not dominated\n"); + NewPhi->addIncoming(UndefValue::get(Def->getType()), In); + } + } + + LLVM_DEBUG(dbgs() << "external users:"); + for (auto U : II.second) { + LLVM_DEBUG(dbgs() << " " << U->getName()); + U->replaceUsesOfWith(Def, NewPhi); + } + LLVM_DEBUG(dbgs() << "\n"); + } +} + +static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { + // To unify the loop exits, we need a list of the exiting blocks as + // well as exit blocks. The functions for locating these lists both + // traverse the entire loop body. It is more efficient to first + // locate the exiting blocks and then examine their successors to + // locate the exit blocks. + SetVector<BasicBlock *> ExitingBlocks; + SetVector<BasicBlock *> Exits; + + // We need SetVectors, but the Loop API takes a vector, so we use a temporary. + SmallVector<BasicBlock *, 8> Temp; + L->getExitingBlocks(Temp); + for (auto BB : Temp) { + ExitingBlocks.insert(BB); + for (auto S : successors(BB)) { + auto SL = LI.getLoopFor(S); + // A successor is not an exit if it is directly or indirectly in the + // current loop. + if (SL == L || L->contains(SL)) + continue; + Exits.insert(S); + } + } + + LLVM_DEBUG( + dbgs() << "Found exit blocks:"; + for (auto Exit : Exits) { + dbgs() << " " << Exit->getName(); + } + dbgs() << "\n"; + + dbgs() << "Found exiting blocks:"; + for (auto EB : ExitingBlocks) { + dbgs() << " " << EB->getName(); + } + dbgs() << "\n";); + + if (Exits.size() <= 1) { + LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n"); + return false; + } + + SmallVector<BasicBlock *, 8> GuardBlocks; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks, + Exits, "loop.exit"); + + restoreSSA(DT, L, ExitingBlocks, LoopExitBlock); + +#if defined(EXPENSIVE_CHECKS) + assert(DT.verify(DominatorTree::VerificationLevel::Full)); +#else + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); +#endif // EXPENSIVE_CHECKS + L->verifyLoop(); + + // The guard blocks were created outside the loop, so they need to become + // members of the parent loop. + if (auto ParentLoop = L->getParentLoop()) { + for (auto G : GuardBlocks) { + ParentLoop->addBasicBlockToLoop(G, LI); + } + ParentLoop->verifyLoop(); + } + +#if defined(EXPENSIVE_CHECKS) + LI.verify(DT); +#endif // EXPENSIVE_CHECKS + + return true; +} + +bool UnifyLoopExits::runOnFunction(Function &F) { + LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName() + << "\n"); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + bool Changed = false; + auto Loops = LI.getLoopsInPreorder(); + for (auto L : Loops) { + LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: " + << LI.getLoopDepth(L->getHeader()) << ")\n"); + Changed |= unifyLoopExits(DT, LI, L); + } + return Changed; +} diff --git a/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp b/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp new file mode 100644 index 0000000000000..5b58548e54dc1 --- /dev/null +++ b/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp @@ -0,0 +1,97 @@ +//===- UniqueInternalLinkageNames.cpp - Unique Internal Linkage Sym Names -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements unique naming of internal linkage symbols with option +// -funique-internal-linkage-symbols. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/MD5.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +static bool uniqueifyInternalLinkageNames(Module &M) { + llvm::MD5 Md5; + Md5.update(M.getSourceFileName()); + llvm::MD5::MD5Result R; + Md5.final(R); + SmallString<32> Str; + llvm::MD5::stringifyResult(R, Str); + std::string ModuleNameHash = (Twine(".") + Twine(Str)).str(); + bool Changed = false; + + // Append the module hash to all internal linkage functions. + for (auto &F : M) { + if (F.hasInternalLinkage()) { + F.setName(F.getName() + ModuleNameHash); + Changed = true; + } + } + + // Append the module hash to all internal linkage globals. + for (auto &GV : M.globals()) { + if (GV.hasInternalLinkage()) { + GV.setName(GV.getName() + ModuleNameHash); + Changed = true; + } + } + return Changed; +} + +namespace { + +// Legacy pass that provides a name to every anon globals. +class UniqueInternalLinkageNamesLegacyPass : public ModulePass { + +public: + /// Pass identification, replacement for typeid + static char ID; + + /// Specify pass name for debug output + StringRef getPassName() const override { + return "Unique Internal Linkage Names"; + } + + explicit UniqueInternalLinkageNamesLegacyPass() : ModulePass(ID) { + initializeUniqueInternalLinkageNamesLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + return uniqueifyInternalLinkageNames(M); + } +}; + +char UniqueInternalLinkageNamesLegacyPass::ID = 0; +} // anonymous namespace + +PreservedAnalyses +UniqueInternalLinkageNamesPass::run(Module &M, ModuleAnalysisManager &AM) { + if (!uniqueifyInternalLinkageNames(M)) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +INITIALIZE_PASS_BEGIN(UniqueInternalLinkageNamesLegacyPass, + "unique-internal-linkage-names", + "Uniqueify internal linkage names", false, false) +INITIALIZE_PASS_END(UniqueInternalLinkageNamesLegacyPass, + "unique-internal-linkage-names", + "Uniqueify Internal linkage names", false, false) + +namespace llvm { +ModulePass *createUniqueInternalLinkageNamesPass() { + return new UniqueInternalLinkageNamesLegacyPass(); +} +} // namespace llvm diff --git a/llvm/lib/Transforms/Utils/Utils.cpp b/llvm/lib/Transforms/Utils/Utils.cpp index 7769c7493cdab..ce98a739bea88 100644 --- a/llvm/lib/Transforms/Utils/Utils.cpp +++ b/llvm/lib/Transforms/Utils/Utils.cpp @@ -24,8 +24,11 @@ using namespace llvm; /// library. void llvm::initializeTransformUtils(PassRegistry &Registry) { initializeAddDiscriminatorsLegacyPassPass(Registry); + initializeAssumeSimplifyPassLegacyPassPass(Registry); + initializeAssumeBuilderPassLegacyPassPass(Registry); initializeBreakCriticalEdgesPass(Registry); initializeCanonicalizeAliasesLegacyPassPass(Registry); + initializeCanonicalizeFreezeInLoopsPass(Registry); initializeInstNamerPass(Registry); initializeLCSSAWrapperPassPass(Registry); initializeLibCallsShrinkWrapLegacyPassPass(Registry); @@ -40,6 +43,9 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) { initializeStripGCRelocatesPass(Registry); initializePredicateInfoPrinterLegacyPassPass(Registry); initializeInjectTLIMappingsLegacyPass(Registry); + initializeFixIrreduciblePass(Registry); + initializeUnifyLoopExitsPass(Registry); + initializeUniqueInternalLinkageNamesLegacyPassPass(Registry); } /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp index 591e1fd2dbee1..6ff08cd287124 100644 --- a/llvm/lib/Transforms/Utils/VNCoercion.cpp +++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp @@ -1,16 +1,18 @@ #include "llvm/Transforms/Utils/VNCoercion.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "vncoerce" + namespace llvm { namespace VNCoercion { +static bool isFirstClassAggregateOrScalableType(Type *Ty) { + return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty); +} + /// Return true if coerceAvailableValueToLoadType will succeed. bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, const DataLayout &DL) { @@ -18,20 +20,20 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, if (StoredTy == LoadTy) return true; - // If the loaded or stored value is an first class array or struct, don't try - // to transform them. We need to be able to bitcast to integer. - if (LoadTy->isStructTy() || LoadTy->isArrayTy() || StoredTy->isStructTy() || - StoredTy->isArrayTy()) + // If the loaded/stored value is a first class array/struct, or scalable type, + // don't try to transform them. We need to be able to bitcast to integer. + if (isFirstClassAggregateOrScalableType(LoadTy) || + isFirstClassAggregateOrScalableType(StoredTy)) return false; - uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy); + uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize(); // The store size must be byte-aligned to support future type casts. if (llvm::alignTo(StoreSize, 8) != StoreSize) return false; // The store has to be at least as big as the load. - if (StoreSize < DL.getTypeSizeInBits(LoadTy)) + if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize()) return false; // Don't coerce non-integral pointers to integers or vice versa. @@ -55,14 +57,13 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy, assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) && "precondition violation - materialization can't fail"); if (auto *C = dyn_cast<Constant>(StoredVal)) - if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL)) - StoredVal = FoldedStoredVal; + StoredVal = ConstantFoldConstant(C, DL); // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy); - uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy); + uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize(); + uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize(); // If the store and reload are the same size, we can always reuse it. if (StoredValSize == LoadedValSize) { @@ -89,8 +90,7 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy, } if (auto *C = dyn_cast<ConstantExpr>(StoredVal)) - if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL)) - StoredVal = FoldedStoredVal; + StoredVal = ConstantFoldConstant(C, DL); return StoredVal; } @@ -115,8 +115,8 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy, // If this is a big-endian system, we need to shift the value down to the low // bits so that a truncate will work. if (DL.isBigEndian()) { - uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) - - DL.getTypeStoreSizeInBits(LoadedTy); + uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() - + DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize(); StoredVal = Helper.CreateLShr( StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt)); } @@ -135,8 +135,7 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy, } if (auto *C = dyn_cast<Constant>(StoredVal)) - if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL)) - StoredVal = FoldedStoredVal; + StoredVal = ConstantFoldConstant(C, DL); return StoredVal; } @@ -148,7 +147,8 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy, /// /// If we can't do it, return null. Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, - IRBuilder<> &IRB, const DataLayout &DL) { + IRBuilderBase &IRB, + const DataLayout &DL) { return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL); } @@ -164,9 +164,9 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, const DataLayout &DL) { - // If the loaded or stored value is a first class array or struct, don't try - // to transform them. We need to be able to bitcast to integer. - if (LoadTy->isStructTy() || LoadTy->isArrayTy()) + // If the loaded/stored value is a first class array/struct, or scalable type, + // don't try to transform them. We need to be able to bitcast to integer. + if (isFirstClassAggregateOrScalableType(LoadTy)) return -1; int64_t StoreOffset = 0, LoadOffset = 0; @@ -184,7 +184,7 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, // If the load and store don't overlap at all, the store doesn't provide // anything to the load. In this case, they really don't alias at all, AA // must have gotten confused. - uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy); + uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize(); if ((WriteSizeInBits & 7) | (LoadSize & 7)) return -1; @@ -218,10 +218,9 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, const DataLayout &DL) { auto *StoredVal = DepSI->getValueOperand(); - - // Cannot handle reading from store of first-class aggregate yet. - if (StoredVal->getType()->isStructTy() || - StoredVal->getType()->isArrayTy()) + + // Cannot handle reading from store of first-class aggregate or scalable type. + if (isFirstClassAggregateOrScalableType(StoredVal->getType())) return -1; // Don't coerce non-integral pointers to integers or vice versa. @@ -235,11 +234,96 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, Value *StorePtr = DepSI->getPointerOperand(); uint64_t StoreSize = - DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()); + DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize(); return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize, DL); } +/// Looks at a memory location for a load (specified by MemLocBase, Offs, and +/// Size) and compares it against a load. +/// +/// If the specified load could be safely widened to a larger integer load +/// that is 1) still efficient, 2) safe for the target, and 3) would provide +/// the specified memory location value, then this function returns the size +/// in bytes of the load width to use. If not, this returns zero. +static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase, + int64_t MemLocOffs, + unsigned MemLocSize, + const LoadInst *LI) { + // We can only extend simple integer loads. + if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) + return 0; + + // Load widening is hostile to ThreadSanitizer: it may cause false positives + // or make the reports more cryptic (access sizes are wrong). + if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread)) + return 0; + + const DataLayout &DL = LI->getModule()->getDataLayout(); + + // Get the base of this load. + int64_t LIOffs = 0; + const Value *LIBase = + GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL); + + // If the two pointers are not based on the same pointer, we can't tell that + // they are related. + if (LIBase != MemLocBase) + return 0; + + // Okay, the two values are based on the same pointer, but returned as + // no-alias. This happens when we have things like two byte loads at "P+1" + // and "P+3". Check to see if increasing the size of the "LI" load up to its + // alignment (or the largest native integer type) will allow us to load all + // the bits required by MemLoc. + + // If MemLoc is before LI, then no widening of LI will help us out. + if (MemLocOffs < LIOffs) + return 0; + + // Get the alignment of the load in bytes. We assume that it is safe to load + // any legal integer up to this size without a problem. For example, if we're + // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can + // widen it up to an i32 load. If it is known 2-byte aligned, we can widen it + // to i16. + unsigned LoadAlign = LI->getAlignment(); + + int64_t MemLocEnd = MemLocOffs + MemLocSize; + + // If no amount of rounding up will let MemLoc fit into LI, then bail out. + if (LIOffs + LoadAlign < MemLocEnd) + return 0; + + // This is the size of the load to try. Start with the next larger power of + // two. + unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U; + NewLoadByteSize = NextPowerOf2(NewLoadByteSize); + + while (true) { + // If this load size is bigger than our known alignment or would not fit + // into a native integer register, then we fail. + if (NewLoadByteSize > LoadAlign || + !DL.fitsInLegalInteger(NewLoadByteSize * 8)) + return 0; + + if (LIOffs + NewLoadByteSize > MemLocEnd && + (LI->getParent()->getParent()->hasFnAttribute( + Attribute::SanitizeAddress) || + LI->getParent()->getParent()->hasFnAttribute( + Attribute::SanitizeHWAddress))) + // We will be reading past the location accessed by the original program. + // While this is safe in a regular build, Address Safety analysis tools + // may start reporting false warnings. So, don't do widening. + return 0; + + // If a load of this width would include all of MemLoc, then we succeed. + if (LIOffs + NewLoadByteSize >= MemLocEnd) + return NewLoadByteSize; + + NewLoadByteSize <<= 1; + } +} + /// This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. @@ -255,7 +339,7 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI, return -1; Value *DepPtr = DepLI->getPointerOperand(); - uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()); + uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize(); int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL); if (R != -1) return R; @@ -265,10 +349,10 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI, int64_t LoadOffs = 0; const Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL); - unsigned LoadSize = DL.getTypeStoreSize(LoadTy); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize(); - unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize( - LoadBase, LoadOffs, LoadSize, DepLI); + unsigned Size = + getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI); if (Size == 0) return -1; @@ -319,21 +403,17 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (Offset == -1) return Offset; - // Don't coerce non-integral pointers to integers or vice versa, and the - // memtransfer is implicitly a raw byte code - if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) - // TODO: Can allow nullptrs from constant zeros - return -1; - unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. - Src = - ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS)); - Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, - OffsetCst); + if (Offset) { + Src = ConstantExpr::getBitCast(Src, + Type::getInt8PtrTy(Src->getContext(), AS)); + Constant *OffsetCst = + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), + Src, OffsetCst); + } Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL)) return Offset; @@ -355,8 +435,9 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy, return SrcVal; } - uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; - uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8; + uint64_t StoreSize = + (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8; + uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8; // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. if (SrcVal->getType()->isPtrOrPtrVectorTy()) @@ -408,8 +489,9 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, const DataLayout &DL) { // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. - unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType()); - unsigned LoadSize = DL.getTypeStoreSize(LoadTy); + unsigned SrcValStoreSize = + DL.getTypeStoreSize(SrcVal->getType()).getFixedSize(); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize(); if (Offset + LoadSize > SrcValStoreSize) { assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); @@ -431,7 +513,7 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal); NewLoad->takeName(SrcVal); - NewLoad->setAlignment(MaybeAlign(SrcVal->getAlignment())); + NewLoad->setAlignment(SrcVal->getAlign()); LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); @@ -452,8 +534,9 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset, Type *LoadTy, const DataLayout &DL) { - unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType()); - unsigned LoadSize = DL.getTypeStoreSize(LoadTy); + unsigned SrcValStoreSize = + DL.getTypeStoreSize(SrcVal->getType()).getFixedSize(); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize(); if (Offset + LoadSize > SrcValStoreSize) return nullptr; return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL); @@ -464,7 +547,7 @@ T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, HelperClass &Helper, const DataLayout &DL) { LLVMContext &Ctx = LoadTy->getContext(); - uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8; + uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8; // We know that this method is only called when the mem transfer fully // provides the bits for the load. @@ -500,16 +583,18 @@ T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset, // Otherwise, this is a memcpy/memmove from a constant global. MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); Constant *Src = cast<Constant>(MTI->getSource()); - unsigned AS = Src->getType()->getPointerAddressSpace(); + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. - Src = - ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS)); - Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, - OffsetCst); + if (Offset) { + Src = ConstantExpr::getBitCast(Src, + Type::getInt8PtrTy(Src->getContext(), AS)); + Constant *OffsetCst = + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), + Src, OffsetCst); + } Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL); } diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index da68d3713b404..f1b3fe8e2fa9a 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -21,7 +21,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -369,7 +368,8 @@ Value *Mapper::mapValue(const Value *V) { if (NewTy != IA->getFunctionType()) V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(), - IA->hasSideEffects(), IA->isAlignStack()); + IA->hasSideEffects(), IA->isAlignStack(), + IA->getDialect()); } return getVM()[V] = const_cast<Value *>(V); @@ -888,17 +888,17 @@ void Mapper::remapInstruction(Instruction *I) { return; // If the instruction's type is being remapped, do so now. - if (auto CS = CallSite(I)) { + if (auto *CB = dyn_cast<CallBase>(I)) { SmallVector<Type *, 3> Tys; - FunctionType *FTy = CS.getFunctionType(); + FunctionType *FTy = CB->getFunctionType(); Tys.reserve(FTy->getNumParams()); for (Type *Ty : FTy->params()) Tys.push_back(TypeMapper->remapType(Ty)); - CS.mutateFunctionType(FunctionType::get( + CB->mutateFunctionType(FunctionType::get( TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg())); - LLVMContext &C = CS->getContext(); - AttributeList Attrs = CS.getAttributes(); + LLVMContext &C = CB->getContext(); + AttributeList Attrs = CB->getAttributes(); for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) { if (Attrs.hasAttribute(i, Attribute::ByVal)) { Type *Ty = Attrs.getAttribute(i, Attribute::ByVal).getValueAsType(); @@ -910,7 +910,7 @@ void Mapper::remapInstruction(Instruction *I) { C, i, Attribute::getWithByValType(C, TypeMapper->remapType(Ty))); } } - CS.setAttributes(Attrs); + CB->setAttributes(Attrs); return; } if (auto *AI = dyn_cast<AllocaInst>(I)) |