summaryrefslogtreecommitdiff
path: root/lib/Transforms
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-06-16 21:03:24 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-06-16 21:03:24 +0000
commit7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (patch)
tree99ec531924f6078534b100ab9d7696abce848099 /lib/Transforms
parent7ab83427af0f77b59941ceba41d509d7d097b065 (diff)
Notes
Diffstat (limited to 'lib/Transforms')
-rw-r--r--lib/Transforms/IPO/CrossDSOCFI.cpp11
-rw-r--r--lib/Transforms/IPO/Inliner.cpp53
-rw-r--r--lib/Transforms/IPO/LowerTypeTests.cpp166
-rw-r--r--lib/Transforms/IPO/PartialInlining.cpp414
-rw-r--r--lib/Transforms/IPO/PassManagerBuilder.cpp6
-rw-r--r--lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp50
-rw-r--r--lib/Transforms/InstCombine/InstCombineAndOrXor.cpp113
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp121
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h9
-rw-r--r--lib/Transforms/InstCombine/InstCombineShifts.cpp14
-rw-r--r--lib/Transforms/Instrumentation/CMakeLists.txt1
-rw-r--r--lib/Transforms/Instrumentation/IndirectCallPromotion.cpp353
-rw-r--r--lib/Transforms/Instrumentation/InstrProfiling.cpp12
-rw-r--r--lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp419
-rw-r--r--lib/Transforms/Instrumentation/SanitizerCoverage.cpp2
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp2
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp5
-rw-r--r--lib/Transforms/Scalar/GVNSink.cpp11
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp24
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp4
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp71
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp11
-rw-r--r--lib/Transforms/Utils/CodeExtractor.cpp219
-rw-r--r--lib/Transforms/Utils/PredicateInfo.cpp3
-rw-r--r--lib/Transforms/Utils/SimplifyIndVar.cpp14
25 files changed, 1342 insertions, 766 deletions
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index 1b111de061576..d94aa5da85601 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -95,6 +95,17 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
}
}
+ NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+ if (CfiFunctionsMD) {
+ for (auto Func : CfiFunctionsMD->operands()) {
+ assert(Func->getNumOperands() >= 2);
+ for (unsigned I = 2; I < Func->getNumOperands(); ++I)
+ if (ConstantInt *TypeId =
+ extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
+ TypeIds.insert(TypeId->getZExtValue());
+ }
+ }
+
LLVMContext &Ctx = M.getContext();
Constant *C = M.getOrInsertFunction(
"__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index c0dfeede05c5a..ad89e40661c67 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -523,40 +523,47 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
if (!Callee || Callee->isDeclaration())
continue;
- // If this call site is dead and it is to a readonly function, we should
- // just delete the call instead of trying to inline it, regardless of
- // size. This happens because IPSCCP propagates the result out of the
- // call and then we're left with the dead call.
- if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) {
- DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction()
- << "\n");
- // Update the call graph by deleting the edge from Callee to Caller.
- CG[Caller]->removeCallEdgeFor(CS);
- CS.getInstruction()->eraseFromParent();
- ++NumCallsDeleted;
- } else {
+ Instruction *Instr = CS.getInstruction();
+
+ bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI);
+
+ int InlineHistoryID;
+ if (!IsTriviallyDead) {
// If this call site was obtained by inlining another function, verify
// that the include path for the function did not include the callee
// itself. If so, we'd be recursively inlining the same function,
// which would provide the same callsites, which would cause us to
// infinitely inline.
- int InlineHistoryID = CallSites[CSi].second;
+ InlineHistoryID = CallSites[CSi].second;
if (InlineHistoryID != -1 &&
InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
continue;
+ }
+ // FIXME for new PM: because of the old PM we currently generate ORE and
+ // in turn BFI on demand. With the new PM, the ORE dependency should
+ // just become a regular analysis dependency.
+ OptimizationRemarkEmitter ORE(Caller);
+
+ // If the policy determines that we should inline this function,
+ // delete the call instead.
+ if (!shouldInline(CS, GetInlineCost, ORE))
+ continue;
+
+ // If this call site is dead and it is to a readonly function, we should
+ // just delete the call instead of trying to inline it, regardless of
+ // size. This happens because IPSCCP propagates the result out of the
+ // call and then we're left with the dead call.
+ if (IsTriviallyDead) {
+ DEBUG(dbgs() << " -> Deleting dead call: " << *Instr << "\n");
+ // Update the call graph by deleting the edge from Callee to Caller.
+ CG[Caller]->removeCallEdgeFor(CS);
+ Instr->eraseFromParent();
+ ++NumCallsDeleted;
+ } else {
// Get DebugLoc to report. CS will be invalid after Inliner.
- DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ DebugLoc DLoc = Instr->getDebugLoc();
BasicBlock *Block = CS.getParent();
- // FIXME for new PM: because of the old PM we currently generate ORE and
- // in turn BFI on demand. With the new PM, the ORE dependency should
- // just become a regular analysis dependency.
- OptimizationRemarkEmitter ORE(Caller);
-
- // If the policy determines that we should inline this function,
- // try to do so.
- if (!shouldInline(CS, GetInlineCost, ORE))
- continue;
// Attempt to inline the function.
using namespace ore;
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 90896d285f5af..b406c22c69d7a 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -206,17 +207,26 @@ struct ByteArrayInfo {
class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
GlobalObject *GO;
size_t NTypes;
+ // For functions: true if this is a definition (either in the merged module or
+ // in one of the thinlto modules).
+ bool IsDefinition;
+ // For functions: true if this function is either defined or used in a thinlto
+ // module and its jumptable entry needs to be exported to thinlto backends.
+ bool IsExported;
friend TrailingObjects;
size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
public:
static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+ bool IsDefinition, bool IsExported,
ArrayRef<MDNode *> Types) {
auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
GTM->GO = GO;
GTM->NTypes = Types.size();
+ GTM->IsDefinition = IsDefinition;
+ GTM->IsExported = IsExported;
std::uninitialized_copy(Types.begin(), Types.end(),
GTM->getTrailingObjects<MDNode *>());
return GTM;
@@ -224,6 +234,12 @@ public:
GlobalObject *getGlobal() const {
return GO;
}
+ bool isDefinition() const {
+ return IsDefinition;
+ }
+ bool isExported() const {
+ return IsExported;
+ }
ArrayRef<MDNode *> types() const {
return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
}
@@ -294,6 +310,7 @@ class LowerTypeTestsModule {
void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
TypeIdLowering importTypeId(StringRef TypeId);
void importTypeTest(CallInst *CI);
+ void importFunction(Function *F, bool isDefinition);
BitSetInfo
buildBitSet(Metadata *TypeId,
@@ -820,6 +837,41 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
CI->eraseFromParent();
}
+// ThinLTO backend: the function F has a jump table entry; update this module
+// accordingly. isDefinition describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
+ assert(F->getType()->getAddressSpace() == 0);
+
+ // Declaration of a local function - nothing to do.
+ if (F->isDeclarationForLinker() && isDefinition)
+ return;
+
+ GlobalValue::VisibilityTypes Visibility = F->getVisibility();
+ std::string Name = F->getName();
+ Function *FDecl;
+
+ if (F->isDeclarationForLinker() && !isDefinition) {
+ // Declaration of an external function.
+ FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+ Name + ".cfi_jt", &M);
+ FDecl->setVisibility(GlobalValue::HiddenVisibility);
+ } else {
+ // Definition.
+ assert(isDefinition);
+ F->setName(Name + ".cfi");
+ F->setLinkage(GlobalValue::ExternalLinkage);
+ F->setVisibility(GlobalValue::HiddenVisibility);
+ FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+ Name, &M);
+ FDecl->setVisibility(Visibility);
+ }
+
+ if (F->isWeakForLinker())
+ replaceWeakDeclarationWithJumpTablePtr(F, FDecl);
+ else
+ F->replaceAllUsesWith(FDecl);
+}
+
void LowerTypeTestsModule::lowerTypeTestCalls(
ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
@@ -1143,7 +1195,6 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
// arithmetic that we normally use for globals.
// FIXME: find a better way to represent the jumptable in the IR.
-
assert(!Functions.empty());
// Build a simple layout based on the regular layout of jump tables.
@@ -1167,6 +1218,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
// references to the original functions with references to the aliases.
for (unsigned I = 0; I != Functions.size(); ++I) {
Function *F = cast<Function>(Functions[I]->getGlobal());
+ bool IsDefinition = Functions[I]->isDefinition();
Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
ConstantExpr::getInBoundsGetElementPtr(
@@ -1174,7 +1226,18 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
ConstantInt::get(IntPtrTy, I)}),
F->getType());
- if (F->isDeclarationForLinker()) {
+ if (Functions[I]->isExported()) {
+ if (IsDefinition) {
+ ExportSummary->cfiFunctionDefs().insert(F->getName());
+ } else {
+ GlobalAlias *JtAlias = GlobalAlias::create(
+ F->getValueType(), 0, GlobalValue::ExternalLinkage,
+ F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+ JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+ ExportSummary->cfiFunctionDecls().insert(F->getName());
+ }
+ }
+ if (!IsDefinition) {
if (F->isWeakForLinker())
replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
else
@@ -1182,9 +1245,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
} else {
assert(F->getType()->getAddressSpace() == 0);
- GlobalAlias *FAlias = GlobalAlias::create(F->getValueType(), 0,
- F->getLinkage(), "",
- CombinedGlobalElemPtr, &M);
+ GlobalAlias *FAlias = GlobalAlias::create(
+ F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
FAlias->setVisibility(F->getVisibility());
FAlias->takeName(F);
if (FAlias->hasName())
@@ -1353,15 +1415,37 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
bool LowerTypeTestsModule::lower() {
Function *TypeTestFunc =
M.getFunction(Intrinsic::getName(Intrinsic::type_test));
- if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary)
+ if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary &&
+ !ImportSummary)
return false;
if (ImportSummary) {
- for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
- UI != UE;) {
- auto *CI = cast<CallInst>((*UI++).getUser());
- importTypeTest(CI);
+ if (TypeTestFunc) {
+ for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+ UI != UE;) {
+ auto *CI = cast<CallInst>((*UI++).getUser());
+ importTypeTest(CI);
+ }
+ }
+
+ SmallVector<Function *, 8> Defs;
+ SmallVector<Function *, 8> Decls;
+ for (auto &F : M) {
+ // CFI functions are either external, or promoted. A local function may
+ // have the same name, but it's not the one we are looking for.
+ if (F.hasLocalLinkage())
+ continue;
+ if (ImportSummary->cfiFunctionDefs().count(F.getName()))
+ Defs.push_back(&F);
+ else if (ImportSummary->cfiFunctionDecls().count(F.getName()))
+ Decls.push_back(&F);
}
+
+ for (auto F : Defs)
+ importFunction(F, /*isDefinition*/ true);
+ for (auto F : Decls)
+ importFunction(F, /*isDefinition*/ false);
+
return true;
}
@@ -1387,6 +1471,58 @@ bool LowerTypeTestsModule::lower() {
llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
unsigned I = 0;
SmallVector<MDNode *, 2> Types;
+
+ struct ExportedFunctionInfo {
+ CfiFunctionLinkage Linkage;
+ MDNode *FuncMD; // {name, linkage, type[, type...]}
+ };
+ DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
+ if (ExportSummary) {
+ NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+ if (CfiFunctionsMD) {
+ for (auto FuncMD : CfiFunctionsMD->operands()) {
+ assert(FuncMD->getNumOperands() >= 2);
+ StringRef FunctionName =
+ cast<MDString>(FuncMD->getOperand(0))->getString();
+ if (!ExportSummary->isGUIDLive(GlobalValue::getGUID(
+ GlobalValue::dropLLVMManglingEscape(FunctionName))))
+ continue;
+ CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
+ cast<ConstantAsMetadata>(FuncMD->getOperand(1))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue());
+ auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
+ if (!P.second && P.first->second.Linkage != CFL_Definition)
+ P.first->second = {Linkage, FuncMD};
+ }
+
+ for (const auto &P : ExportedFunctions) {
+ StringRef FunctionName = P.first;
+ CfiFunctionLinkage Linkage = P.second.Linkage;
+ MDNode *FuncMD = P.second.FuncMD;
+ Function *F = M.getFunction(FunctionName);
+ if (!F)
+ F = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()), false),
+ GlobalVariable::ExternalLinkage, FunctionName, &M);
+
+ if (Linkage == CFL_Definition)
+ F->eraseMetadata(LLVMContext::MD_type);
+
+ if (F->isDeclaration()) {
+ if (Linkage == CFL_WeakDeclaration)
+ F->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+ SmallVector<MDNode *, 2> Types;
+ for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
+ F->addMetadata(LLVMContext::MD_type,
+ *cast<MDNode>(FuncMD->getOperand(I).get()));
+ }
+ }
+ }
+ }
+
for (GlobalObject &GO : M.global_objects()) {
if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
continue;
@@ -1396,7 +1532,15 @@ bool LowerTypeTestsModule::lower() {
if (Types.empty())
continue;
- auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types);
+ bool IsDefinition = !GO.isDeclarationForLinker();
+ bool IsExported = false;
+ if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) {
+ IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition;
+ IsExported = true;
+ }
+
+ auto *GTM =
+ GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
for (MDNode *Type : Types) {
verifyTypeMDNode(&GO, Type);
auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index ea805efc66b79..8840435af6421 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -103,6 +103,35 @@ struct PartialInlinerImpl {
bool run(Module &M);
Function *unswitchFunction(Function *F);
+ // This class speculatively clones the the function to be partial inlined.
+ // At the end of partial inlining, the remaining callsites to the cloned
+ // function that are not partially inlined will be fixed up to reference
+ // the original function, and the cloned function will be erased.
+ struct FunctionCloner {
+ FunctionCloner(Function *F, FunctionOutliningInfo *OI);
+ ~FunctionCloner();
+
+ // Prepare for function outlining: making sure there is only
+ // one incoming edge from the extracted/outlined region to
+ // the return block.
+ void NormalizeReturnBlock();
+
+ // Do function outlining:
+ Function *doFunctionOutlining();
+
+ Function *OrigFunc = nullptr;
+ Function *ClonedFunc = nullptr;
+ Function *OutlinedFunc = nullptr;
+ BasicBlock *OutliningCallBB = nullptr;
+ // ClonedFunc is inlined in one of its callers after function
+ // outlining.
+ bool IsFunctionInlined = false;
+ // The cost of the region to be outlined.
+ int OutlinedRegionCost = 0;
+ std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+ std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+ };
+
private:
int NumPartialInlining = 0;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -114,27 +143,18 @@ private:
// The result is no larger than 1 and is represented using BP.
// (Note that the outlined region's 'head' block can only have incoming
// edges from the guarding entry blocks).
- BranchProbability getOutliningCallBBRelativeFreq(Function *F,
- FunctionOutliningInfo *OI,
- Function *DuplicateFunction,
- BlockFrequencyInfo *BFI,
- BasicBlock *OutliningCallBB);
+ BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
// Return true if the callee of CS should be partially inlined with
// profit.
- bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
- BlockFrequencyInfo *CalleeBFI,
- BasicBlock *OutliningCallBB,
- int OutliningCallOverhead,
+ bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
+ BlockFrequency WeightedOutliningRcost,
OptimizationRemarkEmitter &ORE);
// Try to inline DuplicateFunction (cloned from F with call to
// the OutlinedFunction into its callers. Return true
// if there is any successful inlining.
- bool tryPartialInline(Function *DuplicateFunction,
- Function *F, /*orignal function */
- FunctionOutliningInfo *OI, Function *OutlinedFunction,
- BlockFrequencyInfo *CalleeBFI);
+ bool tryPartialInline(FunctionCloner &Cloner);
// Compute the mapping from use site of DuplicationFunction to the enclosing
// BB's profile count.
@@ -146,7 +166,7 @@ private:
NumPartialInlining >= MaxNumPartialInlining);
}
- CallSite getCallSite(User *U) {
+ static CallSite getCallSite(User *U) {
CallSite CS;
if (CallInst *CI = dyn_cast<CallInst>(U))
CS = CallSite(CI);
@@ -157,7 +177,7 @@ private:
return CS;
}
- CallSite getOneCallSiteTo(Function *F) {
+ static CallSite getOneCallSiteTo(Function *F) {
User *User = *F->user_begin();
return getCallSite(User);
}
@@ -171,20 +191,15 @@ private:
// Returns the costs associated with function outlining:
// - The first value is the non-weighted runtime cost for making the call
- // to the outlined function 'OutlinedFunction', including the addtional
- // setup cost in the outlined function itself;
+ // to the outlined function, including the addtional setup cost in the
+ // outlined function itself;
// - The second value is the estimated size of the new call sequence in
- // basic block 'OutliningCallBB';
- // - The third value is the estimated size of the original code from
- // function 'F' that is extracted into the outlined function.
- std::tuple<int, int, int>
- computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
- Function *OutlinedFunction,
- BasicBlock *OutliningCallBB);
+ // basic block Cloner.OutliningCallBB;
+ std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
// Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
// approximate both the size and runtime cost (Note that in the current
// inline cost analysis, there is no clear distinction there either).
- int computeBBInlineCost(BasicBlock *BB);
+ static int computeBBInlineCost(BasicBlock *BB);
std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
@@ -396,19 +411,19 @@ static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
return false;
}
-BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
- Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
- BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+BranchProbability
+PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
auto EntryFreq =
- BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
- auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+ Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+ auto OutliningCallFreq =
+ Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
auto OutlineRegionRelFreq =
BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
EntryFreq.getFrequency());
- if (hasProfileData(F, OI))
+ if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
return OutlineRegionRelFreq;
// When profile data is not available, we need to be conservative in
@@ -433,15 +448,17 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
}
bool PartialInlinerImpl::shouldPartialInline(
- CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
- BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
- int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
+ CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
+ OptimizationRemarkEmitter &ORE) {
+
using namespace ore;
if (SkipCostAnalysis)
return true;
Instruction *Call = CS.getInstruction();
Function *Callee = CS.getCalledFunction();
+ assert(Callee == Cloner.ClonedFunc);
+
Function *Caller = CS.getCaller();
auto &CalleeTTI = (*GetTTI)(*Callee);
InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
@@ -449,14 +466,14 @@ bool PartialInlinerImpl::shouldPartialInline(
if (IC.isAlways()) {
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
- << NV("Callee", F)
+ << NV("Callee", Cloner.OrigFunc)
<< " should always be fully inlined, not partially");
return false;
}
if (IC.isNever()) {
ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
- << NV("Callee", F) << " not partially inlined into "
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
<< NV("Caller", Caller)
<< " because it should never be inlined (cost=never)");
return false;
@@ -464,29 +481,25 @@ bool PartialInlinerImpl::shouldPartialInline(
if (!IC) {
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
- << NV("Callee", F) << " not partially inlined into "
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
<< NV("Caller", Caller) << " because too costly to inline (cost="
<< NV("Cost", IC.getCost()) << ", threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return false;
}
const DataLayout &DL = Caller->getParent()->getDataLayout();
+
// The savings of eliminating the call:
int NonWeightedSavings = getCallsiteCost(CS, DL);
BlockFrequency NormWeightedSavings(NonWeightedSavings);
- auto RelativeFreq =
- getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
- auto NormWeightedRcost =
- BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
-
// Weighted saving is smaller than weighted cost, return false
- if (NormWeightedSavings < NormWeightedRcost) {
+ if (NormWeightedSavings < WeightedOutliningRcost) {
ORE.emit(
OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
- << NV("Callee", F) << " not partially inlined into "
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
<< NV("Caller", Caller) << " runtime overhead (overhead="
- << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+ << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
<< ", savings="
<< NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
<< " of making the outlined call is too high");
@@ -495,7 +508,7 @@ bool PartialInlinerImpl::shouldPartialInline(
}
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
- << NV("Callee", F) << " can be partially inlined into "
+ << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
<< " (threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
@@ -551,50 +564,32 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
return InlineCost;
}
-std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
- Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
- BasicBlock *OutliningCallBB) {
- // First compute the cost of the outlined region 'OI' in the original
- // function 'F'.
- // FIXME: The code extractor (outliner) can now do code sinking/hoisting
- // to reduce outlining cost. The hoisted/sunk code currently do not
- // incur any runtime cost so it is still OK to compare the outlined
- // function cost with the outlined region in the original function.
- // If this ever changes, we will need to introduce new extractor api
- // to pass the information.
- int OutlinedRegionCost = 0;
- for (BasicBlock &BB : *F) {
- if (&BB != OI->ReturnBlock &&
- // Assuming Entry set is small -- do a linear search here:
- std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
- OI->Entries.end()) {
- OutlinedRegionCost += computeBBInlineCost(&BB);
- }
- }
+std::tuple<int, int>
+PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
// Now compute the cost of the call sequence to the outlined function
// 'OutlinedFunction' in BB 'OutliningCallBB':
- int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+ int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
// Now compute the cost of the extracted/outlined function itself:
int OutlinedFunctionCost = 0;
- for (BasicBlock &BB : *OutlinedFunction) {
+ for (BasicBlock &BB : *Cloner.OutlinedFunc) {
OutlinedFunctionCost += computeBBInlineCost(&BB);
}
- assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+ assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
"Outlined function cost should be no less than the outlined region");
// The code extractor introduces a new root and exit stub blocks with
// additional unconditional branches. Those branches will be eliminated
// later with bb layout. The cost should be adjusted accordingly:
OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
- int OutliningRuntimeOverhead = OutliningFuncCallCost +
- (OutlinedFunctionCost - OutlinedRegionCost) +
- ExtraOutliningPenalty;
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost +
+ (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+ ExtraOutliningPenalty;
- return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
- OutlinedRegionCost);
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
}
// Create the callsite to profile count map which is
@@ -641,42 +636,30 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
}
}
-Function *PartialInlinerImpl::unswitchFunction(Function *F) {
-
- if (F->hasAddressTaken())
- return nullptr;
-
- // Let inliner handle it
- if (F->hasFnAttribute(Attribute::AlwaysInline))
- return nullptr;
-
- if (F->hasFnAttribute(Attribute::NoInline))
- return nullptr;
-
- if (PSI->isFunctionEntryCold(F))
- return nullptr;
-
- if (F->user_begin() == F->user_end())
- return nullptr;
-
- std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
-
- if (!OI)
- return nullptr;
+PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
+ FunctionOutliningInfo *OI)
+ : OrigFunc(F) {
+ ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
// Clone the function, so that we can hack away on it.
ValueToValueMapTy VMap;
- Function *DuplicateFunction = CloneFunction(F, VMap);
- BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
- BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
- DenseSet<BasicBlock *> NewEntries;
+ ClonedFunc = CloneFunction(F, VMap);
+
+ ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
for (BasicBlock *BB : OI->Entries) {
- NewEntries.insert(cast<BasicBlock>(VMap[BB]));
+ ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+ }
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
+ BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+ ClonedOI->ReturnBlockPreds.push_back(NewE);
}
-
// Go ahead and update all uses to the duplicate, so that we can just
// use the inliner functionality when we're done hacking.
- F->replaceAllUsesWith(DuplicateFunction);
+ F->replaceAllUsesWith(ClonedFunc);
+}
+
+void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
auto getFirstPHI = [](BasicBlock *BB) {
BasicBlock::iterator I = BB->begin();
@@ -692,14 +675,19 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
}
return FirstPhi;
};
+
// Special hackery is needed with PHI nodes that have inputs from more than
// one extracted block. For simplicity, just split the PHIs into a two-level
// sequence of PHIs, some of which will go in the extracted region, and some
// of which will go outside.
- BasicBlock *PreReturn = NewReturnBlock;
+ BasicBlock *PreReturn = ClonedOI->ReturnBlock;
// only split block when necessary:
PHINode *FirstPhi = getFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
+ unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+ if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+ return;
+
auto IsTrivialPhi = [](PHINode *PN) -> Value * {
Value *CommonValue = PN->getIncomingValue(0);
if (all_of(PN->incoming_values(),
@@ -708,143 +696,185 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
return nullptr;
};
- if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
-
- NewReturnBlock = NewReturnBlock->splitBasicBlock(
- NewReturnBlock->getFirstNonPHI()->getIterator());
- BasicBlock::iterator I = PreReturn->begin();
- Instruction *Ins = &NewReturnBlock->front();
- SmallVector<Instruction *, 4> DeadPhis;
- while (I != PreReturn->end()) {
- PHINode *OldPhi = dyn_cast<PHINode>(I);
- if (!OldPhi)
- break;
-
- PHINode *RetPhi =
- PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
- OldPhi->replaceAllUsesWith(RetPhi);
- Ins = NewReturnBlock->getFirstNonPHI();
+ ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+ ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+ BasicBlock::iterator I = PreReturn->begin();
+ Instruction *Ins = &ClonedOI->ReturnBlock->front();
+ SmallVector<Instruction *, 4> DeadPhis;
+ while (I != PreReturn->end()) {
+ PHINode *OldPhi = dyn_cast<PHINode>(I);
+ if (!OldPhi)
+ break;
- RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : OI->ReturnBlockPreds) {
- BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
- RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
- OldPhi->removeIncomingValue(NewE);
- }
+ PHINode *RetPhi =
+ PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
+ OldPhi->replaceAllUsesWith(RetPhi);
+ Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
- // After incoming values splitting, the old phi may become trivial.
- // Keeping the trivial phi can introduce definition inside the outline
- // region which is live-out, causing necessary overhead (load, store
- // arg passing etc).
- if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
- OldPhi->replaceAllUsesWith(OldPhiVal);
- DeadPhis.push_back(OldPhi);
- }
-
- ++I;
+ RetPhi->addIncoming(&*I, PreReturn);
+ for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+ RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+ OldPhi->removeIncomingValue(E);
}
+ // After incoming values splitting, the old phi may become trivial.
+ // Keeping the trivial phi can introduce definition inside the outline
+ // region which is live-out, causing necessary overhead (load, store
+ // arg passing etc).
+ if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+ OldPhi->replaceAllUsesWith(OldPhiVal);
+ DeadPhis.push_back(OldPhi);
+ }
+ ++I;
+ }
for (auto *DP : DeadPhis)
DP->eraseFromParent();
- for (auto E : OI->ReturnBlockPreds) {
- BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
- NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
+ for (auto E : ClonedOI->ReturnBlockPreds) {
+ E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
}
- }
+}
+Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
// Returns true if the block is to be partial inlined into the caller
// (i.e. not to be extracted to the out of line function)
- auto ToBeInlined = [&](BasicBlock *BB) {
- return BB == NewReturnBlock || NewEntries.count(BB);
+ auto ToBeInlined = [&, this](BasicBlock *BB) {
+ return BB == ClonedOI->ReturnBlock ||
+ (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
+ ClonedOI->Entries.end());
};
+
// Gather up the blocks that we're going to extract.
std::vector<BasicBlock *> ToExtract;
- ToExtract.push_back(NewNonReturnBlock);
- for (BasicBlock &BB : *DuplicateFunction)
- if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock)
+ ToExtract.push_back(ClonedOI->NonReturnBlock);
+ OutlinedRegionCost +=
+ PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
+ for (BasicBlock &BB : *ClonedFunc)
+ if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
ToExtract.push_back(&BB);
+ // FIXME: the code extractor may hoist/sink more code
+ // into the outlined function which may make the outlining
+ // overhead (the difference of the outlined function cost
+ // and OutliningRegionCost) look larger.
+ OutlinedRegionCost += computeBBInlineCost(&BB);
+ }
// The CodeExtractor needs a dominator tree.
DominatorTree DT;
- DT.recalculate(*DuplicateFunction);
+ DT.recalculate(*ClonedFunc);
// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*DuplicateFunction, LI);
- BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
+ BranchProbabilityInfo BPI(*ClonedFunc, LI);
+ ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
// Extract the body of the if.
- Function *OutlinedFunction =
- CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
- .extractCodeRegion();
+ OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+ ClonedFuncBFI.get(), &BPI)
+ .extractCodeRegion();
+
+ if (OutlinedFunc) {
+ OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
+ .getInstruction()
+ ->getParent();
+ assert(OutliningCallBB->getParent() == ClonedFunc);
+ }
- bool AnyInline =
- tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+ return OutlinedFunc;
+}
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
// Ditch the duplicate, since we're done with it, and rewrite all remaining
// users (function pointers, etc.) back to the original function.
- DuplicateFunction->replaceAllUsesWith(F);
- DuplicateFunction->eraseFromParent();
+ ClonedFunc->replaceAllUsesWith(OrigFunc);
+ ClonedFunc->eraseFromParent();
+ if (!IsFunctionInlined) {
+ // Remove the function that is speculatively created if there is no
+ // reference.
+ if (OutlinedFunc)
+ OutlinedFunc->eraseFromParent();
+ }
+}
+
+Function *PartialInlinerImpl::unswitchFunction(Function *F) {
+
+ if (F->hasAddressTaken())
+ return nullptr;
+
+ // Let inliner handle it
+ if (F->hasFnAttribute(Attribute::AlwaysInline))
+ return nullptr;
+
+ if (F->hasFnAttribute(Attribute::NoInline))
+ return nullptr;
+
+ if (PSI->isFunctionEntryCold(F))
+ return nullptr;
+
+ if (F->user_begin() == F->user_end())
+ return nullptr;
+
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+
+ if (!OI)
+ return nullptr;
+
+ FunctionCloner Cloner(F, OI.get());
+ Cloner.NormalizeReturnBlock();
+ Function *OutlinedFunction = Cloner.doFunctionOutlining();
+
+ bool AnyInline = tryPartialInline(Cloner);
if (AnyInline)
return OutlinedFunction;
- // Remove the function that is speculatively created:
- if (OutlinedFunction)
- OutlinedFunction->eraseFromParent();
-
return nullptr;
}
-bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
- Function *F,
- FunctionOutliningInfo *OI,
- Function *OutlinedFunction,
- BlockFrequencyInfo *CalleeBFI) {
- if (OutlinedFunction == nullptr)
- return false;
-
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
int NonWeightedRcost;
int SizeCost;
- int OutlinedRegionSizeCost;
- auto OutliningCallBB =
- getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+ if (Cloner.OutlinedFunc == nullptr)
+ return false;
+
+ std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
- std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
- computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+ auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+ auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
// The call sequence to the outlined function is larger than the original
// outlined region size, it does not increase the chances of inlining
- // 'F' with outlining (The inliner usies the size increase to model the
- // the cost of inlining a callee).
- if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
- OptimizationRemarkEmitter ORE(F);
+ // the function with outlining (The inliner usies the size increase to
+ // model the cost of inlining a callee).
+ if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+ OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
DebugLoc DLoc;
BasicBlock *Block;
- std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+ std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
DLoc, Block)
- << ore::NV("Function", F)
+ << ore::NV("Function", Cloner.OrigFunc)
<< " not partially inlined into callers (Original Size = "
- << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+ << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
<< ", Size of call sequence to outlined function = "
<< ore::NV("NewSize", SizeCost) << ")");
return false;
}
- assert(F->user_begin() == F->user_end() &&
+ assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
"F's users should all be replaced!");
- std::vector<User *> Users(DuplicateFunction->user_begin(),
- DuplicateFunction->user_end());
+
+ std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+ Cloner.ClonedFunc->user_end());
DenseMap<User *, uint64_t> CallSiteToProfCountMap;
- if (F->getEntryCount())
- computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+ if (Cloner.OrigFunc->getEntryCount())
+ computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
- auto CalleeEntryCount = F->getEntryCount();
+ auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+
bool AnyInline = false;
for (User *User : Users) {
CallSite CS = getCallSite(User);
@@ -854,13 +884,12 @@ bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
OptimizationRemarkEmitter ORE(CS.getCaller());
- if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
- NonWeightedRcost, ORE))
+ if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
continue;
ORE.emit(
OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
- << ore::NV("Callee", F) << " partially inlined into "
+ << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
<< ore::NV("Caller", CS.getCaller()));
InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
@@ -878,8 +907,11 @@ bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
NumPartialInlined++;
}
- if (AnyInline && CalleeEntryCount)
- F->setEntryCount(CalleeEntryCountV);
+ if (AnyInline) {
+ Cloner.IsFunctionInlined = true;
+ if (CalleeEntryCount)
+ Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
+ }
return AnyInline;
}
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 16fba32e98056..4bc64ab698ff9 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -141,6 +141,10 @@ static cl::opt<int> PreInlineThreshold(
cl::desc("Control the amount of inlining in pre-instrumentation inliner "
"(default = 75)"));
+static cl::opt<bool> EnableEarlyCSEMemSSA(
+ "enable-earlycse-memssa", cl::init(false), cl::Hidden,
+ cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = off)"));
+
static cl::opt<bool> EnableGVNHoist(
"enable-gvn-hoist", cl::init(false), cl::Hidden,
cl::desc("Enable the GVN hoisting pass (default = off)"));
@@ -308,7 +312,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
// Start of function pass.
// Break up aggregate allocas, using SSAUpdater.
MPM.add(createSROAPass());
- MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
+ MPM.add(createEarlyCSEPass(EnableEarlyCSEMemSSA)); // Catch trivial redundancies
if (EnableGVNHoist)
MPM.add(createGVNHoistPass());
if (EnableGVNSink) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index a7bcc7cc55325..802f470ffe1fb 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -32,7 +32,8 @@ namespace {
// Promote each local-linkage entity defined by ExportM and used by ImportM by
// changing visibility and appending the given ModuleId.
-void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
+ SetVector<GlobalValue *> &PromoteExtra) {
DenseMap<const Comdat *, Comdat *> RenamedComdats;
for (auto &ExportGV : ExportM.global_values()) {
if (!ExportGV.hasLocalLinkage())
@@ -40,7 +41,7 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
auto Name = ExportGV.getName();
GlobalValue *ImportGV = ImportM.getNamedValue(Name);
- if (!ImportGV || ImportGV->use_empty())
+ if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV))
continue;
std::string NewName = (Name + ModuleId).str();
@@ -53,8 +54,10 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
ExportGV.setLinkage(GlobalValue::ExternalLinkage);
ExportGV.setVisibility(GlobalValue::HiddenVisibility);
- ImportGV->setName(NewName);
- ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+ if (ImportGV) {
+ ImportGV->setName(NewName);
+ ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+ }
}
if (!RenamedComdats.empty())
@@ -296,6 +299,11 @@ void splitAndWriteThinLTOBitcode(
F.setComdat(nullptr);
}
+ SetVector<GlobalValue *> CfiFunctions;
+ for (auto &F : M)
+ if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
+ CfiFunctions.insert(&F);
+
// Remove all globals with type metadata, globals with comdats that live in
// MergedM, and aliases pointing to such globals from the thin LTO module.
filterModule(&M, [&](const GlobalValue *GV) {
@@ -308,11 +316,39 @@ void splitAndWriteThinLTOBitcode(
return true;
});
- promoteInternals(*MergedM, M, ModuleId);
- promoteInternals(M, *MergedM, ModuleId);
+ promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
+ promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+
+ SmallVector<MDNode *, 8> CfiFunctionMDs;
+ for (auto V : CfiFunctions) {
+ Function &F = *cast<Function>(V);
+ SmallVector<MDNode *, 2> Types;
+ F.getMetadata(LLVMContext::MD_type, Types);
+
+ auto &Ctx = MergedM->getContext();
+ SmallVector<Metadata *, 4> Elts;
+ Elts.push_back(MDString::get(Ctx, F.getName()));
+ CfiFunctionLinkage Linkage;
+ if (!F.isDeclarationForLinker())
+ Linkage = CFL_Definition;
+ else if (F.isWeakForLinker())
+ Linkage = CFL_WeakDeclaration;
+ else
+ Linkage = CFL_Declaration;
+ Elts.push_back(ConstantAsMetadata::get(
+ llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
+ for (auto Type : Types)
+ Elts.push_back(Type);
+ CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
+ }
- simplifyExternals(*MergedM);
+ if(!CfiFunctionMDs.empty()) {
+ NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
+ for (auto MD : CfiFunctionMDs)
+ NMD->addOperand(MD);
+ }
+ simplifyExternals(*MergedM);
// FIXME: Try to re-use BSI and PFI from the original module here.
ProfileSummaryInfo PSI(M);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4fe3225a21722..a881bda5ba98d 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -763,8 +763,54 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
return nullptr;
}
+// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+ bool JoinedByAnd,
+ Instruction &CxtI) {
+ ICmpInst::Predicate Pred = LHS->getPredicate();
+ if (Pred != RHS->getPredicate())
+ return nullptr;
+ if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+ return nullptr;
+ if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+ return nullptr;
+
+ // TODO support vector splats
+ ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+ ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+ if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero())
+ return nullptr;
+
+ Value *A, *B, *C, *D;
+ if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
+ match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
+ if (A == D || B == D)
+ std::swap(C, D);
+ if (B == C)
+ std::swap(A, B);
+
+ if (A == C &&
+ isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) &&
+ isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) {
+ Value *Mask = Builder->CreateOr(B, D);
+ Value *Masked = Builder->CreateAnd(A, Mask);
+ auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+ return Builder->CreateICmp(NewPred, Masked, Mask);
+ }
+ }
+
+ return nullptr;
+}
+
/// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+ Instruction &CxtI) {
+ // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+ // if K1 and K2 are a one-bit mask.
+ if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
+ return V;
+
ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
// (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
@@ -1127,8 +1173,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
if (ICmp0 && ICmp1) {
- Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1)
- : foldOrOfICmps(ICmp0, ICmp1, &I);
+ Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
+ : foldOrOfICmps(ICmp0, ICmp1, I);
if (Res)
return CastInst::Create(CastOpcode, Res, DestTy);
return nullptr;
@@ -1426,7 +1472,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
if (LHS && RHS)
- if (Value *Res = foldAndOfICmps(LHS, RHS))
+ if (Value *Res = foldAndOfICmps(LHS, RHS, I))
return replaceInstUsesWith(I, Res);
// TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -1434,18 +1480,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
Value *X, *Y;
if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldAndOfICmps(LHS, Cmp))
+ if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldAndOfICmps(LHS, Cmp))
+ if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
}
if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldAndOfICmps(Cmp, RHS))
+ if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldAndOfICmps(Cmp, RHS))
+ if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
}
}
@@ -1591,41 +1637,16 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
/// Fold (icmp)|(icmp) if possible.
Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
- Instruction *CxtI) {
- ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-
+ Instruction &CxtI) {
// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
// if K1 and K2 are a one-bit mask.
- ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
- ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+ if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
+ return V;
- if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero() &&
- RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
-
- BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
- BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
- if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() &&
- LAnd->getOpcode() == Instruction::And &&
- RAnd->getOpcode() == Instruction::And) {
-
- Value *Mask = nullptr;
- Value *Masked = nullptr;
- if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
- isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, CxtI) &&
- isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, CxtI)) {
- Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
- Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
- } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
- isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, CxtI) &&
- isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, CxtI)) {
- Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
- Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
- }
+ ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
- if (Masked)
- return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask);
- }
- }
+ ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+ ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
// Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
// --> (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
@@ -2117,12 +2138,16 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
}
// (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
+ // FIXME: The two hasOneUse calls here are the same call, maybe we were
+ // supposed to check Op1->operand(0)?
if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
return BinaryOperator::CreateOr(Op0, C);
// ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
+ // FIXME: The two hasOneUse calls here are the same call, maybe we were
+ // supposed to check Op0->operand(0)?
if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
@@ -2194,7 +2219,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
if (LHS && RHS)
- if (Value *Res = foldOrOfICmps(LHS, RHS, &I))
+ if (Value *Res = foldOrOfICmps(LHS, RHS, I))
return replaceInstUsesWith(I, Res);
// TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -2202,18 +2227,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
Value *X, *Y;
if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldOrOfICmps(LHS, Cmp, &I))
+ if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldOrOfICmps(LHS, Cmp, &I))
+ if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
}
if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldOrOfICmps(Cmp, RHS, &I))
+ if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldOrOfICmps(Cmp, RHS, &I))
+ if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
}
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d29ed49eca0b0..c0830a5d21124 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -94,75 +94,80 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
return ConstantVector::get(BoolVec);
}
-Instruction *
-InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
+Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy(
+ ElementUnorderedAtomicMemCpyInst *AMI) {
// Try to unfold this intrinsic into sequence of explicit atomic loads and
// stores.
// First check that number of elements is compile time constant.
- auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
- if (!NumElementsCI)
+ auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
+ if (!LengthCI)
return nullptr;
// Check that there are not too many elements.
- uint64_t NumElements = NumElementsCI->getZExtValue();
+ uint64_t LengthInBytes = LengthCI->getZExtValue();
+ uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
+ uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
return nullptr;
- // Don't unfold into illegal integers
- uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
- if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
- return nullptr;
+ // Only expand if there are elements to copy.
+ if (NumElements > 0) {
+ // Don't unfold into illegal integers
+ uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
+ if (!getDataLayout().isLegalInteger(ElementSizeInBits))
+ return nullptr;
- // Cast source and destination to the correct type. Intrinsic input arguments
- // are usually represented as i8*.
- // Often operands will be explicitly casted to i8* and we can just strip
- // those casts instead of inserting new ones. However it's easier to rely on
- // other InstCombine rules which will cover trivial cases anyway.
- Value *Src = AMI->getRawSource();
- Value *Dst = AMI->getRawDest();
- Type *ElementPointerType = Type::getIntNPtrTy(
- AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
-
- Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
- "memcpy_unfold.src_casted");
- Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
- "memcpy_unfold.dst_casted");
-
- for (uint64_t i = 0; i < NumElements; ++i) {
- // Get current element addresses
- ConstantInt *ElementIdxCI =
- ConstantInt::get(AMI->getContext(), APInt(64, i));
- Value *SrcElementAddr =
- Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
- Value *DstElementAddr =
- Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
- // Load from the source. Transfer alignment information and mark load as
- // unordered atomic.
- LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
- Load->setOrdering(AtomicOrdering::Unordered);
- // We know alignment of the first element. It is also guaranteed by the
- // verifier that element size is less or equal than first element alignment
- // and both of this values are powers of two.
- // This means that all subsequent accesses are at least element size
- // aligned.
- // TODO: We can infer better alignment but there is no evidence that this
- // will matter.
- Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
- : AMI->getElementSizeInBytes());
- Load->setDebugLoc(AMI->getDebugLoc());
-
- // Store loaded value via unordered atomic store.
- StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
- Store->setOrdering(AtomicOrdering::Unordered);
- Store->setAlignment(i == 0 ? AMI->getDstAlignment()
- : AMI->getElementSizeInBytes());
- Store->setDebugLoc(AMI->getDebugLoc());
+ // Cast source and destination to the correct type. Intrinsic input
+ // arguments are usually represented as i8*. Often operands will be
+ // explicitly casted to i8* and we can just strip those casts instead of
+ // inserting new ones. However it's easier to rely on other InstCombine
+ // rules which will cover trivial cases anyway.
+ Value *Src = AMI->getRawSource();
+ Value *Dst = AMI->getRawDest();
+ Type *ElementPointerType =
+ Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
+ Src->getType()->getPointerAddressSpace());
+
+ Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
+ "memcpy_unfold.src_casted");
+ Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
+ "memcpy_unfold.dst_casted");
+
+ for (uint64_t i = 0; i < NumElements; ++i) {
+ // Get current element addresses
+ ConstantInt *ElementIdxCI =
+ ConstantInt::get(AMI->getContext(), APInt(64, i));
+ Value *SrcElementAddr =
+ Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+ Value *DstElementAddr =
+ Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+ // Load from the source. Transfer alignment information and mark load as
+ // unordered atomic.
+ LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+ Load->setOrdering(AtomicOrdering::Unordered);
+ // We know alignment of the first element. It is also guaranteed by the
+ // verifier that element size is less or equal than first element
+ // alignment and both of this values are powers of two. This means that
+ // all subsequent accesses are at least element size aligned.
+ // TODO: We can infer better alignment but there is no evidence that this
+ // will matter.
+ Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
+ : ElementSizeInBytes);
+ Load->setDebugLoc(AMI->getDebugLoc());
+
+ // Store loaded value via unordered atomic store.
+ StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
+ Store->setOrdering(AtomicOrdering::Unordered);
+ Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
+ : ElementSizeInBytes);
+ Store->setDebugLoc(AMI->getDebugLoc());
+ }
}
// Set the number of elements of the copy to 0, it will be deleted on the
// next iteration.
- AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
+ AMI->setLength(Constant::getNullValue(LengthCI->getType()));
return AMI;
}
@@ -1888,12 +1893,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
if (Changed) return II;
}
- if (auto *AMI = dyn_cast<ElementAtomicMemCpyInst>(II)) {
- if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
+ if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) {
+ if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
if (C->isNullValue())
return eraseInstFromFunction(*AMI);
- if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
+ if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
return I;
}
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index fd0a64a5bbb56..1a7db146df426 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -447,12 +447,14 @@ private:
Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
const CastInst *CI2);
- Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+ Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
Value *foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
- Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
+ Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
Value *foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+ Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+ bool JoinedByAnd, Instruction &CxtI);
public:
/// \brief Inserts an instruction \p New before instruction \p Old
///
@@ -724,7 +726,8 @@ private:
Instruction *MatchBSwap(BinaryOperator &I);
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
- Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
+ Instruction *
+ SimplifyElementUnorderedAtomicMemCpy(ElementUnorderedAtomicMemCpyInst *AMI);
Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
Instruction *SimplifyMemSet(MemSetInst *MI);
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 3f2ddcacce2b8..8cec865c6422a 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -682,11 +682,11 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
}
- if (match(Op0, m_SExt(m_Value(X)))) {
+ if (match(Op0, m_SExt(m_Value(X))) &&
+ (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
// Are we moving the sign bit to the low bit and widening with high zeros?
unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
- if (ShAmt == BitWidth - 1 &&
- (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+ if (ShAmt == BitWidth - 1) {
// lshr (sext i1 X to iN), N-1 --> zext X to iN
if (SrcTyBitWidth == 1)
return new ZExtInst(X, Ty);
@@ -698,7 +698,13 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
}
}
- // TODO: Convert to ashr+zext if the shift equals the extension amount.
+ // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+ if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
+ // The new shift amount can't be more than the narrow source type.
+ unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
+ Value *AShr = Builder->CreateAShr(X, NewShAmt);
+ return new ZExtInst(AShr, Ty);
+ }
}
if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 7ff69b9eb7f42..f2806e278e6e1 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMInstrumentation
Instrumentation.cpp
InstrProfiling.cpp
PGOInstrumentation.cpp
+ PGOMemOPSizeOpt.cpp
SanitizerCoverage.cpp
ThreadSanitizer.cpp
EfficiencySanitizer.cpp
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 96027bc3d0a91..0d308810009d5 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -56,8 +56,6 @@ using namespace llvm;
STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
-STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
-STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
// Command line option to disable indirect-call promotion with the default as
// false. This is for debug purpose.
@@ -111,44 +109,6 @@ static cl::opt<bool>
ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
cl::desc("Dump IR after transformation happens"));
-// The minimum call count to optimize memory intrinsic calls.
-static cl::opt<unsigned>
- MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
- cl::init(1000),
- cl::desc("The minimum count to optimize memory "
- "intrinsic calls"));
-
-// Command line option to disable memory intrinsic optimization. The default is
-// false. This is for debug purpose.
-static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
- cl::Hidden, cl::desc("Disable optimize"));
-
-// The percent threshold to optimize memory intrinsic calls.
-static cl::opt<unsigned>
- MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
- cl::Hidden, cl::ZeroOrMore,
- cl::desc("The percentage threshold for the "
- "memory intrinsic calls optimization"));
-
-// Maximum number of versions for optimizing memory intrinsic call.
-static cl::opt<unsigned>
- MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
- cl::ZeroOrMore,
- cl::desc("The max version for the optimized memory "
- " intrinsic calls"));
-
-// Scale the counts from the annotation using the BB count value.
-static cl::opt<bool>
- MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
- cl::desc("Scale the memop size counts using the basic "
- " block count value"));
-
-// This option sets the rangge of precise profile memop sizes.
-extern cl::opt<std::string> MemOPSizeRange;
-
-// This option sets the value that groups large memop sizes
-extern cl::opt<unsigned> MemOPSizeLarge;
-
namespace {
class PGOIndirectCallPromotionLegacyPass : public ModulePass {
public:
@@ -173,24 +133,6 @@ private:
// the promoted direct call.
bool SamplePGO;
};
-
-class PGOMemOPSizeOptLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
- initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "PGOMemOPSize"; }
-
-private:
- bool runOnFunction(Function &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
} // end anonymous namespace
char PGOIndirectCallPromotionLegacyPass::ID = 0;
@@ -204,19 +146,6 @@ ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
}
-char PGOMemOPSizeOptLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
- "Optimize memory intrinsic using its size value profile",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
- "Optimize memory intrinsic using its size value profile",
- false, false)
-
-FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
- return new PGOMemOPSizeOptLegacyPass();
-}
-
namespace {
// The class for main data structure to promote indirect calls to conditional
// direct calls.
@@ -749,285 +678,3 @@ PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
return PreservedAnalyses::none();
}
-
-namespace {
-class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
-public:
- MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
- : Func(Func), BFI(BFI), Changed(false) {
- ValueDataArray =
- llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
- // Get the MemOPSize range information from option MemOPSizeRange,
- getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
- PreciseRangeLast);
- }
- bool isChanged() const { return Changed; }
- void perform() {
- WorkList.clear();
- visit(Func);
-
- for (auto &MI : WorkList) {
- ++NumOfPGOMemOPAnnotate;
- if (perform(MI)) {
- Changed = true;
- ++NumOfPGOMemOPOpt;
- DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
- << "is Transformed.\n");
- }
- }
- }
-
- void visitMemIntrinsic(MemIntrinsic &MI) {
- Value *Length = MI.getLength();
- // Not perform on constant length calls.
- if (dyn_cast<ConstantInt>(Length))
- return;
- WorkList.push_back(&MI);
- }
-
-private:
- Function &Func;
- BlockFrequencyInfo &BFI;
- bool Changed;
- std::vector<MemIntrinsic *> WorkList;
- // Start of the previse range.
- int64_t PreciseRangeStart;
- // Last value of the previse range.
- int64_t PreciseRangeLast;
- // The space to read the profile annotation.
- std::unique_ptr<InstrProfValueData[]> ValueDataArray;
- bool perform(MemIntrinsic *MI);
-
- // This kind shows which group the value falls in. For PreciseValue, we have
- // the profile count for that value. LargeGroup groups the values that are in
- // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
- enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
-
- MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
- if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
- return LargeGroup;
- if (Value == PreciseRangeLast + 1)
- return NonLargeGroup;
- return PreciseValue;
- }
-};
-
-static const char *getMIName(const MemIntrinsic *MI) {
- switch (MI->getIntrinsicID()) {
- case Intrinsic::memcpy:
- return "memcpy";
- case Intrinsic::memmove:
- return "memmove";
- case Intrinsic::memset:
- return "memset";
- default:
- return "unknown";
- }
-}
-
-static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
- assert(Count <= TotalCount);
- if (Count < MemOPCountThreshold)
- return false;
- if (Count < TotalCount * MemOPPercentThreshold / 100)
- return false;
- return true;
-}
-
-static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
- uint64_t Denom) {
- if (!MemOPScaleCount)
- return Count;
- bool Overflowed;
- uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
- return ScaleCount / Denom;
-}
-
-bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
- assert(MI);
- if (MI->getIntrinsicID() == Intrinsic::memmove)
- return false;
-
- uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
- uint64_t TotalCount;
- if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
- ValueDataArray.get(), NumVals, TotalCount))
- return false;
-
- uint64_t ActualCount = TotalCount;
- uint64_t SavedTotalCount = TotalCount;
- if (MemOPScaleCount) {
- auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
- if (!BBEdgeCount)
- return false;
- ActualCount = *BBEdgeCount;
- }
-
- ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
- DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
- << "\n");
- DEBUG(
- for (auto &VD
- : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
-
- if (ActualCount < MemOPCountThreshold)
- return false;
- // Skip if the total value profiled count is 0, in which case we can't
- // scale up the counts properly (and there is no profitable transformation).
- if (TotalCount == 0)
- return false;
-
- TotalCount = ActualCount;
- if (MemOPScaleCount)
- DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
- << " denominator = " << SavedTotalCount << "\n");
-
- // Keeping track of the count of the default case:
- uint64_t RemainCount = TotalCount;
- SmallVector<uint64_t, 16> SizeIds;
- SmallVector<uint64_t, 16> CaseCounts;
- uint64_t MaxCount = 0;
- unsigned Version = 0;
- // Default case is in the front -- save the slot here.
- CaseCounts.push_back(0);
- for (auto &VD : VDs) {
- int64_t V = VD.Value;
- uint64_t C = VD.Count;
- if (MemOPScaleCount)
- C = getScaledCount(C, ActualCount, SavedTotalCount);
-
- // Only care precise value here.
- if (getMemOPSizeKind(V) != PreciseValue)
- continue;
-
- // ValueCounts are sorted on the count. Break at the first un-profitable
- // value.
- if (!isProfitable(C, RemainCount))
- break;
-
- SizeIds.push_back(V);
- CaseCounts.push_back(C);
- if (C > MaxCount)
- MaxCount = C;
-
- assert(RemainCount >= C);
- RemainCount -= C;
-
- if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
- break;
- }
-
- if (Version == 0)
- return false;
-
- CaseCounts[0] = RemainCount;
- if (RemainCount > MaxCount)
- MaxCount = RemainCount;
-
- uint64_t SumForOpt = TotalCount - RemainCount;
-
- DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
- << " Versions (covering " << SumForOpt << " out of "
- << TotalCount << ")\n");
-
- // mem_op(..., size)
- // ==>
- // switch (size) {
- // case s1:
- // mem_op(..., s1);
- // goto merge_bb;
- // case s2:
- // mem_op(..., s2);
- // goto merge_bb;
- // ...
- // default:
- // mem_op(..., size);
- // goto merge_bb;
- // }
- // merge_bb:
-
- BasicBlock *BB = MI->getParent();
- DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
- DEBUG(dbgs() << *BB << "\n");
- auto OrigBBFreq = BFI.getBlockFreq(BB);
-
- BasicBlock *DefaultBB = SplitBlock(BB, MI);
- BasicBlock::iterator It(*MI);
- ++It;
- assert(It != DefaultBB->end());
- BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
- MergeBB->setName("MemOP.Merge");
- BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
- DefaultBB->setName("MemOP.Default");
-
- auto &Ctx = Func.getContext();
- IRBuilder<> IRB(BB);
- BB->getTerminator()->eraseFromParent();
- Value *SizeVar = MI->getLength();
- SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
-
- // Clear the value profile data.
- MI->setMetadata(LLVMContext::MD_prof, nullptr);
-
- DEBUG(dbgs() << "\n\n== Basic Block After==\n");
-
- for (uint64_t SizeId : SizeIds) {
- ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
- BasicBlock *CaseBB = BasicBlock::Create(
- Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
- Instruction *NewInst = MI->clone();
- // Fix the argument.
- dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
- CaseBB->getInstList().push_back(NewInst);
- IRBuilder<> IRBCase(CaseBB);
- IRBCase.CreateBr(MergeBB);
- SI->addCase(CaseSizeId, CaseBB);
- DEBUG(dbgs() << *CaseBB << "\n");
- }
- setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
-
- DEBUG(dbgs() << *BB << "\n");
- DEBUG(dbgs() << *DefaultBB << "\n");
- DEBUG(dbgs() << *MergeBB << "\n");
-
- emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
- MI->getDebugLoc(),
- Twine("optimize ") + getMIName(MI) + " with count " +
- Twine(SumForOpt) + " out of " + Twine(TotalCount) +
- " for " + Twine(Version) + " versions");
-
- return true;
-}
-} // namespace
-
-static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
- if (DisableMemOPOPT)
- return false;
-
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
- return false;
- MemOPSizeOpt MemOPSizeOpt(F, BFI);
- MemOPSizeOpt.perform();
- return MemOPSizeOpt.isChanged();
-}
-
-bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
- BlockFrequencyInfo &BFI =
- getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
- return PGOMemOPSizeOptImpl(F, BFI);
-}
-
-namespace llvm {
-char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
-
-PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
- FunctionAnalysisManager &FAM) {
- auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
- bool Changed = PGOMemOPSizeOptImpl(F, BFI);
- if (!Changed)
- return PreservedAnalyses::all();
- auto PA = PreservedAnalyses();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f83c930ca61be..37f88d5f95f18 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -343,14 +343,24 @@ static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
static inline bool shouldRecordFunctionAddr(Function *F) {
// Check the linkage
+ bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
- !F->hasAvailableExternallyLinkage())
+ !HasAvailableExternallyLinkage)
return true;
+
+ // A function marked 'alwaysinline' with available_externally linkage can't
+ // have its address taken. Doing so would create an undefined external ref to
+ // the function, which would fail to link.
+ if (HasAvailableExternallyLinkage &&
+ F->hasFnAttribute(Attribute::AlwaysInline))
+ return false;
+
// Prohibit function address recording if the function is both internal and
// COMDAT. This avoids the profile data variable referencing internal symbols
// in COMDAT.
if (F->hasLocalLinkage() && F->hasComdat())
return false;
+
// Check uses of this function for other than direct calls or invokes to it.
// Inline virtual functions have linkeOnceODR linkage. When a key method
// exists, the vtable will only be emitted in the TU where the key method
diff --git a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
new file mode 100644
index 0000000000000..0bc9ddfbe4d33
--- /dev/null
+++ b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -0,0 +1,419 @@
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+ MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+ cl::init(1000),
+ cl::desc("The minimum count to optimize memory "
+ "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+ cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+ MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("The percentage threshold for the "
+ "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+ MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+ cl::ZeroOrMore,
+ cl::desc("The max version for the optimized memory "
+ " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+ MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+ cl::desc("Scale the memop size counts using the basic "
+ " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+ initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+ "Optimize memory intrinsic using its size value profile",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+ "Optimize memory intrinsic using its size value profile",
+ false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+ return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+ MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
+ : Func(Func), BFI(BFI), Changed(false) {
+ ValueDataArray =
+ llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+ // Get the MemOPSize range information from option MemOPSizeRange,
+ getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+ PreciseRangeLast);
+ }
+ bool isChanged() const { return Changed; }
+ void perform() {
+ WorkList.clear();
+ visit(Func);
+
+ for (auto &MI : WorkList) {
+ ++NumOfPGOMemOPAnnotate;
+ if (perform(MI)) {
+ Changed = true;
+ ++NumOfPGOMemOPOpt;
+ DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
+ << "is Transformed.\n");
+ }
+ }
+ }
+
+ void visitMemIntrinsic(MemIntrinsic &MI) {
+ Value *Length = MI.getLength();
+ // Not perform on constant length calls.
+ if (dyn_cast<ConstantInt>(Length))
+ return;
+ WorkList.push_back(&MI);
+ }
+
+private:
+ Function &Func;
+ BlockFrequencyInfo &BFI;
+ bool Changed;
+ std::vector<MemIntrinsic *> WorkList;
+ // Start of the previse range.
+ int64_t PreciseRangeStart;
+ // Last value of the previse range.
+ int64_t PreciseRangeLast;
+ // The space to read the profile annotation.
+ std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+ bool perform(MemIntrinsic *MI);
+
+ // This kind shows which group the value falls in. For PreciseValue, we have
+ // the profile count for that value. LargeGroup groups the values that are in
+ // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+ enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+ MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+ if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+ return LargeGroup;
+ if (Value == PreciseRangeLast + 1)
+ return NonLargeGroup;
+ return PreciseValue;
+ }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+ switch (MI->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ return "memcpy";
+ case Intrinsic::memmove:
+ return "memmove";
+ case Intrinsic::memset:
+ return "memset";
+ default:
+ return "unknown";
+ }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+ assert(Count <= TotalCount);
+ if (Count < MemOPCountThreshold)
+ return false;
+ if (Count < TotalCount * MemOPPercentThreshold / 100)
+ return false;
+ return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+ uint64_t Denom) {
+ if (!MemOPScaleCount)
+ return Count;
+ bool Overflowed;
+ uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+ return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+ assert(MI);
+ if (MI->getIntrinsicID() == Intrinsic::memmove)
+ return false;
+
+ uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+ uint64_t TotalCount;
+ if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+ ValueDataArray.get(), NumVals, TotalCount))
+ return false;
+
+ uint64_t ActualCount = TotalCount;
+ uint64_t SavedTotalCount = TotalCount;
+ if (MemOPScaleCount) {
+ auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+ if (!BBEdgeCount)
+ return false;
+ ActualCount = *BBEdgeCount;
+ }
+
+ ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+ DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
+ << "\n");
+ DEBUG(
+ for (auto &VD
+ : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
+
+ if (ActualCount < MemOPCountThreshold)
+ return false;
+ // Skip if the total value profiled count is 0, in which case we can't
+ // scale up the counts properly (and there is no profitable transformation).
+ if (TotalCount == 0)
+ return false;
+
+ TotalCount = ActualCount;
+ if (MemOPScaleCount)
+ DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+ << " denominator = " << SavedTotalCount << "\n");
+
+ // Keeping track of the count of the default case:
+ uint64_t RemainCount = TotalCount;
+ uint64_t SavedRemainCount = SavedTotalCount;
+ SmallVector<uint64_t, 16> SizeIds;
+ SmallVector<uint64_t, 16> CaseCounts;
+ uint64_t MaxCount = 0;
+ unsigned Version = 0;
+ // Default case is in the front -- save the slot here.
+ CaseCounts.push_back(0);
+ for (auto &VD : VDs) {
+ int64_t V = VD.Value;
+ uint64_t C = VD.Count;
+ if (MemOPScaleCount)
+ C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+ // Only care precise value here.
+ if (getMemOPSizeKind(V) != PreciseValue)
+ continue;
+
+ // ValueCounts are sorted on the count. Break at the first un-profitable
+ // value.
+ if (!isProfitable(C, RemainCount))
+ break;
+
+ SizeIds.push_back(V);
+ CaseCounts.push_back(C);
+ if (C > MaxCount)
+ MaxCount = C;
+
+ assert(RemainCount >= C);
+ RemainCount -= C;
+ assert(SavedRemainCount >= VD.Count);
+ SavedRemainCount -= VD.Count;
+
+ if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+ break;
+ }
+
+ if (Version == 0)
+ return false;
+
+ CaseCounts[0] = RemainCount;
+ if (RemainCount > MaxCount)
+ MaxCount = RemainCount;
+
+ uint64_t SumForOpt = TotalCount - RemainCount;
+
+ DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+ << " Versions (covering " << SumForOpt << " out of "
+ << TotalCount << ")\n");
+
+ // mem_op(..., size)
+ // ==>
+ // switch (size) {
+ // case s1:
+ // mem_op(..., s1);
+ // goto merge_bb;
+ // case s2:
+ // mem_op(..., s2);
+ // goto merge_bb;
+ // ...
+ // default:
+ // mem_op(..., size);
+ // goto merge_bb;
+ // }
+ // merge_bb:
+
+ BasicBlock *BB = MI->getParent();
+ DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+ DEBUG(dbgs() << *BB << "\n");
+ auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+ BasicBlock *DefaultBB = SplitBlock(BB, MI);
+ BasicBlock::iterator It(*MI);
+ ++It;
+ assert(It != DefaultBB->end());
+ BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+ MergeBB->setName("MemOP.Merge");
+ BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+ DefaultBB->setName("MemOP.Default");
+
+ auto &Ctx = Func.getContext();
+ IRBuilder<> IRB(BB);
+ BB->getTerminator()->eraseFromParent();
+ Value *SizeVar = MI->getLength();
+ SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+ // Clear the value profile data.
+ MI->setMetadata(LLVMContext::MD_prof, nullptr);
+ // If all promoted, we don't need the MD.prof metadata.
+ if (SavedRemainCount > 0 || Version != NumVals)
+ // Otherwise we need update with the un-promoted records back.
+ annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+ SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+ DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+ for (uint64_t SizeId : SizeIds) {
+ ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
+ BasicBlock *CaseBB = BasicBlock::Create(
+ Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+ Instruction *NewInst = MI->clone();
+ // Fix the argument.
+ dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
+ CaseBB->getInstList().push_back(NewInst);
+ IRBuilder<> IRBCase(CaseBB);
+ IRBCase.CreateBr(MergeBB);
+ SI->addCase(CaseSizeId, CaseBB);
+ DEBUG(dbgs() << *CaseBB << "\n");
+ }
+ setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+ DEBUG(dbgs() << *BB << "\n");
+ DEBUG(dbgs() << *DefaultBB << "\n");
+ DEBUG(dbgs() << *MergeBB << "\n");
+
+ emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
+ MI->getDebugLoc(),
+ Twine("optimize ") + getMIName(MI) + " with count " +
+ Twine(SumForOpt) + " out of " + Twine(TotalCount) +
+ " for " + Twine(Version) + " versions");
+
+ return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
+ if (DisableMemOPOPT)
+ return false;
+
+ if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+ MemOPSizeOpt MemOPSizeOpt(F, BFI);
+ MemOPSizeOpt.perform();
+ return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+ BlockFrequencyInfo &BFI =
+ getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+ return PGOMemOPSizeOptImpl(F, BFI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ bool Changed = PGOMemOPSizeOptImpl(F, BFI);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 8aa40d1759de6..e3c36c98ab0db 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -61,7 +61,7 @@ static const char *const SanCov8bitCountersInitName =
"__sanitizer_cov_8bit_counters_init";
static const char *const SanCovGuardsSectionName = "sancov_guards";
-static const char *const SanCovCountersSectionName = "sancov_counters";
+static const char *const SanCovCountersSectionName = "sancov_cntrs";
static cl::opt<int> ClCoverageLevel(
"sanitizer-coverage-level",
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 7b625b9b136ec..2a4c9526dfcd9 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -551,7 +551,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
BBChanged = true;
}
}
- };
+ }
FnChanged |= BBChanged;
}
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index c4f450949e6de..0f92760a874b5 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -15,6 +15,7 @@
#include "llvm/Transforms/Scalar/EarlyCSE.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
@@ -506,7 +507,7 @@ private:
if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
// Optimize MemoryPhi nodes that may become redundant by having all the
// same input values once MA is removed.
- SmallVector<MemoryPhi *, 4> PhisToCheck;
+ SmallSetVector<MemoryPhi *, 4> PhisToCheck;
SmallVector<MemoryAccess *, 8> WorkQueue;
WorkQueue.push_back(MA);
// Process MemoryPhi nodes in FIFO order using a ever-growing vector since
@@ -517,7 +518,7 @@ private:
for (auto *U : WI->users())
if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
- PhisToCheck.push_back(MP);
+ PhisToCheck.insert(MP);
MSSAUpdater->removeMemoryAccess(WI);
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
index 8634816e702fb..5fd2dfc118b4b 100644
--- a/lib/Transforms/Scalar/GVNSink.cpp
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -64,6 +64,17 @@ using namespace llvm;
STATISTIC(NumRemoved, "Number of instructions removed");
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+ print(dbgs());
+ dbgs() << "\n";
+}
+
+}
+}
+
namespace {
static bool isMemoryInst(const Instruction *I) {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index b706152f30c81..8b435050ac769 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -983,21 +983,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
const SCEV *NumBytesS =
SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+ if (StoreSize != 1)
+ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+ SCEV::FlagNUW);
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+
unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
CallInst *NewCall = nullptr;
// Check whether to generate an unordered atomic memcpy:
// If the load or store are atomic, then they must neccessarily be unordered
// by previous checks.
- if (!SI->isAtomic() && !LI->isAtomic()) {
- if (StoreSize != 1)
- NumBytesS = SE->getMulExpr(
- NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW);
-
- Value *NumBytes =
- Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
-
+ if (!SI->isAtomic() && !LI->isAtomic())
NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
- } else {
+ else {
// We cannot allow unaligned ops for unordered load/store, so reject
// anything where the alignment isn't at least the element size.
if (Align < StoreSize)
@@ -1010,11 +1010,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
return false;
- Value *NumElements =
- Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+ NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+ StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
- NewCall = Builder.CreateElementAtomicMemCpy(StoreBasePtr, LoadBasePtr,
- NumElements, StoreSize);
// Propagate alignment info onto the pointer args. Note that unordered
// atomic loads/stores are *required* by the spec to have an alignment
// but non-atomic loads/stores may not.
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 6926aae37963f..cbbd55512c9f5 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -2195,7 +2195,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// For a given expression, mark the phi of ops instructions that could have
// changed as a result.
void NewGVN::markPhiOfOpsChanged(const Expression *E) {
- touchAndErase(ExpressionToPhiOfOps, E);
+ touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
}
// Perform congruence finding on a given value numbering expression.
@@ -3561,7 +3561,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
// TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
// the block and subtract the pred count, but it's more complicated.
if (ReachablePredCount.lookup(BB) !=
- std::distance(pred_begin(BB), pred_end(BB))) {
+ unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
auto &PHI = cast<PHINode>(*II);
ReplaceUnreachablePHIArgs(PHI, BB);
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index bae7911d222c9..a52739bb76f71 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -89,10 +89,10 @@ struct RewriteStatepointsForGC : public ModulePass {
Changed |= runOnFunction(F);
if (Changed) {
- // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
+ // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
// returns true for at least one function in the module. Since at least
// one function changed, we know that the precondition is satisfied.
- stripNonValidAttributes(M);
+ stripNonValidAttributesAndMetadata(M);
}
return Changed;
@@ -105,20 +105,24 @@ struct RewriteStatepointsForGC : public ModulePass {
AU.addRequired<TargetTransformInfoWrapperPass>();
}
- /// The IR fed into RewriteStatepointsForGC may have had attributes implying
- /// dereferenceability that are no longer valid/correct after
- /// RewriteStatepointsForGC has run. This is because semantically, after
+ /// The IR fed into RewriteStatepointsForGC may have had attributes and
+ /// metadata implying dereferenceability that are no longer valid/correct after
+ /// RewriteStatepointsForGC has run. This is because semantically, after
/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
- /// heap. stripNonValidAttributes (conservatively) restores correctness
- /// by erasing all attributes in the module that externally imply
- /// dereferenceability.
- /// Similar reasoning also applies to the noalias attributes. gc.statepoint
- /// can touch the entire heap including noalias objects.
- void stripNonValidAttributes(Module &M);
-
- // Helpers for stripNonValidAttributes
- void stripNonValidAttributesFromBody(Function &F);
+ /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+ /// correctness by erasing all attributes in the module that externally imply
+ /// dereferenceability. Similar reasoning also applies to the noalias
+ /// attributes and metadata. gc.statepoint can touch the entire heap including
+ /// noalias objects.
+ void stripNonValidAttributesAndMetadata(Module &M);
+
+ // Helpers for stripNonValidAttributesAndMetadata
+ void stripNonValidAttributesAndMetadataFromBody(Function &F);
void stripNonValidAttributesFromPrototype(Function &F);
+ // Certain metadata on instructions are invalid after running RS4GC.
+ // Optimizations that run after RS4GC can incorrectly use this metadata to
+ // optimize functions. We drop such metadata on the instruction.
+ void stripInvalidMetadataFromInstruction(Instruction &I);
};
} // namespace
@@ -2306,13 +2310,44 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
}
-void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
+void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
+
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return;
+ // These are the attributes that are still valid on loads and stores after
+ // RS4GC.
+ // The metadata implying dereferenceability and noalias are (conservatively)
+ // dropped. This is because semantically, after RewriteStatepointsForGC runs,
+ // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+ // touch the entire heap including noalias objects. Note: The reasoning is
+ // same as stripping the dereferenceability and noalias attributes that are
+ // analogous to the metadata counterparts.
+ // We also drop the invariant.load metadata on the load because that metadata
+ // implies the address operand to the load points to memory that is never
+ // changed once it became dereferenceable. This is no longer true after RS4GC.
+ // Similar reasoning applies to invariant.group metadata, which applies to
+ // loads within a group.
+ unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_nontemporal,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_align,
+ LLVMContext::MD_type};
+
+ // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+ I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
if (F.empty())
return;
LLVMContext &Ctx = F.getContext();
MDBuilder Builder(Ctx);
+
for (Instruction &I : instructions(F)) {
if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
@@ -2333,6 +2368,8 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
}
+ stripInvalidMetadataFromInstruction(I);
+
if (CallSite CS = CallSite(&I)) {
for (int i = 0, e = CS.arg_size(); i != e; i++)
if (isa<PointerType>(CS.getArgument(i)->getType()))
@@ -2357,7 +2394,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
return false;
}
-void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
#ifndef NDEBUG
assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
#endif
@@ -2366,7 +2403,7 @@ void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
stripNonValidAttributesFromPrototype(F);
for (Function &F : M)
- stripNonValidAttributesFromBody(F);
+ stripNonValidAttributesAndMetadataFromBody(F);
}
bool RewriteStatepointsForGC::runOnFunction(Function &F) {
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 815492ac354c0..c6929c33b3e9e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -515,10 +515,6 @@ private:
void visitCmpInst(CmpInst &I);
void visitExtractValueInst(ExtractValueInst &EVI);
void visitInsertValueInst(InsertValueInst &IVI);
- void visitLandingPadInst(LandingPadInst &I) { markOverdefined(&I); }
- void visitFuncletPadInst(FuncletPadInst &FPI) {
- markOverdefined(&FPI);
- }
void visitCatchSwitchInst(CatchSwitchInst &CPI) {
markOverdefined(&CPI);
visitTerminatorInst(CPI);
@@ -539,13 +535,6 @@ private:
void visitResumeInst (TerminatorInst &I) { /*returns void*/ }
void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
void visitFenceInst (FenceInst &I) { /*returns void*/ }
- void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
- markOverdefined(&I);
- }
- void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
- void visitAllocaInst (Instruction &I) { markOverdefined(&I); }
- void visitVAArgInst (Instruction &I) { markOverdefined(&I); }
-
void visitInstruction(Instruction &I) {
// If a new instruction is added to LLVM that we don't handle.
DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 24d28a6c28311..5d57ed9718fb6 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -142,8 +142,139 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
return false;
}
-void CodeExtractor::findAllocas(ValueSet &SinkCands) const {
+static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
+ BasicBlock *CommonExitBlock = nullptr;
+ auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
+ for (auto *Succ : successors(Block)) {
+ // Internal edges, ok.
+ if (Blocks.count(Succ))
+ continue;
+ if (!CommonExitBlock) {
+ CommonExitBlock = Succ;
+ continue;
+ }
+ if (CommonExitBlock == Succ)
+ continue;
+
+ return true;
+ }
+ return false;
+ };
+
+ if (any_of(Blocks, hasNonCommonExitSucc))
+ return nullptr;
+
+ return CommonExitBlock;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+ Instruction *Addr) const {
+ AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+ Function *Func = (*Blocks.begin())->getParent();
+ for (BasicBlock &BB : *Func) {
+ if (Blocks.count(&BB))
+ continue;
+ for (Instruction &II : BB) {
+
+ if (isa<DbgInfoIntrinsic>(II))
+ continue;
+
+ unsigned Opcode = II.getOpcode();
+ Value *MemAddr = nullptr;
+ switch (Opcode) {
+ case Instruction::Store:
+ case Instruction::Load: {
+ if (Opcode == Instruction::Store) {
+ StoreInst *SI = cast<StoreInst>(&II);
+ MemAddr = SI->getPointerOperand();
+ } else {
+ LoadInst *LI = cast<LoadInst>(&II);
+ MemAddr = LI->getPointerOperand();
+ }
+ // Global variable can not be aliased with locals.
+ if (dyn_cast<Constant>(MemAddr))
+ break;
+ Value *Base = MemAddr->stripInBoundsConstantOffsets();
+ if (!dyn_cast<AllocaInst>(Base) || Base == AI)
+ return false;
+ break;
+ }
+ default: {
+ IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+ if (IntrInst) {
+ if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+ IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+ break;
+ return false;
+ }
+ // Treat all the other cases conservatively if it has side effects.
+ if (II.mayHaveSideEffects())
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+BasicBlock *
+CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
+ BasicBlock *SinglePredFromOutlineRegion = nullptr;
+ assert(!Blocks.count(CommonExitBlock) &&
+ "Expect a block outside the region!");
+ for (auto *Pred : predecessors(CommonExitBlock)) {
+ if (!Blocks.count(Pred))
+ continue;
+ if (!SinglePredFromOutlineRegion) {
+ SinglePredFromOutlineRegion = Pred;
+ } else if (SinglePredFromOutlineRegion != Pred) {
+ SinglePredFromOutlineRegion = nullptr;
+ break;
+ }
+ }
+
+ if (SinglePredFromOutlineRegion)
+ return SinglePredFromOutlineRegion;
+
+#ifndef NDEBUG
+ auto getFirstPHI = [](BasicBlock *BB) {
+ BasicBlock::iterator I = BB->begin();
+ PHINode *FirstPhi = nullptr;
+ while (I != BB->end()) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi)
+ break;
+ if (!FirstPhi) {
+ FirstPhi = Phi;
+ break;
+ }
+ }
+ return FirstPhi;
+ };
+ // If there are any phi nodes, the single pred either exists or has already
+ // be created before code extraction.
+ assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
+#endif
+
+ BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
+ CommonExitBlock->getFirstNonPHI()->getIterator());
+
+ for (auto *Pred : predecessors(CommonExitBlock)) {
+ if (Blocks.count(Pred))
+ continue;
+ Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
+ }
+ // Now add the old exit block to the outline region.
+ Blocks.insert(CommonExitBlock);
+ return CommonExitBlock;
+}
+
+void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+ BasicBlock *&ExitBlock) const {
Function *Func = (*Blocks.begin())->getParent();
+ ExitBlock = getCommonExitBlock(Blocks);
+
for (BasicBlock &BB : *Func) {
if (Blocks.count(&BB))
continue;
@@ -152,49 +283,96 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands) const {
if (!AI)
continue;
- // Returns true if matching life time markers are found within
- // the outlined region.
- auto GetLifeTimeMarkers = [&](Instruction *Addr) {
+ // Find the pair of life time markers for address 'Addr' that are either
+ // defined inside the outline region or can legally be shrinkwrapped into
+ // the outline region. If there are not other untracked uses of the
+ // address, return the pair of markers if found; otherwise return a pair
+ // of nullptr.
+ auto GetLifeTimeMarkers =
+ [&](Instruction *Addr, bool &SinkLifeStart,
+ bool &HoistLifeEnd) -> std::pair<Instruction *, Instruction *> {
Instruction *LifeStart = nullptr, *LifeEnd = nullptr;
- for (User *U : Addr->users()) {
- if (!definedInRegion(Blocks, U))
- return false;
+ for (User *U : Addr->users()) {
IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
if (IntrInst) {
- if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start)
+ if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
+ // Do not handle the case where AI has multiple start markers.
+ if (LifeStart)
+ return std::make_pair<Instruction *>(nullptr, nullptr);
LifeStart = IntrInst;
- if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+ }
+ if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
+ if (LifeEnd)
+ return std::make_pair<Instruction *>(nullptr, nullptr);
LifeEnd = IntrInst;
+ }
+ continue;
}
+ // Find untracked uses of the address, bail.
+ if (!definedInRegion(Blocks, U))
+ return std::make_pair<Instruction *>(nullptr, nullptr);
}
- return LifeStart && LifeEnd;
+
+ if (!LifeStart || !LifeEnd)
+ return std::make_pair<Instruction *>(nullptr, nullptr);
+
+ SinkLifeStart = !definedInRegion(Blocks, LifeStart);
+ HoistLifeEnd = !definedInRegion(Blocks, LifeEnd);
+ // Do legality Check.
+ if ((SinkLifeStart || HoistLifeEnd) &&
+ !isLegalToShrinkwrapLifetimeMarkers(Addr))
+ return std::make_pair<Instruction *>(nullptr, nullptr);
+
+ // Check to see if we have a place to do hoisting, if not, bail.
+ if (HoistLifeEnd && !ExitBlock)
+ return std::make_pair<Instruction *>(nullptr, nullptr);
+
+ return std::make_pair(LifeStart, LifeEnd);
};
- if (GetLifeTimeMarkers(AI)) {
+ bool SinkLifeStart = false, HoistLifeEnd = false;
+ auto Markers = GetLifeTimeMarkers(AI, SinkLifeStart, HoistLifeEnd);
+
+ if (Markers.first) {
+ if (SinkLifeStart)
+ SinkCands.insert(Markers.first);
SinkCands.insert(AI);
+ if (HoistLifeEnd)
+ HoistCands.insert(Markers.second);
continue;
}
- // Follow the bitcast:
+ // Follow the bitcast.
Instruction *MarkerAddr = nullptr;
for (User *U : AI->users()) {
- if (U->stripPointerCasts() == AI) {
+
+ if (U->stripInBoundsConstantOffsets() == AI) {
+ SinkLifeStart = false;
+ HoistLifeEnd = false;
Instruction *Bitcast = cast<Instruction>(U);
- if (GetLifeTimeMarkers(Bitcast)) {
+ Markers = GetLifeTimeMarkers(Bitcast, SinkLifeStart, HoistLifeEnd);
+ if (Markers.first) {
MarkerAddr = Bitcast;
continue;
}
}
+
+ // Found unknown use of AI.
if (!definedInRegion(Blocks, U)) {
MarkerAddr = nullptr;
break;
}
}
+
if (MarkerAddr) {
+ if (SinkLifeStart)
+ SinkCands.insert(Markers.first);
if (!definedInRegion(Blocks, MarkerAddr))
SinkCands.insert(MarkerAddr);
SinkCands.insert(AI);
+ if (HoistLifeEnd)
+ HoistCands.insert(Markers.second);
}
}
}
@@ -780,7 +958,8 @@ Function *CodeExtractor::extractCodeRegion() {
if (!isEligible())
return nullptr;
- ValueSet inputs, outputs, SinkingCands;
+ ValueSet inputs, outputs, SinkingCands, HoistingCands;
+ BasicBlock *CommonExit = nullptr;
// Assumption: this is a single-entry code region, and the header is the first
// block in the region.
@@ -819,7 +998,8 @@ Function *CodeExtractor::extractCodeRegion() {
"newFuncRoot");
newFuncRoot->getInstList().push_back(BranchInst::Create(header));
- findAllocas(SinkingCands);
+ findAllocas(SinkingCands, HoistingCands, CommonExit);
+ assert(HoistingCands.empty() || CommonExit);
// Find inputs to, outputs from the code region.
findInputsOutputs(inputs, outputs, SinkingCands);
@@ -829,6 +1009,13 @@ Function *CodeExtractor::extractCodeRegion() {
cast<Instruction>(II)->moveBefore(*newFuncRoot,
newFuncRoot->getFirstInsertionPt());
+ if (!HoistingCands.empty()) {
+ auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
+ Instruction *TI = HoistToBlock->getTerminator();
+ for (auto *II : HoistingCands)
+ cast<Instruction>(II)->moveBefore(TI);
+ }
+
// Calculate the exit blocks for the extracted region and the total exit
// weights for each of those blocks.
DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index 9e71cba4f1b7a..1260e35e934de 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -460,6 +460,9 @@ void PredicateInfo::buildPredicateInfo() {
if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
if (!BI->isConditional())
continue;
+ // Can't insert conditional information if they all go to the same place.
+ if (BI->getSuccessor(0) == BI->getSuccessor(1))
+ continue;
processBranch(BI, BranchBB, OpsToRename);
} else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
processSwitch(SI, BranchBB, OpsToRename);
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 02a5d3dbeadfb..faa14046b1e3c 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -352,7 +352,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
return false;
typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
- const SCEV *, const SCEV *, SCEV::NoWrapFlags);
+ const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned);
typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
const SCEV *, Type *);
@@ -406,10 +406,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
const SCEV *A =
- (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap), WideTy);
+ (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0u),
+ WideTy);
const SCEV *B =
(SE->*Operation)((SE->*Extension)(LHS, WideTy),
- (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap);
+ (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap, 0u);
if (A != B)
return false;
@@ -530,8 +531,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
return false;
const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *,
- SCEV::NoWrapFlags);
-
+ SCEV::NoWrapFlags, unsigned);
switch (BO->getOpcode()) {
default:
return false;
@@ -560,7 +560,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy);
const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy),
- SCEV::FlagAnyWrap);
+ SCEV::FlagAnyWrap, 0u);
if (ExtendAfterOp == OpAfterExtend) {
BO->setHasNoUnsignedWrap();
SE->forgetValue(BO);
@@ -572,7 +572,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy);
const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy),
- SCEV::FlagAnyWrap);
+ SCEV::FlagAnyWrap, 0u);
if (ExtendAfterOp == OpAfterExtend) {
BO->setHasNoSignedWrap();
SE->forgetValue(BO);