summaryrefslogtreecommitdiff
path: root/lib/Transforms
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Transforms
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
Notes
Diffstat (limited to 'lib/Transforms')
-rw-r--r--lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp257
-rw-r--r--lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h121
-rw-r--r--lib/Transforms/AggressiveInstCombine/CMakeLists.txt11
-rw-r--r--lib/Transforms/AggressiveInstCombine/LLVMBuild.txt22
-rw-r--r--lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp418
-rw-r--r--lib/Transforms/CMakeLists.txt1
-rw-r--r--lib/Transforms/Coroutines/CMakeLists.txt2
-rw-r--r--lib/Transforms/Coroutines/CoroEarly.cpp48
-rw-r--r--lib/Transforms/Coroutines/CoroElide.cpp51
-rw-r--r--lib/Transforms/Coroutines/CoroFrame.cpp133
-rw-r--r--lib/Transforms/Coroutines/CoroInternal.h1
-rw-r--r--lib/Transforms/Coroutines/CoroSplit.cpp34
-rw-r--r--lib/Transforms/Coroutines/Coroutines.cpp11
-rw-r--r--lib/Transforms/IPO/AlwaysInliner.cpp5
-rw-r--r--lib/Transforms/IPO/ArgumentPromotion.cpp54
-rw-r--r--lib/Transforms/IPO/BarrierNoopPass.cpp2
-rw-r--r--lib/Transforms/IPO/BlockExtractor.cpp176
-rw-r--r--lib/Transforms/IPO/CMakeLists.txt3
-rw-r--r--lib/Transforms/IPO/CalledValuePropagation.cpp30
-rw-r--r--lib/Transforms/IPO/CrossDSOCFI.cpp3
-rw-r--r--lib/Transforms/IPO/DeadArgumentElimination.cpp102
-rw-r--r--lib/Transforms/IPO/ExtractGV.cpp2
-rw-r--r--lib/Transforms/IPO/ForceFunctionAttrs.cpp7
-rw-r--r--lib/Transforms/IPO/FunctionAttrs.cpp248
-rw-r--r--lib/Transforms/IPO/FunctionImport.cpp456
-rw-r--r--lib/Transforms/IPO/GlobalOpt.cpp422
-rw-r--r--lib/Transforms/IPO/IPO.cpp3
-rw-r--r--lib/Transforms/IPO/InlineSimple.cpp2
-rw-r--r--lib/Transforms/IPO/Inliner.cpp95
-rw-r--r--lib/Transforms/IPO/Internalize.cpp6
-rw-r--r--lib/Transforms/IPO/LLVMBuild.txt2
-rw-r--r--lib/Transforms/IPO/LoopExtractor.cpp153
-rw-r--r--lib/Transforms/IPO/LowerTypeTests.cpp433
-rw-r--r--lib/Transforms/IPO/MergeFunctions.cpp252
-rw-r--r--lib/Transforms/IPO/PartialInlining.cpp82
-rw-r--r--lib/Transforms/IPO/PassManagerBuilder.cpp52
-rw-r--r--lib/Transforms/IPO/PruneEH.cpp2
-rw-r--r--lib/Transforms/IPO/SCCP.cpp58
-rw-r--r--lib/Transforms/IPO/SampleProfile.cpp327
-rw-r--r--lib/Transforms/IPO/StripSymbols.cpp2
-rw-r--r--lib/Transforms/IPO/SyntheticCountsPropagation.cpp140
-rw-r--r--lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp139
-rw-r--r--lib/Transforms/IPO/WholeProgramDevirt.cpp274
-rw-r--r--lib/Transforms/InstCombine/CMakeLists.txt4
-rw-r--r--lib/Transforms/InstCombine/InstCombineAddSub.cpp428
-rw-r--r--lib/Transforms/InstCombine/InstCombineAndOrXor.cpp534
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp836
-rw-r--r--lib/Transforms/InstCombine/InstCombineCasts.cpp422
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp341
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h165
-rw-r--r--lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp48
-rw-r--r--lib/Transforms/InstCombine/InstCombineMulDivRem.cpp1229
-rw-r--r--lib/Transforms/InstCombine/InstCombinePHI.cpp23
-rw-r--r--lib/Transforms/InstCombine/InstCombineSelect.cpp607
-rw-r--r--lib/Transforms/InstCombine/InstCombineShifts.cpp85
-rw-r--r--lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp339
-rw-r--r--lib/Transforms/InstCombine/InstCombineTables.td11
-rw-r--r--lib/Transforms/InstCombine/InstCombineVectorOps.cpp213
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp611
-rw-r--r--lib/Transforms/Instrumentation/AddressSanitizer.cpp139
-rw-r--r--lib/Transforms/Instrumentation/BoundsChecking.cpp44
-rw-r--r--lib/Transforms/Instrumentation/CFGMST.h16
-rw-r--r--lib/Transforms/Instrumentation/CGProfile.cpp100
-rw-r--r--lib/Transforms/Instrumentation/CMakeLists.txt1
-rw-r--r--lib/Transforms/Instrumentation/DataFlowSanitizer.cpp137
-rw-r--r--lib/Transforms/Instrumentation/EfficiencySanitizer.cpp4
-rw-r--r--lib/Transforms/Instrumentation/GCOVProfiling.cpp303
-rw-r--r--lib/Transforms/Instrumentation/HWAddressSanitizer.cpp511
-rw-r--r--lib/Transforms/Instrumentation/IndirectCallPromotion.cpp30
-rw-r--r--lib/Transforms/Instrumentation/InstrProfiling.cpp46
-rw-r--r--lib/Transforms/Instrumentation/MemorySanitizer.cpp851
-rw-r--r--lib/Transforms/Instrumentation/PGOInstrumentation.cpp83
-rw-r--r--lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp74
-rw-r--r--lib/Transforms/Instrumentation/SanitizerCoverage.cpp23
-rw-r--r--lib/Transforms/Instrumentation/ThreadSanitizer.cpp8
-rw-r--r--lib/Transforms/LLVMBuild.txt2
-rw-r--r--lib/Transforms/ObjCARC/BlotMapVector.h2
-rw-r--r--lib/Transforms/ObjCARC/DependencyAnalysis.h2
-rw-r--r--lib/Transforms/ObjCARC/ObjCARC.h24
-rw-r--r--lib/Transforms/ObjCARC/ObjCARCAPElim.cpp12
-rw-r--r--lib/Transforms/ObjCARC/ObjCARCContract.cpp177
-rw-r--r--lib/Transforms/ObjCARC/ObjCARCExpand.cpp15
-rw-r--r--lib/Transforms/ObjCARC/ObjCARCOpts.cpp202
-rw-r--r--lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp15
-rw-r--r--lib/Transforms/ObjCARC/ProvenanceAnalysis.h6
-rw-r--r--lib/Transforms/ObjCARC/PtrState.cpp45
-rw-r--r--lib/Transforms/ObjCARC/PtrState.h6
-rw-r--r--lib/Transforms/Scalar/ADCE.cpp29
-rw-r--r--lib/Transforms/Scalar/AlignmentFromAssumptions.cpp90
-rw-r--r--lib/Transforms/Scalar/BDCE.cpp4
-rw-r--r--lib/Transforms/Scalar/CMakeLists.txt2
-rw-r--r--lib/Transforms/Scalar/CallSiteSplitting.cpp358
-rw-r--r--lib/Transforms/Scalar/ConstantHoisting.cpp117
-rw-r--r--lib/Transforms/Scalar/ConstantProp.cpp2
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp173
-rw-r--r--lib/Transforms/Scalar/DCE.cpp5
-rw-r--r--lib/Transforms/Scalar/DeadStoreElimination.cpp280
-rw-r--r--lib/Transforms/Scalar/DivRemPairs.cpp11
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp335
-rw-r--r--lib/Transforms/Scalar/FlattenCFGPass.cpp2
-rw-r--r--lib/Transforms/Scalar/Float2Int.cpp13
-rw-r--r--lib/Transforms/Scalar/GVN.cpp133
-rw-r--r--lib/Transforms/Scalar/GVNHoist.cpp35
-rw-r--r--lib/Transforms/Scalar/GVNSink.cpp43
-rw-r--r--lib/Transforms/Scalar/GuardWidening.cpp207
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp74
-rw-r--r--lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp643
-rw-r--r--lib/Transforms/Scalar/InferAddressSpaces.cpp36
-rw-r--r--lib/Transforms/Scalar/InstSimplifyPass.cpp (renamed from lib/Transforms/Utils/SimplifyInstructions.cpp)96
-rw-r--r--lib/Transforms/Scalar/JumpThreading.cpp546
-rw-r--r--lib/Transforms/Scalar/LICM.cpp237
-rw-r--r--lib/Transforms/Scalar/LLVMBuild.txt2
-rw-r--r--lib/Transforms/Scalar/LoopDataPrefetch.cpp14
-rw-r--r--lib/Transforms/Scalar/LoopDeletion.cpp42
-rw-r--r--lib/Transforms/Scalar/LoopDistribute.cpp127
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp133
-rw-r--r--lib/Transforms/Scalar/LoopInstSimplify.cpp208
-rw-r--r--lib/Transforms/Scalar/LoopInterchange.cpp574
-rw-r--r--lib/Transforms/Scalar/LoopLoadElimination.cpp82
-rw-r--r--lib/Transforms/Scalar/LoopPredication.cpp211
-rw-r--r--lib/Transforms/Scalar/LoopRerollPass.cpp360
-rw-r--r--lib/Transforms/Scalar/LoopRotation.cpp619
-rw-r--r--lib/Transforms/Scalar/LoopSimplifyCFG.cpp19
-rw-r--r--lib/Transforms/Scalar/LoopSink.cpp24
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp421
-rw-r--r--lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp447
-rw-r--r--lib/Transforms/Scalar/LoopUnrollPass.cpp149
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp97
-rw-r--r--lib/Transforms/Scalar/LoopVersioningLICM.cpp148
-rw-r--r--lib/Transforms/Scalar/LowerExpectIntrinsic.cpp2
-rw-r--r--lib/Transforms/Scalar/MemCpyOptimizer.cpp101
-rw-r--r--lib/Transforms/Scalar/MergeICmps.cpp295
-rw-r--r--lib/Transforms/Scalar/MergedLoadStoreMotion.cpp84
-rw-r--r--lib/Transforms/Scalar/NaryReassociate.cpp18
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp483
-rw-r--r--lib/Transforms/Scalar/PlaceSafepoints.cpp12
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp109
-rw-r--r--lib/Transforms/Scalar/Reg2Mem.cpp3
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp90
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp556
-rw-r--r--lib/Transforms/Scalar/SROA.cpp509
-rw-r--r--lib/Transforms/Scalar/Scalar.cpp30
-rw-r--r--lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp47
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp1517
-rw-r--r--lib/Transforms/Scalar/SimplifyCFGPass.cpp2
-rw-r--r--lib/Transforms/Scalar/Sink.cpp12
-rw-r--r--lib/Transforms/Scalar/SpeculateAroundPHIs.cpp61
-rw-r--r--lib/Transforms/Scalar/SpeculativeExecution.cpp10
-rw-r--r--lib/Transforms/Scalar/StraightLineStrengthReduce.cpp2
-rw-r--r--lib/Transforms/Scalar/StructurizeCFG.cpp173
-rw-r--r--lib/Transforms/Scalar/TailRecursionElimination.cpp8
-rw-r--r--lib/Transforms/Utils/AddDiscriminators.cpp10
-rw-r--r--lib/Transforms/Utils/BasicBlockUtils.cpp120
-rw-r--r--lib/Transforms/Utils/BreakCriticalEdges.cpp13
-rw-r--r--lib/Transforms/Utils/BuildLibCalls.cpp191
-rw-r--r--lib/Transforms/Utils/BypassSlowDivision.cpp4
-rw-r--r--lib/Transforms/Utils/CMakeLists.txt4
-rw-r--r--lib/Transforms/Utils/CallPromotionUtils.cpp34
-rw-r--r--lib/Transforms/Utils/CloneFunction.cpp81
-rw-r--r--lib/Transforms/Utils/CloneModule.cpp36
-rw-r--r--lib/Transforms/Utils/CodeExtractor.cpp227
-rw-r--r--lib/Transforms/Utils/CtorUtils.cpp2
-rw-r--r--lib/Transforms/Utils/DemoteRegToStack.cpp2
-rw-r--r--lib/Transforms/Utils/EntryExitInstrumenter.cpp23
-rw-r--r--lib/Transforms/Utils/EscapeEnumerator.cpp6
-rw-r--r--lib/Transforms/Utils/Evaluator.cpp258
-rw-r--r--lib/Transforms/Utils/FlattenCFG.cpp12
-rw-r--r--lib/Transforms/Utils/FunctionComparator.cpp7
-rw-r--r--lib/Transforms/Utils/FunctionImportUtils.cpp13
-rw-r--r--lib/Transforms/Utils/GlobalStatus.cpp33
-rw-r--r--lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp7
-rw-r--r--lib/Transforms/Utils/InlineFunction.cpp99
-rw-r--r--lib/Transforms/Utils/InstructionNamer.cpp2
-rw-r--r--lib/Transforms/Utils/IntegerDivision.cpp12
-rw-r--r--lib/Transforms/Utils/LCSSA.cpp26
-rw-r--r--lib/Transforms/Utils/LibCallsShrinkWrap.cpp27
-rw-r--r--lib/Transforms/Utils/Local.cpp866
-rw-r--r--lib/Transforms/Utils/LoopRotationUtils.cpp645
-rw-r--r--lib/Transforms/Utils/LoopSimplify.cpp57
-rw-r--r--lib/Transforms/Utils/LoopUnroll.cpp215
-rw-r--r--lib/Transforms/Utils/LoopUnrollAndJam.cpp785
-rw-r--r--lib/Transforms/Utils/LoopUnrollPeel.cpp182
-rw-r--r--lib/Transforms/Utils/LoopUnrollRuntime.cpp93
-rw-r--r--lib/Transforms/Utils/LoopUtils.cpp391
-rw-r--r--lib/Transforms/Utils/LoopVersioning.cpp11
-rw-r--r--lib/Transforms/Utils/LowerInvoke.cpp8
-rw-r--r--lib/Transforms/Utils/LowerMemIntrinsics.cpp14
-rw-r--r--lib/Transforms/Utils/LowerSwitch.cpp82
-rw-r--r--lib/Transforms/Utils/Mem2Reg.cpp2
-rw-r--r--lib/Transforms/Utils/MetaRenamer.cpp2
-rw-r--r--lib/Transforms/Utils/OrderedInstructions.cpp33
-rw-r--r--lib/Transforms/Utils/PredicateInfo.cpp90
-rw-r--r--lib/Transforms/Utils/PromoteMemoryToRegister.cpp78
-rw-r--r--lib/Transforms/Utils/SSAUpdater.cpp10
-rw-r--r--lib/Transforms/Utils/SSAUpdaterBulk.cpp191
-rw-r--r--lib/Transforms/Utils/SimplifyCFG.cpp440
-rw-r--r--lib/Transforms/Utils/SimplifyIndVar.cpp166
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp380
-rw-r--r--lib/Transforms/Utils/SplitModule.cpp28
-rw-r--r--lib/Transforms/Utils/StripGCRelocates.cpp4
-rw-r--r--lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp2
-rw-r--r--lib/Transforms/Utils/UnifyFunctionExitNodes.cpp2
-rw-r--r--lib/Transforms/Utils/Utils.cpp13
-rw-r--r--lib/Transforms/Utils/VNCoercion.cpp12
-rw-r--r--lib/Transforms/Utils/ValueMapper.cpp19
-rw-r--r--lib/Transforms/Vectorize/CMakeLists.txt4
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp298
-rw-r--r--lib/Transforms/Vectorize/LoopVectorizationLegality.cpp1072
-rw-r--r--lib/Transforms/Vectorize/LoopVectorizationPlanner.h282
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp2806
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp1926
-rw-r--r--lib/Transforms/Vectorize/VPRecipeBuilder.h131
-rw-r--r--lib/Transforms/Vectorize/VPlan.cpp37
-rw-r--r--lib/Transforms/Vectorize/VPlan.h201
-rw-r--r--lib/Transforms/Vectorize/VPlanBuilder.h61
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp336
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGBuilder.h55
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp73
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGTransforms.h36
-rw-r--r--lib/Transforms/Vectorize/VPlanValue.h35
-rw-r--r--lib/Transforms/Vectorize/VPlanVerifier.cpp133
-rw-r--r--lib/Transforms/Vectorize/VPlanVerifier.h44
-rw-r--r--lib/Transforms/Vectorize/Vectorize.cpp4
223 files changed, 24900 insertions, 13392 deletions
diff --git a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
new file mode 100644
index 000000000000..b622d018478a
--- /dev/null
+++ b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -0,0 +1,257 @@
+//===- AggressiveInstCombine.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the aggressive expression pattern combiner classes.
+// Currently, it handles expression patterns for:
+// * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "AggressiveInstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+namespace {
+/// Contains expression pattern combiner logic.
+/// This class provides both the logic to combine expression patterns and
+/// combine them. It differs from InstCombiner class in that each pattern
+/// combiner runs only once as opposed to InstCombine's multi-iteration,
+/// which allows pattern combiner to have higher complexity than the O(1)
+/// required by the instruction combiner.
+class AggressiveInstCombinerLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ AggressiveInstCombinerLegacyPass() : FunctionPass(ID) {
+ initializeAggressiveInstCombinerLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Run all expression pattern optimizations on the given /p F function.
+ ///
+ /// \param F function to optimize.
+ /// \returns true if the IR is changed.
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
+/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
+/// of 'and' ops, then we also need to capture the fact that we saw an
+/// "and X, 1", so that's an extra return value for that case.
+struct MaskOps {
+ Value *Root;
+ APInt Mask;
+ bool MatchAndChain;
+ bool FoundAnd1;
+
+ MaskOps(unsigned BitWidth, bool MatchAnds) :
+ Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
+ MatchAndChain(MatchAnds), FoundAnd1(false) {}
+};
+
+/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
+/// chain of 'and' or 'or' instructions looking for shift ops of a common source
+/// value. Examples:
+/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
+/// returns { X, 0x129 }
+/// and (and (X >> 1), 1), (X >> 4)
+/// returns { X, 0x12 }
+static bool matchAndOrChain(Value *V, MaskOps &MOps) {
+ Value *Op0, *Op1;
+ if (MOps.MatchAndChain) {
+ // Recurse through a chain of 'and' operands. This requires an extra check
+ // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
+ // in the chain to know that all of the high bits are cleared.
+ if (match(V, m_And(m_Value(Op0), m_One()))) {
+ MOps.FoundAnd1 = true;
+ return matchAndOrChain(Op0, MOps);
+ }
+ if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
+ return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+ } else {
+ // Recurse through a chain of 'or' operands.
+ if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+ return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+ }
+
+ // We need a shift-right or a bare value representing a compare of bit 0 of
+ // the original source operand.
+ Value *Candidate;
+ uint64_t BitIndex = 0;
+ if (!match(V, m_LShr(m_Value(Candidate), m_ConstantInt(BitIndex))))
+ Candidate = V;
+
+ // Initialize result source operand.
+ if (!MOps.Root)
+ MOps.Root = Candidate;
+
+ // The shift constant is out-of-range? This code hasn't been simplified.
+ if (BitIndex >= MOps.Mask.getBitWidth())
+ return false;
+
+ // Fill in the mask bit derived from the shift constant.
+ MOps.Mask.setBit(BitIndex);
+ return MOps.Root == Candidate;
+}
+
+/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
+/// These will include a chain of 'or' or 'and'-shifted bits from a
+/// common source value:
+/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
+/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
+/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
+/// that differ only with a final 'not' of the result. We expect that final
+/// 'not' to be folded with the compare that we create here (invert predicate).
+static bool foldAnyOrAllBitsSet(Instruction &I) {
+ // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
+ // final "and X, 1" instruction must be the final op in the sequence.
+ bool MatchAllBitsSet;
+ if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
+ MatchAllBitsSet = true;
+ else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
+ MatchAllBitsSet = false;
+ else
+ return false;
+
+ MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
+ if (MatchAllBitsSet) {
+ if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)
+ return false;
+ } else {
+ if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))
+ return false;
+ }
+
+ // The pattern was found. Create a masked compare that replaces all of the
+ // shift and logic ops.
+ IRBuilder<> Builder(&I);
+ Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
+ Value *And = Builder.CreateAnd(MOps.Root, Mask);
+ Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask) :
+ Builder.CreateIsNotNull(And);
+ Value *Zext = Builder.CreateZExt(Cmp, I.getType());
+ I.replaceAllUsesWith(Zext);
+ return true;
+}
+
+/// This is the entry point for folds that could be implemented in regular
+/// InstCombine, but they are separated because they are not expected to
+/// occur frequently and/or have more than a constant-length pattern match.
+static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
+ bool MadeChange = false;
+ for (BasicBlock &BB : F) {
+ // Ignore unreachable basic blocks.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+ // Do not delete instructions under here and invalidate the iterator.
+ // Walk the block backwards for efficiency. We're matching a chain of
+ // use->defs, so we're more likely to succeed by starting from the bottom.
+ // Also, we want to avoid matching partial patterns.
+ // TODO: It would be more efficient if we removed dead instructions
+ // iteratively in this loop rather than waiting until the end.
+ for (Instruction &I : make_range(BB.rbegin(), BB.rend()))
+ MadeChange |= foldAnyOrAllBitsSet(I);
+ }
+
+ // We're done with transforms, so remove dead instructions.
+ if (MadeChange)
+ for (BasicBlock &BB : F)
+ SimplifyInstructionsInBlock(&BB);
+
+ return MadeChange;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
+ bool MadeChange = false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ TruncInstCombine TIC(TLI, DL, DT);
+ MadeChange |= TIC.run(F);
+ MadeChange |= foldUnusualPatterns(F, DT);
+ return MadeChange;
+}
+
+void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
+ AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return runImpl(F, TLI, DT);
+}
+
+PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, TLI, DT)) {
+ // No changes, all analyses are preserved.
+ return PreservedAnalyses::all();
+ }
+ // Mark all the analyses that instcombine updates as preserved.
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char AggressiveInstCombinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
+ "aggressive-instcombine",
+ "Combine pattern based expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
+ "Combine pattern based expressions", false, false)
+
+// Initialization Routines
+void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) {
+ initializeAggressiveInstCombinerLegacyPassPass(Registry);
+}
+
+void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) {
+ initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createAggressiveInstCombinerPass() {
+ return new AggressiveInstCombinerLegacyPass();
+}
+
+void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAggressiveInstCombinerPass());
+}
diff --git a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
new file mode 100644
index 000000000000..199374cdabf3
--- /dev/null
+++ b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -0,0 +1,121 @@
+//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the instruction pattern combiner classes.
+// Currently, it handles pattern expressions for:
+// * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TruncInstCombine - looks for expression dags dominated by trunc instructions
+// and for each eligible dag, it will create a reduced bit-width expression and
+// replace the old expression with this new one and remove the old one.
+// Eligible expression dag is such that:
+// 1. Contains only supported instructions.
+// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+// 3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
+// 4. All instructions in the dag must not have users outside the dag.
+// Only exception is for {ZExt, SExt}Inst with operand type equal to the
+// new reduced type chosen in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+ class DataLayout;
+ class DominatorTree;
+ class TargetLibraryInfo;
+
+class TruncInstCombine {
+ TargetLibraryInfo &TLI;
+ const DataLayout &DL;
+ const DominatorTree &DT;
+
+ /// List of all TruncInst instructions to be processed.
+ SmallVector<TruncInst *, 4> Worklist;
+
+ /// Current processed TruncInst instruction.
+ TruncInst *CurrentTruncInst;
+
+ /// Information per each instruction in the expression dag.
+ struct Info {
+ /// Number of LSBs that are needed to generate a valid expression.
+ unsigned ValidBitWidth = 0;
+ /// Minimum number of LSBs needed to generate the ValidBitWidth.
+ unsigned MinBitWidth = 0;
+ /// The reduced value generated to replace the old instruction.
+ Value *NewValue = nullptr;
+ };
+ /// An ordered map representing expression dag post-dominated by current
+ /// processed TruncInst. It maps each instruction in the dag to its Info
+ /// structure. The map is ordered such that each instruction appears before
+ /// all other instructions in the dag that uses it.
+ MapVector<Instruction *, Info> InstInfoMap;
+
+public:
+ TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL,
+ const DominatorTree &DT)
+ : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
+
+ /// Perform TruncInst pattern optimization on given function.
+ bool run(Function &F);
+
+private:
+ /// Build expression dag dominated by the /p CurrentTruncInst and append it to
+ /// the InstInfoMap container.
+ ///
+ /// \return true only if succeed to generate an eligible sub expression dag.
+ bool buildTruncExpressionDag();
+
+ /// Calculate the minimal allowed bit-width of the chain ending with the
+ /// currently visited truncate's operand.
+ ///
+ /// \return minimum number of bits to which the chain ending with the
+ /// truncate's operand can be shrunk to.
+ unsigned getMinBitWidth();
+
+ /// Build an expression dag dominated by the current processed TruncInst and
+ /// Check if it is eligible to be reduced to a smaller type.
+ ///
+ /// \return the scalar version of the new type to be used for the reduced
+ /// expression dag, or nullptr if the expression dag is not eligible
+ /// to be reduced.
+ Type *getBestTruncatedType();
+
+ /// Given a \p V value and a \p SclTy scalar type return the generated reduced
+ /// value of \p V based on the type \p SclTy.
+ ///
+ /// \param V value to be reduced.
+ /// \param SclTy scalar version of new type to reduce to.
+ /// \return the new reduced value.
+ Value *getReducedOperand(Value *V, Type *SclTy);
+
+ /// Create a new expression dag using the reduced /p SclTy type and replace
+ /// the old expression dag with it. Also erase all instructions in the old
+ /// dag, except those that are still needed outside the dag.
+ ///
+ /// \param SclTy scalar version of new type to reduce expression dag into.
+ void ReduceExpressionDag(Type *SclTy);
+};
+} // end namespace llvm.
diff --git a/lib/Transforms/AggressiveInstCombine/CMakeLists.txt b/lib/Transforms/AggressiveInstCombine/CMakeLists.txt
new file mode 100644
index 000000000000..386314801e38
--- /dev/null
+++ b/lib/Transforms/AggressiveInstCombine/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_library(LLVMAggressiveInstCombine
+ AggressiveInstCombine.cpp
+ TruncInstCombine.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+ ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/AggressiveInstCombine
+
+ DEPENDS
+ intrinsics_gen
+ )
diff --git a/lib/Transforms/AggressiveInstCombine/LLVMBuild.txt b/lib/Transforms/AggressiveInstCombine/LLVMBuild.txt
new file mode 100644
index 000000000000..c05844f33de9
--- /dev/null
+++ b/lib/Transforms/AggressiveInstCombine/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/AggressiveInstCombine/LLVMBuild.txt -----*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AggressiveInstCombine
+parent = Transforms
+required_libraries = Analysis Core Support TransformUtils
diff --git a/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
new file mode 100644
index 000000000000..8289b2d68f8a
--- /dev/null
+++ b/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -0,0 +1,418 @@
+//===- TruncInstCombine.cpp -----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
+// for each eligible dag, it will create a reduced bit-width expression, replace
+// the old expression with this new one and remove the old expression.
+// Eligible expression dag is such that:
+// 1. Contains only supported instructions.
+// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+// 3. Can be evaluated into type with reduced legal bit-width.
+// 4. All instructions in the dag must not have users outside the dag.
+// The only exception is for {ZExt, SExt}Inst with operand type equal to
+// the new reduced type evaluated in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveInstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+/// Given an instruction and a container, it fills all the relevant operands of
+/// that instruction, with respect to the Trunc expression dag optimizaton.
+static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // These CastInst are considered leaves of the evaluated expression, thus,
+ // their operands are not relevent.
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ Ops.push_back(I->getOperand(0));
+ Ops.push_back(I->getOperand(1));
+ break;
+ default:
+ llvm_unreachable("Unreachable!");
+ }
+}
+
+bool TruncInstCombine::buildTruncExpressionDag() {
+ SmallVector<Value *, 8> Worklist;
+ SmallVector<Instruction *, 8> Stack;
+ // Clear old expression dag.
+ InstInfoMap.clear();
+
+ Worklist.push_back(CurrentTruncInst->getOperand(0));
+
+ while (!Worklist.empty()) {
+ Value *Curr = Worklist.back();
+
+ if (isa<Constant>(Curr)) {
+ Worklist.pop_back();
+ continue;
+ }
+
+ auto *I = dyn_cast<Instruction>(Curr);
+ if (!I)
+ return false;
+
+ if (!Stack.empty() && Stack.back() == I) {
+ // Already handled all instruction operands, can remove it from both the
+ // Worklist and the Stack, and add it to the instruction info map.
+ Worklist.pop_back();
+ Stack.pop_back();
+ // Insert I to the Info map.
+ InstInfoMap.insert(std::make_pair(I, Info()));
+ continue;
+ }
+
+ if (InstInfoMap.count(I)) {
+ Worklist.pop_back();
+ continue;
+ }
+
+ // Add the instruction to the stack before start handling its operands.
+ Stack.push_back(I);
+
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // trunc(trunc(x)) -> trunc(x)
+ // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+ // trunc(ext(x)) -> trunc(x) if the source type is larger than the new
+ // dest
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ SmallVector<Value *, 2> Operands;
+ getRelevantOperands(I, Operands);
+ for (Value *Operand : Operands)
+ Worklist.push_back(Operand);
+ break;
+ }
+ default:
+ // TODO: Can handle more cases here:
+ // 1. select, shufflevector, extractelement, insertelement
+ // 2. udiv, urem
+ // 3. shl, lshr, ashr
+ // 4. phi node(and loop handling)
+ // ...
+ return false;
+ }
+ }
+ return true;
+}
+
+unsigned TruncInstCombine::getMinBitWidth() {
+ SmallVector<Value *, 8> Worklist;
+ SmallVector<Instruction *, 8> Stack;
+
+ Value *Src = CurrentTruncInst->getOperand(0);
+ Type *DstTy = CurrentTruncInst->getType();
+ unsigned TruncBitWidth = DstTy->getScalarSizeInBits();
+ unsigned OrigBitWidth =
+ CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+ if (isa<Constant>(Src))
+ return TruncBitWidth;
+
+ Worklist.push_back(Src);
+ InstInfoMap[cast<Instruction>(Src)].ValidBitWidth = TruncBitWidth;
+
+ while (!Worklist.empty()) {
+ Value *Curr = Worklist.back();
+
+ if (isa<Constant>(Curr)) {
+ Worklist.pop_back();
+ continue;
+ }
+
+ // Otherwise, it must be an instruction.
+ auto *I = cast<Instruction>(Curr);
+
+ auto &Info = InstInfoMap[I];
+
+ SmallVector<Value *, 2> Operands;
+ getRelevantOperands(I, Operands);
+
+ if (!Stack.empty() && Stack.back() == I) {
+ // Already handled all instruction operands, can remove it from both, the
+ // Worklist and the Stack, and update MinBitWidth.
+ Worklist.pop_back();
+ Stack.pop_back();
+ for (auto *Operand : Operands)
+ if (auto *IOp = dyn_cast<Instruction>(Operand))
+ Info.MinBitWidth =
+ std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth);
+ continue;
+ }
+
+ // Add the instruction to the stack before start handling its operands.
+ Stack.push_back(I);
+ unsigned ValidBitWidth = Info.ValidBitWidth;
+
+ // Update minimum bit-width before handling its operands. This is required
+ // when the instruction is part of a loop.
+ Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth);
+
+ for (auto *Operand : Operands)
+ if (auto *IOp = dyn_cast<Instruction>(Operand)) {
+ // If we already calculated the minimum bit-width for this valid
+ // bit-width, or for a smaller valid bit-width, then just keep the
+ // answer we already calculated.
+ unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth;
+ if (IOpBitwidth >= ValidBitWidth)
+ continue;
+ InstInfoMap[IOp].ValidBitWidth = std::max(ValidBitWidth, IOpBitwidth);
+ Worklist.push_back(IOp);
+ }
+ }
+ unsigned MinBitWidth = InstInfoMap.lookup(cast<Instruction>(Src)).MinBitWidth;
+ assert(MinBitWidth >= TruncBitWidth);
+
+ if (MinBitWidth > TruncBitWidth) {
+ // In this case reducing expression with vector type might generate a new
+ // vector type, which is not preferable as it might result in generating
+ // sub-optimal code.
+ if (DstTy->isVectorTy())
+ return OrigBitWidth;
+ // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth).
+ Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth);
+ // Update minimum bit-width with the new destination type bit-width if
+ // succeeded to find such, otherwise, with original bit-width.
+ MinBitWidth = Ty ? Ty->getScalarSizeInBits() : OrigBitWidth;
+ } else { // MinBitWidth == TruncBitWidth
+ // In this case the expression can be evaluated with the trunc instruction
+ // destination type, and trunc instruction can be omitted. However, we
+ // should not perform the evaluation if the original type is a legal scalar
+ // type and the target type is illegal.
+ bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth);
+ bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth);
+ if (!DstTy->isVectorTy() && FromLegal && !ToLegal)
+ return OrigBitWidth;
+ }
+ return MinBitWidth;
+}
+
+Type *TruncInstCombine::getBestTruncatedType() {
+ if (!buildTruncExpressionDag())
+ return nullptr;
+
+ // We don't want to duplicate instructions, which isn't profitable. Thus, we
+ // can't shrink something that has multiple users, unless all users are
+ // post-dominated by the trunc instruction, i.e., were visited during the
+ // expression evaluation.
+ unsigned DesiredBitWidth = 0;
+ for (auto Itr : InstInfoMap) {
+ Instruction *I = Itr.first;
+ if (I->hasOneUse())
+ continue;
+ bool IsExtInst = (isa<ZExtInst>(I) || isa<SExtInst>(I));
+ for (auto *U : I->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) {
+ if (!IsExtInst)
+ return nullptr;
+ // If this is an extension from the dest type, we can eliminate it,
+ // even if it has multiple users. Thus, update the DesiredBitWidth and
+ // validate all extension instructions agrees on same DesiredBitWidth.
+ unsigned ExtInstBitWidth =
+ I->getOperand(0)->getType()->getScalarSizeInBits();
+ if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth)
+ return nullptr;
+ DesiredBitWidth = ExtInstBitWidth;
+ }
+ }
+
+ unsigned OrigBitWidth =
+ CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+ // Calculate minimum allowed bit-width allowed for shrinking the currently
+ // visited truncate's operand.
+ unsigned MinBitWidth = getMinBitWidth();
+
+ // Check that we can shrink to smaller bit-width than original one and that
+ // it is similar to the DesiredBitWidth is such exists.
+ if (MinBitWidth >= OrigBitWidth ||
+ (DesiredBitWidth && DesiredBitWidth != MinBitWidth))
+ return nullptr;
+
+ return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth);
+}
+
+/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type
+/// for \p V, according to its type, if it vector type, return the vector
+/// version of \p Ty, otherwise return \p Ty.
+static Type *getReducedType(Value *V, Type *Ty) {
+ assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
+ if (auto *VTy = dyn_cast<VectorType>(V->getType()))
+ return VectorType::get(Ty, VTy->getNumElements());
+ return Ty;
+}
+
+Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
+ Type *Ty = getReducedType(V, SclTy);
+ if (auto *C = dyn_cast<Constant>(V)) {
+ C = ConstantExpr::getIntegerCast(C, Ty, false);
+ // If we got a constantexpr back, try to simplify it with DL info.
+ if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI))
+ C = FoldedC;
+ return C;
+ }
+
+ auto *I = cast<Instruction>(V);
+ Info Entry = InstInfoMap.lookup(I);
+ assert(Entry.NewValue);
+ return Entry.NewValue;
+}
+
+void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+ for (auto &Itr : InstInfoMap) { // Forward
+ Instruction *I = Itr.first;
+ TruncInstCombine::Info &NodeInfo = Itr.second;
+
+ assert(!NodeInfo.NewValue && "Instruction has been evaluated");
+
+ IRBuilder<> Builder(I);
+ Value *Res = nullptr;
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt: {
+ Type *Ty = getReducedType(I, SclTy);
+ // If the source type of the cast is the type we're trying for then we can
+ // just return the source. There's no need to insert it because it is not
+ // new.
+ if (I->getOperand(0)->getType() == Ty) {
+ assert(!isa<TruncInst>(I) && "Cannot reach here with TruncInst");
+ NodeInfo.NewValue = I->getOperand(0);
+ continue;
+ }
+ // Otherwise, must be the same type of cast, so just reinsert a new one.
+ // This also handles the case of zext(trunc(x)) -> zext(x).
+ Res = Builder.CreateIntCast(I->getOperand(0), Ty,
+ Opc == Instruction::SExt);
+
+ // Update Worklist entries with new value if needed.
+ // There are three possible changes to the Worklist:
+ // 1. Update Old-TruncInst -> New-TruncInst.
+ // 2. Remove Old-TruncInst (if New node is not TruncInst).
+ // 3. Add New-TruncInst (if Old node was not TruncInst).
+ auto Entry = find(Worklist, I);
+ if (Entry != Worklist.end()) {
+ if (auto *NewCI = dyn_cast<TruncInst>(Res))
+ *Entry = NewCI;
+ else
+ Worklist.erase(Entry);
+ } else if (auto *NewCI = dyn_cast<TruncInst>(Res))
+ Worklist.push_back(NewCI);
+ break;
+ }
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
+ Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
+ Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+ break;
+ }
+ default:
+ llvm_unreachable("Unhandled instruction");
+ }
+
+ NodeInfo.NewValue = Res;
+ if (auto *ResI = dyn_cast<Instruction>(Res))
+ ResI->takeName(I);
+ }
+
+ Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
+ Type *DstTy = CurrentTruncInst->getType();
+ if (Res->getType() != DstTy) {
+ IRBuilder<> Builder(CurrentTruncInst);
+ Res = Builder.CreateIntCast(Res, DstTy, false);
+ if (auto *ResI = dyn_cast<Instruction>(Res))
+ ResI->takeName(CurrentTruncInst);
+ }
+ CurrentTruncInst->replaceAllUsesWith(Res);
+
+ // Erase old expression dag, which was replaced by the reduced expression dag.
+ // We iterate backward, which means we visit the instruction before we visit
+ // any of its operands, this way, when we get to the operand, we already
+ // removed the instructions (from the expression dag) that uses it.
+ CurrentTruncInst->eraseFromParent();
+ for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
+ // We still need to check that the instruction has no users before we erase
+ // it, because {SExt, ZExt}Inst Instruction might have other users that was
+ // not reduced, in such case, we need to keep that instruction.
+ if (I->first->use_empty())
+ I->first->eraseFromParent();
+ }
+}
+
+bool TruncInstCombine::run(Function &F) {
+ bool MadeIRChange = false;
+
+ // Collect all TruncInst in the function into the Worklist for evaluating.
+ for (auto &BB : F) {
+ // Ignore unreachable basic block.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+ for (auto &I : BB)
+ if (auto *CI = dyn_cast<TruncInst>(&I))
+ Worklist.push_back(CI);
+ }
+
+ // Process all TruncInst in the Worklist, for each instruction:
+ // 1. Check if it dominates an eligible expression dag to be reduced.
+ // 2. Create a reduced expression dag and replace the old one with it.
+ while (!Worklist.empty()) {
+ CurrentTruncInst = Worklist.pop_back_val();
+
+ if (Type *NewDstSclTy = getBestTruncatedType()) {
+ LLVM_DEBUG(
+ dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
+ "dominated by: "
+ << CurrentTruncInst << '\n');
+ ReduceExpressionDag(NewDstSclTy);
+ MadeIRChange = true;
+ }
+ }
+
+ return MadeIRChange;
+}
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index 67bdeb27212d..74db9e53304d 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
add_subdirectory(Utils)
add_subdirectory(Instrumentation)
+add_subdirectory(AggressiveInstCombine)
add_subdirectory(InstCombine)
add_subdirectory(Scalar)
add_subdirectory(IPO)
diff --git a/lib/Transforms/Coroutines/CMakeLists.txt b/lib/Transforms/Coroutines/CMakeLists.txt
index 1c635bd9db08..80a052a2d45d 100644
--- a/lib/Transforms/Coroutines/CMakeLists.txt
+++ b/lib/Transforms/Coroutines/CMakeLists.txt
@@ -4,7 +4,7 @@ add_llvm_library(LLVMCoroutines
CoroEarly.cpp
CoroElide.cpp
CoroFrame.cpp
- CoroSplit.cpp
+ CoroSplit.cpp
DEPENDS
intrinsics_gen
diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp
index ba05896af150..ac47a06281a5 100644
--- a/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -27,10 +27,12 @@ namespace {
class Lowerer : public coro::LowererBase {
IRBuilder<> Builder;
PointerType *const AnyResumeFnPtrTy;
+ Constant *NoopCoro = nullptr;
void lowerResumeOrDestroy(CallSite CS, CoroSubFnInst::ResumeKind);
void lowerCoroPromise(CoroPromiseInst *Intrin);
void lowerCoroDone(IntrinsicInst *II);
+ void lowerCoroNoop(IntrinsicInst *II);
public:
Lowerer(Module &M)
@@ -103,6 +105,41 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
II->eraseFromParent();
}
+void Lowerer::lowerCoroNoop(IntrinsicInst *II) {
+ if (!NoopCoro) {
+ LLVMContext &C = Builder.getContext();
+ Module &M = *II->getModule();
+
+ // Create a noop.frame struct type.
+ StructType *FrameTy = StructType::create(C, "NoopCoro.Frame");
+ auto *FramePtrTy = FrameTy->getPointerTo();
+ auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+ /*IsVarArgs=*/false);
+ auto *FnPtrTy = FnTy->getPointerTo();
+ FrameTy->setBody({FnPtrTy, FnPtrTy});
+
+ // Create a Noop function that does nothing.
+ Function *NoopFn =
+ Function::Create(FnTy, GlobalValue::LinkageTypes::PrivateLinkage,
+ "NoopCoro.ResumeDestroy", &M);
+ NoopFn->setCallingConv(CallingConv::Fast);
+ auto *Entry = BasicBlock::Create(C, "entry", NoopFn);
+ ReturnInst::Create(C, Entry);
+
+ // Create a constant struct for the frame.
+ Constant* Values[] = {NoopFn, NoopFn};
+ Constant* NoopCoroConst = ConstantStruct::get(FrameTy, Values);
+ NoopCoro = new GlobalVariable(M, NoopCoroConst->getType(), /*isConstant=*/true,
+ GlobalVariable::PrivateLinkage, NoopCoroConst,
+ "NoopCoro.Frame.Const");
+ }
+
+ Builder.SetInsertPoint(II);
+ auto *NoopCoroVoidPtr = Builder.CreateBitCast(NoopCoro, Int8Ptr);
+ II->replaceAllUsesWith(NoopCoroVoidPtr);
+ II->eraseFromParent();
+}
+
// Prior to CoroSplit, calls to coro.begin needs to be marked as NoDuplicate,
// as CoroSplit assumes there is exactly one coro.begin. After CoroSplit,
// NoDuplicate attribute will be removed from coro.begin otherwise, it will
@@ -138,6 +175,9 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
if (cast<CoroEndInst>(&I)->isFallthrough())
CS.setCannotDuplicate();
break;
+ case Intrinsic::coro_noop:
+ lowerCoroNoop(cast<IntrinsicInst>(&I));
+ break;
case Intrinsic::coro_id:
// Mark a function that comes out of the frontend that has a coro.id
// with a coroutine attribute.
@@ -192,10 +232,10 @@ struct CoroEarly : public FunctionPass {
// This pass has work to do only if we find intrinsics we are going to lower
// in the module.
bool doInitialization(Module &M) override {
- if (coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.destroy",
- "llvm.coro.done", "llvm.coro.end",
- "llvm.coro.free", "llvm.coro.promise",
- "llvm.coro.resume", "llvm.coro.suspend"}))
+ if (coro::declaresIntrinsics(
+ M, {"llvm.coro.id", "llvm.coro.destroy", "llvm.coro.done",
+ "llvm.coro.end", "llvm.coro.noop", "llvm.coro.free",
+ "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.suspend"}))
L = llvm::make_unique<Lowerer>(M);
return false;
}
diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp
index 42fd6d746145..dfe05c4b2a5e 100644
--- a/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/lib/Transforms/Coroutines/CoroElide.cpp
@@ -14,6 +14,7 @@
#include "CoroInternal.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/Pass.h"
#include "llvm/Support/ErrorHandling.h"
@@ -35,8 +36,8 @@ struct Lowerer : coro::LowererBase {
Lowerer(Module &M) : LowererBase(M) {}
void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA);
- bool shouldElide() const;
- bool processCoroId(CoroIdInst *, AAResults &AA);
+ bool shouldElide(Function *F, DominatorTree &DT) const;
+ bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT);
};
} // end anonymous namespace
@@ -77,7 +78,6 @@ static bool operandReferences(CallInst *CI, AllocaInst *Frame, AAResults &AA) {
// call implies that the function does not references anything on the stack.
static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
Function &F = *Frame->getFunction();
- MemoryLocation Mem(Frame);
for (Instruction &I : instructions(F))
if (auto *Call = dyn_cast<CallInst>(&I))
if (Call->isTailCall() && operandReferences(Call, Frame, AA)) {
@@ -142,33 +142,54 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
removeTailCallAttribute(Frame, AA);
}
-bool Lowerer::shouldElide() const {
+bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
// If no CoroAllocs, we cannot suppress allocation, so elision is not
// possible.
if (CoroAllocs.empty())
return false;
// Check that for every coro.begin there is a coro.destroy directly
- // referencing the SSA value of that coro.begin. If the value escaped, then
- // coro.destroy would have been referencing a memory location storing that
- // value and not the virtual register.
-
- SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+ // referencing the SSA value of that coro.begin along a non-exceptional path.
+ // If the value escaped, then coro.destroy would have been referencing a
+ // memory location storing that value and not the virtual register.
+
+ // First gather all of the non-exceptional terminators for the function.
+ SmallPtrSet<Instruction *, 8> Terminators;
+ for (BasicBlock &B : *F) {
+ auto *TI = B.getTerminator();
+ if (TI->getNumSuccessors() == 0 && !TI->isExceptional() &&
+ !isa<UnreachableInst>(TI))
+ Terminators.insert(TI);
+ }
+ // Filter out the coro.destroy that lie along exceptional paths.
+ SmallPtrSet<CoroSubFnInst *, 4> DAs;
for (CoroSubFnInst *DA : DestroyAddr) {
+ for (Instruction *TI : Terminators) {
+ if (DT.dominates(DA, TI)) {
+ DAs.insert(DA);
+ break;
+ }
+ }
+ }
+
+ // Find all the coro.begin referenced by coro.destroy along happy paths.
+ SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+ for (CoroSubFnInst *DA : DAs) {
if (auto *CB = dyn_cast<CoroBeginInst>(DA->getFrame()))
ReferencedCoroBegins.insert(CB);
else
return false;
}
- // If size of the set is the same as total number of CoroBegins, means we
- // found a coro.free or coro.destroy mentioning a coro.begin and we can
+ // If size of the set is the same as total number of coro.begin, that means we
+ // found a coro.free or coro.destroy referencing each coro.begin, so we can
// perform heap elision.
return ReferencedCoroBegins.size() == CoroBegins.size();
}
-bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) {
+bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
+ DominatorTree &DT) {
CoroBegins.clear();
CoroAllocs.clear();
CoroFrees.clear();
@@ -214,7 +235,7 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) {
replaceWithConstant(ResumeAddrConstant, ResumeAddr);
- bool ShouldElide = shouldElide();
+ bool ShouldElide = shouldElide(CoroId->getFunction(), DT);
auto *DestroyAddrConstant = ConstantExpr::getExtractValue(
Resumers,
@@ -294,14 +315,16 @@ struct CoroElide : FunctionPass {
return Changed;
AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
for (auto *CII : L->CoroIds)
- Changed |= L->processCoroId(CII, AA);
+ Changed |= L->processCoroId(CII, AA, DT);
return Changed;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
}
StringRef getPassName() const override { return "Coroutine Elision"; }
};
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 6334256bf03a..cf63b678b618 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -19,6 +19,8 @@
#include "CoroInternal.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
@@ -27,7 +29,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/circular_raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -48,7 +49,7 @@ public:
BlockToIndexMapping(Function &F) {
for (BasicBlock &BB : F)
V.push_back(&BB);
- std::sort(V.begin(), V.end());
+ llvm::sort(V.begin(), V.end());
}
size_t blockToIndex(BasicBlock *BB) const {
@@ -105,8 +106,8 @@ struct SuspendCrossingInfo {
assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def");
bool const Result = Block[UseIndex].Kills[DefIndex];
- DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
- << " answer is " << Result << "\n");
+ LLVM_DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
+ << " answer is " << Result << "\n");
return Result;
}
@@ -194,8 +195,8 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
bool Changed;
do {
- DEBUG(dbgs() << "iteration " << ++Iteration);
- DEBUG(dbgs() << "==============\n");
+ LLVM_DEBUG(dbgs() << "iteration " << ++Iteration);
+ LLVM_DEBUG(dbgs() << "==============\n");
Changed = false;
for (size_t I = 0; I < N; ++I) {
@@ -239,20 +240,20 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
Changed |= (S.Kills != SavedKills) || (S.Consumes != SavedConsumes);
if (S.Kills != SavedKills) {
- DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()
- << "\n");
- DEBUG(dump("S.Kills", S.Kills));
- DEBUG(dump("SavedKills", SavedKills));
+ LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()
+ << "\n");
+ LLVM_DEBUG(dump("S.Kills", S.Kills));
+ LLVM_DEBUG(dump("SavedKills", SavedKills));
}
if (S.Consumes != SavedConsumes) {
- DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");
- DEBUG(dump("S.Consume", S.Consumes));
- DEBUG(dump("SavedCons", SavedConsumes));
+ LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");
+ LLVM_DEBUG(dump("S.Consume", S.Consumes));
+ LLVM_DEBUG(dump("SavedCons", SavedConsumes));
}
}
}
} while (Changed);
- DEBUG(dump());
+ LLVM_DEBUG(dump());
}
#undef DEBUG_TYPE // "coro-suspend-crossing"
@@ -263,8 +264,9 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
namespace {
class Spill {
- Value *Def;
- Instruction *User;
+ Value *Def = nullptr;
+ Instruction *User = nullptr;
+ unsigned FieldNo = 0;
public:
Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {}
@@ -272,6 +274,20 @@ public:
Value *def() const { return Def; }
Instruction *user() const { return User; }
BasicBlock *userBlock() const { return User->getParent(); }
+
+ // Note that field index is stored in the first SpillEntry for a particular
+ // definition. Subsequent mentions of a defintion do not have fieldNo
+ // assigned. This works out fine as the users of Spills capture the info about
+ // the definition the first time they encounter it. Consider refactoring
+ // SpillInfo into two arrays to normalize the spill representation.
+ unsigned fieldIndex() const {
+ assert(FieldNo && "Accessing unassigned field");
+ return FieldNo;
+ }
+ void setFieldIndex(unsigned FieldNumber) {
+ assert(!FieldNo && "Reassigning field number");
+ FieldNo = FieldNumber;
+ }
};
} // namespace
@@ -294,6 +310,57 @@ static void dump(StringRef Title, SpillInfo const &Spills) {
}
#endif
+namespace {
+// We cannot rely solely on natural alignment of a type when building a
+// coroutine frame and if the alignment specified on the Alloca instruction
+// differs from the natural alignment of the alloca type we will need to insert
+// padding.
+struct PaddingCalculator {
+ const DataLayout &DL;
+ LLVMContext &Context;
+ unsigned StructSize = 0;
+
+ PaddingCalculator(LLVMContext &Context, DataLayout const &DL)
+ : DL(DL), Context(Context) {}
+
+ // Replicate the logic from IR/DataLayout.cpp to match field offset
+ // computation for LLVM structs.
+ void addType(Type *Ty) {
+ unsigned TyAlign = DL.getABITypeAlignment(Ty);
+ if ((StructSize & (TyAlign - 1)) != 0)
+ StructSize = alignTo(StructSize, TyAlign);
+
+ StructSize += DL.getTypeAllocSize(Ty); // Consume space for this data item.
+ }
+
+ void addTypes(SmallVectorImpl<Type *> const &Types) {
+ for (auto *Ty : Types)
+ addType(Ty);
+ }
+
+ unsigned computePadding(Type *Ty, unsigned ForcedAlignment) {
+ unsigned TyAlign = DL.getABITypeAlignment(Ty);
+ auto Natural = alignTo(StructSize, TyAlign);
+ auto Forced = alignTo(StructSize, ForcedAlignment);
+
+ // Return how many bytes of padding we need to insert.
+ if (Natural != Forced)
+ return std::max(Natural, Forced) - StructSize;
+
+ // Rely on natural alignment.
+ return 0;
+ }
+
+ // If padding required, return the padding field type to insert.
+ ArrayType *getPaddingType(Type *Ty, unsigned ForcedAlignment) {
+ if (auto Padding = computePadding(Ty, ForcedAlignment))
+ return ArrayType::get(Type::getInt8Ty(Context), Padding);
+
+ return nullptr;
+ }
+};
+} // namespace
+
// Build a struct that will keep state for an active coroutine.
// struct f.frame {
// ResumeFnTy ResumeFnAddr;
@@ -305,6 +372,8 @@ static void dump(StringRef Title, SpillInfo const &Spills) {
static StructType *buildFrameType(Function &F, coro::Shape &Shape,
SpillInfo &Spills) {
LLVMContext &C = F.getContext();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ PaddingCalculator Padder(C, DL);
SmallString<32> Name(F.getName());
Name.append(".Frame");
StructType *FrameTy = StructType::create(C, Name);
@@ -322,8 +391,10 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
Type::getIntNTy(C, IndexBits)};
Value *CurrentDef = nullptr;
+ Padder.addTypes(Types);
+
// Create an entry for every spilled value.
- for (auto const &S : Spills) {
+ for (auto &S : Spills) {
if (CurrentDef == S.def())
continue;
@@ -333,12 +404,22 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
continue;
Type *Ty = nullptr;
- if (auto *AI = dyn_cast<AllocaInst>(CurrentDef))
+ if (auto *AI = dyn_cast<AllocaInst>(CurrentDef)) {
Ty = AI->getAllocatedType();
- else
+ if (unsigned AllocaAlignment = AI->getAlignment()) {
+ // If alignment is specified in alloca, see if we need to insert extra
+ // padding.
+ if (auto PaddingTy = Padder.getPaddingType(Ty, AllocaAlignment)) {
+ Types.push_back(PaddingTy);
+ Padder.addType(PaddingTy);
+ }
+ }
+ } else {
Ty = CurrentDef->getType();
-
+ }
+ S.setFieldIndex(Types.size());
Types.push_back(Ty);
+ Padder.addType(Ty);
}
FrameTy->setBody(Types);
@@ -399,7 +480,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
Value *CurrentValue = nullptr;
BasicBlock *CurrentBlock = nullptr;
Value *CurrentReload = nullptr;
- unsigned Index = coro::Shape::LastKnownField;
+ unsigned Index = 0; // Proper field number will be read from field definition.
// We need to keep track of any allocas that need "spilling"
// since they will live in the coroutine frame now, all access to them
@@ -414,6 +495,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
// Create a load instruction to reload the spilled value from the coroutine
// frame.
auto CreateReload = [&](Instruction *InsertBefore) {
+ assert(Index && "accessing unassigned field number");
Builder.SetInsertPoint(InsertBefore);
auto *G = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, Index,
CurrentValue->getName() +
@@ -431,7 +513,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
CurrentBlock = nullptr;
CurrentReload = nullptr;
- ++Index;
+ Index = E.fieldIndex();
if (auto *AI = dyn_cast<AllocaInst>(CurrentValue)) {
// Spilled AllocaInst will be replaced with GEP from the coroutine frame
@@ -739,6 +821,8 @@ static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
for (User *U : CurrentValue->users()) {
Instruction *I = cast<Instruction>(U);
if (!DT.dominates(CoroBegin, I)) {
+ LLVM_DEBUG(dbgs() << "will move: " << *I << "\n");
+
// TODO: Make this more robust. Currently if we run into a situation
// where simple instruction move won't work we panic and
// report_fatal_error.
@@ -748,7 +832,6 @@ static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
" dominated by CoroBegin");
}
- DEBUG(dbgs() << "will move: " << *I << "\n");
NeedsMoving.push_back(I);
}
}
@@ -823,7 +906,7 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
break;
// Rewrite materializable instructions to be materialized at the use point.
- DEBUG(dump("Materializations", Spills));
+ LLVM_DEBUG(dump("Materializations", Spills));
rewriteMaterializableInstructions(Builder, Spills);
Spills.clear();
}
@@ -853,7 +936,7 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
Spills.emplace_back(&I, U);
}
}
- DEBUG(dump("Spills", Spills));
+ LLVM_DEBUG(dump("Spills", Spills));
moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
Shape.FrameTy = buildFrameType(F, Shape, Spills);
Shape.FramePtr = insertSpills(Spills, Shape);
diff --git a/lib/Transforms/Coroutines/CoroInternal.h b/lib/Transforms/Coroutines/CoroInternal.h
index 1eac88dbac3a..8e690d649cf5 100644
--- a/lib/Transforms/Coroutines/CoroInternal.h
+++ b/lib/Transforms/Coroutines/CoroInternal.h
@@ -76,7 +76,6 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
DestroyField,
PromiseField,
IndexField,
- LastKnownField = IndexField
};
StructType *FrameTy;
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 8712ca4823c6..49acc5e93a39 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -59,7 +60,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <cassert>
#include <cstddef>
@@ -250,7 +250,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType());
Function *NewF =
- Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
+ Function::Create(FnTy, GlobalValue::LinkageTypes::ExternalLinkage,
F.getName() + Suffix, M);
NewF->addParamAttr(0, Attribute::NonNull);
NewF->addParamAttr(0, Attribute::NoAlias);
@@ -265,6 +265,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
SmallVector<ReturnInst *, 4> Returns;
CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
+ NewF->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
// Remove old returns.
for (ReturnInst *Return : Returns)
@@ -440,16 +441,14 @@ static void
scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
DenseMap<Value *, Value *> &ResolvedValues) {
auto *PrevBB = Prev->getParent();
- auto *I = &*NewBlock->begin();
- while (auto PN = dyn_cast<PHINode>(I)) {
- auto V = PN->getIncomingValueForBlock(PrevBB);
+ for (PHINode &PN : NewBlock->phis()) {
+ auto V = PN.getIncomingValueForBlock(PrevBB);
// See if we already resolved it.
auto VI = ResolvedValues.find(V);
if (VI != ResolvedValues.end())
V = VI->second;
// Remember the value.
- ResolvedValues[PN] = V;
- I = I->getNextNode();
+ ResolvedValues[&PN] = V;
}
}
@@ -655,13 +654,28 @@ getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
// set.
do {
Instruction *Current = Work.pop_back_val();
+ LLVM_DEBUG(dbgs() << "CoroSplit: Will not relocate: " << *Current << "\n");
DoNotRelocate.insert(Current);
for (Value *U : Current->operands()) {
auto *I = dyn_cast<Instruction>(U);
if (!I)
continue;
- if (isa<AllocaInst>(U))
+
+ if (auto *A = dyn_cast<AllocaInst>(I)) {
+ // Stores to alloca instructions that occur before the coroutine frame
+ // is allocated should not be moved; the stored values may be used by
+ // the coroutine frame allocator. The operands to those stores must also
+ // remain in place.
+ for (const auto &User : A->users())
+ if (auto *SI = dyn_cast<llvm::StoreInst>(User))
+ if (RelocBlocks.count(SI->getParent()) != 0 &&
+ DoNotRelocate.count(SI) == 0) {
+ Work.push_back(SI);
+ DoNotRelocate.insert(SI);
+ }
continue;
+ }
+
if (DoNotRelocate.count(I) == 0) {
Work.push_back(I);
DoNotRelocate.insert(I);
@@ -836,8 +850,8 @@ struct CoroSplit : public CallGraphSCCPass {
for (Function *F : Coroutines) {
Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
StringRef Value = Attr.getValueAsString();
- DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
- << "' state: " << Value << "\n");
+ LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
+ << "' state: " << Value << "\n");
if (Value == UNPREPARED_FOR_SPLIT) {
prepareForSplit(*F, CG);
continue;
diff --git a/lib/Transforms/Coroutines/Coroutines.cpp b/lib/Transforms/Coroutines/Coroutines.cpp
index 10411c1bd65d..731faeb5dce4 100644
--- a/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/lib/Transforms/Coroutines/Coroutines.cpp
@@ -11,12 +11,14 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Coroutines.h"
#include "CoroInstr.h"
#include "CoroInternal.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
@@ -31,10 +33,8 @@
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Coroutines.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstddef>
#include <utility>
@@ -125,9 +125,10 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
static const char *const CoroIntrinsics[] = {
"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.destroy",
"llvm.coro.done", "llvm.coro.end", "llvm.coro.frame",
- "llvm.coro.free", "llvm.coro.id", "llvm.coro.param",
- "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.save",
- "llvm.coro.size", "llvm.coro.subfn.addr", "llvm.coro.suspend",
+ "llvm.coro.free", "llvm.coro.id", "llvm.coro.noop",
+ "llvm.coro.param", "llvm.coro.promise", "llvm.coro.resume",
+ "llvm.coro.save", "llvm.coro.size", "llvm.coro.subfn.addr",
+ "llvm.coro.suspend",
};
return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
}
diff --git a/lib/Transforms/IPO/AlwaysInliner.cpp b/lib/Transforms/IPO/AlwaysInliner.cpp
index a4bbc99b1f90..3b735ddd192e 100644
--- a/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -50,7 +50,8 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, ModuleAnalysisManager &) {
for (CallSite CS : Calls)
// FIXME: We really shouldn't be able to fail to inline at this point!
// We should do something to log or check the inline failures here.
- Changed |= InlineFunction(CS, IFI);
+ Changed |=
+ InlineFunction(CS, IFI, /*CalleeAAR=*/nullptr, InsertLifetime);
// Remember to try and delete this function afterward. This both avoids
// re-walking the rest of the module and avoids dealing with any iterator
@@ -129,7 +130,7 @@ Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
return new AlwaysInlinerLegacyPass(InsertLifetime);
}
-/// \brief Get the inline cost for the always-inliner.
+/// Get the inline cost for the always-inliner.
///
/// The always inliner *only* handles functions which are marked with the
/// attribute to force inlining. As such, it is dramatically simpler and avoids
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index b25cbcad3b9d..f2c2b55b1c5b 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -220,8 +220,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
NF->setSubprogram(F->getSubprogram());
F->setSubprogram(nullptr);
- DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n"
- << "From: " << *F);
+ LLVM_DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n"
+ << "From: " << *F);
// Recompute the parameter attributes list based on the new arguments for
// the function.
@@ -426,8 +426,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
I2->setName(I->getName() + ".val");
LI->replaceAllUsesWith(&*I2);
LI->eraseFromParent();
- DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
- << "' in function '" << F->getName() << "'\n");
+ LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+ << "' in function '" << F->getName() << "'\n");
} else {
GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
IndicesVector Operands;
@@ -453,8 +453,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
NewName += ".val";
TheArg->setName(NewName);
- DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
- << "' of function '" << NF->getName() << "'\n");
+ LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+ << "' of function '" << NF->getName() << "'\n");
// All of the uses must be load instructions. Replace them all with
// the argument specified by ArgNo.
@@ -688,11 +688,11 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
// to do.
if (ToPromote.find(Operands) == ToPromote.end()) {
if (MaxElements > 0 && ToPromote.size() == MaxElements) {
- DEBUG(dbgs() << "argpromotion not promoting argument '"
- << Arg->getName()
- << "' because it would require adding more "
- << "than " << MaxElements
- << " arguments to the function.\n");
+ LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
+ << Arg->getName()
+ << "' because it would require adding more "
+ << "than " << MaxElements
+ << " arguments to the function.\n");
// We limit aggregate promotion to only promoting up to a fixed number
// of elements of the aggregate.
return false;
@@ -738,7 +738,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
return true;
}
-/// \brief Checks if a type could have padding bytes.
+/// Checks if a type could have padding bytes.
static bool isDenselyPacked(Type *type, const DataLayout &DL) {
// There is no size information, so be conservative.
if (!type->isSized())
@@ -772,7 +772,7 @@ static bool isDenselyPacked(Type *type, const DataLayout &DL) {
return true;
}
-/// \brief Checks if the padding bytes of an argument could be accessed.
+/// Checks if the padding bytes of an argument could be accessed.
static bool canPaddingBeAccessed(Argument *arg) {
assert(arg->hasByValAttr());
@@ -817,6 +817,12 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
unsigned MaxElements,
Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
ReplaceCallSite) {
+ // Don't perform argument promotion for naked functions; otherwise we can end
+ // up removing parameters that are seemingly 'not used' as they are referred
+ // to in the assembly.
+ if(F->hasFnAttribute(Attribute::Naked))
+ return nullptr;
+
// Make sure that it is local to this module.
if (!F->hasLocalLinkage())
return nullptr;
@@ -847,10 +853,20 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
if (CS.getInstruction() == nullptr || !CS.isCallee(&U))
return nullptr;
+ // Can't change signature of musttail callee
+ if (CS.isMustTailCall())
+ return nullptr;
+
if (CS.getInstruction()->getParent()->getParent() == F)
isSelfRecursive = true;
}
+ // Can't change signature of musttail caller
+ // FIXME: Support promoting whole chain of musttail functions
+ for (BasicBlock &BB : *F)
+ if (BB.getTerminatingMustTailCall())
+ return nullptr;
+
const DataLayout &DL = F->getParent()->getDataLayout();
AAResults &AAR = AARGetter(*F);
@@ -885,11 +901,11 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
if (isSafeToPromote) {
if (StructType *STy = dyn_cast<StructType>(AgTy)) {
if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
- DEBUG(dbgs() << "argpromotion disable promoting argument '"
- << PtrArg->getName()
- << "' because it would require adding more"
- << " than " << MaxElements
- << " arguments to the function.\n");
+ LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
+ << PtrArg->getName()
+ << "' because it would require adding more"
+ << " than " << MaxElements
+ << " arguments to the function.\n");
continue;
}
@@ -963,7 +979,7 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
return FAM.getResult<AAManager>(F);
};
- Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
+ Function *NewF = promoteArguments(&OldF, AARGetter, MaxElements, None);
if (!NewF)
continue;
LocalChange = true;
diff --git a/lib/Transforms/IPO/BarrierNoopPass.cpp b/lib/Transforms/IPO/BarrierNoopPass.cpp
index 6af104362594..05fc3dd6950c 100644
--- a/lib/Transforms/IPO/BarrierNoopPass.cpp
+++ b/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -23,7 +23,7 @@
using namespace llvm;
namespace {
-/// \brief A nonce module pass used to place a barrier in a pass manager.
+/// A nonce module pass used to place a barrier in a pass manager.
///
/// There is no mechanism for ending a CGSCC pass manager once one is started.
/// This prevents extension points from having clear deterministic ordering
diff --git a/lib/Transforms/IPO/BlockExtractor.cpp b/lib/Transforms/IPO/BlockExtractor.cpp
new file mode 100644
index 000000000000..ff5ee817da49
--- /dev/null
+++ b/lib/Transforms/IPO/BlockExtractor.cpp
@@ -0,0 +1,176 @@
+//===- BlockExtractor.cpp - Extracts blocks into their own functions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts the specified basic blocks from the module into their
+// own functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "block-extractor"
+
+STATISTIC(NumExtracted, "Number of basic blocks extracted");
+
+static cl::opt<std::string> BlockExtractorFile(
+ "extract-blocks-file", cl::value_desc("filename"),
+ cl::desc("A file containing list of basic blocks to extract"), cl::Hidden);
+
+cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs",
+ cl::desc("Erase the existing functions"),
+ cl::Hidden);
+
+namespace {
+class BlockExtractor : public ModulePass {
+ SmallVector<BasicBlock *, 16> Blocks;
+ bool EraseFunctions;
+ SmallVector<std::pair<std::string, std::string>, 32> BlocksByName;
+
+public:
+ static char ID;
+ BlockExtractor(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
+ bool EraseFunctions)
+ : ModulePass(ID), Blocks(BlocksToExtract.begin(), BlocksToExtract.end()),
+ EraseFunctions(EraseFunctions) {
+ if (!BlockExtractorFile.empty())
+ loadFile();
+ }
+ BlockExtractor() : BlockExtractor(SmallVector<BasicBlock *, 0>(), false) {}
+ bool runOnModule(Module &M) override;
+
+private:
+ void loadFile();
+ void splitLandingPadPreds(Function &F);
+};
+} // end anonymous namespace
+
+char BlockExtractor::ID = 0;
+INITIALIZE_PASS(BlockExtractor, "extract-blocks",
+ "Extract basic blocks from module", false, false)
+
+ModulePass *llvm::createBlockExtractorPass() { return new BlockExtractor(); }
+ModulePass *llvm::createBlockExtractorPass(
+ const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
+ return new BlockExtractor(BlocksToExtract, EraseFunctions);
+}
+
+/// Gets all of the blocks specified in the input file.
+void BlockExtractor::loadFile() {
+ auto ErrOrBuf = MemoryBuffer::getFile(BlockExtractorFile);
+ if (ErrOrBuf.getError())
+ report_fatal_error("BlockExtractor couldn't load the file.");
+ // Read the file.
+ auto &Buf = *ErrOrBuf;
+ SmallVector<StringRef, 16> Lines;
+ Buf->getBuffer().split(Lines, '\n', /*MaxSplit=*/-1,
+ /*KeepEmpty=*/false);
+ for (const auto &Line : Lines) {
+ auto FBPair = Line.split(' ');
+ BlocksByName.push_back({FBPair.first, FBPair.second});
+ }
+}
+
+/// Extracts the landing pads to make sure all of them have only one
+/// predecessor.
+void BlockExtractor::splitLandingPadPreds(Function &F) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (!isa<InvokeInst>(&I))
+ continue;
+ InvokeInst *II = cast<InvokeInst>(&I);
+ BasicBlock *Parent = II->getParent();
+ BasicBlock *LPad = II->getUnwindDest();
+
+ // Look through the landing pad's predecessors. If one of them ends in an
+ // 'invoke', then we want to split the landing pad.
+ bool Split = false;
+ for (auto PredBB : predecessors(LPad)) {
+ if (PredBB->isLandingPad() && PredBB != Parent &&
+ isa<InvokeInst>(Parent->getTerminator())) {
+ Split = true;
+ break;
+ }
+ }
+
+ if (!Split)
+ continue;
+
+ SmallVector<BasicBlock *, 2> NewBBs;
+ SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
+ }
+ }
+}
+
+bool BlockExtractor::runOnModule(Module &M) {
+
+ bool Changed = false;
+
+ // Get all the functions.
+ SmallVector<Function *, 4> Functions;
+ for (Function &F : M) {
+ splitLandingPadPreds(F);
+ Functions.push_back(&F);
+ }
+
+ // Get all the blocks specified in the input file.
+ for (const auto &BInfo : BlocksByName) {
+ Function *F = M.getFunction(BInfo.first);
+ if (!F)
+ report_fatal_error("Invalid function name specified in the input file");
+ auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
+ return BB.getName().equals(BInfo.second);
+ });
+ if (Res == F->end())
+ report_fatal_error("Invalid block name specified in the input file");
+ Blocks.push_back(&*Res);
+ }
+
+ // Extract basic blocks.
+ for (BasicBlock *BB : Blocks) {
+ // Check if the module contains BB.
+ if (BB->getParent()->getParent() != &M)
+ report_fatal_error("Invalid basic block");
+ LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting "
+ << BB->getParent()->getName() << ":" << BB->getName()
+ << "\n");
+ SmallVector<BasicBlock *, 2> BlocksToExtractVec;
+ BlocksToExtractVec.push_back(BB);
+ if (const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+ BlocksToExtractVec.push_back(II->getUnwindDest());
+ CodeExtractor(BlocksToExtractVec).extractCodeRegion();
+ ++NumExtracted;
+ Changed = true;
+ }
+
+ // Erase the functions.
+ if (EraseFunctions || BlockExtractorEraseFuncs) {
+ for (Function *F : Functions) {
+ LLVM_DEBUG(dbgs() << "BlockExtractor: Trying to delete " << F->getName()
+ << "\n");
+ F->deleteBody();
+ }
+ // Set linkage as ExternalLinkage to avoid erasing unreachable functions.
+ for (Function &F : M)
+ F.setLinkage(GlobalValue::ExternalLinkage);
+ Changed = true;
+ }
+
+ return Changed;
+}
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 397561746f86..4772baf5976c 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMipo
AlwaysInliner.cpp
ArgumentPromotion.cpp
BarrierNoopPass.cpp
+ BlockExtractor.cpp
CalledValuePropagation.cpp
ConstantMerge.cpp
CrossDSOCFI.cpp
@@ -27,8 +28,10 @@ add_llvm_library(LLVMipo
PassManagerBuilder.cpp
PruneEH.cpp
SampleProfile.cpp
+ SCCP.cpp
StripDeadPrototypes.cpp
StripSymbols.cpp
+ SyntheticCountsPropagation.cpp
ThinLTOBitcodeWriter.cpp
WholeProgramDevirt.cpp
diff --git a/lib/Transforms/IPO/CalledValuePropagation.cpp b/lib/Transforms/IPO/CalledValuePropagation.cpp
index c5f6336aa2be..d642445b35de 100644
--- a/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -69,12 +69,15 @@ public:
CVPLatticeVal() : LatticeState(Undefined) {}
CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
- CVPLatticeVal(std::set<Function *, Compare> &&Functions)
- : LatticeState(FunctionSet), Functions(Functions) {}
+ CVPLatticeVal(std::vector<Function *> &&Functions)
+ : LatticeState(FunctionSet), Functions(std::move(Functions)) {
+ assert(std::is_sorted(this->Functions.begin(), this->Functions.end(),
+ Compare()));
+ }
/// Get a reference to the functions held by this lattice value. The number
/// of functions will be zero for states other than FunctionSet.
- const std::set<Function *, Compare> &getFunctions() const {
+ const std::vector<Function *> &getFunctions() const {
return Functions;
}
@@ -99,7 +102,8 @@ private:
/// MaxFunctionsPerValue. Since most LLVM values are expected to be in
/// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be
/// small and efficiently copyable.
- std::set<Function *, Compare> Functions;
+ // FIXME: This could be a TinyPtrVector and/or merge with LatticeState.
+ std::vector<Function *> Functions;
};
/// The custom lattice function used by the generic sparse propagation solver.
@@ -150,11 +154,10 @@ public:
return getOverdefinedVal();
if (X == getUndefVal() && Y == getUndefVal())
return getUndefVal();
- std::set<Function *, CVPLatticeVal::Compare> Union;
+ std::vector<Function *> Union;
std::set_union(X.getFunctions().begin(), X.getFunctions().end(),
Y.getFunctions().begin(), Y.getFunctions().end(),
- std::inserter(Union, Union.begin()),
- CVPLatticeVal::Compare{});
+ std::back_inserter(Union), CVPLatticeVal::Compare{});
if (Union.size() > MaxFunctionsPerValue)
return getOverdefinedVal();
return CVPLatticeVal(std::move(Union));
@@ -265,6 +268,10 @@ private:
// If we can't track the function's return values, there's nothing to do.
if (!F || !canTrackReturnsInterprocedurally(F)) {
+ // Void return, No need to create and update CVPLattice state as no one
+ // can use it.
+ if (I->getType()->isVoidTy())
+ return;
ChangedValues[RegI] = getOverdefinedVal();
return;
}
@@ -280,6 +287,12 @@ private:
ChangedValues[RegFormal] =
MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
}
+
+ // Void return, No need to create and update CVPLattice state as no one can
+ // use it.
+ if (I->getType()->isVoidTy())
+ return;
+
ChangedValues[RegI] =
MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
}
@@ -377,8 +390,7 @@ static bool runCVP(Module &M) {
CVPLatticeVal LV = Solver.getExistingValueState(RegI);
if (!LV.isFunctionSet() || LV.getFunctions().empty())
continue;
- MDNode *Callees = MDB.createCallees(SmallVector<Function *, 4>(
- LV.getFunctions().begin(), LV.getFunctions().end()));
+ MDNode *Callees = MDB.createCallees(LV.getFunctions());
C->setMetadata(LLVMContext::MD_callees, Callees);
Changed = true;
}
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index 886029ea58d5..666f6cc37bfd 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -162,9 +162,6 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
}
bool CrossDSOCFI::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
VeryLikelyWeights =
MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 5446541550e5..31e771da3bd3 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -240,8 +240,11 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
I2->takeName(&*I);
}
- // Patch the pointer to LLVM function in debug info descriptor.
- NF->setSubprogram(Fn.getSubprogram());
+ // Clone metadatas from the old function, including debug info descriptor.
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ Fn.getAllMetadata(MDs);
+ for (auto MD : MDs)
+ NF->addMetadata(MD.first, *MD.second);
// Fix up any BlockAddresses that refer to the function.
Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
@@ -507,25 +510,43 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
// MaybeLive. Initialized to a list of RetCount empty lists.
RetUses MaybeLiveRetUses(RetCount);
- for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
- if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+ bool HasMustTailCalls = false;
+
+ for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+ if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
!= F.getFunctionType()->getReturnType()) {
// We don't support old style multiple return values.
MarkLive(F);
return;
}
+ }
+
+ // If we have any returns of `musttail` results - the signature can't
+ // change
+ if (BB->getTerminatingMustTailCall() != nullptr)
+ HasMustTailCalls = true;
+ }
+
+ if (HasMustTailCalls) {
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+ << " has musttail calls\n");
+ }
if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
MarkLive(F);
return;
}
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
- << F.getName() << "\n");
+ LLVM_DEBUG(
+ dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
+ << F.getName() << "\n");
// Keep track of the number of live retvals, so we can skip checks once all
// of them turn out to be live.
unsigned NumLiveRetVals = 0;
+
+ bool HasMustTailCallers = false;
+
// Loop all uses of the function.
for (const Use &U : F.uses()) {
// If the function is PASSED IN as an argument, its address has been
@@ -536,6 +557,11 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
return;
}
+ // The number of arguments for `musttail` call must match the number of
+ // arguments of the caller
+ if (CS.isMustTailCall())
+ HasMustTailCallers = true;
+
// If this use is anything other than a call site, the function is alive.
const Instruction *TheCall = CS.getInstruction();
if (!TheCall) { // Not a direct call site?
@@ -580,12 +606,17 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
}
}
+ if (HasMustTailCallers) {
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+ << " has musttail callers\n");
+ }
+
// Now we've inspected all callers, record the liveness of our return values.
for (unsigned i = 0; i != RetCount; ++i)
MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
- << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
+ << F.getName() << "\n");
// Now, check all of our arguments.
unsigned i = 0;
@@ -593,12 +624,19 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
for (Function::const_arg_iterator AI = F.arg_begin(),
E = F.arg_end(); AI != E; ++AI, ++i) {
Liveness Result;
- if (F.getFunctionType()->isVarArg()) {
+ if (F.getFunctionType()->isVarArg() || HasMustTailCallers ||
+ HasMustTailCalls) {
// Variadic functions will already have a va_arg function expanded inside
// them, making them potentially very sensitive to ABI changes resulting
// from removing arguments entirely, so don't. For example AArch64 handles
// register and stack HFAs very differently, and this is reflected in the
// IR which has already been generated.
+ //
+ // `musttail` calls to this function restrict argument removal attempts.
+ // The signature of the caller must match the signature of the function.
+ //
+ // `musttail` calls in this function prevents us from changing its
+ // signature
Result = Live;
} else {
// See what the effect of this use is (recording any uses that cause
@@ -637,8 +675,8 @@ void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
/// mark any values that are used as this function's parameters or by its return
/// values (according to Uses) live as well.
void DeadArgumentEliminationPass::MarkLive(const Function &F) {
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
- << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
+ << F.getName() << "\n");
// Mark the function as live.
LiveFunctions.insert(&F);
// Mark all arguments as live.
@@ -659,8 +697,8 @@ void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
if (!LiveValues.insert(RA).second)
return; // We were already marked Live.
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
- << RA.getDescription() << " live\n");
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
+ << RA.getDescription() << " live\n");
PropagateLiveness(RA);
}
@@ -718,9 +756,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned);
} else {
++NumArgumentsEliminated;
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
- << " (" << I->getName() << ") from " << F->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
+ << i << " (" << I->getName() << ") from "
+ << F->getName() << "\n");
}
}
@@ -763,8 +801,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
NewRetIdxs[i] = RetTypes.size() - 1;
} else {
++NumRetValsEliminated;
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing return value "
- << i << " from " << F->getName() << "\n");
+ LLVM_DEBUG(
+ dbgs() << "DeadArgumentEliminationPass - Removing return value "
+ << i << " from " << F->getName() << "\n");
}
}
if (RetTypes.size() > 1) {
@@ -803,10 +842,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+ // Strip allocsize attributes. They might refer to the deleted arguments.
+ AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
+ F->getContext(), Attribute::AllocSize);
+
// Reconstruct the AttributesList based on the vector we constructed.
assert(ArgAttrVec.size() == Params.size());
- AttributeList NewPAL = AttributeList::get(
- F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec);
+ AttributeList NewPAL =
+ AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
// Create the new function type based on the recomputed parameters.
FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -875,8 +918,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
// Reconstruct the AttributesList based on the vector we constructed.
assert(ArgAttrVec.size() == Args.size());
+
+ // Again, be sure to remove any allocsize attributes, since their indices
+ // may now be incorrect.
+ AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+ F->getContext(), Attribute::AllocSize);
+
AttributeList NewCallPAL = AttributeList::get(
- F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec);
+ F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
SmallVector<OperandBundleDef, 1> OpBundles;
CS.getOperandBundlesAsDefs(OpBundles);
@@ -1017,8 +1066,11 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
BB.getInstList().erase(RI);
}
- // Patch the pointer to LLVM function in debug info descriptor.
- NF->setSubprogram(F->getSubprogram());
+ // Clone metadatas from the old function, including debug info descriptor.
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ F->getAllMetadata(MDs);
+ for (auto MD : MDs)
+ NF->addMetadata(MD.first, *MD.second);
// Now that the old function is dead, delete it.
F->eraseFromParent();
@@ -1034,7 +1086,7 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
// removed. We can do this if they never call va_start. This loop cannot be
// fused with the next loop, because deleting a function invalidates
// information computed while surveying other functions.
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
Function &F = *I++;
if (F.getFunctionType()->isVarArg())
@@ -1045,7 +1097,7 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
// We assume all arguments are dead unless proven otherwise (allowing us to
// determine that dead arguments passed into recursive functions are dead).
//
- DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
for (auto &F : M)
SurveyFunction(F);
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 042cacb70ad0..d45a88323910 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -51,7 +51,7 @@ static void makeVisible(GlobalValue &GV, bool Delete) {
}
namespace {
- /// @brief A pass to extract specific global values and their dependencies.
+ /// A pass to extract specific global values and their dependencies.
class GVExtractorPass : public ModulePass {
SetVector<GlobalValue *> Named;
bool deleteStuff;
diff --git a/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 325a5d77aadb..37273f975417 100644
--- a/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -42,8 +42,10 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) {
.Case("nonlazybind", Attribute::NonLazyBind)
.Case("noredzone", Attribute::NoRedZone)
.Case("noreturn", Attribute::NoReturn)
+ .Case("nocf_check", Attribute::NoCfCheck)
.Case("norecurse", Attribute::NoRecurse)
.Case("nounwind", Attribute::NoUnwind)
+ .Case("optforfuzzing", Attribute::OptForFuzzing)
.Case("optnone", Attribute::OptimizeNone)
.Case("optsize", Attribute::OptimizeForSize)
.Case("readnone", Attribute::ReadNone)
@@ -51,6 +53,7 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) {
.Case("argmemonly", Attribute::ArgMemOnly)
.Case("returns_twice", Attribute::ReturnsTwice)
.Case("safestack", Attribute::SafeStack)
+ .Case("shadowcallstack", Attribute::ShadowCallStack)
.Case("sanitize_address", Attribute::SanitizeAddress)
.Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
.Case("sanitize_memory", Attribute::SanitizeMemory)
@@ -72,8 +75,8 @@ static void addForcedAttributes(Function &F) {
auto Kind = parseAttrKind(KV.second);
if (Kind == Attribute::None) {
- DEBUG(dbgs() << "ForcedAttribute: " << KV.second
- << " unknown or not handled!\n");
+ LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second
+ << " unknown or not handled!\n");
continue;
}
if (F.hasFnAttribute(Kind))
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 5352e32479bb..2797da6c0abd 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -18,7 +18,6 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -74,6 +73,7 @@ STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
STATISTIC(NumNoAlias, "Number of function returns marked noalias");
STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
+STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
// FIXME: This is disabled by default to avoid exposing security vulnerabilities
// in C/C++ code compiled by clang:
@@ -83,6 +83,10 @@ static cl::opt<bool> EnableNonnullArgPropagation(
cl::desc("Try to propagate nonnull argument attributes from callsites to "
"caller functions."));
+static cl::opt<bool> DisableNoUnwindInference(
+ "disable-nounwind-inference", cl::Hidden,
+ cl::desc("Stop inferring nounwind attribute during function-attrs pass"));
+
namespace {
using SCCNodeSet = SmallSetVector<Function *, 8>;
@@ -401,7 +405,7 @@ static Attribute::AttrKind
determinePointerReadAttrs(Argument *A,
const SmallPtrSet<Argument *, 8> &SCCNodes) {
SmallVector<Use *, 32> Worklist;
- SmallSet<Use *, 32> Visited;
+ SmallPtrSet<Use *, 32> Visited;
// inalloca arguments are always clobbered by the call.
if (A->hasInAllocaAttr())
@@ -1008,7 +1012,8 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
if (!Speculative) {
// Mark the function eagerly since we may discover a function
// which prevents us from speculating about the entire SCC
- DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
+ LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
+ << " as nonnull\n");
F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
++NumNonNullReturn;
MadeChange = true;
@@ -1027,7 +1032,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
!F->getReturnType()->isPointerTy())
continue;
- DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
+ LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
++NumNonNullReturn;
MadeChange = true;
@@ -1037,49 +1042,214 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
return MadeChange;
}
-/// Remove the convergent attribute from all functions in the SCC if every
-/// callsite within the SCC is not convergent (except for calls to functions
-/// within the SCC). Returns true if changes were made.
-static bool removeConvergentAttrs(const SCCNodeSet &SCCNodes) {
- // For every function in SCC, ensure that either
- // * it is not convergent, or
- // * we can remove its convergent attribute.
- bool HasConvergentFn = false;
+namespace {
+
+/// Collects a set of attribute inference requests and performs them all in one
+/// go on a single SCC Node. Inference involves scanning function bodies
+/// looking for instructions that violate attribute assumptions.
+/// As soon as all the bodies are fine we are free to set the attribute.
+/// Customization of inference for individual attributes is performed by
+/// providing a handful of predicates for each attribute.
+class AttributeInferer {
+public:
+ /// Describes a request for inference of a single attribute.
+ struct InferenceDescriptor {
+
+ /// Returns true if this function does not have to be handled.
+ /// General intent for this predicate is to provide an optimization
+ /// for functions that do not need this attribute inference at all
+ /// (say, for functions that already have the attribute).
+ std::function<bool(const Function &)> SkipFunction;
+
+ /// Returns true if this instruction violates attribute assumptions.
+ std::function<bool(Instruction &)> InstrBreaksAttribute;
+
+ /// Sets the inferred attribute for this function.
+ std::function<void(Function &)> SetAttribute;
+
+ /// Attribute we derive.
+ Attribute::AttrKind AKind;
+
+ /// If true, only "exact" definitions can be used to infer this attribute.
+ /// See GlobalValue::isDefinitionExact.
+ bool RequiresExactDefinition;
+
+ InferenceDescriptor(Attribute::AttrKind AK,
+ std::function<bool(const Function &)> SkipFunc,
+ std::function<bool(Instruction &)> InstrScan,
+ std::function<void(Function &)> SetAttr,
+ bool ReqExactDef)
+ : SkipFunction(SkipFunc), InstrBreaksAttribute(InstrScan),
+ SetAttribute(SetAttr), AKind(AK),
+ RequiresExactDefinition(ReqExactDef) {}
+ };
+
+private:
+ SmallVector<InferenceDescriptor, 4> InferenceDescriptors;
+
+public:
+ void registerAttrInference(InferenceDescriptor AttrInference) {
+ InferenceDescriptors.push_back(AttrInference);
+ }
+
+ bool run(const SCCNodeSet &SCCNodes);
+};
+
+/// Perform all the requested attribute inference actions according to the
+/// attribute predicates stored before.
+bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
+ SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
+ // Go through all the functions in SCC and check corresponding attribute
+ // assumptions for each of them. Attributes that are invalid for this SCC
+ // will be removed from InferInSCC.
for (Function *F : SCCNodes) {
- if (!F->isConvergent()) continue;
- HasConvergentFn = true;
- // Can't remove convergent from function declarations.
- if (F->isDeclaration()) return false;
+ // No attributes whose assumptions are still valid - done.
+ if (InferInSCC.empty())
+ return false;
- // Can't remove convergent if any of our functions has a convergent call to a
- // function not in the SCC.
- for (Instruction &I : instructions(*F)) {
- CallSite CS(&I);
- // Bail if CS is a convergent call to a function not in the SCC.
- if (CS && CS.isConvergent() &&
- SCCNodes.count(CS.getCalledFunction()) == 0)
+ // Check if our attributes ever need scanning/can be scanned.
+ llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
+ if (ID.SkipFunction(*F))
return false;
+
+ // Remove from further inference (invalidate) when visiting a function
+ // that has no instructions to scan/has an unsuitable definition.
+ return F->isDeclaration() ||
+ (ID.RequiresExactDefinition && !F->hasExactDefinition());
+ });
+
+ // For each attribute still in InferInSCC that doesn't explicitly skip F,
+ // set up the F instructions scan to verify assumptions of the attribute.
+ SmallVector<InferenceDescriptor, 4> InferInThisFunc;
+ llvm::copy_if(
+ InferInSCC, std::back_inserter(InferInThisFunc),
+ [F](const InferenceDescriptor &ID) { return !ID.SkipFunction(*F); });
+
+ if (InferInThisFunc.empty())
+ continue;
+
+ // Start instruction scan.
+ for (Instruction &I : instructions(*F)) {
+ llvm::erase_if(InferInThisFunc, [&](const InferenceDescriptor &ID) {
+ if (!ID.InstrBreaksAttribute(I))
+ return false;
+ // Remove attribute from further inference on any other functions
+ // because attribute assumptions have just been violated.
+ llvm::erase_if(InferInSCC, [&ID](const InferenceDescriptor &D) {
+ return D.AKind == ID.AKind;
+ });
+ // Remove attribute from the rest of current instruction scan.
+ return true;
+ });
+
+ if (InferInThisFunc.empty())
+ break;
}
}
- // If the SCC doesn't have any convergent functions, we have nothing to do.
- if (!HasConvergentFn) return false;
+ if (InferInSCC.empty())
+ return false;
- // If we got here, all of the calls the SCC makes to functions not in the SCC
- // are non-convergent. Therefore all of the SCC's functions can also be made
- // non-convergent. We'll remove the attr from the callsites in
- // InstCombineCalls.
- for (Function *F : SCCNodes) {
- if (!F->isConvergent()) continue;
+ bool Changed = false;
+ for (Function *F : SCCNodes)
+ // At this point InferInSCC contains only functions that were either:
+ // - explicitly skipped from scan/inference, or
+ // - verified to have no instructions that break attribute assumptions.
+ // Hence we just go and force the attribute for all non-skipped functions.
+ for (auto &ID : InferInSCC) {
+ if (ID.SkipFunction(*F))
+ continue;
+ Changed = true;
+ ID.SetAttribute(*F);
+ }
+ return Changed;
+}
- DEBUG(dbgs() << "Removing convergent attr from fn " << F->getName()
- << "\n");
- F->setNotConvergent();
+} // end anonymous namespace
+
+/// Helper for non-Convergent inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonConvergent(Instruction &I,
+ const SCCNodeSet &SCCNodes) {
+ const CallSite CS(&I);
+ // Breaks non-convergent assumption if CS is a convergent call to a function
+ // not in the SCC.
+ return CS && CS.isConvergent() && SCCNodes.count(CS.getCalledFunction()) == 0;
+}
+
+/// Helper for NoUnwind inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
+ if (!I.mayThrow())
+ return false;
+ if (const auto *CI = dyn_cast<CallInst>(&I)) {
+ if (Function *Callee = CI->getCalledFunction()) {
+ // I is a may-throw call to a function inside our SCC. This doesn't
+ // invalidate our current working assumption that the SCC is no-throw; we
+ // just have to scan that other function.
+ if (SCCNodes.count(Callee) > 0)
+ return false;
+ }
}
return true;
}
+/// Infer attributes from all functions in the SCC by scanning every
+/// instruction for compliance to the attribute assumptions. Currently it
+/// does:
+/// - removal of Convergent attribute
+/// - addition of NoUnwind attribute
+///
+/// Returns true if any changes to function attributes were made.
+static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
+
+ AttributeInferer AI;
+
+ // Request to remove the convergent attribute from all functions in the SCC
+ // if every callsite within the SCC is not convergent (except for calls
+ // to functions within the SCC).
+ // Note: Removal of the attr from the callsites will happen in
+ // InstCombineCalls separately.
+ AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+ Attribute::Convergent,
+ // Skip non-convergent functions.
+ [](const Function &F) { return !F.isConvergent(); },
+ // Instructions that break non-convergent assumption.
+ [SCCNodes](Instruction &I) {
+ return InstrBreaksNonConvergent(I, SCCNodes);
+ },
+ [](Function &F) {
+ LLVM_DEBUG(dbgs() << "Removing convergent attr from fn " << F.getName()
+ << "\n");
+ F.setNotConvergent();
+ },
+ /* RequiresExactDefinition= */ false});
+
+ if (!DisableNoUnwindInference)
+ // Request to infer nounwind attribute for all the functions in the SCC if
+ // every callsite within the SCC is not throwing (except for calls to
+ // functions within the SCC). Note that nounwind attribute suffers from
+ // derefinement - results may change depending on how functions are
+ // optimized. Thus it can be inferred only from exact definitions.
+ AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+ Attribute::NoUnwind,
+ // Skip non-throwing functions.
+ [](const Function &F) { return F.doesNotThrow(); },
+ // Instructions that break non-throwing assumption.
+ [SCCNodes](Instruction &I) {
+ return InstrBreaksNonThrowing(I, SCCNodes);
+ },
+ [](Function &F) {
+ LLVM_DEBUG(dbgs()
+ << "Adding nounwind attr to fn " << F.getName() << "\n");
+ F.setDoesNotThrow();
+ ++NumNoUnwind;
+ },
+ /* RequiresExactDefinition= */ true});
+
+ // Perform all the requested attribute inference actions.
+ return AI.run(SCCNodes);
+}
+
static bool setDoesNotRecurse(Function &F) {
if (F.doesNotRecurse())
return false;
@@ -1136,7 +1306,8 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
bool HasUnknownCall = false;
for (LazyCallGraph::Node &N : C) {
Function &F = N.getFunction();
- if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+ if (F.hasFnAttribute(Attribute::OptimizeNone) ||
+ F.hasFnAttribute(Attribute::Naked)) {
// Treat any function we're trying not to optimize as if it were an
// indirect call and omit it from the node set used below.
HasUnknownCall = true;
@@ -1167,7 +1338,7 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
if (!HasUnknownCall) {
Changed |= addNoAliasAttrs(SCCNodes);
Changed |= addNonNullAttrs(SCCNodes);
- Changed |= removeConvergentAttrs(SCCNodes);
+ Changed |= inferAttrsFromFunctionBodies(SCCNodes);
Changed |= addNoRecurseAttrs(SCCNodes);
}
@@ -1221,7 +1392,8 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
bool ExternalNode = false;
for (CallGraphNode *I : SCC) {
Function *F = I->getFunction();
- if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) {
+ if (!F || F->hasFnAttribute(Attribute::OptimizeNone) ||
+ F->hasFnAttribute(Attribute::Naked)) {
// External node or function we're trying not to optimize - we both avoid
// transform them and avoid leveraging information they provide.
ExternalNode = true;
@@ -1244,7 +1416,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
if (!ExternalNode) {
Changed |= addNoAliasAttrs(SCCNodes);
Changed |= addNonNullAttrs(SCCNodes);
- Changed |= removeConvergentAttrs(SCCNodes);
+ Changed |= inferAttrsFromFunctionBodies(SCCNodes);
Changed |= addNoRecurseAttrs(SCCNodes);
}
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index b6d6201cd23b..15808a073894 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -18,8 +18,8 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/AutoUpgrade.h"
#include "llvm/IR/Constants.h"
@@ -61,6 +61,7 @@ using namespace llvm;
#define DEBUG_TYPE "function-import"
STATISTIC(NumImportedFunctions, "Number of functions imported");
+STATISTIC(NumImportedGlobalVars, "Number of global variables imported");
STATISTIC(NumImportedModules, "Number of modules imported from");
STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
STATISTIC(NumLiveSymbols, "Number of live symbols in index");
@@ -70,6 +71,10 @@ static cl::opt<unsigned> ImportInstrLimit(
"import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
cl::desc("Only import functions with less than N instructions"));
+static cl::opt<int> ImportCutoff(
+ "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
+ cl::desc("Only import first N functions if N>=0 (default -1)"));
+
static cl::opt<float>
ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
cl::Hidden, cl::value_desc("x"),
@@ -131,7 +136,7 @@ static cl::opt<bool>
static std::unique_ptr<Module> loadFile(const std::string &FileName,
LLVMContext &Context) {
SMDiagnostic Err;
- DEBUG(dbgs() << "Loading '" << FileName << "'\n");
+ LLVM_DEBUG(dbgs() << "Loading '" << FileName << "'\n");
// Metadata isn't loaded until functions are imported, to minimize
// the memory overhead.
std::unique_ptr<Module> Result =
@@ -163,6 +168,9 @@ selectCallee(const ModuleSummaryIndex &Index,
CalleeSummaryList,
[&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
auto *GVSummary = SummaryPtr.get();
+ if (!Index.isGlobalValueLive(GVSummary))
+ return false;
+
// For SamplePGO, in computeImportForFunction the OriginalId
// may have been used to locate the callee summary list (See
// comment there).
@@ -231,10 +239,37 @@ updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
// it, rather than needing to perform this mapping on each walk.
auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
if (GUID == 0)
- return nullptr;
+ return ValueInfo();
return Index.getValueInfo(GUID);
}
+static void computeImportForReferencedGlobals(
+ const FunctionSummary &Summary, const GVSummaryMapTy &DefinedGVSummaries,
+ FunctionImporter::ImportMapTy &ImportList,
+ StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
+ for (auto &VI : Summary.refs()) {
+ if (DefinedGVSummaries.count(VI.getGUID())) {
+ LLVM_DEBUG(
+ dbgs() << "Ref ignored! Target already in destination module.\n");
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
+
+ for (auto &RefSummary : VI.getSummaryList())
+ if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind &&
+ // Don't try to import regular LTO summaries added to dummy module.
+ !RefSummary->modulePath().empty() &&
+ !GlobalValue::isInterposableLinkage(RefSummary->linkage()) &&
+ RefSummary->refs().empty()) {
+ ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+ if (ExportLists)
+ (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID());
+ break;
+ }
+ }
+}
+
/// Compute the list of functions to import for a given caller. Mark these
/// imported functions and the symbols they reference in their source module as
/// exported from their source module.
@@ -243,18 +278,28 @@ static void computeImportForFunction(
const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
SmallVectorImpl<EdgeInfo> &Worklist,
FunctionImporter::ImportMapTy &ImportList,
- StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+ StringMap<FunctionImporter::ExportSetTy> *ExportLists,
+ FunctionImporter::ImportThresholdsTy &ImportThresholds) {
+ computeImportForReferencedGlobals(Summary, DefinedGVSummaries, ImportList,
+ ExportLists);
+ static int ImportCount = 0;
for (auto &Edge : Summary.calls()) {
ValueInfo VI = Edge.first;
- DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold
- << "\n");
+ LLVM_DEBUG(dbgs() << " edge -> " << VI << " Threshold:" << Threshold
+ << "\n");
+
+ if (ImportCutoff >= 0 && ImportCount >= ImportCutoff) {
+ LLVM_DEBUG(dbgs() << "ignored! import-cutoff value of " << ImportCutoff
+ << " reached.\n");
+ continue;
+ }
VI = updateValueInfoForIndirectCalls(Index, VI);
if (!VI)
continue;
if (DefinedGVSummaries.count(VI.getGUID())) {
- DEBUG(dbgs() << "ignored! Target already in destination module.\n");
+ LLVM_DEBUG(dbgs() << "ignored! Target already in destination module.\n");
continue;
}
@@ -269,20 +314,87 @@ static void computeImportForFunction(
};
const auto NewThreshold =
- Threshold * GetBonusMultiplier(Edge.second.Hotness);
-
- auto *CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
- Summary.modulePath());
- if (!CalleeSummary) {
- DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
- continue;
- }
+ Threshold * GetBonusMultiplier(Edge.second.getHotness());
+
+ auto IT = ImportThresholds.insert(
+ std::make_pair(VI.getGUID(), std::make_pair(NewThreshold, nullptr)));
+ bool PreviouslyVisited = !IT.second;
+ auto &ProcessedThreshold = IT.first->second.first;
+ auto &CalleeSummary = IT.first->second.second;
+
+ const FunctionSummary *ResolvedCalleeSummary = nullptr;
+ if (CalleeSummary) {
+ assert(PreviouslyVisited);
+ // Since the traversal of the call graph is DFS, we can revisit a function
+ // a second time with a higher threshold. In this case, it is added back
+ // to the worklist with the new threshold (so that its own callee chains
+ // can be considered with the higher threshold).
+ if (NewThreshold <= ProcessedThreshold) {
+ LLVM_DEBUG(
+ dbgs() << "ignored! Target was already imported with Threshold "
+ << ProcessedThreshold << "\n");
+ continue;
+ }
+ // Update with new larger threshold.
+ ProcessedThreshold = NewThreshold;
+ ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+ } else {
+ // If we already rejected importing a callee at the same or higher
+ // threshold, don't waste time calling selectCallee.
+ if (PreviouslyVisited && NewThreshold <= ProcessedThreshold) {
+ LLVM_DEBUG(
+ dbgs() << "ignored! Target was already rejected with Threshold "
+ << ProcessedThreshold << "\n");
+ continue;
+ }
- // "Resolve" the summary
- const auto *ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary->getBaseObject());
+ CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+ Summary.modulePath());
+ if (!CalleeSummary) {
+ // Update with new larger threshold if this was a retry (otherwise
+ // we would have already inserted with NewThreshold above).
+ if (PreviouslyVisited)
+ ProcessedThreshold = NewThreshold;
+ LLVM_DEBUG(
+ dbgs() << "ignored! No qualifying callee with summary found.\n");
+ continue;
+ }
- assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
- "selectCallee() didn't honor the threshold");
+ // "Resolve" the summary
+ CalleeSummary = CalleeSummary->getBaseObject();
+ ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+ assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
+ "selectCallee() didn't honor the threshold");
+
+ auto ExportModulePath = ResolvedCalleeSummary->modulePath();
+ auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
+ // We previously decided to import this GUID definition if it was already
+ // inserted in the set of imports from the exporting module.
+ bool PreviouslyImported = !ILI.second;
+
+ // Make exports in the source module.
+ if (ExportLists) {
+ auto &ExportList = (*ExportLists)[ExportModulePath];
+ ExportList.insert(VI.getGUID());
+ if (!PreviouslyImported) {
+ // This is the first time this function was exported from its source
+ // module, so mark all functions and globals it references as exported
+ // to the outside if they are defined in the same source module.
+ // For efficiency, we unconditionally add all the referenced GUIDs
+ // to the ExportList for this module, and will prune out any not
+ // defined in the module later in a single pass.
+ for (auto &Edge : ResolvedCalleeSummary->calls()) {
+ auto CalleeGUID = Edge.first.getGUID();
+ ExportList.insert(CalleeGUID);
+ }
+ for (auto &Ref : ResolvedCalleeSummary->refs()) {
+ auto GUID = Ref.getGUID();
+ ExportList.insert(GUID);
+ }
+ }
+ }
+ }
auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
// Adjust the threshold for next level of imported functions.
@@ -293,44 +405,11 @@ static void computeImportForFunction(
return Threshold * ImportInstrFactor;
};
- bool IsHotCallsite = Edge.second.Hotness == CalleeInfo::HotnessType::Hot;
+ bool IsHotCallsite =
+ Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
- auto ExportModulePath = ResolvedCalleeSummary->modulePath();
- auto &ProcessedThreshold = ImportList[ExportModulePath][VI.getGUID()];
- /// Since the traversal of the call graph is DFS, we can revisit a function
- /// a second time with a higher threshold. In this case, it is added back to
- /// the worklist with the new threshold.
- if (ProcessedThreshold && ProcessedThreshold >= AdjThreshold) {
- DEBUG(dbgs() << "ignored! Target was already seen with Threshold "
- << ProcessedThreshold << "\n");
- continue;
- }
- bool PreviouslyImported = ProcessedThreshold != 0;
- // Mark this function as imported in this module, with the current Threshold
- ProcessedThreshold = AdjThreshold;
-
- // Make exports in the source module.
- if (ExportLists) {
- auto &ExportList = (*ExportLists)[ExportModulePath];
- ExportList.insert(VI.getGUID());
- if (!PreviouslyImported) {
- // This is the first time this function was exported from its source
- // module, so mark all functions and globals it references as exported
- // to the outside if they are defined in the same source module.
- // For efficiency, we unconditionally add all the referenced GUIDs
- // to the ExportList for this module, and will prune out any not
- // defined in the module later in a single pass.
- for (auto &Edge : ResolvedCalleeSummary->calls()) {
- auto CalleeGUID = Edge.first.getGUID();
- ExportList.insert(CalleeGUID);
- }
- for (auto &Ref : ResolvedCalleeSummary->refs()) {
- auto GUID = Ref.getGUID();
- ExportList.insert(GUID);
- }
- }
- }
+ ImportCount++;
// Insert the newly imported function to the worklist.
Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, VI.getGUID());
@@ -347,12 +426,18 @@ static void ComputeImportForModule(
// Worklist contains the list of function imported in this module, for which
// we will analyse the callees and may import further down the callgraph.
SmallVector<EdgeInfo, 128> Worklist;
+ FunctionImporter::ImportThresholdsTy ImportThresholds;
// Populate the worklist with the import for the functions in the current
// module
for (auto &GVSummary : DefinedGVSummaries) {
+#ifndef NDEBUG
+ // FIXME: Change the GVSummaryMapTy to hold ValueInfo instead of GUID
+ // so this map look up (and possibly others) can be avoided.
+ auto VI = Index.getValueInfo(GVSummary.first);
+#endif
if (!Index.isGlobalValueLive(GVSummary.second)) {
- DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
+ LLVM_DEBUG(dbgs() << "Ignores Dead GUID: " << VI << "\n");
continue;
}
auto *FuncSummary =
@@ -360,10 +445,10 @@ static void ComputeImportForModule(
if (!FuncSummary)
// Skip import for global variables
continue;
- DEBUG(dbgs() << "Initialize import for " << GVSummary.first << "\n");
+ LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n");
computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
DefinedGVSummaries, Worklist, ImportList,
- ExportLists);
+ ExportLists, ImportThresholds);
}
// Process the newly imported functions and add callees to the worklist.
@@ -371,20 +456,37 @@ static void ComputeImportForModule(
auto FuncInfo = Worklist.pop_back_val();
auto *Summary = std::get<0>(FuncInfo);
auto Threshold = std::get<1>(FuncInfo);
- auto GUID = std::get<2>(FuncInfo);
-
- // Check if we later added this summary with a higher threshold.
- // If so, skip this entry.
- auto ExportModulePath = Summary->modulePath();
- auto &LatestProcessedThreshold = ImportList[ExportModulePath][GUID];
- if (LatestProcessedThreshold > Threshold)
- continue;
computeImportForFunction(*Summary, Index, Threshold, DefinedGVSummaries,
- Worklist, ImportList, ExportLists);
+ Worklist, ImportList, ExportLists,
+ ImportThresholds);
+ }
+}
+
+#ifndef NDEBUG
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
+ GlobalValue::GUID G) {
+ if (const auto &VI = Index.getValueInfo(G)) {
+ auto SL = VI.getSummaryList();
+ if (!SL.empty())
+ return SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
}
+ return false;
}
+static GlobalValue::GUID getGUID(GlobalValue::GUID G) { return G; }
+
+template <class T>
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
+ T &Cont) {
+ unsigned NumGVS = 0;
+ for (auto &V : Cont)
+ if (isGlobalVarSummary(Index, getGUID(V)))
+ ++NumGVS;
+ return NumGVS;
+}
+#endif
+
/// Compute all the import and export for every module using the Index.
void llvm::ComputeCrossModuleImport(
const ModuleSummaryIndex &Index,
@@ -394,8 +496,8 @@ void llvm::ComputeCrossModuleImport(
// For each module that has function defined, compute the import/export lists.
for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
auto &ImportList = ImportLists[DefinedGVSummaries.first()];
- DEBUG(dbgs() << "Computing import for Module '"
- << DefinedGVSummaries.first() << "'\n");
+ LLVM_DEBUG(dbgs() << "Computing import for Module '"
+ << DefinedGVSummaries.first() << "'\n");
ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
&ExportLists);
}
@@ -417,32 +519,41 @@ void llvm::ComputeCrossModuleImport(
}
#ifndef NDEBUG
- DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
- << " modules:\n");
+ LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
+ << " modules:\n");
for (auto &ModuleImports : ImportLists) {
auto ModName = ModuleImports.first();
auto &Exports = ExportLists[ModName];
- DEBUG(dbgs() << "* Module " << ModName << " exports " << Exports.size()
- << " functions. Imports from " << ModuleImports.second.size()
- << " modules.\n");
+ unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
+ LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
+ << Exports.size() - NumGVS << " functions and " << NumGVS
+ << " vars. Imports from " << ModuleImports.second.size()
+ << " modules.\n");
for (auto &Src : ModuleImports.second) {
auto SrcModName = Src.first();
- DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
- << SrcModName << "\n");
+ unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+ LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+ << " functions imported from " << SrcModName << "\n");
+ LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
+ << " global vars imported from " << SrcModName << "\n");
}
}
#endif
}
#ifndef NDEBUG
-static void dumpImportListForModule(StringRef ModulePath,
+static void dumpImportListForModule(const ModuleSummaryIndex &Index,
+ StringRef ModulePath,
FunctionImporter::ImportMapTy &ImportList) {
- DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
- << ImportList.size() << " modules.\n");
+ LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+ << ImportList.size() << " modules.\n");
for (auto &Src : ImportList) {
auto SrcModName = Src.first();
- DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
- << SrcModName << "\n");
+ unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+ LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+ << " functions imported from " << SrcModName << "\n");
+ LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
+ << SrcModName << "\n");
}
}
#endif
@@ -457,11 +568,11 @@ void llvm::ComputeCrossModuleImportForModule(
Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
// Compute the import list for this module.
- DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
+ LLVM_DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
ComputeImportForModule(FunctionSummaryMap, Index, ImportList);
#ifndef NDEBUG
- dumpImportListForModule(ModulePath, ImportList);
+ dumpImportListForModule(Index, ModulePath, ImportList);
#endif
}
@@ -483,18 +594,18 @@ void llvm::ComputeCrossModuleImportForModuleFromIndex(
// e.g. record required linkage changes.
if (Summary->modulePath() == ModulePath)
continue;
- // Doesn't matter what value we plug in to the map, just needs an entry
- // to provoke importing by thinBackend.
- ImportList[Summary->modulePath()][GUID] = 1;
+ // Add an entry to provoke importing by thinBackend.
+ ImportList[Summary->modulePath()].insert(GUID);
}
#ifndef NDEBUG
- dumpImportListForModule(ModulePath, ImportList);
+ dumpImportListForModule(Index, ModulePath, ImportList);
#endif
}
void llvm::computeDeadSymbols(
ModuleSummaryIndex &Index,
- const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+ const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+ function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
assert(!Index.withGlobalValueDeadStripping());
if (!ComputeDead)
return;
@@ -513,17 +624,18 @@ void llvm::computeDeadSymbols(
}
// Add values flagged in the index as live roots to the worklist.
- for (const auto &Entry : Index)
+ for (const auto &Entry : Index) {
+ auto VI = Index.getValueInfo(Entry);
for (auto &S : Entry.second.SummaryList)
if (S->isLive()) {
- DEBUG(dbgs() << "Live root: " << Entry.first << "\n");
- Worklist.push_back(ValueInfo(&Entry));
+ LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
+ Worklist.push_back(VI);
++LiveSymbols;
break;
}
+ }
// Make value live and add it to the worklist if it was not live before.
- // FIXME: we should only make the prevailing copy live here
auto visit = [&](ValueInfo VI) {
// FIXME: If we knew which edges were created for indirect call profiles,
// we could skip them here. Any that are live should be reached via
@@ -539,6 +651,28 @@ void llvm::computeDeadSymbols(
for (auto &S : VI.getSummaryList())
if (S->isLive())
return;
+
+ // We only keep live symbols that are known to be non-prevailing if any are
+ // available_externally. Those symbols are discarded later in the
+ // EliminateAvailableExternally pass and setting them to not-live breaks
+ // downstreams users of liveness information (PR36483).
+ if (isPrevailing(VI.getGUID()) == PrevailingType::No) {
+ bool AvailableExternally = false;
+ bool Interposable = false;
+ for (auto &S : VI.getSummaryList()) {
+ if (S->linkage() == GlobalValue::AvailableExternallyLinkage)
+ AvailableExternally = true;
+ else if (GlobalValue::isInterposableLinkage(S->linkage()))
+ Interposable = true;
+ }
+
+ if (!AvailableExternally)
+ return;
+
+ if (Interposable)
+ report_fatal_error("Interposable and available_externally symbol");
+ }
+
for (auto &S : VI.getSummaryList())
S->setLive(true);
++LiveSymbols;
@@ -549,6 +683,8 @@ void llvm::computeDeadSymbols(
auto VI = Worklist.pop_back_val();
for (auto &Summary : VI.getSummaryList()) {
GlobalValueSummary *Base = Summary->getBaseObject();
+ // Set base value live in case it is an alias.
+ Base->setLive(true);
for (auto Ref : Base->refs())
visit(Ref);
if (auto *FS = dyn_cast<FunctionSummary>(Base))
@@ -559,8 +695,8 @@ void llvm::computeDeadSymbols(
Index.setWithGlobalValueDeadStripping();
unsigned DeadSymbols = Index.size() - LiveSymbols;
- DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
- << " symbols Dead \n");
+ LLVM_DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+ << " symbols Dead \n");
NumDeadSymbols += DeadSymbols;
NumLiveSymbols += LiveSymbols;
}
@@ -581,47 +717,66 @@ void llvm::gatherImportedSummariesForModule(
const auto &DefinedGVSummaries =
ModuleToDefinedGVSummaries.lookup(ILI.first());
for (auto &GI : ILI.second) {
- const auto &DS = DefinedGVSummaries.find(GI.first);
+ const auto &DS = DefinedGVSummaries.find(GI);
assert(DS != DefinedGVSummaries.end() &&
"Expected a defined summary for imported global value");
- SummariesForIndex[GI.first] = DS->second;
+ SummariesForIndex[GI] = DS->second;
}
}
}
/// Emit the files \p ModulePath will import from into \p OutputFilename.
-std::error_code
-llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
- const FunctionImporter::ImportMapTy &ModuleImports) {
+std::error_code llvm::EmitImportsFiles(
+ StringRef ModulePath, StringRef OutputFilename,
+ const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
std::error_code EC;
raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
if (EC)
return EC;
- for (auto &ILI : ModuleImports)
- ImportsOS << ILI.first() << "\n";
+ for (auto &ILI : ModuleToSummariesForIndex)
+ // The ModuleToSummariesForIndex map includes an entry for the current
+ // Module (needed for writing out the index files). We don't want to
+ // include it in the imports file, however, so filter it out.
+ if (ILI.first != ModulePath)
+ ImportsOS << ILI.first << "\n";
return std::error_code();
}
+bool llvm::convertToDeclaration(GlobalValue &GV) {
+ LLVM_DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName()
+ << "\n");
+ if (Function *F = dyn_cast<Function>(&GV)) {
+ F->deleteBody();
+ F->clearMetadata();
+ F->setComdat(nullptr);
+ } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+ V->setInitializer(nullptr);
+ V->setLinkage(GlobalValue::ExternalLinkage);
+ V->clearMetadata();
+ V->setComdat(nullptr);
+ } else {
+ GlobalValue *NewGV;
+ if (GV.getValueType()->isFunctionTy())
+ NewGV =
+ Function::Create(cast<FunctionType>(GV.getValueType()),
+ GlobalValue::ExternalLinkage, "", GV.getParent());
+ else
+ NewGV =
+ new GlobalVariable(*GV.getParent(), GV.getValueType(),
+ /*isConstant*/ false, GlobalValue::ExternalLinkage,
+ /*init*/ nullptr, "",
+ /*insertbefore*/ nullptr, GV.getThreadLocalMode(),
+ GV.getType()->getAddressSpace());
+ NewGV->takeName(&GV);
+ GV.replaceAllUsesWith(NewGV);
+ return false;
+ }
+ return true;
+}
+
/// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
void llvm::thinLTOResolveWeakForLinkerModule(
Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
- auto ConvertToDeclaration = [](GlobalValue &GV) {
- DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n");
- if (Function *F = dyn_cast<Function>(&GV)) {
- F->deleteBody();
- F->clearMetadata();
- } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
- V->setInitializer(nullptr);
- V->setLinkage(GlobalValue::ExternalLinkage);
- V->clearMetadata();
- } else
- // For now we don't resolve or drop aliases. Once we do we'll
- // need to add support here for creating either a function or
- // variable declaration, and return the new GlobalValue* for
- // the caller to use.
- llvm_unreachable("Expected function or variable");
- };
-
auto updateLinkage = [&](GlobalValue &GV) {
// See if the global summary analysis computed a new resolved linkage.
const auto &GS = DefinedGlobals.find(GV.getGUID());
@@ -651,11 +806,23 @@ void llvm::thinLTOResolveWeakForLinkerModule(
// interposable property and possibly get inlined. Simply drop
// the definition in that case.
if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
- GlobalValue::isInterposableLinkage(GV.getLinkage()))
- ConvertToDeclaration(GV);
- else {
- DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
- << GV.getLinkage() << " to " << NewLinkage << "\n");
+ GlobalValue::isInterposableLinkage(GV.getLinkage())) {
+ if (!convertToDeclaration(GV))
+ // FIXME: Change this to collect replaced GVs and later erase
+ // them from the parent module once thinLTOResolveWeakForLinkerGUID is
+ // changed to enable this for aliases.
+ llvm_unreachable("Expected GV to be converted");
+ } else {
+ // If the original symbols has global unnamed addr and linkonce_odr linkage,
+ // it should be an auto hide symbol. Add hidden visibility to the symbol to
+ // preserve the property.
+ if (GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr() &&
+ NewLinkage == GlobalValue::WeakODRLinkage)
+ GV.setVisibility(GlobalValue::HiddenVisibility);
+
+ LLVM_DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName()
+ << "` from " << GV.getLinkage() << " to " << NewLinkage
+ << "\n");
GV.setLinkage(NewLinkage);
}
// Remove declarations from comdats, including available_externally
@@ -732,9 +899,9 @@ static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
// index.
Expected<bool> FunctionImporter::importFunctions(
Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
- DEBUG(dbgs() << "Starting import for Module "
- << DestModule.getModuleIdentifier() << "\n");
- unsigned ImportedCount = 0;
+ LLVM_DEBUG(dbgs() << "Starting import for Module "
+ << DestModule.getModuleIdentifier() << "\n");
+ unsigned ImportedCount = 0, ImportedGVCount = 0;
IRMover Mover(DestModule);
// Do the actual import of functions now, one Module at a time
@@ -766,9 +933,9 @@ Expected<bool> FunctionImporter::importFunctions(
continue;
auto GUID = F.getGUID();
auto Import = ImportGUIDs.count(GUID);
- DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function " << GUID
- << " " << F.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
+ LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
+ << GUID << " " << F.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
if (Import) {
if (Error Err = F.materialize())
return std::move(Err);
@@ -788,13 +955,13 @@ Expected<bool> FunctionImporter::importFunctions(
continue;
auto GUID = GV.getGUID();
auto Import = ImportGUIDs.count(GUID);
- DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global " << GUID
- << " " << GV.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
+ LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
+ << GUID << " " << GV.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
if (Import) {
if (Error Err = GV.materialize())
return std::move(Err);
- GlobalsToImport.insert(&GV);
+ ImportedGVCount += GlobalsToImport.insert(&GV);
}
}
for (GlobalAlias &GA : SrcModule->aliases()) {
@@ -802,9 +969,9 @@ Expected<bool> FunctionImporter::importFunctions(
continue;
auto GUID = GA.getGUID();
auto Import = ImportGUIDs.count(GUID);
- DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias " << GUID
- << " " << GA.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
+ LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
+ << GUID << " " << GA.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
if (Import) {
if (Error Err = GA.materialize())
return std::move(Err);
@@ -813,9 +980,9 @@ Expected<bool> FunctionImporter::importFunctions(
if (Error Err = Base->materialize())
return std::move(Err);
auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
- DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
- << " " << Base->getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
+ LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
+ << " " << Base->getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
if (EnableImportMetadata) {
// Add 'thinlto_src_module' metadata for statistics and debugging.
Fn->setMetadata(
@@ -851,10 +1018,15 @@ Expected<bool> FunctionImporter::importFunctions(
NumImportedModules++;
}
- NumImportedFunctions += ImportedCount;
+ NumImportedFunctions += (ImportedCount - ImportedGVCount);
+ NumImportedGlobalVars += ImportedGVCount;
- DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
- << DestModule.getModuleIdentifier() << "\n");
+ LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
+ << " functions for Module "
+ << DestModule.getModuleIdentifier() << "\n");
+ LLVM_DEBUG(dbgs() << "Imported " << ImportedGVCount
+ << " global variables for Module "
+ << DestModule.getModuleIdentifier() << "\n");
return ImportedCount;
}
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 4bb2984e3b47..1af7e6894777 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -17,14 +17,16 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -55,6 +57,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
@@ -63,7 +66,6 @@
#include "llvm/Transforms/Utils/CtorUtils.h"
#include "llvm/Transforms/Utils/Evaluator.h"
#include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
#include <utility>
@@ -88,6 +90,21 @@ STATISTIC(NumNestRemoved , "Number of nest attributes removed");
STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+STATISTIC(NumInternalFunc, "Number of internal functions");
+STATISTIC(NumColdCC, "Number of functions marked coldcc");
+
+static cl::opt<bool>
+ EnableColdCCStressTest("enable-coldcc-stress-test",
+ cl::desc("Enable stress test of coldcc by adding "
+ "calling conv to all internal functions."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<int> ColdCCRelFreq(
+ "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+ cl::desc(
+ "Maximum block frequency, expressed as a percentage of caller's "
+ "entry frequency, for a call site to be considered cold for enabling"
+ "coldcc"));
/// Is this global variable possibly used by a leak checker as a root? If so,
/// we might not really want to eliminate the stores to it.
@@ -483,7 +500,6 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
StartAlignment = DL.getABITypeAlignment(GV->getType());
if (StructType *STy = dyn_cast<StructType>(Ty)) {
- uint64_t FragmentOffset = 0;
unsigned NumElements = STy->getNumElements();
NewGlobals.reserve(NumElements);
const StructLayout &Layout = *DL.getStructLayout(STy);
@@ -509,10 +525,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
NGV->setAlignment(NewAlign);
// Copy over the debug info for the variable.
- FragmentOffset = alignTo(FragmentOffset, NewAlign);
- uint64_t Size = DL.getTypeSizeInBits(NGV->getValueType());
- transferSRADebugInfo(GV, NGV, FragmentOffset, Size, NumElements);
- FragmentOffset += Size;
+ uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
+ uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(i);
+ transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, NumElements);
}
} else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
unsigned NumElements = STy->getNumElements();
@@ -522,7 +537,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
auto ElTy = STy->getElementType();
uint64_t EltSize = DL.getTypeAllocSize(ElTy);
unsigned EltAlign = DL.getABITypeAlignment(ElTy);
- uint64_t FragmentSizeInBits = DL.getTypeSizeInBits(ElTy);
+ uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
for (unsigned i = 0, e = NumElements; i != e; ++i) {
Constant *In = Init->getAggregateElement(i);
assert(In && "Couldn't get element of initializer?");
@@ -551,7 +566,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
if (NewGlobals.empty())
return nullptr;
- DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
+ LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
@@ -621,7 +636,13 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
/// reprocessing them.
static bool AllUsesOfValueWillTrapIfNull(const Value *V,
SmallPtrSetImpl<const PHINode*> &PHIs) {
- for (const User *U : V->users())
+ for (const User *U : V->users()) {
+ if (const Instruction *I = dyn_cast<Instruction>(U)) {
+ // If null pointer is considered valid, then all uses are non-trapping.
+ // Non address-space 0 globals have already been pruned by the caller.
+ if (NullPointerIsDefined(I->getFunction()))
+ return false;
+ }
if (isa<LoadInst>(U)) {
// Will trap.
} else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
@@ -655,7 +676,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
//cerr << "NONTRAPPING USE: " << *U;
return false;
}
-
+ }
return true;
}
@@ -682,6 +703,10 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
bool Changed = false;
for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
Instruction *I = cast<Instruction>(*UI++);
+ // Uses are non-trapping if null pointer is considered valid.
+ // Non address-space 0 globals are already pruned by the caller.
+ if (NullPointerIsDefined(I->getFunction()))
+ return false;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
LI->setOperand(0, NewV);
Changed = true;
@@ -783,7 +808,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
}
if (Changed) {
- DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n");
+ LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV
+ << "\n");
++NumGlobUses;
}
@@ -797,7 +823,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
CleanupConstantGlobalUsers(GV, nullptr, DL, TLI);
}
if (GV->use_empty()) {
- DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n");
+ LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n");
Changed = true;
GV->eraseFromParent();
++NumDeleted;
@@ -833,7 +859,8 @@ static GlobalVariable *
OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
ConstantInt *NElements, const DataLayout &DL,
TargetLibraryInfo *TLI) {
- DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n');
+ LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI
+ << '\n');
Type *GlobalType;
if (NElements->getZExtValue() == 1)
@@ -1269,7 +1296,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
Value *NElems, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
- DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI << '\n');
+ LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI
+ << '\n');
Type *MAT = getMallocAllocatedType(CI, TLI);
StructType *STy = cast<StructType>(MAT);
@@ -1566,7 +1594,10 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
// users of the loaded value (often calls and loads) that would trap if the
// value was null.
if (GV->getInitializer()->getType()->isPointerTy() &&
- GV->getInitializer()->isNullValue()) {
+ GV->getInitializer()->isNullValue() &&
+ !NullPointerIsDefined(
+ nullptr /* F */,
+ GV->getInitializer()->getType()->getPointerAddressSpace())) {
if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
if (GV->getInitializer()->getType() != SOVC->getType())
SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
@@ -1608,7 +1639,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
return false;
- DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n");
+ LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n");
// Create the new global, initializing it to false.
GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
@@ -1652,15 +1683,11 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
// val * (ValOther - ValInit) + ValInit:
// DW_OP_deref DW_OP_constu <ValMinus>
// DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
- E = DIExpression::get(NewGV->getContext(),
- {dwarf::DW_OP_deref,
- dwarf::DW_OP_constu,
- ValMinus,
- dwarf::DW_OP_mul,
- dwarf::DW_OP_constu,
- ValInit,
- dwarf::DW_OP_plus,
- dwarf::DW_OP_stack_value});
+ SmallVector<uint64_t, 12> Ops = {
+ dwarf::DW_OP_deref, dwarf::DW_OP_constu, ValMinus,
+ dwarf::DW_OP_mul, dwarf::DW_OP_constu, ValInit,
+ dwarf::DW_OP_plus};
+ E = DIExpression::prependOpcodes(E, Ops, DIExpression::WithStackValue);
DIGlobalVariableExpression *DGVE =
DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
NewGV->addDebugInfo(DGVE);
@@ -1732,8 +1759,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
return true;
}
-static bool deleteIfDead(GlobalValue &GV,
- SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+static bool deleteIfDead(
+ GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
GV.removeDeadConstantUsers();
if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
@@ -1751,7 +1778,7 @@ static bool deleteIfDead(GlobalValue &GV,
if (!Dead)
return false;
- DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+ LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
GV.eraseFromParent();
++NumDeleted;
return true;
@@ -1917,7 +1944,7 @@ static bool processInternalGlobal(
LookupDomTree)) {
const DataLayout &DL = GV->getParent()->getDataLayout();
- DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
+ LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
->getEntryBlock().begin());
Type *ElemTy = GV->getValueType();
@@ -1938,7 +1965,7 @@ static bool processInternalGlobal(
// If the global is never loaded (but may be stored to), it is dead.
// Delete it now.
if (!GS.IsLoaded) {
- DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
+ LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
bool Changed;
if (isLeakCheckerRoot(GV)) {
@@ -1960,7 +1987,7 @@ static bool processInternalGlobal(
}
if (GS.StoredType <= GlobalStatus::InitializerStored) {
- DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
+ LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
GV->setConstant(true);
// Clean up any obviously simplifiable users now.
@@ -1968,8 +1995,8 @@ static bool processInternalGlobal(
// If the global is dead now, just nuke it.
if (GV->use_empty()) {
- DEBUG(dbgs() << " *** Marking constant allowed us to simplify "
- << "all users and delete global!\n");
+ LLVM_DEBUG(dbgs() << " *** Marking constant allowed us to simplify "
+ << "all users and delete global!\n");
GV->eraseFromParent();
++NumDeleted;
return true;
@@ -1997,8 +2024,8 @@ static bool processInternalGlobal(
CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
if (GV->use_empty()) {
- DEBUG(dbgs() << " *** Substituting initializer allowed us to "
- << "simplify all users and delete global!\n");
+ LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to "
+ << "simplify all users and delete global!\n");
GV->eraseFromParent();
++NumDeleted;
}
@@ -2097,20 +2124,142 @@ static void RemoveNestAttribute(Function *F) {
/// idea here is that we don't want to mess with the convention if the user
/// explicitly requested something with performance implications like coldcc,
/// GHC, or anyregcc.
-static bool isProfitableToMakeFastCC(Function *F) {
+static bool hasChangeableCC(Function *F) {
CallingConv::ID CC = F->getCallingConv();
+
// FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
- return CC == CallingConv::C || CC == CallingConv::X86_ThisCall;
+ if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
+ return false;
+
+ // FIXME: Change CC for the whole chain of musttail calls when possible.
+ //
+ // Can't change CC of the function that either has musttail calls, or is a
+ // musttail callee itself
+ for (User *U : F->users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+ CallInst* CI = dyn_cast<CallInst>(U);
+ if (!CI)
+ continue;
+
+ if (CI->isMustTailCall())
+ return false;
+ }
+
+ for (BasicBlock &BB : *F)
+ if (BB.getTerminatingMustTailCall())
+ return false;
+
+ return true;
+}
+
+/// Return true if the block containing the call site has a BlockFrequency of
+/// less than ColdCCRelFreq% of the entry block.
+static bool isColdCallSite(CallSite CS, BlockFrequencyInfo &CallerBFI) {
+ const BranchProbability ColdProb(ColdCCRelFreq, 100);
+ auto CallSiteBB = CS.getInstruction()->getParent();
+ auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
+ auto CallerEntryFreq =
+ CallerBFI.getBlockFreq(&(CS.getCaller()->getEntryBlock()));
+ return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
+// This function checks if the input function F is cold at all call sites. It
+// also looks each call site's containing function, returning false if the
+// caller function contains other non cold calls. The input vector AllCallsCold
+// contains a list of functions that only have call sites in cold blocks.
+static bool
+isValidCandidateForColdCC(Function &F,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+ const std::vector<Function *> &AllCallsCold) {
+
+ if (F.user_empty())
+ return false;
+
+ for (User *U : F.users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+
+ CallSite CS(cast<Instruction>(U));
+ Function *CallerFunc = CS.getInstruction()->getParent()->getParent();
+ BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
+ if (!isColdCallSite(CS, CallerBFI))
+ return false;
+ auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc);
+ if (It == AllCallsCold.end())
+ return false;
+ }
+ return true;
+}
+
+static void changeCallSitesToColdCC(Function *F) {
+ for (User *U : F->users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+ CallSite CS(cast<Instruction>(U));
+ CS.setCallingConv(CallingConv::Cold);
+ }
+}
+
+// This function iterates over all the call instructions in the input Function
+// and checks that all call sites are in cold blocks and are allowed to use the
+// coldcc calling convention.
+static bool
+hasOnlyColdCalls(Function &F,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ CallSite CS(cast<Instruction>(CI));
+ // Skip over isline asm instructions since they aren't function calls.
+ if (CI->isInlineAsm())
+ continue;
+ Function *CalledFn = CI->getCalledFunction();
+ if (!CalledFn)
+ return false;
+ if (!CalledFn->hasLocalLinkage())
+ return false;
+ // Skip over instrinsics since they won't remain as function calls.
+ if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
+ continue;
+ // Check if it's valid to use coldcc calling convention.
+ if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() ||
+ CalledFn->hasAddressTaken())
+ return false;
+ BlockFrequencyInfo &CallerBFI = GetBFI(F);
+ if (!isColdCallSite(CS, CallerBFI))
+ return false;
+ }
+ }
+ }
+ return true;
}
static bool
OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
function_ref<DominatorTree &(Function &)> LookupDomTree,
- SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+ SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+
bool Changed = false;
+
+ std::vector<Function *> AllCallsCold;
+ for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
+ Function *F = &*FI++;
+ if (hasOnlyColdCalls(*F, GetBFI))
+ AllCallsCold.push_back(F);
+ }
+
// Optimize functions.
for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
Function *F = &*FI++;
+
+ // Don't perform global opt pass on naked functions; we don't want fast
+ // calling conventions for naked functions.
+ if (F->hasFnAttribute(Attribute::Naked))
+ continue;
+
// Functions without names cannot be referenced outside this module.
if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
F->setLinkage(GlobalValue::InternalLinkage);
@@ -2142,7 +2291,25 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
if (!F->hasLocalLinkage())
continue;
- if (isProfitableToMakeFastCC(F) && !F->isVarArg() &&
+
+ if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
+ NumInternalFunc++;
+ TargetTransformInfo &TTI = GetTTI(*F);
+ // Change the calling convention to coldcc if either stress testing is
+ // enabled or the target would like to use coldcc on functions which are
+ // cold at all call sites and the callers contain no other non coldcc
+ // calls.
+ if (EnableColdCCStressTest ||
+ (isValidCandidateForColdCC(*F, GetBFI, AllCallsCold) &&
+ TTI.useColdCCForColdCall(*F))) {
+ F->setCallingConv(CallingConv::Cold);
+ changeCallSitesToColdCC(F);
+ Changed = true;
+ NumColdCC++;
+ }
+ }
+
+ if (hasChangeableCC(F) && !F->isVarArg() &&
!F->hasAddressTaken()) {
// If this function has a calling convention worth changing, is not a
// varargs function, and is only called directly, promote it to use the
@@ -2168,7 +2335,7 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
static bool
OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
function_ref<DominatorTree &(Function &)> LookupDomTree,
- SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+ SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
bool Changed = false;
for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
@@ -2254,6 +2421,131 @@ static void CommitValueTo(Constant *Val, Constant *Addr) {
GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
}
+/// Given a map of address -> value, where addresses are expected to be some form
+/// of either a global or a constant GEP, set the initializer for the address to
+/// be the value. This performs mostly the same function as CommitValueTo()
+/// and EvaluateStoreInto() but is optimized to be more efficient for the common
+/// case where the set of addresses are GEPs sharing the same underlying global,
+/// processing the GEPs in batches rather than individually.
+///
+/// To give an example, consider the following C++ code adapted from the clang
+/// regression tests:
+/// struct S {
+/// int n = 10;
+/// int m = 2 * n;
+/// S(int a) : n(a) {}
+/// };
+///
+/// template<typename T>
+/// struct U {
+/// T *r = &q;
+/// T q = 42;
+/// U *p = this;
+/// };
+///
+/// U<S> e;
+///
+/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
+/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
+/// members. This batch algorithm will simply use general CommitValueTo() method
+/// to handle the complex nested S struct initialization of 'q', before
+/// processing the outermost members in a single batch. Using CommitValueTo() to
+/// handle member in the outer struct is inefficient when the struct/array is
+/// very large as we end up creating and destroy constant arrays for each
+/// initialization.
+/// For the above case, we expect the following IR to be generated:
+///
+/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
+/// %struct.S = type { i32, i32 }
+/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
+/// i64 0, i32 1),
+/// %struct.S { i32 42, i32 84 }, %struct.U* @e }
+/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
+/// constant expression, while the other two elements of @e are "simple".
+static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
+ SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
+ SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
+ SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
+ SimpleCEs.reserve(Mem.size());
+
+ for (const auto &I : Mem) {
+ if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
+ GVs.push_back(std::make_pair(GV, I.second));
+ } else {
+ ConstantExpr *GEP = cast<ConstantExpr>(I.first);
+ // We don't handle the deeply recursive case using the batch method.
+ if (GEP->getNumOperands() > 3)
+ ComplexCEs.push_back(std::make_pair(GEP, I.second));
+ else
+ SimpleCEs.push_back(std::make_pair(GEP, I.second));
+ }
+ }
+
+ // The algorithm below doesn't handle cases like nested structs, so use the
+ // slower fully general method if we have to.
+ for (auto ComplexCE : ComplexCEs)
+ CommitValueTo(ComplexCE.second, ComplexCE.first);
+
+ for (auto GVPair : GVs) {
+ assert(GVPair.first->hasInitializer());
+ GVPair.first->setInitializer(GVPair.second);
+ }
+
+ if (SimpleCEs.empty())
+ return;
+
+ // We cache a single global's initializer elements in the case where the
+ // subsequent address/val pair uses the same one. This avoids throwing away and
+ // rebuilding the constant struct/vector/array just because one element is
+ // modified at a time.
+ SmallVector<Constant *, 32> Elts;
+ Elts.reserve(SimpleCEs.size());
+ GlobalVariable *CurrentGV = nullptr;
+
+ auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
+ Constant *Init = GV->getInitializer();
+ Type *Ty = Init->getType();
+ if (Update) {
+ if (CurrentGV) {
+ assert(CurrentGV && "Expected a GV to commit to!");
+ Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
+ // We have a valid cache that needs to be committed.
+ if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
+ CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
+ else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
+ CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
+ else
+ CurrentGV->setInitializer(ConstantVector::get(Elts));
+ }
+ if (CurrentGV == GV)
+ return;
+ // Need to clear and set up cache for new initializer.
+ CurrentGV = GV;
+ Elts.clear();
+ unsigned NumElts;
+ if (auto *STy = dyn_cast<StructType>(Ty))
+ NumElts = STy->getNumElements();
+ else
+ NumElts = cast<SequentialType>(Ty)->getNumElements();
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(Init->getAggregateElement(i));
+ }
+ };
+
+ for (auto CEPair : SimpleCEs) {
+ ConstantExpr *GEP = CEPair.first;
+ Constant *Val = CEPair.second;
+
+ GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
+ commitAndSetupCache(GV, GV != CurrentGV);
+ ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
+ Elts[CI->getZExtValue()] = Val;
+ }
+ // The last initializer in the list needs to be committed, others
+ // will be committed on a new initializer being processed.
+ commitAndSetupCache(CurrentGV, true);
+}
+
/// Evaluate static constructors in the function, if we can. Return true if we
/// can, false otherwise.
static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
@@ -2268,11 +2560,10 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
++NumCtorsEvaluated;
// We succeeded at evaluation: commit the result.
- DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
- << F->getName() << "' to " << Eval.getMutatedMemory().size()
- << " stores.\n");
- for (const auto &I : Eval.getMutatedMemory())
- CommitValueTo(I.second, I.first);
+ LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+ << F->getName() << "' to "
+ << Eval.getMutatedMemory().size() << " stores.\n");
+ BatchCommitValueTo(Eval.getMutatedMemory());
for (GlobalVariable *GV : Eval.getInvariants())
GV->setConstant(true);
}
@@ -2287,7 +2578,7 @@ static int compareNames(Constant *const *A, Constant *const *B) {
}
static void setUsedInitializer(GlobalVariable &V,
- const SmallPtrSet<GlobalValue *, 8> &Init) {
+ const SmallPtrSetImpl<GlobalValue *> &Init) {
if (Init.empty()) {
V.eraseFromParent();
return;
@@ -2440,7 +2731,7 @@ static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
static bool
OptimizeGlobalAliases(Module &M,
- SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+ SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
bool Changed = false;
LLVMUsed Used(M);
@@ -2460,7 +2751,7 @@ OptimizeGlobalAliases(Module &M,
continue;
}
- // If the aliasee may change at link time, nothing can be done - bail out.
+ // If the alias can change at link time, nothing can be done - bail out.
if (J->isInterposable())
continue;
@@ -2486,6 +2777,7 @@ OptimizeGlobalAliases(Module &M,
// Give the aliasee the name, linkage and other attributes of the alias.
Target->takeName(&*J);
Target->setLinkage(J->getLinkage());
+ Target->setDSOLocal(J->isDSOLocal());
Target->setVisibility(J->getVisibility());
Target->setDLLStorageClass(J->getDLLStorageClass());
@@ -2619,8 +2911,10 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
static bool optimizeGlobalsInModule(
Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
function_ref<DominatorTree &(Function &)> LookupDomTree) {
- SmallSet<const Comdat *, 8> NotDiscardableComdats;
+ SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
bool Changed = false;
bool LocalChange = true;
while (LocalChange) {
@@ -2641,8 +2935,8 @@ static bool optimizeGlobalsInModule(
NotDiscardableComdats.insert(C);
// Delete functions that are trivially dead, ccc -> fastcc
- LocalChange |=
- OptimizeFunctions(M, TLI, LookupDomTree, NotDiscardableComdats);
+ LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree,
+ NotDiscardableComdats);
// Optimize global_ctors list.
LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
@@ -2679,7 +2973,15 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
return FAM.getResult<DominatorTreeAnalysis>(F);
};
- if (!optimizeGlobalsInModule(M, DL, &TLI, LookupDomTree))
+ auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+ return FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
@@ -2702,12 +3004,22 @@ struct GlobalOptLegacyPass : public ModulePass {
auto LookupDomTree = [this](Function &F) -> DominatorTree & {
return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
};
- return optimizeGlobalsInModule(M, DL, TLI, LookupDomTree);
+ auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+ return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ };
+
+ auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+ return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+ };
+
+ return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
}
};
@@ -2718,6 +3030,8 @@ char GlobalOptLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
"Global Variable Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
"Global Variable Optimizer", false, false)
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index d5d35ee89e0e..dce9ee076bc5 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -40,7 +40,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
initializeInferFunctionAttrsLegacyPassPass(Registry);
initializeInternalizeLegacyPassPass(Registry);
initializeLoopExtractorPass(Registry);
- initializeBlockExtractorPassPass(Registry);
+ initializeBlockExtractorPass(Registry);
initializeSingleLoopExtractorPass(Registry);
initializeLowerTypeTestsPass(Registry);
initializeMergeFunctionsPass(Registry);
@@ -48,6 +48,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
initializePostOrderFunctionAttrsLegacyPassPass(Registry);
initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
initializePruneEHPass(Registry);
+ initializeIPSCCPLegacyPassPass(Registry);
initializeStripDeadPrototypesLegacyPassPass(Registry);
initializeStripSymbolsPass(Registry);
initializeStripDebugDeclarePass(Registry);
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index b259a0abd63c..82bba1e5c93b 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
namespace {
-/// \brief Actual inliner pass implementation.
+/// Actual inliner pass implementation.
///
/// The common implementation of the inlining logic is shared between this
/// inliner pass and the always inliner pass. The two passes use different cost
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 4449c87ddefa..3da0c2e83eb8 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -35,6 +35,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
@@ -59,7 +60,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
#include <cassert>
@@ -208,8 +208,8 @@ static void mergeInlinedArrayAllocas(
// Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
// success!
- DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI
- << "\n\t\tINTO: " << *AvailableAlloca << '\n');
+ LLVM_DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI
+ << "\n\t\tINTO: " << *AvailableAlloca << '\n');
// Move affected dbg.declare calls immediately after the new alloca to
// avoid the situation when a dbg.declare precedes its alloca.
@@ -379,14 +379,14 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
Function *Caller = CS.getCaller();
if (IC.isAlways()) {
- DEBUG(dbgs() << " Inlining: cost=always"
- << ", Call: " << *CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << " Inlining: cost=always"
+ << ", Call: " << *CS.getInstruction() << "\n");
return IC;
}
if (IC.isNever()) {
- DEBUG(dbgs() << " NOT Inlining: cost=never"
- << ", Call: " << *CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << " NOT Inlining: cost=never"
+ << ", Call: " << *CS.getInstruction() << "\n");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
<< NV("Callee", Callee) << " not inlined into "
@@ -397,9 +397,9 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
}
if (!IC) {
- DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost()
- << ", thres=" << IC.getThreshold()
- << ", Call: " << *CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost()
+ << ", thres=" << IC.getThreshold()
+ << ", Call: " << *CS.getInstruction() << "\n");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
<< NV("Callee", Callee) << " not inlined into "
@@ -412,9 +412,9 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
int TotalSecondaryCost = 0;
if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost, GetInlineCost)) {
- DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction()
- << " Cost = " << IC.getCost()
- << ", outer Cost = " << TotalSecondaryCost << '\n');
+ LLVM_DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction()
+ << " Cost = " << IC.getCost()
+ << ", outer Cost = " << TotalSecondaryCost << '\n');
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
Call)
@@ -428,9 +428,9 @@ shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
return None;
}
- DEBUG(dbgs() << " Inlining: cost=" << IC.getCost()
- << ", thres=" << IC.getThreshold()
- << ", Call: " << *CS.getInstruction() << '\n');
+ LLVM_DEBUG(dbgs() << " Inlining: cost=" << IC.getCost()
+ << ", thres=" << IC.getThreshold()
+ << ", Call: " << *CS.getInstruction() << '\n');
return IC;
}
@@ -470,12 +470,12 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
function_ref<AAResults &(Function &)> AARGetter,
ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
SmallPtrSet<Function *, 8> SCCFunctions;
- DEBUG(dbgs() << "Inliner visiting SCC:");
+ LLVM_DEBUG(dbgs() << "Inliner visiting SCC:");
for (CallGraphNode *Node : SCC) {
Function *F = Node->getFunction();
if (F)
SCCFunctions.insert(F);
- DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
+ LLVM_DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
}
// Scan through and identify all call sites ahead of time so that we only
@@ -524,7 +524,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
}
}
- DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
+ LLVM_DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
// If there are no calls in this function, exit early.
if (CallSites.empty())
@@ -593,7 +593,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
// size. This happens because IPSCCP propagates the result out of the
// call and then we're left with the dead call.
if (IsTriviallyDead) {
- DEBUG(dbgs() << " -> Deleting dead call: " << *Instr << "\n");
+ LLVM_DEBUG(dbgs() << " -> Deleting dead call: " << *Instr << "\n");
// Update the call graph by deleting the edge from Callee to Caller.
CG[Caller]->removeCallEdgeFor(CS);
Instr->eraseFromParent();
@@ -657,8 +657,8 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
// callgraph references to the node, we cannot delete it yet, this
// could invalidate the CGSCC iterator.
CG[Callee]->getNumReferences() == 0) {
- DEBUG(dbgs() << " -> Deleting dead function: " << Callee->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << " -> Deleting dead function: "
+ << Callee->getName() << "\n");
CallGraphNode *CalleeNode = CG[Callee];
// Remove any call graph edges from the callee to its callees.
@@ -793,6 +793,14 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
return true;
}
+InlinerPass::~InlinerPass() {
+ if (ImportedFunctionsStats) {
+ assert(InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No);
+ ImportedFunctionsStats->dump(InlinerFunctionImportStats ==
+ InlinerFunctionImportStatsOpts::Verbose);
+ }
+}
+
PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
CGSCCAnalysisManager &AM, LazyCallGraph &CG,
CGSCCUpdateResult &UR) {
@@ -804,6 +812,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
Module &M = *InitialC.begin()->getFunction().getParent();
ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
+ if (!ImportedFunctionsStats &&
+ InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
+ ImportedFunctionsStats =
+ llvm::make_unique<ImportedFunctionsInliningStatistics>();
+ ImportedFunctionsStats->setModuleInfo(M);
+ }
+
// We use a single common worklist for calls across the entire SCC. We
// process these in-order and append new calls introduced during inlining to
// the end.
@@ -830,8 +845,14 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// incrementally maknig a single function grow in a super linear fashion.
SmallVector<std::pair<CallSite, int>, 16> Calls;
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
+ .getManager();
+
// Populate the initial list of calls in this SCC.
for (auto &N : InitialC) {
+ auto &ORE =
+ FAM.getResult<OptimizationRemarkEmitterAnalysis>(N.getFunction());
// We want to generally process call sites top-down in order for
// simplifications stemming from replacing the call with the returned value
// after inlining to be visible to subsequent inlining decisions.
@@ -839,9 +860,20 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// Instead we should do an actual RPO walk of the function body.
for (Instruction &I : instructions(N.getFunction()))
if (auto CS = CallSite(&I))
- if (Function *Callee = CS.getCalledFunction())
+ if (Function *Callee = CS.getCalledFunction()) {
if (!Callee->isDeclaration())
Calls.push_back({CS, -1});
+ else if (!isa<IntrinsicInst>(I)) {
+ using namespace ore;
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", CS.getCaller())
+ << " because its definition is unavailable"
+ << setIsVerbose();
+ });
+ }
+ }
}
if (Calls.empty())
return PreservedAnalyses::all();
@@ -879,7 +911,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
if (F.hasFnAttribute(Attribute::OptimizeNone))
continue;
- DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
// Get a FunctionAnalysisManager via a proxy for this particular node. We
// do this each time we visit a node as the SCC may have changed and as
@@ -931,9 +963,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// and thus hidden from the full inline history.
if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
UR.InlinedInternalEdges.count({&N, C})) {
- DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
- "previously split out of this SCC by inlining: "
- << F.getName() << " -> " << Callee.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+ "previously split out of this SCC by inlining: "
+ << F.getName() << " -> " << Callee.getName() << "\n");
continue;
}
@@ -992,6 +1024,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
Calls.push_back({CS, NewHistoryID});
}
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats->recordInline(F, Callee);
+
// Merge the attributes based on the inlining.
AttributeFuncs::mergeAttributesForInlining(F, Callee);
@@ -1052,7 +1087,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// change.
LazyCallGraph::SCC *OldC = C;
C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
- DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
+ LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
RC = &C->getOuterRefSCC();
// If this causes an SCC to split apart into multiple smaller SCCs, there
@@ -1070,8 +1105,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
})) {
- DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
- "retaining this to avoid infinite inlining.\n");
+ LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+ "retaining this to avoid infinite inlining.\n");
UR.InlinedInternalEdges.insert({&N, OldC});
}
InlinedCallees.clear();
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 26db1465bb26..a6542d28dfd8 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -192,7 +192,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
++NumFunctions;
- DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
}
// Never internalize the llvm.used symbol. It is used to implement
@@ -221,7 +221,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
Changed = true;
++NumGlobals;
- DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
}
// Mark all aliases that are not in the api as internal as well.
@@ -231,7 +231,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
Changed = true;
++NumAliases;
- DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
}
return Changed;
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
index a8b0f32fd785..54ce23876e66 100644
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
name = IPO
parent = Transforms
library_name = ipo
-required_libraries = Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
+required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 36b6bdba2cd0..8c86f7cb806a 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -23,6 +23,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CodeExtractor.h"
#include <fstream>
@@ -158,155 +159,3 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
Pass *llvm::createSingleLoopExtractorPass() {
return new SingleLoopExtractor();
}
-
-
-// BlockFile - A file which contains a list of blocks that should not be
-// extracted.
-static cl::opt<std::string>
-BlockFile("extract-blocks-file", cl::value_desc("filename"),
- cl::desc("A file containing list of basic blocks to not extract"),
- cl::Hidden);
-
-namespace {
- /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
- /// from the module into their own functions except for those specified by the
- /// BlocksToNotExtract list.
- class BlockExtractorPass : public ModulePass {
- void LoadFile(const char *Filename);
- void SplitLandingPadPreds(Function *F);
-
- std::vector<BasicBlock*> BlocksToNotExtract;
- std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
- public:
- static char ID; // Pass identification, replacement for typeid
- BlockExtractorPass() : ModulePass(ID) {
- if (!BlockFile.empty())
- LoadFile(BlockFile.c_str());
- }
-
- bool runOnModule(Module &M) override;
- };
-}
-
-char BlockExtractorPass::ID = 0;
-INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
- "Extract Basic Blocks From Module (for bugpoint use)",
- false, false)
-
-// createBlockExtractorPass - This pass extracts all blocks (except those
-// specified in the argument list) from the functions in the module.
-//
-ModulePass *llvm::createBlockExtractorPass() {
- return new BlockExtractorPass();
-}
-
-void BlockExtractorPass::LoadFile(const char *Filename) {
- // Load the BlockFile...
- std::ifstream In(Filename);
- if (!In.good()) {
- errs() << "WARNING: BlockExtractor couldn't load file '" << Filename
- << "'!\n";
- return;
- }
- while (In) {
- std::string FunctionName, BlockName;
- In >> FunctionName;
- In >> BlockName;
- if (!BlockName.empty())
- BlocksToNotExtractByName.push_back(
- std::make_pair(FunctionName, BlockName));
- }
-}
-
-/// SplitLandingPadPreds - The landing pad needs to be extracted with the invoke
-/// instruction. The critical edge breaker will refuse to break critical edges
-/// to a landing pad. So do them here. After this method runs, all landing pads
-/// should have only one predecessor.
-void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
- for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
- InvokeInst *II = dyn_cast<InvokeInst>(I);
- if (!II) continue;
- BasicBlock *Parent = II->getParent();
- BasicBlock *LPad = II->getUnwindDest();
-
- // Look through the landing pad's predecessors. If one of them ends in an
- // 'invoke', then we want to split the landing pad.
- bool Split = false;
- for (pred_iterator
- PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ++PI) {
- BasicBlock *BB = *PI;
- if (BB->isLandingPad() && BB != Parent &&
- isa<InvokeInst>(Parent->getTerminator())) {
- Split = true;
- break;
- }
- }
-
- if (!Split) continue;
-
- SmallVector<BasicBlock*, 2> NewBBs;
- SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
- }
-}
-
-bool BlockExtractorPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- std::set<BasicBlock*> TranslatedBlocksToNotExtract;
- for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
- BasicBlock *BB = BlocksToNotExtract[i];
- Function *F = BB->getParent();
-
- // Map the corresponding function in this module.
- Function *MF = M.getFunction(F->getName());
- assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?");
-
- // Figure out which index the basic block is in its function.
- Function::iterator BBI = MF->begin();
- std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
- TranslatedBlocksToNotExtract.insert(&*BBI);
- }
-
- while (!BlocksToNotExtractByName.empty()) {
- // There's no way to find BBs by name without looking at every BB inside
- // every Function. Fortunately, this is always empty except when used by
- // bugpoint in which case correctness is more important than performance.
-
- std::string &FuncName = BlocksToNotExtractByName.back().first;
- std::string &BlockName = BlocksToNotExtractByName.back().second;
-
- for (Function &F : M) {
- if (F.getName() != FuncName) continue;
-
- for (BasicBlock &BB : F) {
- if (BB.getName() != BlockName) continue;
-
- TranslatedBlocksToNotExtract.insert(&BB);
- }
- }
-
- BlocksToNotExtractByName.pop_back();
- }
-
- // Now that we know which blocks to not extract, figure out which ones we WANT
- // to extract.
- std::vector<BasicBlock*> BlocksToExtract;
- for (Function &F : M) {
- SplitLandingPadPreds(&F);
- for (BasicBlock &BB : F)
- if (!TranslatedBlocksToNotExtract.count(&BB))
- BlocksToExtract.push_back(&BB);
- }
-
- for (BasicBlock *BlockToExtract : BlocksToExtract) {
- SmallVector<BasicBlock*, 2> BlocksToExtractVec;
- BlocksToExtractVec.push_back(BlockToExtract);
- if (const InvokeInst *II =
- dyn_cast<InvokeInst>(BlockToExtract->getTerminator()))
- BlocksToExtractVec.push_back(II->getUnwindDest());
- CodeExtractor(BlocksToExtractVec).extractCodeRegion();
- }
-
- return !BlocksToExtract.empty();
-}
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 8db7e1e142d2..4f7571884707 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -8,6 +8,8 @@
//===----------------------------------------------------------------------===//
//
// This pass lowers type metadata and calls to the llvm.type.test intrinsic.
+// It also ensures that globals are properly laid out for the
+// llvm.icall.branch.funnel intrinsic.
// See http://llvm.org/docs/TypeMetadata.html for more information.
//
//===----------------------------------------------------------------------===//
@@ -25,6 +27,7 @@
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -291,6 +294,33 @@ public:
}
};
+struct ICallBranchFunnel final
+ : TrailingObjects<ICallBranchFunnel, GlobalTypeMember *> {
+ static ICallBranchFunnel *create(BumpPtrAllocator &Alloc, CallInst *CI,
+ ArrayRef<GlobalTypeMember *> Targets,
+ unsigned UniqueId) {
+ auto *Call = static_cast<ICallBranchFunnel *>(
+ Alloc.Allocate(totalSizeToAlloc<GlobalTypeMember *>(Targets.size()),
+ alignof(ICallBranchFunnel)));
+ Call->CI = CI;
+ Call->UniqueId = UniqueId;
+ Call->NTargets = Targets.size();
+ std::uninitialized_copy(Targets.begin(), Targets.end(),
+ Call->getTrailingObjects<GlobalTypeMember *>());
+ return Call;
+ }
+
+ CallInst *CI;
+ ArrayRef<GlobalTypeMember *> targets() const {
+ return makeArrayRef(getTrailingObjects<GlobalTypeMember *>(), NTargets);
+ }
+
+ unsigned UniqueId;
+
+private:
+ size_t NTargets;
+};
+
class LowerTypeTestsModule {
Module &M;
@@ -372,6 +402,7 @@ class LowerTypeTestsModule {
const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
const TypeIdLowering &TIL);
+
void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
ArrayRef<GlobalTypeMember *> Globals);
unsigned getJumpTableEntrySize();
@@ -383,19 +414,32 @@ class LowerTypeTestsModule {
void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
ArrayRef<GlobalTypeMember *> Functions);
void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Functions);
+ ArrayRef<GlobalTypeMember *> Functions);
void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
ArrayRef<GlobalTypeMember *> Functions);
- void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Globals);
+ void
+ buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Globals,
+ ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
- void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT);
+ void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, bool IsDefinition);
void moveInitializerToModuleConstructor(GlobalVariable *GV);
void findGlobalVariableUsersOf(Constant *C,
SmallSetVector<GlobalVariable *, 8> &Out);
void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
+ /// replaceCfiUses - Go through the uses list for this definition
+ /// and make each use point to "V" instead of "this" when the use is outside
+ /// the block. 'This's use list is expected to have at least one element.
+ /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
+ /// uses.
+ void replaceCfiUses(Function *Old, Value *New, bool IsDefinition);
+
+ /// replaceDirectCalls - Go through the uses list for this definition and
+ /// replace each use, which is a direct function call.
+ void replaceDirectCalls(Value *Old, Value *New);
+
public:
LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
const ModuleSummaryIndex *ImportSummary);
@@ -427,8 +471,6 @@ struct LowerTypeTests : public ModulePass {
}
bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
if (UseCommandLine)
return LowerTypeTestsModule::runForTesting(M);
return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
@@ -729,10 +771,12 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
// Compute the amount of padding required.
uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
- // Cap at 128 was found experimentally to have a good data/instruction
- // overhead tradeoff.
- if (Padding > 128)
- Padding = alignTo(InitSize, 128) - InitSize;
+ // Experiments of different caps with Chromium on both x64 and ARM64
+ // have shown that the 32-byte cap generates the smallest binary on
+ // both platforms while different caps yield similar performance.
+ // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
+ if (Padding > 32)
+ Padding = alignTo(InitSize, 32) - InitSize;
GlobalInits.push_back(
ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
@@ -936,14 +980,23 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
assert(F->getType()->getAddressSpace() == 0);
- // Declaration of a local function - nothing to do.
- if (F->isDeclarationForLinker() && isDefinition)
- return;
-
GlobalValue::VisibilityTypes Visibility = F->getVisibility();
std::string Name = F->getName();
- Function *FDecl;
+ if (F->isDeclarationForLinker() && isDefinition) {
+ // Non-dso_local functions may be overriden at run time,
+ // don't short curcuit them
+ if (F->isDSOLocal()) {
+ Function *RealF = Function::Create(F->getFunctionType(),
+ GlobalValue::ExternalLinkage,
+ Name + ".cfi", &M);
+ RealF->setVisibility(GlobalVariable::HiddenVisibility);
+ replaceDirectCalls(F, RealF);
+ }
+ return;
+ }
+
+ Function *FDecl;
if (F->isDeclarationForLinker() && !isDefinition) {
// Declaration of an external function.
FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
@@ -952,10 +1005,25 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
} else if (isDefinition) {
F->setName(Name + ".cfi");
F->setLinkage(GlobalValue::ExternalLinkage);
- F->setVisibility(GlobalValue::HiddenVisibility);
FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
Name, &M);
FDecl->setVisibility(Visibility);
+ Visibility = GlobalValue::HiddenVisibility;
+
+ // Delete aliases pointing to this function, they'll be re-created in the
+ // merged output
+ SmallVector<GlobalAlias*, 4> ToErase;
+ for (auto &U : F->uses()) {
+ if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
+ Function *AliasDecl = Function::Create(
+ F->getFunctionType(), GlobalValue::ExternalLinkage, "", &M);
+ AliasDecl->takeName(A);
+ A->replaceAllUsesWith(AliasDecl);
+ ToErase.push_back(A);
+ }
+ }
+ for (auto *A : ToErase)
+ A->eraseFromParent();
} else {
// Function definition without type metadata, where some other translation
// unit contained a declaration with type metadata. This normally happens
@@ -966,9 +1034,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
}
if (F->isWeakForLinker())
- replaceWeakDeclarationWithJumpTablePtr(F, FDecl);
+ replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isDefinition);
else
- F->replaceAllUsesWith(FDecl);
+ replaceCfiUses(F, FDecl, isDefinition);
+
+ // Set visibility late because it's used in replaceCfiUses() to determine
+ // whether uses need to to be replaced.
+ F->setVisibility(Visibility);
}
void LowerTypeTestsModule::lowerTypeTestCalls(
@@ -980,7 +1052,7 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
for (Metadata *TypeId : TypeIds) {
// Build the bitset.
BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
- DEBUG({
+ LLVM_DEBUG({
if (auto MDS = dyn_cast<MDString>(TypeId))
dbgs() << MDS->getString() << ": ";
else
@@ -1150,7 +1222,7 @@ void LowerTypeTestsModule::findGlobalVariableUsersOf(
// Replace all uses of F with (F ? JT : 0).
void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
- Function *F, Constant *JT) {
+ Function *F, Constant *JT, bool IsDefinition) {
// The target expression can not appear in a constant initializer on most
// (all?) targets. Switch to a runtime initializer.
SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
@@ -1163,7 +1235,7 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
Function *PlaceholderFn =
Function::Create(cast<FunctionType>(F->getValueType()),
GlobalValue::ExternalWeakLinkage, "", &M);
- F->replaceAllUsesWith(PlaceholderFn);
+ replaceCfiUses(F, PlaceholderFn, IsDefinition);
Constant *Target = ConstantExpr::getSelect(
ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
@@ -1226,12 +1298,6 @@ void LowerTypeTestsModule::createJumpTable(
createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs,
cast<Function>(Functions[I]->getGlobal()));
- // Try to emit the jump table at the end of the text segment.
- // Jump table must come after __cfi_check in the cross-dso mode.
- // FIXME: this magic section name seems to do the trick.
- F->setSection(ObjectFormat == Triple::MachO
- ? "__TEXT,__text,regular,pure_instructions"
- : ".text.cfi");
// Align the whole table by entry size.
F->setAlignment(getJumpTableEntrySize());
// Skip prologue.
@@ -1248,6 +1314,8 @@ void LowerTypeTestsModule::createJumpTable(
// by Clang for -march=armv7.
F->addFnAttr("target-cpu", "cortex-a8");
}
+ // Make sure we don't emit .eh_frame for this function.
+ F->addFnAttr(Attribute::NoUnwind);
BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
IRBuilder<> IRB(BB);
@@ -1389,9 +1457,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
}
if (!IsDefinition) {
if (F->isWeakForLinker())
- replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
+ replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, IsDefinition);
else
- F->replaceAllUsesWith(CombinedGlobalElemPtr);
+ replaceCfiUses(F, CombinedGlobalElemPtr, IsDefinition);
} else {
assert(F->getType()->getAddressSpace() == 0);
@@ -1401,10 +1469,10 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
FAlias->takeName(F);
if (FAlias->hasName())
F->setName(FAlias->getName() + ".cfi");
- F->replaceUsesExceptBlockAddr(FAlias);
+ replaceCfiUses(F, FAlias, IsDefinition);
+ if (!F->hasLocalLinkage())
+ F->setVisibility(GlobalVariable::HiddenVisibility);
}
- if (!F->isDeclarationForLinker())
- F->setLinkage(GlobalValue::InternalLinkage);
}
createJumpTable(JumpTableFn, Functions);
@@ -1447,7 +1515,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
}
void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals,
+ ArrayRef<ICallBranchFunnel *> ICallBranchFunnels) {
DenseMap<Metadata *, uint64_t> TypeIdIndices;
for (unsigned I = 0; I != TypeIds.size(); ++I)
TypeIdIndices[TypeIds[I]] = I;
@@ -1456,15 +1525,25 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
// the type identifier.
std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
unsigned GlobalIndex = 0;
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalIndices;
for (GlobalTypeMember *GTM : Globals) {
for (MDNode *Type : GTM->types()) {
// Type = { offset, type identifier }
- unsigned TypeIdIndex = TypeIdIndices[Type->getOperand(1)];
- TypeMembers[TypeIdIndex].insert(GlobalIndex);
+ auto I = TypeIdIndices.find(Type->getOperand(1));
+ if (I != TypeIdIndices.end())
+ TypeMembers[I->second].insert(GlobalIndex);
}
+ GlobalIndices[GTM] = GlobalIndex;
GlobalIndex++;
}
+ for (ICallBranchFunnel *JT : ICallBranchFunnels) {
+ TypeMembers.emplace_back();
+ std::set<uint64_t> &TMSet = TypeMembers.back();
+ for (GlobalTypeMember *T : JT->targets())
+ TMSet.insert(GlobalIndices[T]);
+ }
+
// Order the sets of indices by size. The GlobalLayoutBuilder works best
// when given small index sets first.
std::stable_sort(
@@ -1514,7 +1593,7 @@ LowerTypeTestsModule::LowerTypeTestsModule(
}
bool LowerTypeTestsModule::runForTesting(Module &M) {
- ModuleSummaryIndex Summary;
+ ModuleSummaryIndex Summary(/*HaveGVs=*/false);
// Handle the command-line summary arguments. This code is for testing
// purposes only, so we handle errors directly.
@@ -1549,11 +1628,71 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
return Changed;
}
+static bool isDirectCall(Use& U) {
+ auto *Usr = dyn_cast<CallInst>(U.getUser());
+ if (Usr) {
+ CallSite CS(Usr);
+ if (CS.isCallee(&U))
+ return true;
+ }
+ return false;
+}
+
+void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefinition) {
+ SmallSetVector<Constant *, 4> Constants;
+ auto UI = Old->use_begin(), E = Old->use_end();
+ for (; UI != E;) {
+ Use &U = *UI;
+ ++UI;
+
+ // Skip block addresses
+ if (isa<BlockAddress>(U.getUser()))
+ continue;
+
+ // Skip direct calls to externally defined or non-dso_local functions
+ if (isDirectCall(U) && (Old->isDSOLocal() || !IsDefinition))
+ continue;
+
+ // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+ // constant because they are uniqued.
+ if (auto *C = dyn_cast<Constant>(U.getUser())) {
+ if (!isa<GlobalValue>(C)) {
+ // Save unique users to avoid processing operand replacement
+ // more than once.
+ Constants.insert(C);
+ continue;
+ }
+ }
+
+ U.set(New);
+ }
+
+ // Process operand replacement of saved constants.
+ for (auto *C : Constants)
+ C->handleOperandChange(Old, New);
+}
+
+void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
+ auto UI = Old->use_begin(), E = Old->use_end();
+ for (; UI != E;) {
+ Use &U = *UI;
+ ++UI;
+
+ if (!isDirectCall(U))
+ continue;
+
+ U.set(New);
+ }
+}
+
bool LowerTypeTestsModule::lower() {
Function *TypeTestFunc =
M.getFunction(Intrinsic::getName(Intrinsic::type_test));
- if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary &&
- !ImportSummary)
+ Function *ICallBranchFunnelFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+ if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+ (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
+ !ExportSummary && !ImportSummary)
return false;
if (ImportSummary) {
@@ -1565,6 +1704,10 @@ bool LowerTypeTestsModule::lower() {
}
}
+ if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
+ report_fatal_error(
+ "unexpected call to llvm.icall.branch.funnel during import phase");
+
SmallVector<Function *, 8> Defs;
SmallVector<Function *, 8> Decls;
for (auto &F : M) {
@@ -1589,8 +1732,8 @@ bool LowerTypeTestsModule::lower() {
// Equivalence class set containing type identifiers and the globals that
// reference them. This is used to partition the set of type identifiers in
// the module into disjoint sets.
- using GlobalClassesTy =
- EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>;
+ using GlobalClassesTy = EquivalenceClasses<
+ PointerUnion3<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
GlobalClassesTy GlobalClasses;
// Verify the type metadata and build a few data structures to let us
@@ -1602,33 +1745,61 @@ bool LowerTypeTestsModule::lower() {
// identifiers.
BumpPtrAllocator Alloc;
struct TIInfo {
- unsigned Index;
+ unsigned UniqueId;
std::vector<GlobalTypeMember *> RefGlobals;
};
DenseMap<Metadata *, TIInfo> TypeIdInfo;
- unsigned I = 0;
+ unsigned CurUniqueId = 0;
SmallVector<MDNode *, 2> Types;
+ // Cross-DSO CFI emits jumptable entries for exported functions as well as
+ // address taken functions in case they are address taken in other modules.
+ const bool CrossDsoCfi = M.getModuleFlag("Cross-DSO CFI") != nullptr;
+
struct ExportedFunctionInfo {
CfiFunctionLinkage Linkage;
MDNode *FuncMD; // {name, linkage, type[, type...]}
};
DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
if (ExportSummary) {
+ // A set of all functions that are address taken by a live global object.
+ DenseSet<GlobalValue::GUID> AddressTaken;
+ for (auto &I : *ExportSummary)
+ for (auto &GVS : I.second.SummaryList)
+ if (GVS->isLive())
+ for (auto &Ref : GVS->refs())
+ AddressTaken.insert(Ref.getGUID());
+
NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
if (CfiFunctionsMD) {
for (auto FuncMD : CfiFunctionsMD->operands()) {
assert(FuncMD->getNumOperands() >= 2);
StringRef FunctionName =
cast<MDString>(FuncMD->getOperand(0))->getString();
- if (!ExportSummary->isGUIDLive(GlobalValue::getGUID(
- GlobalValue::dropLLVMManglingEscape(FunctionName))))
- continue;
CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
cast<ConstantAsMetadata>(FuncMD->getOperand(1))
->getValue()
->getUniqueInteger()
.getZExtValue());
+ const GlobalValue::GUID GUID = GlobalValue::getGUID(
+ GlobalValue::dropLLVMManglingEscape(FunctionName));
+ // Do not emit jumptable entries for functions that are not-live and
+ // have no live references (and are not exported with cross-DSO CFI.)
+ if (!ExportSummary->isGUIDLive(GUID))
+ continue;
+ if (!AddressTaken.count(GUID)) {
+ if (!CrossDsoCfi || Linkage != CFL_Definition)
+ continue;
+
+ bool Exported = false;
+ if (auto VI = ExportSummary->getValueInfo(GUID))
+ for (auto &GVS : VI.getSummaryList())
+ if (GVS->isLive() && !GlobalValue::isLocalLinkage(GVS->linkage()))
+ Exported = true;
+
+ if (!Exported)
+ continue;
+ }
auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
if (!P.second && P.first->second.Linkage != CFL_Definition)
P.first->second = {Linkage, FuncMD};
@@ -1656,6 +1827,11 @@ bool LowerTypeTestsModule::lower() {
F->clearMetadata();
}
+ // Update the linkage for extern_weak declarations when a definition
+ // exists.
+ if (Linkage == CFL_Definition && F->hasExternalWeakLinkage())
+ F->setLinkage(GlobalValue::ExternalLinkage);
+
// If the function in the full LTO module is a declaration, replace its
// type metadata with the type metadata we found in cfi.functions. That
// metadata is presumed to be more accurate than the metadata attached
@@ -1673,28 +1849,37 @@ bool LowerTypeTestsModule::lower() {
}
}
+ DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers;
for (GlobalObject &GO : M.global_objects()) {
if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
continue;
Types.clear();
GO.getMetadata(LLVMContext::MD_type, Types);
- if (Types.empty())
- continue;
bool IsDefinition = !GO.isDeclarationForLinker();
bool IsExported = false;
- if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) {
- IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition;
- IsExported = true;
+ if (Function *F = dyn_cast<Function>(&GO)) {
+ if (ExportedFunctions.count(F->getName())) {
+ IsDefinition |= ExportedFunctions[F->getName()].Linkage == CFL_Definition;
+ IsExported = true;
+ // TODO: The logic here checks only that the function is address taken,
+ // not that the address takers are live. This can be updated to check
+ // their liveness and emit fewer jumptable entries once monolithic LTO
+ // builds also emit summaries.
+ } else if (!F->hasAddressTaken()) {
+ if (!CrossDsoCfi || !IsDefinition || F->hasLocalLinkage())
+ continue;
+ }
}
auto *GTM =
GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
+ GlobalTypeMembers[&GO] = GTM;
for (MDNode *Type : Types) {
verifyTypeMDNode(&GO, Type);
auto &Info = TypeIdInfo[Type->getOperand(1)];
- Info.Index = ++I;
+ Info.UniqueId = ++CurUniqueId;
Info.RefGlobals.push_back(GTM);
}
}
@@ -1731,6 +1916,44 @@ bool LowerTypeTestsModule::lower() {
}
}
+ if (ICallBranchFunnelFunc) {
+ for (const Use &U : ICallBranchFunnelFunc->uses()) {
+ if (Arch != Triple::x86_64)
+ report_fatal_error(
+ "llvm.icall.branch.funnel not supported on this target");
+
+ auto CI = cast<CallInst>(U.getUser());
+
+ std::vector<GlobalTypeMember *> Targets;
+ if (CI->getNumArgOperands() % 2 != 1)
+ report_fatal_error("number of arguments should be odd");
+
+ GlobalClassesTy::member_iterator CurSet;
+ for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
+ int64_t Offset;
+ auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+ CI->getOperand(I), Offset, M.getDataLayout()));
+ if (!Base)
+ report_fatal_error(
+ "Expected branch funnel operand to be global value");
+
+ GlobalTypeMember *GTM = GlobalTypeMembers[Base];
+ Targets.push_back(GTM);
+ GlobalClassesTy::member_iterator NewSet =
+ GlobalClasses.findLeader(GlobalClasses.insert(GTM));
+ if (I == 1)
+ CurSet = NewSet;
+ else
+ CurSet = GlobalClasses.unionSets(CurSet, NewSet);
+ }
+
+ GlobalClasses.unionSets(
+ CurSet, GlobalClasses.findLeader(
+ GlobalClasses.insert(ICallBranchFunnel::create(
+ Alloc, CI, Targets, ++CurUniqueId))));
+ }
+ }
+
if (ExportSummary) {
DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
for (auto &P : TypeIdInfo) {
@@ -1764,54 +1987,124 @@ bool LowerTypeTestsModule::lower() {
continue;
++NumTypeIdDisjointSets;
- unsigned MaxIndex = 0;
+ unsigned MaxUniqueId = 0;
for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
MI != GlobalClasses.member_end(); ++MI) {
- if ((*MI).is<Metadata *>())
- MaxIndex = std::max(MaxIndex, TypeIdInfo[MI->get<Metadata *>()].Index);
+ if (auto *MD = MI->dyn_cast<Metadata *>())
+ MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId);
+ else if (auto *BF = MI->dyn_cast<ICallBranchFunnel *>())
+ MaxUniqueId = std::max(MaxUniqueId, BF->UniqueId);
}
- Sets.emplace_back(I, MaxIndex);
+ Sets.emplace_back(I, MaxUniqueId);
}
- std::sort(Sets.begin(), Sets.end(),
- [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
- const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
- return S1.second < S2.second;
- });
+ llvm::sort(Sets.begin(), Sets.end(),
+ [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
+ const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
+ return S1.second < S2.second;
+ });
// For each disjoint set we found...
for (const auto &S : Sets) {
// Build the list of type identifiers in this disjoint set.
std::vector<Metadata *> TypeIds;
std::vector<GlobalTypeMember *> Globals;
+ std::vector<ICallBranchFunnel *> ICallBranchFunnels;
for (GlobalClassesTy::member_iterator MI =
GlobalClasses.member_begin(S.first);
MI != GlobalClasses.member_end(); ++MI) {
- if ((*MI).is<Metadata *>())
+ if (MI->is<Metadata *>())
TypeIds.push_back(MI->get<Metadata *>());
- else
+ else if (MI->is<GlobalTypeMember *>())
Globals.push_back(MI->get<GlobalTypeMember *>());
+ else
+ ICallBranchFunnels.push_back(MI->get<ICallBranchFunnel *>());
}
- // Order type identifiers by global index for determinism. This ordering is
- // stable as there is a one-to-one mapping between metadata and indices.
- std::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
- return TypeIdInfo[M1].Index < TypeIdInfo[M2].Index;
+ // Order type identifiers by unique ID for determinism. This ordering is
+ // stable as there is a one-to-one mapping between metadata and unique IDs.
+ llvm::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
+ return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId;
});
+ // Same for the branch funnels.
+ llvm::sort(ICallBranchFunnels.begin(), ICallBranchFunnels.end(),
+ [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) {
+ return F1->UniqueId < F2->UniqueId;
+ });
+
// Build bitsets for this disjoint set.
- buildBitSetsFromDisjointSet(TypeIds, Globals);
+ buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels);
}
allocateByteArrays();
+ // Parse alias data to replace stand-in function declarations for aliases
+ // with an alias to the intended target.
+ if (ExportSummary) {
+ if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
+ for (auto AliasMD : AliasesMD->operands()) {
+ assert(AliasMD->getNumOperands() >= 4);
+ StringRef AliasName =
+ cast<MDString>(AliasMD->getOperand(0))->getString();
+ StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString();
+
+ if (!ExportedFunctions.count(Aliasee) ||
+ ExportedFunctions[Aliasee].Linkage != CFL_Definition ||
+ !M.getNamedAlias(Aliasee))
+ continue;
+
+ GlobalValue::VisibilityTypes Visibility =
+ static_cast<GlobalValue::VisibilityTypes>(
+ cast<ConstantAsMetadata>(AliasMD->getOperand(2))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue());
+ bool Weak =
+ static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue());
+
+ auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee));
+ Alias->setVisibility(Visibility);
+ if (Weak)
+ Alias->setLinkage(GlobalValue::WeakAnyLinkage);
+
+ if (auto *F = M.getFunction(AliasName)) {
+ Alias->takeName(F);
+ F->replaceAllUsesWith(Alias);
+ F->eraseFromParent();
+ } else {
+ Alias->setName(AliasName);
+ }
+ }
+ }
+ }
+
+ // Emit .symver directives for exported functions, if they exist.
+ if (ExportSummary) {
+ if (NamedMDNode *SymversMD = M.getNamedMetadata("symvers")) {
+ for (auto Symver : SymversMD->operands()) {
+ assert(Symver->getNumOperands() >= 2);
+ StringRef SymbolName =
+ cast<MDString>(Symver->getOperand(0))->getString();
+ StringRef Alias = cast<MDString>(Symver->getOperand(1))->getString();
+
+ if (!ExportedFunctions.count(SymbolName))
+ continue;
+
+ M.appendModuleInlineAsm(
+ (llvm::Twine(".symver ") + SymbolName + ", " + Alias).str());
+ }
+ }
+ }
+
return true;
}
PreservedAnalyses LowerTypeTestsPass::run(Module &M,
ModuleAnalysisManager &AM) {
- bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr,
- /*ImportSummary=*/nullptr)
- .lower();
+ bool Changed = LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
if (!Changed)
return PreservedAnalyses::all();
return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 76b90391fbb1..139941127dee 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -90,7 +90,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/IR/Argument.h"
@@ -407,10 +407,10 @@ bool MergeFunctions::runOnModule(Module &M) {
std::vector<WeakTrackingVH> Worklist;
Deferred.swap(Worklist);
- DEBUG(doSanityCheck(Worklist));
+ LLVM_DEBUG(doSanityCheck(Worklist));
- DEBUG(dbgs() << "size of module: " << M.size() << '\n');
- DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
+ LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+ LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
// Insert functions and merge them.
for (WeakTrackingVH &I : Worklist) {
@@ -421,7 +421,7 @@ bool MergeFunctions::runOnModule(Module &M) {
Changed |= insert(F);
}
}
- DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
+ LLVM_DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
} while (!Deferred.empty());
FnTree.clear();
@@ -498,19 +498,20 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
// parameter debug info, from the entry block.
void MergeFunctions::eraseInstsUnrelatedToPDI(
std::vector<Instruction *> &PDIUnrelatedWL) {
- DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in "
- "entry block) unrelated to parameter debug info from entry "
- "block: {\n");
+ LLVM_DEBUG(
+ dbgs() << " Erasing instructions (in reverse order of appearance in "
+ "entry block) unrelated to parameter debug info from entry "
+ "block: {\n");
while (!PDIUnrelatedWL.empty()) {
Instruction *I = PDIUnrelatedWL.back();
- DEBUG(dbgs() << " Deleting Instruction: ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Deleting Instruction: ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
I->eraseFromParent();
PDIUnrelatedWL.pop_back();
}
- DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
- "debug info from entry block. \n");
+ LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+ "debug info from entry block. \n");
}
// Reduce G to its entry block.
@@ -543,99 +544,113 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
BI != BIE; ++BI) {
if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
- DEBUG(dbgs() << " Deciding: ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Deciding: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
DILocalVariable *DILocVar = DVI->getVariable();
if (DILocVar->isParameter()) {
- DEBUG(dbgs() << " Include (parameter): ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Include (parameter): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
PDIRelated.insert(&*BI);
} else {
- DEBUG(dbgs() << " Delete (!parameter): ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
} else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
- DEBUG(dbgs() << " Deciding: ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Deciding: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
DILocalVariable *DILocVar = DDI->getVariable();
if (DILocVar->isParameter()) {
- DEBUG(dbgs() << " Parameter: ");
- DEBUG(DILocVar->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " Parameter: ");
+ LLVM_DEBUG(DILocVar->print(dbgs()));
AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
if (AI) {
- DEBUG(dbgs() << " Processing alloca users: ");
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Processing alloca users: ");
+ LLVM_DEBUG(dbgs() << "\n");
for (User *U : AI->users()) {
if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
if (Value *Arg = SI->getValueOperand()) {
if (dyn_cast<Argument>(Arg)) {
- DEBUG(dbgs() << " Include: ");
- DEBUG(AI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Include: ");
+ LLVM_DEBUG(AI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
PDIRelated.insert(AI);
- DEBUG(dbgs() << " Include (parameter): ");
- DEBUG(SI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Include (parameter): ");
+ LLVM_DEBUG(SI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
PDIRelated.insert(SI);
- DEBUG(dbgs() << " Include: ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Include: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
PDIRelated.insert(&*BI);
} else {
- DEBUG(dbgs() << " Delete (!parameter): ");
- DEBUG(SI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
+ LLVM_DEBUG(SI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
}
} else {
- DEBUG(dbgs() << " Defer: ");
- DEBUG(U->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Defer: ");
+ LLVM_DEBUG(U->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
}
} else {
- DEBUG(dbgs() << " Delete (alloca NULL): ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Delete (alloca NULL): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
} else {
- DEBUG(dbgs() << " Delete (!parameter): ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
} else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
- DEBUG(dbgs() << " Will Include Terminator: ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
PDIRelated.insert(&*BI);
} else {
- DEBUG(dbgs() << " Defer: ");
- DEBUG(BI->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Defer: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
}
- DEBUG(dbgs()
- << " Report parameter debug info related/related instructions: {\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Report parameter debug info related/related instructions: {\n");
for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
BI != BE; ++BI) {
Instruction *I = &*BI;
if (PDIRelated.find(I) == PDIRelated.end()) {
- DEBUG(dbgs() << " !PDIRelated: ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " !PDIRelated: ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
PDIUnrelatedWL.push_back(I);
} else {
- DEBUG(dbgs() << " PDIRelated: ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " PDIRelated: ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
}
- DEBUG(dbgs() << " }\n");
+ LLVM_DEBUG(dbgs() << " }\n");
+}
+
+// Don't merge tiny functions using a thunk, since it can just end up
+// making the function larger.
+static bool isThunkProfitable(Function * F) {
+ if (F->size() == 1) {
+ if (F->front().size() <= 2) {
+ LLVM_DEBUG(dbgs() << "isThunkProfitable: " << F->getName()
+ << " is too small to bother creating a thunk for\n");
+ return false;
+ }
+ }
+ return true;
}
// Replace G with a simple tail call to bitcast(F). Also (unless
@@ -647,51 +662,19 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
// For better debugability, under MergeFunctionsPDI, we do not modify G's
// call sites to point to F even when within the same translation unit.
void MergeFunctions::writeThunk(Function *F, Function *G) {
- if (!G->isInterposable() && !MergeFunctionsPDI) {
- if (G->hasGlobalUnnamedAddr()) {
- // G might have been a key in our GlobalNumberState, and it's illegal
- // to replace a key in ValueMap<GlobalValue *> with a non-global.
- GlobalNumbers.erase(G);
- // If G's address is not significant, replace it entirely.
- Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
- G->replaceAllUsesWith(BitcastF);
- } else {
- // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
- // above).
- replaceDirectCallers(G, F);
- }
- }
-
- // If G was internal then we may have replaced all uses of G with F. If so,
- // stop here and delete G. There's no need for a thunk. (See note on
- // MergeFunctionsPDI above).
- if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
- G->eraseFromParent();
- return;
- }
-
- // Don't merge tiny functions using a thunk, since it can just end up
- // making the function larger.
- if (F->size() == 1) {
- if (F->front().size() <= 2) {
- DEBUG(dbgs() << "writeThunk: " << F->getName()
- << " is too small to bother creating a thunk for\n");
- return;
- }
- }
-
BasicBlock *GEntryBlock = nullptr;
std::vector<Instruction *> PDIUnrelatedWL;
BasicBlock *BB = nullptr;
Function *NewG = nullptr;
if (MergeFunctionsPDI) {
- DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
- "function as thunk; retain original: "
- << G->getName() << "()\n");
+ LLVM_DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+ "function as thunk; retain original: "
+ << G->getName() << "()\n");
GEntryBlock = &G->getEntryBlock();
- DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
- "debug info for "
- << G->getName() << "() {\n");
+ LLVM_DEBUG(
+ dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+ "debug info for "
+ << G->getName() << "() {\n");
filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
GEntryBlock->getTerminator()->eraseFromParent();
BB = GEntryBlock;
@@ -730,13 +713,15 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
CI->setDebugLoc(CIDbgLoc);
RI->setDebugLoc(RIDbgLoc);
} else {
- DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
- << G->getName() << "()\n");
+ LLVM_DEBUG(
+ dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+ << G->getName() << "()\n");
}
eraseTail(G);
eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
- DEBUG(dbgs() << "} // End of parameter related debug info filtering for: "
- << G->getName() << "()\n");
+ LLVM_DEBUG(
+ dbgs() << "} // End of parameter related debug info filtering for: "
+ << G->getName() << "()\n");
} else {
NewG->copyAttributesFrom(G);
NewG->takeName(G);
@@ -745,7 +730,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
G->eraseFromParent();
}
- DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
++NumThunksWritten;
}
@@ -754,6 +739,10 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
if (F->isInterposable()) {
assert(G->isInterposable());
+ if (!isThunkProfitable(F)) {
+ return;
+ }
+
// Make them both thunks to the same internal function.
Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
F->getParent());
@@ -770,11 +759,41 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
F->setAlignment(MaxAlignment);
F->setLinkage(GlobalValue::PrivateLinkage);
++NumDoubleWeak;
+ ++NumFunctionsMerged;
} else {
+ // For better debugability, under MergeFunctionsPDI, we do not modify G's
+ // call sites to point to F even when within the same translation unit.
+ if (!G->isInterposable() && !MergeFunctionsPDI) {
+ if (G->hasGlobalUnnamedAddr()) {
+ // G might have been a key in our GlobalNumberState, and it's illegal
+ // to replace a key in ValueMap<GlobalValue *> with a non-global.
+ GlobalNumbers.erase(G);
+ // If G's address is not significant, replace it entirely.
+ Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+ G->replaceAllUsesWith(BitcastF);
+ } else {
+ // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+ // above).
+ replaceDirectCallers(G, F);
+ }
+ }
+
+ // If G was internal then we may have replaced all uses of G with F. If so,
+ // stop here and delete G. There's no need for a thunk. (See note on
+ // MergeFunctionsPDI above).
+ if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
+ G->eraseFromParent();
+ ++NumFunctionsMerged;
+ return;
+ }
+
+ if (!isThunkProfitable(F)) {
+ return;
+ }
+
writeThunk(F, G);
+ ++NumFunctionsMerged;
}
-
- ++NumFunctionsMerged;
}
/// Replace function F by function G.
@@ -806,7 +825,8 @@ bool MergeFunctions::insert(Function *NewFunction) {
if (Result.second) {
assert(FNodesInTree.count(NewFunction) == 0);
FNodesInTree.insert({NewFunction, Result.first});
- DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName()
+ << '\n');
return false;
}
@@ -827,8 +847,8 @@ bool MergeFunctions::insert(Function *NewFunction) {
assert(OldF.getFunc() != F && "Must have swapped the functions.");
}
- DEBUG(dbgs() << " " << OldF.getFunc()->getName()
- << " == " << NewFunction->getName() << '\n');
+ LLVM_DEBUG(dbgs() << " " << OldF.getFunc()->getName()
+ << " == " << NewFunction->getName() << '\n');
Function *DeleteF = NewFunction;
mergeTwoFunctions(OldF.getFunc(), DeleteF);
@@ -840,7 +860,7 @@ bool MergeFunctions::insert(Function *NewFunction) {
void MergeFunctions::remove(Function *F) {
auto I = FNodesInTree.find(F);
if (I != FNodesInTree.end()) {
- DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n");
+ LLVM_DEBUG(dbgs() << "Deferred " << F->getName() << ".\n");
FnTree.erase(I->second);
// I->second has been invalidated, remove it from the FNodesInTree map to
// preserve the invariant.
@@ -854,7 +874,7 @@ void MergeFunctions::remove(Function *F) {
void MergeFunctions::removeUsers(Value *V) {
std::vector<Value *> Worklist;
Worklist.push_back(V);
- SmallSet<Value*, 8> Visited;
+ SmallPtrSet<Value*, 8> Visited;
Visited.insert(V);
while (!Worklist.empty()) {
Value *V = Worklist.back();
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index a9cfd8ded6fb..4907e4b30519 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -202,10 +202,8 @@ struct PartialInlinerImpl {
std::function<AssumptionCache &(Function &)> *GetAC,
std::function<TargetTransformInfo &(Function &)> *GTTI,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
- ProfileSummaryInfo *ProfSI,
- std::function<OptimizationRemarkEmitter &(Function &)> *GORE)
- : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI),
- GetORE(GORE) {}
+ ProfileSummaryInfo *ProfSI)
+ : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
bool run(Module &M);
// Main part of the transformation that calls helper functions to find
@@ -217,7 +215,7 @@ struct PartialInlinerImpl {
// outline function due to code size.
std::pair<bool, Function *> unswitchFunction(Function *F);
- // This class speculatively clones the the function to be partial inlined.
+ // This class speculatively clones the function to be partial inlined.
// At the end of partial inlining, the remaining callsites to the cloned
// function that are not partially inlined will be fixed up to reference
// the original function, and the cloned function will be erased.
@@ -271,7 +269,6 @@ private:
std::function<TargetTransformInfo &(Function &)> *GetTTI;
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
ProfileSummaryInfo *PSI;
- std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
// Return the frequency of the OutlininingBB relative to F's entry point.
// The result is no larger than 1 and is represented using BP.
@@ -282,7 +279,8 @@ private:
// Return true if the callee of CS should be partially inlined with
// profit.
bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
- BlockFrequency WeightedOutliningRcost);
+ BlockFrequency WeightedOutliningRcost,
+ OptimizationRemarkEmitter &ORE);
// Try to inline DuplicateFunction (cloned from F with call to
// the OutlinedFunction into its callers. Return true
@@ -337,7 +335,7 @@ private:
std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
std::unique_ptr<FunctionOutliningMultiRegionInfo>
- computeOutliningColdRegionsInfo(Function *F);
+ computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
};
struct PartialInlinerLegacyPass : public ModulePass {
@@ -362,7 +360,6 @@ struct PartialInlinerLegacyPass : public ModulePass {
&getAnalysis<TargetTransformInfoWrapperPass>();
ProfileSummaryInfo *PSI =
getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- std::unique_ptr<OptimizationRemarkEmitter> UPORE;
std::function<AssumptionCache &(Function &)> GetAssumptionCache =
[&ACT](Function &F) -> AssumptionCache & {
@@ -374,14 +371,7 @@ struct PartialInlinerLegacyPass : public ModulePass {
return TTIWP->getTTI(F);
};
- std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
- [&UPORE](Function &F) -> OptimizationRemarkEmitter & {
- UPORE.reset(new OptimizationRemarkEmitter(&F));
- return *UPORE.get();
- };
-
- return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI,
- &GetORE)
+ return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI)
.run(M);
}
};
@@ -389,7 +379,8 @@ struct PartialInlinerLegacyPass : public ModulePass {
} // end anonymous namespace
std::unique_ptr<FunctionOutliningMultiRegionInfo>
-PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
+PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
+ OptimizationRemarkEmitter &ORE) {
BasicBlock *EntryBlock = &F->front();
DominatorTree DT(*F);
@@ -403,8 +394,6 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
} else
BFI = &(*GetBFI)(*F);
- auto &ORE = (*GetORE)(*F);
-
// Return if we don't have profiling information.
if (!PSI->hasInstrumentationProfile())
return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
@@ -414,8 +403,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
BasicBlock *Dom = BlockList.front();
- return BlockList.size() > 1 &&
- std::distance(pred_begin(Dom), pred_end(Dom)) == 1;
+ return BlockList.size() > 1 && pred_size(Dom) == 1;
};
auto IsSingleExit =
@@ -567,10 +555,6 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return is_contained(successors(BB), Succ);
};
- auto SuccSize = [](BasicBlock *BB) {
- return std::distance(succ_begin(BB), succ_end(BB));
- };
-
auto IsReturnBlock = [](BasicBlock *BB) {
TerminatorInst *TI = BB->getTerminator();
return isa<ReturnInst>(TI);
@@ -607,7 +591,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
break;
- if (SuccSize(CurrEntry) != 2)
+ if (succ_size(CurrEntry) != 2)
break;
BasicBlock *Succ1 = *succ_begin(CurrEntry);
@@ -681,7 +665,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
// peeling off dominating blocks from the outlining region:
while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
BasicBlock *Cand = OutliningInfo->NonReturnBlock;
- if (SuccSize(Cand) != 2)
+ if (succ_size(Cand) != 2)
break;
if (HasNonEntryPred(Cand))
@@ -766,19 +750,19 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
bool PartialInlinerImpl::shouldPartialInline(
CallSite CS, FunctionCloner &Cloner,
- BlockFrequency WeightedOutliningRcost) {
+ BlockFrequency WeightedOutliningRcost,
+ OptimizationRemarkEmitter &ORE) {
using namespace ore;
- if (SkipCostAnalysis)
- return true;
-
Instruction *Call = CS.getInstruction();
Function *Callee = CS.getCalledFunction();
assert(Callee == Cloner.ClonedFunc);
+ if (SkipCostAnalysis)
+ return isInlineViable(*Callee);
+
Function *Caller = CS.getCaller();
auto &CalleeTTI = (*GetTTI)(*Callee);
- auto &ORE = (*GetORE)(*Caller);
InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
*GetAssumptionCache, GetBFI, PSI, &ORE);
@@ -1270,14 +1254,14 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
if (F->user_begin() == F->user_end())
return {false, nullptr};
- auto &ORE = (*GetORE)(*F);
+ OptimizationRemarkEmitter ORE(F);
// Only try to outline cold regions if we have a profile summary, which
// implies we have profiling information.
if (PSI->hasProfileSummary() && F->hasProfileData() &&
!DisableMultiRegionPartialInline) {
std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
- computeOutliningColdRegionsInfo(F);
+ computeOutliningColdRegionsInfo(F, ORE);
if (OMRI) {
FunctionCloner Cloner(F, OMRI.get(), ORE);
@@ -1357,11 +1341,11 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
// inlining the function with outlining (The inliner uses the size increase to
// model the cost of inlining a callee).
if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
- auto &ORE = (*GetORE)(*Cloner.OrigFunc);
+ OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
DebugLoc DLoc;
BasicBlock *Block;
std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
- ORE.emit([&]() {
+ OrigFuncORE.emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
DLoc, Block)
<< ore::NV("Function", Cloner.OrigFunc)
@@ -1384,7 +1368,8 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
if (CalleeEntryCount)
computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
- uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+ uint64_t CalleeEntryCountV =
+ (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
bool AnyInline = false;
for (User *User : Users) {
@@ -1393,11 +1378,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
if (IsLimitReached())
continue;
-
- if (!shouldPartialInline(CS, Cloner, WeightedRcost))
+ OptimizationRemarkEmitter CallerORE(CS.getCaller());
+ if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
continue;
- auto &ORE = (*GetORE)(*CS.getCaller());
// Construct remark before doing the inlining, as after successful inlining
// the callsite is removed.
OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
@@ -1412,7 +1396,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
: nullptr)))
continue;
- ORE.emit(OR);
+ CallerORE.emit(OR);
// Now update the entry count:
if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
@@ -1433,9 +1417,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
if (AnyInline) {
Cloner.IsFunctionInlined = true;
if (CalleeEntryCount)
- Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
- auto &ORE = (*GetORE)(*Cloner.OrigFunc);
- ORE.emit([&]() {
+ Cloner.OrigFunc->setEntryCount(
+ CalleeEntryCount.setCount(CalleeEntryCountV));
+ OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+ OrigFuncORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
<< "Partially inlined into at least one caller";
});
@@ -1517,14 +1502,9 @@ PreservedAnalyses PartialInlinerPass::run(Module &M,
return FAM.getResult<TargetIRAnalysis>(F);
};
- std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
- [&FAM](Function &F) -> OptimizationRemarkEmitter & {
- return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- };
-
ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
- if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE)
+ if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI)
.run(M))
return PreservedAnalyses::none();
return PreservedAnalyses::all();
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 3855e6245d8e..5ced6481996a 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -29,14 +29,18 @@
#include "llvm/IR/Verifier.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
#include "llvm/Transforms/IPO/FunctionAttrs.h"
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Vectorize.h"
using namespace llvm;
@@ -92,6 +96,10 @@ static cl::opt<bool> EnableLoopInterchange(
"enable-loopinterchange", cl::init(false), cl::Hidden,
cl::desc("Enable the new, experimental LoopInterchange Pass"));
+static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
+ cl::init(false), cl::Hidden,
+ cl::desc("Enable Unroll And Jam Pass"));
+
static cl::opt<bool>
EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
cl::desc("Enable preparation for ThinLTO."));
@@ -135,10 +143,10 @@ static cl::opt<bool>
cl::Hidden,
cl::desc("Disable shrink-wrap library calls"));
-static cl::opt<bool>
- EnableSimpleLoopUnswitch("enable-simple-loop-unswitch", cl::init(false),
- cl::Hidden,
- cl::desc("Enable the simple loop unswitch pass."));
+static cl::opt<bool> EnableSimpleLoopUnswitch(
+ "enable-simple-loop-unswitch", cl::init(false), cl::Hidden,
+ cl::desc("Enable the simple loop unswitch pass. Also enables independent "
+ "cleanup passes integrated into the loop pass manager pipeline."));
static cl::opt<bool> EnableGVNSink(
"enable-gvn-sink", cl::init(false), cl::Hidden,
@@ -318,6 +326,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
// Combine silly seq's
+ if (OptLevel > 2)
+ MPM.add(createAggressiveInstCombinerPass());
addInstructionCombiningPass(MPM);
if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
MPM.add(createLibCallsShrinkWrapPass());
@@ -330,6 +340,15 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
MPM.add(createReassociatePass()); // Reassociate expressions
+
+ // Begin the loop pass pipeline.
+ if (EnableSimpleLoopUnswitch) {
+ // The simple loop unswitch pass relies on separate cleanup passes. Schedule
+ // them first so when we re-process a loop they run before other loop
+ // passes.
+ MPM.add(createLoopInstSimplifyPass());
+ MPM.add(createLoopSimplifyCFGPass());
+ }
// Rotate Loop - disable header duplication at -Oz
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
MPM.add(createLICMPass()); // Hoist loop invariants
@@ -337,20 +356,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
MPM.add(createSimpleLoopUnswitchLegacyPass());
else
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+ // FIXME: We break the loop pass pipeline here in order to do full
+ // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
+ // need for this.
MPM.add(createCFGSimplificationPass());
addInstructionCombiningPass(MPM);
+ // We resume loop passes creating a second loop pipeline here.
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
addExtensionsToPM(EP_LateLoopOptimizations, MPM);
MPM.add(createLoopDeletionPass()); // Delete dead loops
if (EnableLoopInterchange) {
+ // FIXME: These are function passes and break the loop pass pipeline.
MPM.add(createLoopInterchangePass()); // Interchange loops
MPM.add(createCFGSimplificationPass());
}
if (!DisableUnrollLoops)
MPM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops
addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+ // This ends the loop pass pipelines.
if (OptLevel > 1) {
MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
@@ -431,7 +456,7 @@ void PassManagerBuilder::populateModulePassManager(
// This has to be done after we add the extensions to the pass manager
// as there could be passes (e.g. Adddress sanitizer) which introduce
// new unnamed globals.
- if (PrepareForThinLTO)
+ if (PrepareForLTO || PrepareForThinLTO)
MPM.add(createNameAnonGlobalPass());
return;
}
@@ -648,6 +673,13 @@ void PassManagerBuilder::populateModulePassManager(
addInstructionCombiningPass(MPM);
if (!DisableUnrollLoops) {
+ if (EnableUnrollAndJam) {
+ // Unroll and Jam. We do this before unroll but need to be in a separate
+ // loop pass manager in order for the outer loop to be processed by
+ // unroll and jam before the inner loop is unrolled.
+ MPM.add(createLoopUnrollAndJamPass(OptLevel));
+ }
+
MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops
// LoopUnroll may generate some redundency to cleanup.
@@ -683,7 +715,7 @@ void PassManagerBuilder::populateModulePassManager(
// result too early.
MPM.add(createLoopSinkPass());
// Get rid of LCSSA nodes.
- MPM.add(createInstructionSimplifierPass());
+ MPM.add(createInstSimplifyLegacyPass());
// This hoists/decomposes div/rem ops. It should run after other sink/hoist
// passes to avoid re-sinking, but before SimplifyCFG because it can allow
@@ -695,6 +727,10 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createCFGSimplificationPass());
addExtensionsToPM(EP_OptimizerLast, MPM);
+
+ // Rename anon globals to be able to handle them in the summary
+ if (PrepareForLTO)
+ MPM.add(createNameAnonGlobalPass());
}
void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
@@ -765,6 +801,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
// simplification opportunities, and both can propagate functions through
// function pointers. When this happens, we often have to resolve varargs
// calls, etc, so let instcombine do this.
+ if (OptLevel > 2)
+ PM.add(createAggressiveInstCombinerPass());
addInstructionCombiningPass(PM);
addExtensionsToPM(EP_Peephole, PM);
@@ -865,6 +903,8 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(
void PassManagerBuilder::populateThinLTOPassManager(
legacy::PassManagerBase &PM) {
PerformThinLTO = true;
+ if (LibraryInfo)
+ PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
if (VerifyInput)
PM.add(createVerifierPass());
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 46b088189040..27d791857314 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -27,7 +28,6 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
using namespace llvm;
diff --git a/lib/Transforms/IPO/SCCP.cpp b/lib/Transforms/IPO/SCCP.cpp
new file mode 100644
index 000000000000..cc53c4b8c46f
--- /dev/null
+++ b/lib/Transforms/IPO/SCCP.cpp
@@ -0,0 +1,58 @@
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+
+using namespace llvm;
+
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
+ const DataLayout &DL = M.getDataLayout();
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+ if (!runIPSCCP(M, DL, &TLI))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ IPSCCPLegacyPass() : ModulePass(ID) {
+ initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ const DataLayout &DL = M.getDataLayout();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return runIPSCCP(M, DL, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index a69c009e1a54..dcd24595f7ea 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -22,7 +22,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/SampleProfile.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
@@ -37,6 +37,8 @@
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -85,7 +87,7 @@
using namespace llvm;
using namespace sampleprof;
-
+using ProfileCount = Function::ProfileCount;
#define DEBUG_TYPE "sample-profile"
// Command line option to specify the file to read samples from. This is
@@ -109,10 +111,10 @@ static cl::opt<unsigned> SampleProfileSampleCoverage(
cl::desc("Emit a warning if less than N% of samples in the input profile "
"are matched to the IR."));
-static cl::opt<double> SampleProfileHotThreshold(
- "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
- cl::desc("Inlined functions that account for more than N% of all samples "
- "collected in the parent function, will be inlined again."));
+static cl::opt<bool> NoWarnSampleUnused(
+ "no-warn-sample-unused", cl::init(false), cl::Hidden,
+ cl::desc("Use this option to turn off/on warnings about function with "
+ "samples but without debug information to use those samples. "));
namespace {
@@ -130,10 +132,13 @@ public:
bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
uint32_t Discriminator, uint64_t Samples);
unsigned computeCoverage(unsigned Used, unsigned Total) const;
- unsigned countUsedRecords(const FunctionSamples *FS) const;
- unsigned countBodyRecords(const FunctionSamples *FS) const;
+ unsigned countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+ unsigned countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
- uint64_t countBodySamples(const FunctionSamples *FS) const;
+ uint64_t countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
void clear() {
SampleCoverage.clear();
@@ -170,7 +175,7 @@ private:
uint64_t TotalUsedSamples = 0;
};
-/// \brief Sample profile pass.
+/// Sample profile pass.
///
/// This pass reads profile data from the file specified by
/// -sample-profile-file and annotates every affected function with the
@@ -186,7 +191,8 @@ public:
IsThinLTOPreLink(IsThinLTOPreLink) {}
bool doInitialization(Module &M);
- bool runOnModule(Module &M, ModuleAnalysisManager *AM);
+ bool runOnModule(Module &M, ModuleAnalysisManager *AM,
+ ProfileSummaryInfo *_PSI);
void dump() { Reader->dump(); }
@@ -217,28 +223,27 @@ protected:
void buildEdges(Function &F);
bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
void computeDominanceAndLoopInfo(Function &F);
- unsigned getOffset(const DILocation *DIL) const;
void clearFunctionData();
- /// \brief Map basic blocks to their computed weights.
+ /// Map basic blocks to their computed weights.
///
/// The weight of a basic block is defined to be the maximum
/// of all the instruction weights in that block.
BlockWeightMap BlockWeights;
- /// \brief Map edges to their computed weights.
+ /// Map edges to their computed weights.
///
/// Edge weights are computed by propagating basic block weights in
/// SampleProfile::propagateWeights.
EdgeWeightMap EdgeWeights;
- /// \brief Set of visited blocks during propagation.
+ /// Set of visited blocks during propagation.
SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
- /// \brief Set of visited edges during propagation.
+ /// Set of visited edges during propagation.
SmallSet<Edge, 32> VisitedEdges;
- /// \brief Equivalence classes for block weights.
+ /// Equivalence classes for block weights.
///
/// Two blocks BB1 and BB2 are in the same equivalence class if they
/// dominate and post-dominate each other, and they are in the same loop
@@ -252,47 +257,50 @@ protected:
/// is one-to-one mapping.
StringMap<Function *> SymbolMap;
- /// \brief Dominance, post-dominance and loop information.
+ /// Dominance, post-dominance and loop information.
std::unique_ptr<DominatorTree> DT;
- std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
+ std::unique_ptr<PostDominatorTree> PDT;
std::unique_ptr<LoopInfo> LI;
std::function<AssumptionCache &(Function &)> GetAC;
std::function<TargetTransformInfo &(Function &)> GetTTI;
- /// \brief Predecessors for each basic block in the CFG.
+ /// Predecessors for each basic block in the CFG.
BlockEdgeMap Predecessors;
- /// \brief Successors for each basic block in the CFG.
+ /// Successors for each basic block in the CFG.
BlockEdgeMap Successors;
SampleCoverageTracker CoverageTracker;
- /// \brief Profile reader object.
+ /// Profile reader object.
std::unique_ptr<SampleProfileReader> Reader;
- /// \brief Samples collected for the body of this function.
+ /// Samples collected for the body of this function.
FunctionSamples *Samples = nullptr;
- /// \brief Name of the profile file to load.
+ /// Name of the profile file to load.
std::string Filename;
- /// \brief Flag indicating whether the profile input loaded successfully.
+ /// Flag indicating whether the profile input loaded successfully.
bool ProfileIsValid = false;
- /// \brief Flag indicating if the pass is invoked in ThinLTO compile phase.
+ /// Flag indicating if the pass is invoked in ThinLTO compile phase.
///
/// In this phase, in annotation, we should not promote indirect calls.
/// Instead, we will mark GUIDs that needs to be annotated to the function.
bool IsThinLTOPreLink;
- /// \brief Total number of samples collected in this profile.
+ /// Profile Summary Info computed from sample profile.
+ ProfileSummaryInfo *PSI = nullptr;
+
+ /// Total number of samples collected in this profile.
///
/// This is the sum of all the samples collected in all the functions executed
/// at runtime.
uint64_t TotalCollectedSamples = 0;
- /// \brief Optimization Remark Emitter used to emit diagnostic remarks.
+ /// Optimization Remark Emitter used to emit diagnostic remarks.
OptimizationRemarkEmitter *ORE = nullptr;
};
@@ -326,6 +334,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
private:
@@ -336,7 +345,7 @@ private:
} // end anonymous namespace
-/// Return true if the given callsite is hot wrt to its caller.
+/// Return true if the given callsite is hot wrt to hot cutoff threshold.
///
/// Functions that were inlined in the original binary will be represented
/// in the inline stack in the sample profile. If the profile shows that
@@ -344,28 +353,17 @@ private:
/// frequently), then we will recreate the inline decision and apply the
/// profile from the inlined callsite.
///
-/// To decide whether an inlined callsite is hot, we compute the fraction
-/// of samples used by the callsite with respect to the total number of samples
-/// collected in the caller.
-///
-/// If that fraction is larger than the default given by
-/// SampleProfileHotThreshold, the callsite will be inlined again.
-static bool callsiteIsHot(const FunctionSamples *CallerFS,
- const FunctionSamples *CallsiteFS) {
+/// To decide whether an inlined callsite is hot, we compare the callsite
+/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
+/// regarded as hot if the count is above the cutoff value.
+static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+ ProfileSummaryInfo *PSI) {
if (!CallsiteFS)
return false; // The callsite was not inlined in the original binary.
- uint64_t ParentTotalSamples = CallerFS->getTotalSamples();
- if (ParentTotalSamples == 0)
- return false; // Avoid division by zero.
-
+ assert(PSI && "PSI is expected to be non null");
uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
- if (CallsiteTotalSamples == 0)
- return false; // Callsite is trivially cold.
-
- double PercentSamples =
- (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
- return PercentSamples >= SampleProfileHotThreshold;
+ return PSI->isHotCount(CallsiteTotalSamples);
}
/// Mark as used the sample record for the given function samples at
@@ -388,7 +386,8 @@ bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
///
/// This count does not include records from cold inlined callsites.
unsigned
-SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
auto I = SampleCoverage.find(FS);
// The size of the coverage map for FS represents the number of records
@@ -401,8 +400,8 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(FS, CalleeSamples))
- Count += countUsedRecords(CalleeSamples);
+ if (callsiteIsHot(CalleeSamples, PSI))
+ Count += countUsedRecords(CalleeSamples, PSI);
}
return Count;
@@ -412,15 +411,16 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
///
/// This count does not include records from cold inlined callsites.
unsigned
-SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
unsigned Count = FS->getBodySamples().size();
// Only count records in hot callsites.
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(FS, CalleeSamples))
- Count += countBodyRecords(CalleeSamples);
+ if (callsiteIsHot(CalleeSamples, PSI))
+ Count += countBodyRecords(CalleeSamples, PSI);
}
return Count;
@@ -430,7 +430,8 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
///
/// This count does not include samples from cold inlined callsites.
uint64_t
-SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
uint64_t Total = 0;
for (const auto &I : FS->getBodySamples())
Total += I.second.getSamples();
@@ -439,8 +440,8 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(FS, CalleeSamples))
- Total += countBodySamples(CalleeSamples);
+ if (callsiteIsHot(CalleeSamples, PSI))
+ Total += countBodySamples(CalleeSamples, PSI);
}
return Total;
@@ -473,15 +474,8 @@ void SampleProfileLoader::clearFunctionData() {
CoverageTracker.clear();
}
-/// Returns the line offset to the start line of the subprogram.
-/// We assume that a single function will not exceed 65535 LOC.
-unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
- return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
- 0xffff;
-}
-
#ifndef NDEBUG
-/// \brief Print the weight of edge \p E on stream \p OS.
+/// Print the weight of edge \p E on stream \p OS.
///
/// \param OS Stream to emit the output to.
/// \param E Edge to print.
@@ -490,7 +484,7 @@ void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
<< "]: " << EdgeWeights[E] << "\n";
}
-/// \brief Print the equivalence class of block \p BB on stream \p OS.
+/// Print the equivalence class of block \p BB on stream \p OS.
///
/// \param OS Stream to emit the output to.
/// \param BB Block to print.
@@ -501,7 +495,7 @@ void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
<< "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
}
-/// \brief Print the weight of block \p BB on stream \p OS.
+/// Print the weight of block \p BB on stream \p OS.
///
/// \param OS Stream to emit the output to.
/// \param BB Block to print.
@@ -513,7 +507,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
}
#endif
-/// \brief Get the weight for an instruction.
+/// Get the weight for an instruction.
///
/// The "weight" of an instruction \p Inst is the number of samples
/// collected on that instruction at runtime. To retrieve it, we
@@ -549,7 +543,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
return 0;
const DILocation *DIL = DLoc;
- uint32_t LineOffset = getOffset(DIL);
+ uint32_t LineOffset = FunctionSamples::getOffset(DIL);
uint32_t Discriminator = DIL->getBaseDiscriminator();
ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
if (R) {
@@ -569,16 +563,16 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
return Remark;
});
}
- DEBUG(dbgs() << " " << DLoc.getLine() << "."
- << DIL->getBaseDiscriminator() << ":" << Inst
- << " (line offset: " << LineOffset << "."
- << DIL->getBaseDiscriminator() << " - weight: " << R.get()
- << ")\n");
+ LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "."
+ << DIL->getBaseDiscriminator() << ":" << Inst
+ << " (line offset: " << LineOffset << "."
+ << DIL->getBaseDiscriminator() << " - weight: " << R.get()
+ << ")\n");
}
return R;
}
-/// \brief Compute the weight of a basic block.
+/// Compute the weight of a basic block.
///
/// The weight of basic block \p BB is the maximum weight of all the
/// instructions in BB.
@@ -599,7 +593,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
}
-/// \brief Compute and store the weights of every basic block.
+/// Compute and store the weights of every basic block.
///
/// This populates the BlockWeights map by computing
/// the weights of every basic block in the CFG.
@@ -607,7 +601,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
/// \param F The function to query.
bool SampleProfileLoader::computeBlockWeights(Function &F) {
bool Changed = false;
- DEBUG(dbgs() << "Block weights\n");
+ LLVM_DEBUG(dbgs() << "Block weights\n");
for (const auto &BB : F) {
ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
if (Weight) {
@@ -615,13 +609,13 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) {
VisitedBlocks.insert(&BB);
Changed = true;
}
- DEBUG(printBlockWeight(dbgs(), &BB));
+ LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
}
return Changed;
}
-/// \brief Get the FunctionSamples for a call instruction.
+/// Get the FunctionSamples for a call instruction.
///
/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
/// instance in which that call instruction is calling to. It contains
@@ -649,8 +643,11 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
if (FS == nullptr)
return nullptr;
- return FS->findFunctionSamplesAt(
- LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName);
+ std::string CalleeGUID;
+ CalleeName = getRepInFormat(CalleeName, Reader->getFormat(), CalleeGUID);
+ return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL),
+ DIL->getBaseDiscriminator()),
+ CalleeName);
}
/// Returns a vector of FunctionSamples that are the indirect call targets
@@ -670,7 +667,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
if (FS == nullptr)
return R;
- uint32_t LineOffset = getOffset(DIL);
+ uint32_t LineOffset = FunctionSamples::getOffset(DIL);
uint32_t Discriminator = DIL->getBaseDiscriminator();
auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
@@ -678,23 +675,23 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
if (T)
for (const auto &T_C : T.get())
Sum += T_C.second;
- if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
- LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
+ if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(LineLocation(
+ FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()))) {
if (M->empty())
return R;
for (const auto &NameFS : *M) {
Sum += NameFS.second.getEntrySamples();
R.push_back(&NameFS.second);
}
- std::sort(R.begin(), R.end(),
- [](const FunctionSamples *L, const FunctionSamples *R) {
- return L->getEntrySamples() > R->getEntrySamples();
- });
+ llvm::sort(R.begin(), R.end(),
+ [](const FunctionSamples *L, const FunctionSamples *R) {
+ return L->getEntrySamples() > R->getEntrySamples();
+ });
}
return R;
}
-/// \brief Get the FunctionSamples for an instruction.
+/// Get the FunctionSamples for an instruction.
///
/// The FunctionSamples of an instruction \p Inst is the inlined instance
/// in which that instruction is coming from. We traverse the inline stack
@@ -710,20 +707,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
if (!DIL)
return Samples;
- const DILocation *PrevDIL = DIL;
- for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
- S.push_back(std::make_pair(
- LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
- PrevDIL->getScope()->getSubprogram()->getLinkageName()));
- PrevDIL = DIL;
- }
- if (S.size() == 0)
- return Samples;
- const FunctionSamples *FS = Samples;
- for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
- FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
- }
- return FS;
+ return Samples->findFunctionSamples(DIL);
}
bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
@@ -759,7 +743,7 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
return false;
}
-/// \brief Iteratively inline hot callsites of a function.
+/// Iteratively inline hot callsites of a function.
///
/// Iteratively traverse all callsites of the function \p F, and find if
/// the corresponding inlined instance exists and is hot in profile. If
@@ -776,6 +760,7 @@ bool SampleProfileLoader::inlineHotFunctions(
Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
DenseSet<Instruction *> PromotedInsns;
bool Changed = false;
+ bool isCompact = (Reader->getFormat() == SPF_Compact_Binary);
while (true) {
bool LocalChanged = false;
SmallVector<Instruction *, 10> CIS;
@@ -787,7 +772,7 @@ bool SampleProfileLoader::inlineHotFunctions(
if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
Candidates.push_back(&I);
- if (callsiteIsHot(Samples, FS))
+ if (callsiteIsHot(FS, PSI))
Hot = true;
}
}
@@ -807,8 +792,8 @@ bool SampleProfileLoader::inlineHotFunctions(
for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
if (IsThinLTOPreLink) {
FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
- Samples->getTotalSamples() *
- SampleProfileHotThreshold / 100);
+ PSI->getOrCompHotCountThreshold(),
+ isCompact);
continue;
}
auto CalleeFunctionName = FS->getName();
@@ -817,7 +802,9 @@ bool SampleProfileLoader::inlineHotFunctions(
// clone the caller first, and inline the cloned caller if it is
// recursive. As llvm does not inline recursive calls, we will
// simply ignore it instead of handling it explicitly.
- if (CalleeFunctionName == F.getName())
+ std::string FGUID;
+ auto Fname = getRepInFormat(F.getName(), Reader->getFormat(), FGUID);
+ if (CalleeFunctionName == Fname)
continue;
const char *Reason = "Callee function not available";
@@ -836,9 +823,9 @@ bool SampleProfileLoader::inlineHotFunctions(
inlineCallInstruction(DI))
LocalChanged = true;
} else {
- DEBUG(dbgs()
- << "\nFailed to promote indirect call to "
- << CalleeFunctionName << " because " << Reason << "\n");
+ LLVM_DEBUG(dbgs()
+ << "\nFailed to promote indirect call to "
+ << CalleeFunctionName << " because " << Reason << "\n");
}
}
} else if (CalledFunction && CalledFunction->getSubprogram() &&
@@ -847,8 +834,8 @@ bool SampleProfileLoader::inlineHotFunctions(
LocalChanged = true;
} else if (IsThinLTOPreLink) {
findCalleeFunctionSamples(*I)->findInlinedFunctions(
- InlinedGUIDs, F.getParent(),
- Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
+ InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold(),
+ isCompact);
}
}
if (LocalChanged) {
@@ -860,7 +847,7 @@ bool SampleProfileLoader::inlineHotFunctions(
return Changed;
}
-/// \brief Find equivalence classes for the given block.
+/// Find equivalence classes for the given block.
///
/// This finds all the blocks that are guaranteed to execute the same
/// number of times as \p BB1. To do this, it traverses all the
@@ -917,7 +904,7 @@ void SampleProfileLoader::findEquivalencesFor(
}
}
-/// \brief Find equivalence classes.
+/// Find equivalence classes.
///
/// Since samples may be missing from blocks, we can fill in the gaps by setting
/// the weights of all the blocks in the same equivalence class to the same
@@ -928,14 +915,14 @@ void SampleProfileLoader::findEquivalencesFor(
/// \param F The function to query.
void SampleProfileLoader::findEquivalenceClasses(Function &F) {
SmallVector<BasicBlock *, 8> DominatedBBs;
- DEBUG(dbgs() << "\nBlock equivalence classes\n");
+ LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
// Find equivalence sets based on dominance and post-dominance information.
for (auto &BB : F) {
BasicBlock *BB1 = &BB;
// Compute BB1's equivalence class once.
if (EquivalenceClass.count(BB1)) {
- DEBUG(printBlockEquivalence(dbgs(), BB1));
+ LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
continue;
}
@@ -956,7 +943,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
DT->getDescendants(BB1, DominatedBBs);
findEquivalencesFor(BB1, DominatedBBs, PDT.get());
- DEBUG(printBlockEquivalence(dbgs(), BB1));
+ LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
}
// Assign weights to equivalence classes.
@@ -965,17 +952,18 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
// the same number of times. Since we know that the head block in
// each equivalence class has the largest weight, assign that weight
// to all the blocks in that equivalence class.
- DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+ LLVM_DEBUG(
+ dbgs() << "\nAssign the same weight to all blocks in the same class\n");
for (auto &BI : F) {
const BasicBlock *BB = &BI;
const BasicBlock *EquivBB = EquivalenceClass[BB];
if (BB != EquivBB)
BlockWeights[BB] = BlockWeights[EquivBB];
- DEBUG(printBlockWeight(dbgs(), BB));
+ LLVM_DEBUG(printBlockWeight(dbgs(), BB));
}
}
-/// \brief Visit the given edge to decide if it has a valid weight.
+/// Visit the given edge to decide if it has a valid weight.
///
/// If \p E has not been visited before, we copy to \p UnknownEdge
/// and increment the count of unknown edges.
@@ -996,7 +984,7 @@ uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
return EdgeWeights[E];
}
-/// \brief Propagate weights through incoming/outgoing edges.
+/// Propagate weights through incoming/outgoing edges.
///
/// If the weight of a basic block is known, and there is only one edge
/// with an unknown weight, we can calculate the weight of that edge.
@@ -1012,7 +1000,7 @@ uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
bool SampleProfileLoader::propagateThroughEdges(Function &F,
bool UpdateBlockCount) {
bool Changed = false;
- DEBUG(dbgs() << "\nPropagation through edges\n");
+ LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
for (const auto &BI : F) {
const BasicBlock *BB = &BI;
const BasicBlock *EC = EquivalenceClass[BB];
@@ -1084,9 +1072,9 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
if (TotalWeight > BBWeight) {
BBWeight = TotalWeight;
Changed = true;
- DEBUG(dbgs() << "All edge weights for " << BB->getName()
- << " known. Set weight for block: ";
- printBlockWeight(dbgs(), BB););
+ LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
+ << " known. Set weight for block: ";
+ printBlockWeight(dbgs(), BB););
}
} else if (NumTotalEdges == 1 &&
EdgeWeights[SingleEdge] < BlockWeights[EC]) {
@@ -1113,8 +1101,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
VisitedEdges.insert(UnknownEdge);
Changed = true;
- DEBUG(dbgs() << "Set weight for edge: ";
- printEdgeWeight(dbgs(), UnknownEdge));
+ LLVM_DEBUG(dbgs() << "Set weight for edge: ";
+ printEdgeWeight(dbgs(), UnknownEdge));
}
} else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
// If a block Weights 0, all its in/out edges should weight 0.
@@ -1140,8 +1128,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
EdgeWeights[SelfReferentialEdge] = 0;
VisitedEdges.insert(SelfReferentialEdge);
Changed = true;
- DEBUG(dbgs() << "Set self-referential edge weight to: ";
- printEdgeWeight(dbgs(), SelfReferentialEdge));
+ LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
+ printEdgeWeight(dbgs(), SelfReferentialEdge));
}
if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
BlockWeights[EC] = TotalWeight;
@@ -1154,7 +1142,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F,
return Changed;
}
-/// \brief Build in/out edge lists for each basic block in the CFG.
+/// Build in/out edge lists for each basic block in the CFG.
///
/// We are interested in unique edges. If a block B1 has multiple
/// edges to another block B2, we only add a single B1->B2 edge.
@@ -1190,17 +1178,17 @@ static SmallVector<InstrProfValueData, 2> SortCallTargets(
SmallVector<InstrProfValueData, 2> R;
for (auto I = M.begin(); I != M.end(); ++I)
R.push_back({Function::getGUID(I->getKey()), I->getValue()});
- std::sort(R.begin(), R.end(),
- [](const InstrProfValueData &L, const InstrProfValueData &R) {
- if (L.Count == R.Count)
- return L.Value > R.Value;
- else
- return L.Count > R.Count;
- });
+ llvm::sort(R.begin(), R.end(),
+ [](const InstrProfValueData &L, const InstrProfValueData &R) {
+ if (L.Count == R.Count)
+ return L.Value > R.Value;
+ else
+ return L.Count > R.Count;
+ });
return R;
}
-/// \brief Propagate weights into edges
+/// Propagate weights into edges
///
/// The following rules are applied to every block BB in the CFG:
///
@@ -1265,7 +1253,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
// Generate MD_prof metadata for every branch instruction using the
// edge weights computed during propagation.
- DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
+ LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
LLVMContext &Ctx = F.getContext();
MDBuilder MDB(Ctx);
for (auto &BI : F) {
@@ -1281,7 +1269,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
if (!DLoc)
continue;
const DILocation *DIL = DLoc;
- uint32_t LineOffset = getOffset(DIL);
+ uint32_t LineOffset = FunctionSamples::getOffset(DIL);
uint32_t Discriminator = DIL->getBaseDiscriminator();
const FunctionSamples *FS = findFunctionSamples(I);
@@ -1311,10 +1299,10 @@ void SampleProfileLoader::propagateWeights(Function &F) {
continue;
DebugLoc BranchLoc = TI->getDebugLoc();
- DEBUG(dbgs() << "\nGetting weights for branch at line "
- << ((BranchLoc) ? Twine(BranchLoc.getLine())
- : Twine("<UNKNOWN LOCATION>"))
- << ".\n");
+ LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
+ << ((BranchLoc) ? Twine(BranchLoc.getLine())
+ : Twine("<UNKNOWN LOCATION>"))
+ << ".\n");
SmallVector<uint32_t, 4> Weights;
uint32_t MaxWeight = 0;
Instruction *MaxDestInst;
@@ -1322,12 +1310,12 @@ void SampleProfileLoader::propagateWeights(Function &F) {
BasicBlock *Succ = TI->getSuccessor(I);
Edge E = std::make_pair(BB, Succ);
uint64_t Weight = EdgeWeights[E];
- DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
+ LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
// Use uint32_t saturated arithmetic to adjust the incoming weights,
// if needed. Sample counts in profiles are 64-bit unsigned values,
// but internally branch weights are expressed as 32-bit values.
if (Weight > std::numeric_limits<uint32_t>::max()) {
- DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+ LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
Weight = std::numeric_limits<uint32_t>::max();
}
// Weight is added by one to avoid propagation errors introduced by
@@ -1348,7 +1336,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
// annotation is done twice. If the first annotation already set the
// weights, the second pass does not need to set it.
if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
- DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
+ LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
TI->setMetadata(LLVMContext::MD_prof,
MDB.createBranchWeights(Weights));
ORE->emit([&]() {
@@ -1357,12 +1345,12 @@ void SampleProfileLoader::propagateWeights(Function &F) {
<< ore::NV("CondBranchesLoc", BranchLoc);
});
} else {
- DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+ LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
}
}
}
-/// \brief Get the line number for the function header.
+/// Get the line number for the function header.
///
/// This looks up function \p F in the current compilation unit and
/// retrieves the line number where the function is defined. This is
@@ -1377,6 +1365,9 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
if (DISubprogram *S = F.getSubprogram())
return S->getLine();
+ if (NoWarnSampleUnused)
+ return 0;
+
// If the start of \p F is missing, emit a diagnostic to inform the user
// about the missed opportunity.
F.getContext().diagnose(DiagnosticInfoSampleProfile(
@@ -1390,14 +1381,13 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
DT.reset(new DominatorTree);
DT->recalculate(F);
- PDT.reset(new PostDomTreeBase<BasicBlock>());
- PDT->recalculate(F);
+ PDT.reset(new PostDominatorTree(F));
LI.reset(new LoopInfo);
LI->analyze(*DT);
}
-/// \brief Generate branch weight metadata for all branches in \p F.
+/// Generate branch weight metadata for all branches in \p F.
///
/// Branch weights are computed out of instruction samples using a
/// propagation heuristic. Propagation proceeds in 3 phases:
@@ -1452,8 +1442,8 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
if (getFunctionLoc(F) == 0)
return false;
- DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
- << ": " << getFunctionLoc(F) << "\n");
+ LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
+ << F.getName() << ": " << getFunctionLoc(F) << "\n");
DenseSet<GlobalValue::GUID> InlinedGUIDs;
Changed |= inlineHotFunctions(F, InlinedGUIDs);
@@ -1467,7 +1457,9 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
// Sets the GUIDs that are inlined in the profiled binary. This is used
// for ThinLink to make correct liveness analysis, and also make the IR
// match the profiled binary before annotation.
- F.setEntryCount(Samples->getHeadSamples() + 1, &InlinedGUIDs);
+ F.setEntryCount(
+ ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+ &InlinedGUIDs);
// Compute dominance and loop info needed for propagation.
computeDominanceAndLoopInfo(F);
@@ -1481,8 +1473,8 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
// If coverage checking was requested, compute it now.
if (SampleProfileRecordCoverage) {
- unsigned Used = CoverageTracker.countUsedRecords(Samples);
- unsigned Total = CoverageTracker.countBodyRecords(Samples);
+ unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
+ unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
if (Coverage < SampleProfileRecordCoverage) {
F.getContext().diagnose(DiagnosticInfoSampleProfile(
@@ -1495,7 +1487,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
if (SampleProfileSampleCoverage) {
uint64_t Used = CoverageTracker.getTotalUsedSamples();
- uint64_t Total = CoverageTracker.countBodySamples(Samples);
+ uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
if (Coverage < SampleProfileSampleCoverage) {
F.getContext().diagnose(DiagnosticInfoSampleProfile(
@@ -1514,6 +1506,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
"Sample Profile loader", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
"Sample Profile loader", false, false)
@@ -1538,10 +1531,15 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
return new SampleProfileLoaderLegacyPass(Name);
}
-bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
+bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
+ ProfileSummaryInfo *_PSI) {
if (!ProfileIsValid)
return false;
+ PSI = _PSI;
+ if (M.getProfileSummary() == nullptr)
+ M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
+
// Compute the total number of samples collected in this profile.
for (const auto &I : Reader->getProfiles())
TotalCollectedSamples += I.second.getTotalSamples();
@@ -1572,22 +1570,22 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
clearFunctionData();
retval |= runOnFunction(F, AM);
}
- if (M.getProfileSummary() == nullptr)
- M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
return retval;
}
bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
ACT = &getAnalysis<AssumptionCacheTracker>();
TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
- return SampleLoader.runOnModule(M, nullptr);
+ ProfileSummaryInfo *PSI =
+ getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ return SampleLoader.runOnModule(M, nullptr, PSI);
}
bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
// Initialize the entry count to -1, which will be treated conservatively
// by getEntryCount as the same as unknown (None). If we have samples this
// will be overwritten in emitAnnotations.
- F.setEntryCount(-1);
+ F.setEntryCount(ProfileCount(-1, Function::PCT_Real));
std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
if (AM) {
auto &FAM =
@@ -1622,7 +1620,8 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
SampleLoader.doInitialization(M);
- if (!SampleLoader.runOnModule(M, &AM))
+ ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+ if (!SampleLoader.runOnModule(M, &AM, PSI))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index de1b51e206ff..c9afb060a91a 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -21,6 +21,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
@@ -30,7 +31,6 @@
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
namespace {
diff --git a/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
new file mode 100644
index 000000000000..3c5ad37bced1
--- /dev/null
+++ b/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -0,0 +1,140 @@
+//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that synthesizes entry counts for
+// functions and attaches !prof metadata to functions with the synthesized
+// counts. The presence of !prof metadata with counter name set to
+// 'synthesized_function_entry_count' indicate that the value of the counter is
+// an estimation of the likely execution count of the function. This transform
+// is applied only in non PGO mode as functions get 'real' profile-based
+// function entry counts in the PGO mode.
+//
+// The transformation works by first assigning some initial values to the entry
+// counts of all functions and then doing a top-down traversal of the
+// callgraph-scc to propagate the counts. For each function the set of callsites
+// and their relative block frequency is gathered. The relative block frequency
+// multiplied by the entry count of the caller and added to the callee's entry
+// count. For non-trivial SCCs, the new counts are computed from the previous
+// counts and updated in one shot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/SyntheticCountsUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using Scaled64 = ScaledNumber<uint64_t>;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "synthetic-counts-propagation"
+
+/// Initial synthetic count assigned to functions.
+static cl::opt<int>
+ InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
+ cl::ZeroOrMore,
+ cl::desc("Initial value of synthetic entry count."));
+
+/// Initial synthetic count assigned to inline functions.
+static cl::opt<int> InlineSyntheticCount(
+ "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore,
+ cl::desc("Initial synthetic entry count for inline functions."));
+
+/// Initial synthetic count assigned to cold functions.
+static cl::opt<int> ColdSyntheticCount(
+ "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore,
+ cl::desc("Initial synthetic entry count for cold functions."));
+
+// Assign initial synthetic entry counts to functions.
+static void
+initializeCounts(Module &M, function_ref<void(Function *, uint64_t)> SetCount) {
+ auto MayHaveIndirectCalls = [](Function &F) {
+ for (auto *U : F.users()) {
+ if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+ return true;
+ }
+ return false;
+ };
+
+ for (Function &F : M) {
+ uint64_t InitialCount = InitialSyntheticCount;
+ if (F.isDeclaration())
+ continue;
+ if (F.hasFnAttribute(Attribute::AlwaysInline) ||
+ F.hasFnAttribute(Attribute::InlineHint)) {
+ // Use a higher value for inline functions to account for the fact that
+ // these are usually beneficial to inline.
+ InitialCount = InlineSyntheticCount;
+ } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) {
+ // Local functions without inline hints get counts only through
+ // propagation.
+ InitialCount = 0;
+ } else if (F.hasFnAttribute(Attribute::Cold) ||
+ F.hasFnAttribute(Attribute::NoInline)) {
+ // Use a lower value for noinline and cold functions.
+ InitialCount = ColdSyntheticCount;
+ }
+ SetCount(&F, InitialCount);
+ }
+}
+
+PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ DenseMap<Function *, uint64_t> Counts;
+ // Set initial entry counts.
+ initializeCounts(M, [&](Function *F, uint64_t Count) { Counts[F] = Count; });
+
+ // Compute the relative block frequency for a call edge. Use scaled numbers
+ // and not integers since the relative block frequency could be less than 1.
+ auto GetCallSiteRelFreq = [&](const CallGraphNode::CallRecord &Edge) {
+ Optional<Scaled64> Res = None;
+ if (!Edge.first)
+ return Res;
+ assert(isa<Instruction>(Edge.first));
+ CallSite CS(cast<Instruction>(Edge.first));
+ Function *Caller = CS.getCaller();
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+ BasicBlock *CSBB = CS.getInstruction()->getParent();
+ Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
+ Scaled64 BBFreq(BFI.getBlockFreq(CSBB).getFrequency(), 0);
+ BBFreq /= EntryFreq;
+ return Optional<Scaled64>(BBFreq);
+ };
+
+ CallGraph CG(M);
+ // Propgate the entry counts on the callgraph.
+ SyntheticCountsUtils<const CallGraph *>::propagate(
+ &CG, GetCallSiteRelFreq,
+ [&](const CallGraphNode *N) { return Counts[N->getFunction()]; },
+ [&](const CallGraphNode *N, uint64_t New) {
+ auto F = N->getFunction();
+ if (!F || F->isDeclaration())
+ return;
+ Counts[F] += New;
+ });
+
+ // Set the counts as metadata.
+ for (auto Entry : Counts)
+ Entry.first->setEntryCount(
+ ProfileCount(Entry.second, Function::PCT_Synthetic));
+
+ return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index caffc03339c4..8fe7ae1282cc 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -18,11 +18,13 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Object/ModuleSymbolTable.h"
#include "llvm/Pass.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
@@ -128,8 +130,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
}
GO.addMetadata(
LLVMContext::MD_type,
- *MDNode::get(M.getContext(),
- ArrayRef<Metadata *>{MD->getOperand(0), I->second}));
+ *MDNode::get(M.getContext(), {MD->getOperand(0), I->second}));
}
}
}
@@ -169,46 +170,17 @@ void simplifyExternals(Module &M) {
}
}
-void filterModule(
- Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
- for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
- I != E;) {
- GlobalAlias *GA = &*I++;
- if (ShouldKeepDefinition(GA))
- continue;
-
- GlobalObject *GO;
- if (GA->getValueType()->isFunctionTy())
- GO = Function::Create(cast<FunctionType>(GA->getValueType()),
- GlobalValue::ExternalLinkage, "", M);
- else
- GO = new GlobalVariable(
- *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
- nullptr, "", nullptr,
- GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
- GO->takeName(GA);
- GA->replaceAllUsesWith(GO);
- GA->eraseFromParent();
- }
-
- for (Function &F : *M) {
- if (ShouldKeepDefinition(&F))
- continue;
-
- F.deleteBody();
- F.setComdat(nullptr);
- F.clearMetadata();
- }
-
- for (GlobalVariable &GV : M->globals()) {
- if (ShouldKeepDefinition(&GV))
- continue;
-
- GV.setInitializer(nullptr);
- GV.setLinkage(GlobalValue::ExternalLinkage);
- GV.setComdat(nullptr);
- GV.clearMetadata();
- }
+static void
+filterModule(Module *M,
+ function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
+ std::vector<GlobalValue *> V;
+ for (GlobalValue &GV : M->global_values())
+ if (!ShouldKeepDefinition(&GV))
+ V.push_back(&GV);
+
+ for (GlobalValue *GV : V)
+ if (!convertToDeclaration(*GV))
+ GV->eraseFromParent();
}
void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
@@ -228,13 +200,19 @@ void splitAndWriteThinLTOBitcode(
function_ref<AAResults &(Function &)> AARGetter, Module &M) {
std::string ModuleId = getUniqueModuleId(&M);
if (ModuleId.empty()) {
- // We couldn't generate a module ID for this module, just write it out as a
- // regular LTO module.
- WriteBitcodeToFile(&M, OS);
+ // We couldn't generate a module ID for this module, write it out as a
+ // regular LTO module with an index for summary-based dead stripping.
+ ProfileSummaryInfo PSI(M);
+ M.addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+ WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, &Index);
+
if (ThinLinkOS)
// We don't have a ThinLTO part, but still write the module to the
// ThinLinkOS if requested so that the expected output file is produced.
- WriteBitcodeToFile(&M, *ThinLinkOS);
+ WriteBitcodeToFile(M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+ &Index);
+
return;
}
@@ -243,10 +221,8 @@ void splitAndWriteThinLTOBitcode(
// Returns whether a global has attached type metadata. Such globals may
// participate in CFI or whole-program devirtualization, so they need to
// appear in the merged module instead of the thin LTO module.
- auto HasTypeMetadata = [&](const GlobalObject *GO) {
- SmallVector<MDNode *, 1> MDs;
- GO->getMetadata(LLVMContext::MD_type, MDs);
- return !MDs.empty();
+ auto HasTypeMetadata = [](const GlobalObject *GO) {
+ return GO->hasMetadata(LLVMContext::MD_type);
};
// Collect the set of virtual functions that are eligible for virtual constant
@@ -287,7 +263,7 @@ void splitAndWriteThinLTOBitcode(
ValueToValueMapTy VMap;
std::unique_ptr<Module> MergedM(
- CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool {
+ CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
if (const auto *C = GV->getComdat())
if (MergedMComdats.count(C))
return true;
@@ -298,6 +274,7 @@ void splitAndWriteThinLTOBitcode(
return false;
}));
StripDebugInfo(*MergedM);
+ MergedM->setModuleInlineAsm("");
for (Function &F : *MergedM)
if (!F.isDeclaration()) {
@@ -328,13 +305,13 @@ void splitAndWriteThinLTOBitcode(
promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+ auto &Ctx = MergedM->getContext();
SmallVector<MDNode *, 8> CfiFunctionMDs;
for (auto V : CfiFunctions) {
Function &F = *cast<Function>(V);
SmallVector<MDNode *, 2> Types;
F.getMetadata(LLVMContext::MD_type, Types);
- auto &Ctx = MergedM->getContext();
SmallVector<Metadata *, 4> Elts;
Elts.push_back(MDString::get(Ctx, F.getName()));
CfiFunctionLinkage Linkage;
@@ -357,6 +334,47 @@ void splitAndWriteThinLTOBitcode(
NMD->addOperand(MD);
}
+ SmallVector<MDNode *, 8> FunctionAliases;
+ for (auto &A : M.aliases()) {
+ if (!isa<Function>(A.getAliasee()))
+ continue;
+
+ auto *F = cast<Function>(A.getAliasee());
+
+ Metadata *Elts[] = {
+ MDString::get(Ctx, A.getName()),
+ MDString::get(Ctx, F->getName()),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())),
+ };
+
+ FunctionAliases.push_back(MDTuple::get(Ctx, Elts));
+ }
+
+ if (!FunctionAliases.empty()) {
+ NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases");
+ for (auto MD : FunctionAliases)
+ NMD->addOperand(MD);
+ }
+
+ SmallVector<MDNode *, 8> Symvers;
+ ModuleSymbolTable::CollectAsmSymvers(M, [&](StringRef Name, StringRef Alias) {
+ Function *F = M.getFunction(Name);
+ if (!F || F->use_empty())
+ return;
+
+ Symvers.push_back(MDTuple::get(
+ Ctx, {MDString::get(Ctx, Name), MDString::get(Ctx, Alias)}));
+ });
+
+ if (!Symvers.empty()) {
+ NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("symvers");
+ for (auto MD : Symvers)
+ NMD->addOperand(MD);
+ }
+
simplifyExternals(*MergedM);
// FIXME: Try to re-use BSI and PFI from the original module here.
@@ -376,10 +394,9 @@ void splitAndWriteThinLTOBitcode(
// be used in the backends, and use that in the minimized bitcode
// produced for the full link.
ModuleHash ModHash = {{0}};
- W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+ W.writeModule(M, /*ShouldPreserveUseListOrder=*/false, &Index,
/*GenerateHash=*/true, &ModHash);
- W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
- &MergedMIndex);
+ W.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, &MergedMIndex);
W.writeSymtab();
W.writeStrtab();
OS << Buffer;
@@ -391,8 +408,8 @@ void splitAndWriteThinLTOBitcode(
Buffer.clear();
BitcodeWriter W2(Buffer);
StripDebugInfo(M);
- W2.writeThinLinkBitcode(&M, Index, ModHash);
- W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
+ W2.writeThinLinkBitcode(M, Index, ModHash);
+ W2.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false,
&MergedMIndex);
W2.writeSymtab();
W2.writeStrtab();
@@ -402,10 +419,8 @@ void splitAndWriteThinLTOBitcode(
// Returns whether this module needs to be split because it uses type metadata.
bool requiresSplit(Module &M) {
- SmallVector<MDNode *, 1> MDs;
for (auto &GO : M.global_objects()) {
- GO.getMetadata(LLVMContext::MD_type, MDs);
- if (!MDs.empty())
+ if (GO.hasMetadata(LLVMContext::MD_type))
return true;
}
@@ -425,13 +440,13 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
// be used in the backends, and use that in the minimized bitcode
// produced for the full link.
ModuleHash ModHash = {{0}};
- WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
+ WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
/*GenerateHash=*/true, &ModHash);
// If a minimized bitcode module was requested for the thin link, only
// the information that is needed by thin link will be written in the
// given OS.
if (ThinLinkOS && Index)
- WriteThinLinkBitcodeToFile(&M, *ThinLinkOS, *Index, ModHash);
+ WriteThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash);
}
class WriteThinLTOBitcode : public ModulePass {
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 5fbb001216a3..d65da2504db4 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -111,6 +111,12 @@ static cl::opt<std::string> ClWriteSummary(
cl::desc("Write summary to given YAML file after running pass"),
cl::Hidden);
+static cl::opt<unsigned>
+ ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
+ cl::init(10), cl::ZeroOrMore,
+ cl::desc("Maximum number of call targets per "
+ "call site to enable branch funnels"));
+
// Find the minimum offset that we may store a value of size Size bits at. If
// IsAfter is set, look for an offset before the object, otherwise look for an
// offset after the object.
@@ -281,24 +287,11 @@ struct VirtualCallSite {
DebugLoc DLoc = CS->getDebugLoc();
BasicBlock *Block = CS.getParent();
- // In the new pass manager, we can request the optimization
- // remark emitter pass on a per-function-basis, which the
- // OREGetter will do for us.
- // In the old pass manager, this is harder, so we just build
- // a optimization remark emitter on the fly, when we need it.
- std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
- OptimizationRemarkEmitter *ORE;
- if (OREGetter)
- ORE = &OREGetter(F);
- else {
- OwnedORE = make_unique<OptimizationRemarkEmitter>(F);
- ORE = OwnedORE.get();
- }
-
using namespace ore;
- ORE->emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
- << NV("Optimization", OptName) << ": devirtualized a call to "
- << NV("FunctionName", TargetName));
+ OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+ << NV("Optimization", OptName)
+ << ": devirtualized a call to "
+ << NV("FunctionName", TargetName));
}
void replaceAndErase(
@@ -329,12 +322,17 @@ struct CallSiteInfo {
/// cases we are directly operating on the call sites at the IR level.
std::vector<VirtualCallSite> CallSites;
+ /// Whether all call sites represented by this CallSiteInfo, including those
+ /// in summaries, have been devirtualized. This starts off as true because a
+ /// default constructed CallSiteInfo represents no call sites.
+ bool AllCallSitesDevirted = true;
+
// These fields are used during the export phase of ThinLTO and reflect
// information collected from function summaries.
/// Whether any function summary contains an llvm.assume(llvm.type.test) for
/// this slot.
- bool SummaryHasTypeTestAssumeUsers;
+ bool SummaryHasTypeTestAssumeUsers = false;
/// CFI-specific: a vector containing the list of function summaries that use
/// the llvm.type.checked.load intrinsic and therefore will require
@@ -350,8 +348,22 @@ struct CallSiteInfo {
!SummaryTypeCheckedLoadUsers.empty();
}
- /// As explained in the comment for SummaryTypeCheckedLoadUsers.
- void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); }
+ void markSummaryHasTypeTestAssumeUsers() {
+ SummaryHasTypeTestAssumeUsers = true;
+ AllCallSitesDevirted = false;
+ }
+
+ void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
+ SummaryTypeCheckedLoadUsers.push_back(FS);
+ AllCallSitesDevirted = false;
+ }
+
+ void markDevirt() {
+ AllCallSitesDevirted = true;
+
+ // As explained in the comment for SummaryTypeCheckedLoadUsers.
+ SummaryTypeCheckedLoadUsers.clear();
+ }
};
// Call site information collected for a specific VTableSlot.
@@ -386,7 +398,9 @@ CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
unsigned *NumUnsafeUses) {
- findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses});
+ auto &CSI = findCallSiteInfo(CS);
+ CSI.AllCallSitesDevirted = false;
+ CSI.CallSites.push_back({VTable, CS, NumUnsafeUses});
}
struct DevirtModule {
@@ -451,6 +465,12 @@ struct DevirtModule {
VTableSlotInfo &SlotInfo,
WholeProgramDevirtResolution *Res);
+ void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
+ bool &IsExported);
+ void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
bool tryEvaluateFunctionsWithArgs(
MutableArrayRef<VirtualCallTarget> TargetsForSlot,
ArrayRef<uint64_t> Args);
@@ -484,6 +504,8 @@ struct DevirtModule {
StringRef Name, IntegerType *IntTy,
uint32_t Storage);
+ Constant *getMemberAddr(const TypeMemberInfo *M);
+
void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
Constant *UniqueMemberAddr);
bool tryUniqueRetValOpt(unsigned BitWidth,
@@ -539,7 +561,16 @@ struct WholeProgramDevirt : public ModulePass {
if (skipModule(M))
return false;
- auto OREGetter = function_ref<OptimizationRemarkEmitter &(Function *)>();
+ // In the new pass manager, we can request the optimization
+ // remark emitter pass on a per-function-basis, which the
+ // OREGetter will do for us.
+ // In the old pass manager, this is harder, so we just build
+ // an optimization remark emitter on the fly, when we need it.
+ std::unique_ptr<OptimizationRemarkEmitter> ORE;
+ auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+ ORE = make_unique<OptimizationRemarkEmitter>(F);
+ return *ORE;
+ };
if (UseCommandLine)
return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter);
@@ -580,7 +611,8 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
};
- if (!DevirtModule(M, AARGetter, OREGetter, nullptr, nullptr).run())
+ if (!DevirtModule(M, AARGetter, OREGetter, ExportSummary, ImportSummary)
+ .run())
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
@@ -588,7 +620,7 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
bool DevirtModule::runForTesting(
Module &M, function_ref<AAResults &(Function &)> AARGetter,
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
- ModuleSummaryIndex Summary;
+ ModuleSummaryIndex Summary(/*HaveGVs=*/false);
// Handle the command-line summary arguments. This code is for testing
// purposes only, so we handle errors directly.
@@ -730,10 +762,9 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
if (VCallSite.NumUnsafeUses)
--*VCallSite.NumUnsafeUses;
}
- if (CSInfo.isExported()) {
+ if (CSInfo.isExported())
IsExported = true;
- CSInfo.markDevirt();
- }
+ CSInfo.markDevirt();
};
Apply(SlotInfo.CSInfo);
for (auto &P : SlotInfo.ConstCSInfo)
@@ -789,6 +820,133 @@ bool DevirtModule::trySingleImplDevirt(
return true;
}
+void DevirtModule::tryICallBranchFunnel(
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+ Triple T(M.getTargetTriple());
+ if (T.getArch() != Triple::x86_64)
+ return;
+
+ if (TargetsForSlot.size() > ClThreshold)
+ return;
+
+ bool HasNonDevirt = !SlotInfo.CSInfo.AllCallSitesDevirted;
+ if (!HasNonDevirt)
+ for (auto &P : SlotInfo.ConstCSInfo)
+ if (!P.second.AllCallSitesDevirted) {
+ HasNonDevirt = true;
+ break;
+ }
+
+ if (!HasNonDevirt)
+ return;
+
+ FunctionType *FT =
+ FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true);
+ Function *JT;
+ if (isa<MDString>(Slot.TypeID)) {
+ JT = Function::Create(FT, Function::ExternalLinkage,
+ getGlobalName(Slot, {}, "branch_funnel"), &M);
+ JT->setVisibility(GlobalValue::HiddenVisibility);
+ } else {
+ JT = Function::Create(FT, Function::InternalLinkage, "branch_funnel", &M);
+ }
+ JT->addAttribute(1, Attribute::Nest);
+
+ std::vector<Value *> JTArgs;
+ JTArgs.push_back(JT->arg_begin());
+ for (auto &T : TargetsForSlot) {
+ JTArgs.push_back(getMemberAddr(T.TM));
+ JTArgs.push_back(T.Fn);
+ }
+
+ BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr);
+ Constant *Intr =
+ Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {});
+
+ auto *CI = CallInst::Create(Intr, JTArgs, "", BB);
+ CI->setTailCallKind(CallInst::TCK_MustTail);
+ ReturnInst::Create(M.getContext(), nullptr, BB);
+
+ bool IsExported = false;
+ applyICallBranchFunnel(SlotInfo, JT, IsExported);
+ if (IsExported)
+ Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
+}
+
+void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
+ Constant *JT, bool &IsExported) {
+ auto Apply = [&](CallSiteInfo &CSInfo) {
+ if (CSInfo.isExported())
+ IsExported = true;
+ if (CSInfo.AllCallSitesDevirted)
+ return;
+ for (auto &&VCallSite : CSInfo.CallSites) {
+ CallSite CS = VCallSite.CS;
+
+ // Jump tables are only profitable if the retpoline mitigation is enabled.
+ Attribute FSAttr = CS.getCaller()->getFnAttribute("target-features");
+ if (FSAttr.hasAttribute(Attribute::None) ||
+ !FSAttr.getValueAsString().contains("+retpoline"))
+ continue;
+
+ if (RemarksEnabled)
+ VCallSite.emitRemark("branch-funnel", JT->getName(), OREGetter);
+
+ // Pass the address of the vtable in the nest register, which is r10 on
+ // x86_64.
+ std::vector<Type *> NewArgs;
+ NewArgs.push_back(Int8PtrTy);
+ for (Type *T : CS.getFunctionType()->params())
+ NewArgs.push_back(T);
+ PointerType *NewFT = PointerType::getUnqual(
+ FunctionType::get(CS.getFunctionType()->getReturnType(), NewArgs,
+ CS.getFunctionType()->isVarArg()));
+
+ IRBuilder<> IRB(CS.getInstruction());
+ std::vector<Value *> Args;
+ Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
+ for (unsigned I = 0; I != CS.getNumArgOperands(); ++I)
+ Args.push_back(CS.getArgOperand(I));
+
+ CallSite NewCS;
+ if (CS.isCall())
+ NewCS = IRB.CreateCall(IRB.CreateBitCast(JT, NewFT), Args);
+ else
+ NewCS = IRB.CreateInvoke(
+ IRB.CreateBitCast(JT, NewFT),
+ cast<InvokeInst>(CS.getInstruction())->getNormalDest(),
+ cast<InvokeInst>(CS.getInstruction())->getUnwindDest(), Args);
+ NewCS.setCallingConv(CS.getCallingConv());
+
+ AttributeList Attrs = CS.getAttributes();
+ std::vector<AttributeSet> NewArgAttrs;
+ NewArgAttrs.push_back(AttributeSet::get(
+ M.getContext(), ArrayRef<Attribute>{Attribute::get(
+ M.getContext(), Attribute::Nest)}));
+ for (unsigned I = 0; I + 2 < Attrs.getNumAttrSets(); ++I)
+ NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+ NewCS.setAttributes(
+ AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
+ Attrs.getRetAttributes(), NewArgAttrs));
+
+ CS->replaceAllUsesWith(NewCS.getInstruction());
+ CS->eraseFromParent();
+
+ // This use is no longer unsafe.
+ if (VCallSite.NumUnsafeUses)
+ --*VCallSite.NumUnsafeUses;
+ }
+ // Don't mark as devirtualized because there may be callers compiled without
+ // retpoline mitigation, which would mean that they are lowered to
+ // llvm.type.test and therefore require an llvm.type.test resolution for the
+ // type identifier.
+ };
+ Apply(SlotInfo.CSInfo);
+ for (auto &P : SlotInfo.ConstCSInfo)
+ Apply(P.second);
+}
+
bool DevirtModule::tryEvaluateFunctionsWithArgs(
MutableArrayRef<VirtualCallTarget> TargetsForSlot,
ArrayRef<uint64_t> Args) {
@@ -909,7 +1067,7 @@ Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
// We only need to set metadata if the global is newly created, in which
// case it would not have hidden visibility.
- if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
+ if (GV->hasMetadata(LLVMContext::MD_absolute_symbol))
return C;
auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
@@ -941,6 +1099,12 @@ void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
CSInfo.markDevirt();
}
+Constant *DevirtModule::getMemberAddr(const TypeMemberInfo *M) {
+ Constant *C = ConstantExpr::getBitCast(M->Bits->GV, Int8PtrTy);
+ return ConstantExpr::getGetElementPtr(Int8Ty, C,
+ ConstantInt::get(Int64Ty, M->Offset));
+}
+
bool DevirtModule::tryUniqueRetValOpt(
unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
@@ -960,12 +1124,7 @@ bool DevirtModule::tryUniqueRetValOpt(
// checked for a uniform return value in tryUniformRetValOpt.
assert(UniqueMember);
- Constant *UniqueMemberAddr =
- ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy);
- UniqueMemberAddr = ConstantExpr::getGetElementPtr(
- Int8Ty, UniqueMemberAddr,
- ConstantInt::get(Int64Ty, UniqueMember->Offset));
-
+ Constant *UniqueMemberAddr = getMemberAddr(UniqueMember);
if (CSInfo.isExported()) {
Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
Res->Info = IsOne;
@@ -1352,6 +1511,14 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
break;
}
}
+
+ if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
+ auto *JT = M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
+ Type::getVoidTy(M.getContext()));
+ bool IsExported = false;
+ applyICallBranchFunnel(SlotInfo, JT, IsExported);
+ assert(!IsExported);
+ }
}
void DevirtModule::removeRedundantTypeTests() {
@@ -1421,14 +1588,13 @@ bool DevirtModule::run() {
// FIXME: Only add live functions.
for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
for (Metadata *MD : MetadataByGUID[VF.GUID]) {
- CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers =
- true;
+ CallSlots[{MD, VF.Offset}]
+ .CSInfo.markSummaryHasTypeTestAssumeUsers();
}
}
for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
for (Metadata *MD : MetadataByGUID[VF.GUID]) {
- CallSlots[{MD, VF.Offset}]
- .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS);
+ CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
}
}
for (const FunctionSummary::ConstVCall &VC :
@@ -1436,7 +1602,7 @@ bool DevirtModule::run() {
for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
CallSlots[{MD, VC.VFunc.Offset}]
.ConstCSInfo[VC.Args]
- .SummaryHasTypeTestAssumeUsers = true;
+ .markSummaryHasTypeTestAssumeUsers();
}
}
for (const FunctionSummary::ConstVCall &VC :
@@ -1444,7 +1610,7 @@ bool DevirtModule::run() {
for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
CallSlots[{MD, VC.VFunc.Offset}]
.ConstCSInfo[VC.Args]
- .SummaryTypeCheckedLoadUsers.push_back(FS);
+ .addSummaryTypeCheckedLoadUser(FS);
}
}
}
@@ -1468,9 +1634,12 @@ bool DevirtModule::run() {
cast<MDString>(S.first.TypeID)->getString())
.WPDRes[S.first.ByteOffset];
- if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) &&
- tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first))
- DidVirtualConstProp = true;
+ if (!trySingleImplDevirt(TargetsForSlot, S.second, Res)) {
+ DidVirtualConstProp |=
+ tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
+
+ tryICallBranchFunnel(TargetsForSlot, S.second, Res, S.first);
+ }
// Collect functions devirtualized at least for one call site for stats.
if (RemarksEnabled)
@@ -1499,23 +1668,10 @@ bool DevirtModule::run() {
for (const auto &DT : DevirtTargets) {
Function *F = DT.second;
- // In the new pass manager, we can request the optimization
- // remark emitter pass on a per-function-basis, which the
- // OREGetter will do for us.
- // In the old pass manager, this is harder, so we just build
- // a optimization remark emitter on the fly, when we need it.
- std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
- OptimizationRemarkEmitter *ORE;
- if (OREGetter)
- ORE = &OREGetter(F);
- else {
- OwnedORE = make_unique<OptimizationRemarkEmitter>(F);
- ORE = OwnedORE.get();
- }
-
using namespace ore;
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
- << "devirtualized " << NV("FunctionName", F->getName()));
+ OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+ << "devirtualized "
+ << NV("FunctionName", F->getName()));
}
}
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index 5cbe804ce3ec..8a3a58e9ecc9 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -1,3 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
+tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
+add_public_tablegen_target(InstCombineTableGen)
+
add_llvm_library(LLVMInstCombine
InstructionCombining.cpp
InstCombineAddSub.cpp
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 688897644848..aa31e0d850dd 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -511,7 +511,8 @@ Value *FAddCombine::performFactorization(Instruction *I) {
}
Value *FAddCombine::simplify(Instruction *I) {
- assert(I->isFast() && "Expected 'fast' instruction");
+ assert(I->hasAllowReassoc() && I->hasNoSignedZeros() &&
+ "Expected 'reassoc'+'nsz' instruction");
// Currently we are not able to handle vector type.
if (I->getType()->isVectorTy())
@@ -855,48 +856,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
}
-/// \brief Return true if we can prove that:
-/// (sub LHS, RHS) === (sub nsw LHS, RHS)
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-/// TODO: Handle this for Vectors.
-bool InstCombiner::willNotOverflowSignedSub(const Value *LHS,
- const Value *RHS,
- const Instruction &CxtI) const {
- // If LHS and RHS each have at least two sign bits, the subtraction
- // cannot overflow.
- if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
- ComputeNumSignBits(RHS, 0, &CxtI) > 1)
- return true;
-
- KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI);
-
- KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI);
-
- // Subtraction of two 2's complement numbers having identical signs will
- // never overflow.
- if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
- (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
- return true;
-
- // TODO: implement logic similar to checkRippleForAdd
- return false;
-}
-
-/// \brief Return true if we can prove that:
-/// (sub LHS, RHS) === (sub nuw LHS, RHS)
-bool InstCombiner::willNotOverflowUnsignedSub(const Value *LHS,
- const Value *RHS,
- const Instruction &CxtI) const {
- // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
- KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
- KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
- if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
- return true;
-
- return false;
-}
-
// Checks if any operand is negative and we can convert add to sub.
// This function checks for following negative patterns
// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
@@ -964,7 +923,7 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
if (!match(Op1, m_Constant(Op1C)))
return nullptr;
- if (Instruction *NV = foldOpWithConstantIntoOperand(Add))
+ if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add))
return NV;
Value *X;
@@ -1031,17 +990,148 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
return nullptr;
}
-Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
- if (Value *V = SimplifyVectorOp(I))
- return replaceInstUsesWith(I, V);
+// Matches multiplication expression Op * C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns true if such a
+// match is found.
+static bool MatchMul(Value *E, Value *&Op, APInt &C) {
+ const APInt *AI;
+ if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) {
+ C = APInt(AI->getBitWidth(), 1);
+ C <<= *AI;
+ return true;
+ }
+ return false;
+}
+
+// Matches remainder expression Op % C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns the signedness of
+// the remainder operation in IsSigned. Returns true if such a match is
+// found.
+static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) {
+ const APInt *AI;
+ IsSigned = false;
+ if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) {
+ IsSigned = true;
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) {
+ C = *AI + 1;
+ return true;
+ }
+ return false;
+}
+// Matches division expression Op / C with the given signedness as indicated
+// by IsSigned, where C is a constant. Returns the constant value in C and the
+// other operand in Op. Returns true if such a match is found.
+static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) {
+ const APInt *AI;
+ if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (!IsSigned) {
+ if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) {
+ C = APInt(AI->getBitWidth(), 1);
+ C <<= *AI;
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns whether C0 * C1 with the given signedness overflows.
+static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
+ bool overflow;
+ if (IsSigned)
+ (void)C0.smul_ov(C1, overflow);
+ else
+ (void)C0.umul_ov(C1, overflow);
+ return overflow;
+}
+
+// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
+// does not overflow.
+Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- if (Value *V =
- SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
- SQ.getWithInstruction(&I)))
+ Value *X, *MulOpV;
+ APInt C0, MulOpC;
+ bool IsSigned;
+ // Match I = X % C0 + MulOpV * C0
+ if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) ||
+ (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) &&
+ C0 == MulOpC) {
+ Value *RemOpV;
+ APInt C1;
+ bool Rem2IsSigned;
+ // Match MulOpC = RemOpV % C1
+ if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) &&
+ IsSigned == Rem2IsSigned) {
+ Value *DivOpV;
+ APInt DivOpC;
+ // Match RemOpV = X / C0
+ if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
+ C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
+ Value *NewDivisor =
+ ConstantInt::get(X->getType()->getContext(), C0 * C1);
+ return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
+ : Builder.CreateURem(X, NewDivisor, "urem");
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold
+/// (1 << NBits) - 1
+/// Into:
+/// ~(-(1 << NBits))
+/// Because a 'not' is better for bit-tracking analysis and other transforms
+/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was.
+static Instruction *canonicalizeLowbitMask(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *NBits;
+ if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes())))
+ return nullptr;
+
+ Constant *MinusOne = Constant::getAllOnesValue(NBits->getType());
+ Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask");
+ // Be wary of constant folding.
+ if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) {
+ // Always NSW. But NUW propagates from `add`.
+ BOp->setHasNoSignedWrap();
+ BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ }
+
+ return BinaryOperator::CreateNot(NotMask, I.getName());
+}
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+ if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
+ I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
+
// (A*B)+(A*C) -> A*(B+C) etc
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
@@ -1051,6 +1141,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
// FIXME: This should be moved into the above helper function to allow these
// transforms for general constant or constant splat vectors.
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
Type *Ty = I.getType();
if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr;
@@ -1123,6 +1214,14 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (Value *V = checkForNegativeOperand(I, Builder))
return replaceInstUsesWith(I, V);
+ // (A + 1) + ~B --> A - B
+ // ~B + (A + 1) --> A - B
+ if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))))
+ return BinaryOperator::CreateSub(A, B);
+
+ // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
+ if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
+
// A+B --> A|B iff A and B have no bits set in common.
if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
return BinaryOperator::CreateOr(LHS, RHS);
@@ -1253,26 +1352,15 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
}
// (add (xor A, B) (and A, B)) --> (or A, B)
- if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
- match(RHS, m_c_And(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateOr(A, B);
-
// (add (and A, B) (xor A, B)) --> (or A, B)
- if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
- match(LHS, m_c_And(m_Specific(A), m_Specific(B))))
+ if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)),
+ m_c_And(m_Deferred(A), m_Deferred(B)))))
return BinaryOperator::CreateOr(A, B);
// (add (or A, B) (and A, B)) --> (add A, B)
- if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
- match(RHS, m_c_And(m_Specific(A), m_Specific(B)))) {
- I.setOperand(0, A);
- I.setOperand(1, B);
- return &I;
- }
-
// (add (and A, B) (or A, B)) --> (add A, B)
- if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
- match(LHS, m_c_And(m_Specific(A), m_Specific(B)))) {
+ if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
+ m_c_And(m_Deferred(A), m_Deferred(B))))) {
I.setOperand(0, A);
I.setOperand(1, B);
return &I;
@@ -1281,6 +1369,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
// TODO(jingyue): Consider willNotOverflowSignedAdd and
// willNotOverflowUnsignedAdd to reduce the number of invocations of
// computeKnownBits.
+ bool Changed = false;
if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoSignedWrap(true);
@@ -1290,39 +1379,35 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
I.setHasNoUnsignedWrap(true);
}
+ if (Instruction *V = canonicalizeLowbitMask(I, Builder))
+ return V;
+
return Changed ? &I : nullptr;
}
Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(),
+ if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (isa<Constant>(RHS))
- if (Instruction *FoldedFAdd = foldOpWithConstantIntoOperand(I))
- return FoldedFAdd;
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
- // -A + B --> B - A
- // -A + -B --> -(A + B)
- if (Value *LHSV = dyn_castFNegVal(LHS)) {
- Instruction *RI = BinaryOperator::CreateFSub(RHS, LHSV);
- RI->copyFastMathFlags(&I);
- return RI;
- }
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
- // A + -B --> A - B
- if (!isa<Constant>(RHS))
- if (Value *V = dyn_castFNegVal(RHS)) {
- Instruction *RI = BinaryOperator::CreateFSub(LHS, V);
- RI->copyFastMathFlags(&I);
- return RI;
- }
+ if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
+ return FoldedFAdd;
+
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ Value *X;
+ // (-X) + Y --> Y - X
+ if (match(LHS, m_FNeg(m_Value(X))))
+ return BinaryOperator::CreateFSubFMF(RHS, X, &I);
+ // Y + (-X) --> Y - X
+ if (match(RHS, m_FNeg(m_Value(X))))
+ return BinaryOperator::CreateFSubFMF(LHS, X, &I);
// Check for (fadd double (sitofp x), y), see if we can merge this into an
// integer add followed by a promotion.
@@ -1386,12 +1471,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
return replaceInstUsesWith(I, V);
- if (I.isFast()) {
+ if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
if (Value *V = FAddCombine(Builder).simplify(&I))
return replaceInstUsesWith(I, V);
}
- return Changed ? &I : nullptr;
+ return nullptr;
}
/// Optimize pointer differences into the same array into a size. Consider:
@@ -1481,21 +1566,20 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
}
Instruction *InstCombiner::visitSub(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
+ I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V =
- SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
// (A*B)-(A*C) -> A*(B-C) etc
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
// If this is a 'B = x-(-A)', change to B = x+A.
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
if (Value *V = dyn_castNegVal(Op1)) {
BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
@@ -1519,12 +1603,28 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
if (match(Op0, m_AllOnes()))
return BinaryOperator::CreateNot(Op1);
+ // (~X) - (~Y) --> Y - X
+ Value *X, *Y;
+ if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
+ return BinaryOperator::CreateSub(Y, X);
+
if (Constant *C = dyn_cast<Constant>(Op0)) {
+ bool IsNegate = match(C, m_ZeroInt());
Value *X;
- // C - zext(bool) -> bool ? C - 1 : C
- if (match(Op1, m_ZExt(m_Value(X))) &&
- X->getType()->getScalarSizeInBits() == 1)
+ if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ // 0 - (zext bool) --> sext bool
+ // C - (zext bool) --> bool ? C - 1 : C
+ if (IsNegate)
+ return CastInst::CreateSExtOrBitCast(X, I.getType());
return SelectInst::Create(X, SubOne(C), C);
+ }
+ if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ // 0 - (sext bool) --> zext bool
+ // C - (sext bool) --> bool ? C + 1 : C
+ if (IsNegate)
+ return CastInst::CreateZExtOrBitCast(X, I.getType());
+ return SelectInst::Create(X, AddOne(C), C);
+ }
// C - ~X == X + (1+C)
if (match(Op1, m_Not(m_Value(X))))
@@ -1544,16 +1644,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
Constant *C2;
if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
-
- // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
- if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X))))
- if (X->getType()->isIntOrIntVectorTy(1))
- return CastInst::CreateSExtOrBitCast(X, Op1->getType());
-
- // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
- if (C->isNullValue() && match(Op1, m_SExt(m_Value(X))))
- if (X->getType()->isIntOrIntVectorTy(1))
- return CastInst::CreateZExtOrBitCast(X, Op1->getType());
}
const APInt *Op0C;
@@ -1575,6 +1665,22 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
return BinaryOperator::CreateLShr(X, ShAmtOp);
}
+
+ if (Op1->hasOneUse()) {
+ Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor;
+ if (SPF == SPF_ABS || SPF == SPF_NABS) {
+ // This is a negate of an ABS/NABS pattern. Just swap the operands
+ // of the select.
+ SelectInst *SI = cast<SelectInst>(Op1);
+ Value *TrueVal = SI->getTrueValue();
+ Value *FalseVal = SI->getFalseValue();
+ SI->setTrueValue(FalseVal);
+ SI->setFalseValue(TrueVal);
+ // Don't swap prof metadata, we didn't change the branch behavior.
+ return replaceInstUsesWith(I, SI);
+ }
+ }
}
// Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
@@ -1678,6 +1784,27 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
return replaceInstUsesWith(I, Res);
+ // Canonicalize a shifty way to code absolute value to the common pattern.
+ // There are 2 potential commuted variants.
+ // We're relying on the fact that we only do this transform when the shift has
+ // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase
+ // instructions).
+ Value *A;
+ const APInt *ShAmt;
+ Type *Ty = I.getType();
+ if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
+ Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+ match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) {
+ // B = ashr i32 A, 31 ; smear the sign bit
+ // sub (xor A, B), B ; flip bits if negative and subtract -1 (add 1)
+ // --> (A < 0) ? -A : A
+ Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+ // Copy the nuw/nsw flags from the sub to the negate.
+ Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
+ I.hasNoSignedWrap());
+ return SelectInst::Create(Cmp, Neg, A);
+ }
+
bool Changed = false;
if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
Changed = true;
@@ -1692,21 +1819,32 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
}
Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(),
+ if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
+
+ // Subtraction from -0.0 is the canonical form of fneg.
// fsub nsz 0, X ==> fsub nsz -0.0, X
- if (I.getFastMathFlags().noSignedZeros() && match(Op0, m_Zero())) {
- // Subtraction from -0.0 is the canonical form of fneg.
- Instruction *NewI = BinaryOperator::CreateFNeg(Op1);
- NewI->copyFastMathFlags(&I);
- return NewI;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (I.hasNoSignedZeros() && match(Op0, m_PosZeroFP()))
+ return BinaryOperator::CreateFNegFMF(Op1, &I);
+
+ // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
+ // Canonicalize to fadd to make analysis easier.
+ // This can also help codegen because fadd is commutative.
+ // Note that if this fsub was really an fneg, the fadd with -0.0 will get
+ // killed later. We still limit that particular transform with 'hasOneUse'
+ // because an fneg is assumed better/cheaper than a generic fsub.
+ Value *X, *Y;
+ if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) {
+ if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+ Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
+ return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
+ }
}
if (isa<Constant>(Op0))
@@ -1714,34 +1852,34 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
if (Instruction *NV = FoldOpIntoSelect(I, SI))
return NV;
- // If this is a 'B = x-(-A)', change to B = x+A, potentially looking
- // through FP extensions/truncations along the way.
- if (Value *V = dyn_castFNegVal(Op1)) {
- Instruction *NewI = BinaryOperator::CreateFAdd(Op0, V);
- NewI->copyFastMathFlags(&I);
- return NewI;
- }
- if (FPTruncInst *FPTI = dyn_cast<FPTruncInst>(Op1)) {
- if (Value *V = dyn_castFNegVal(FPTI->getOperand(0))) {
- Value *NewTrunc = Builder.CreateFPTrunc(V, I.getType());
- Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewTrunc);
- NewI->copyFastMathFlags(&I);
- return NewI;
- }
- } else if (FPExtInst *FPEI = dyn_cast<FPExtInst>(Op1)) {
- if (Value *V = dyn_castFNegVal(FPEI->getOperand(0))) {
- Value *NewExt = Builder.CreateFPExt(V, I.getType());
- Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewExt);
- NewI->copyFastMathFlags(&I);
- return NewI;
- }
+ // X - C --> X + (-C)
+ // But don't transform constant expressions because there's an inverse fold
+ // for X + (-Y) --> X - Y.
+ Constant *C;
+ if (match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1))
+ return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
+
+ // X - (-Y) --> X + Y
+ if (match(Op1, m_FNeg(m_Value(Y))))
+ return BinaryOperator::CreateFAddFMF(Op0, Y, &I);
+
+ // Similar to above, but look through a cast of the negated value:
+ // X - (fptrunc(-Y)) --> X + fptrunc(Y)
+ if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y)))))) {
+ Value *TruncY = Builder.CreateFPTrunc(Y, I.getType());
+ return BinaryOperator::CreateFAddFMF(Op0, TruncY, &I);
+ }
+ // X - (fpext(-Y)) --> X + fpext(Y)
+ if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y)))))) {
+ Value *ExtY = Builder.CreateFPExt(Y, I.getType());
+ return BinaryOperator::CreateFAddFMF(Op0, ExtY, &I);
}
// Handle specials cases for FSub with selects feeding the operation
if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
return replaceInstUsesWith(I, V);
- if (I.isFast()) {
+ if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
if (Value *V = FAddCombine(Builder).simplify(&I))
return replaceInstUsesWith(I, V);
}
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2364202e5b69..372bc41f780e 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -14,10 +14,10 @@
#include "InstCombineInternal.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
using namespace PatternMatch;
@@ -75,7 +75,7 @@ static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
return Builder.CreateFCmp(Pred, LHS, RHS);
}
-/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
+/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
/// \param I Binary operator to transform.
/// \return Pointer to node that must replace the original binary operator, or
@@ -305,17 +305,21 @@ static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pre
}
/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
-/// Return the set of pattern classes (from MaskedICmpType) that both LHS and
-/// RHS satisfy.
-static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
- Value *&D, Value *&E, ICmpInst *LHS,
- ICmpInst *RHS,
- ICmpInst::Predicate &PredL,
- ICmpInst::Predicate &PredR) {
+/// Return the pattern classes (from MaskedICmpType) for the left hand side and
+/// the right hand side as a pair.
+/// LHS and RHS are the left hand side and the right hand side ICmps and PredL
+/// and PredR are their predicates, respectively.
+static
+Optional<std::pair<unsigned, unsigned>>
+getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+ Value *&D, Value *&E, ICmpInst *LHS,
+ ICmpInst *RHS,
+ ICmpInst::Predicate &PredL,
+ ICmpInst::Predicate &PredR) {
// vectors are not (yet?) supported. Don't support pointers either.
if (!LHS->getOperand(0)->getType()->isIntegerTy() ||
!RHS->getOperand(0)->getType()->isIntegerTy())
- return 0;
+ return None;
// Here comes the tricky part:
// LHS might be of the form L11 & L12 == X, X == L21 & L22,
@@ -346,7 +350,7 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
// Bail if LHS was a icmp that can't be decomposed into an equality.
if (!ICmpInst::isEquality(PredL))
- return 0;
+ return None;
Value *R1 = RHS->getOperand(0);
Value *R2 = RHS->getOperand(1);
@@ -360,7 +364,7 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
A = R12;
D = R11;
} else {
- return 0;
+ return None;
}
E = R2;
R1 = nullptr;
@@ -388,7 +392,7 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
// Bail if RHS was a icmp that can't be decomposed into an equality.
if (!ICmpInst::isEquality(PredR))
- return 0;
+ return None;
// Look for ANDs on the right side of the RHS icmp.
if (!Ok) {
@@ -408,11 +412,11 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
E = R1;
Ok = true;
} else {
- return 0;
+ return None;
}
}
if (!Ok)
- return 0;
+ return None;
if (L11 == A) {
B = L12;
@@ -430,7 +434,174 @@ static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
unsigned RightType = getMaskedICmpType(A, D, E, PredR);
- return LeftType & RightType;
+ return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType));
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
+/// and the right hand side is of type BMask_Mixed. For example,
+/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
+static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+ ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+ Value *A, Value *B, Value *C, Value *D, Value *E,
+ ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+ llvm::InstCombiner::BuilderTy &Builder) {
+ // We are given the canonical form:
+ // (icmp ne (A & B), 0) & (icmp eq (A & D), E).
+ // where D & E == E.
+ //
+ // If IsAnd is false, we get it in negated form:
+ // (icmp eq (A & B), 0) | (icmp ne (A & D), E) ->
+ // !((icmp ne (A & B), 0) & (icmp eq (A & D), E)).
+ //
+ // We currently handle the case of B, C, D, E are constant.
+ //
+ ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+ if (!BCst)
+ return nullptr;
+ ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+ if (!CCst)
+ return nullptr;
+ ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+ if (!DCst)
+ return nullptr;
+ ConstantInt *ECst = dyn_cast<ConstantInt>(E);
+ if (!ECst)
+ return nullptr;
+
+ ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+
+ // Update E to the canonical form when D is a power of two and RHS is
+ // canonicalized as,
+ // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or
+ // (icmp ne (A & D), D) -> (icmp eq (A & D), 0).
+ if (PredR != NewCC)
+ ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+ // If B or D is zero, skip because if LHS or RHS can be trivially folded by
+ // other folding rules and this pattern won't apply any more.
+ if (BCst->getValue() == 0 || DCst->getValue() == 0)
+ return nullptr;
+
+ // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't
+ // deduce anything from it.
+ // For example,
+ // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding.
+ if ((BCst->getValue() & DCst->getValue()) == 0)
+ return nullptr;
+
+ // If the following two conditions are met:
+ //
+ // 1. mask B covers only a single bit that's not covered by mask D, that is,
+ // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of
+ // B and D has only one bit set) and,
+ //
+ // 2. RHS (and E) indicates that the rest of B's bits are zero (in other
+ // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0
+ //
+ // then that single bit in B must be one and thus the whole expression can be
+ // folded to
+ // (A & (B | D)) == (B & (B ^ D)) | E.
+ //
+ // For example,
+ // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9)
+ // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8)
+ if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) &&
+ (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) {
+ APInt BorD = BCst->getValue() | DCst->getValue();
+ APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) |
+ ECst->getValue();
+ Value *NewMask = ConstantInt::get(BCst->getType(), BorD);
+ Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE);
+ Value *NewAnd = Builder.CreateAnd(A, NewMask);
+ return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue);
+ }
+
+ auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+ return (C1->getValue() & C2->getValue()) == C1->getValue();
+ };
+ auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+ return (C1->getValue() & C2->getValue()) == C2->getValue();
+ };
+
+ // In the following, we consider only the cases where B is a superset of D, B
+ // is a subset of D, or B == D because otherwise there's at least one bit
+ // covered by B but not D, in which case we can't deduce much from it, so
+ // no folding (aside from the single must-be-one bit case right above.)
+ // For example,
+ // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding.
+ if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst))
+ return nullptr;
+
+ // At this point, either B is a superset of D, B is a subset of D or B == D.
+
+ // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict
+ // and the whole expression becomes false (or true if negated), otherwise, no
+ // folding.
+ // For example,
+ // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false.
+ // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding.
+ if (ECst->isZero()) {
+ if (IsSubSetOrEqual(BCst, DCst))
+ return ConstantInt::get(LHS->getType(), !IsAnd);
+ return nullptr;
+ }
+
+ // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B ==
+ // D. If B is a superset of (or equal to) D, since E is not zero, LHS is
+ // subsumed by RHS (RHS implies LHS.) So the whole expression becomes
+ // RHS. For example,
+ // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+ // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+ if (IsSuperSetOrEqual(BCst, DCst))
+ return RHS;
+ // Otherwise, B is a subset of D. If B and E have a common bit set,
+ // ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
+ // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+ assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
+ if ((BCst->getValue() & ECst->getValue()) != 0)
+ return RHS;
+ // Otherwise, LHS and RHS contradict and the whole expression becomes false
+ // (or true if negated.) For example,
+ // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false.
+ // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false.
+ return ConstantInt::get(LHS->getType(), !IsAnd);
+}
+
+/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
+/// aren't of the common mask pattern type.
+static Value *foldLogOpOfMaskedICmpsAsymmetric(
+ ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+ Value *A, Value *B, Value *C, Value *D, Value *E,
+ ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+ unsigned LHSMask, unsigned RHSMask,
+ llvm::InstCombiner::BuilderTy &Builder) {
+ assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+ "Expected equality predicates for masked type of icmps.");
+ // Handle Mask_NotAllZeros-BMask_Mixed cases.
+ // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or
+ // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E)
+ // which gets swapped to
+ // (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C).
+ if (!IsAnd) {
+ LHSMask = conjugateICmpMask(LHSMask);
+ RHSMask = conjugateICmpMask(RHSMask);
+ }
+ if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) {
+ if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+ LHS, RHS, IsAnd, A, B, C, D, E,
+ PredL, PredR, Builder)) {
+ return V;
+ }
+ } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) {
+ if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+ RHS, LHS, IsAnd, A, D, E, B, C,
+ PredR, PredL, Builder)) {
+ return V;
+ }
+ }
+ return nullptr;
}
/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
@@ -439,13 +610,24 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
llvm::InstCombiner::BuilderTy &Builder) {
Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
- unsigned Mask =
+ Optional<std::pair<unsigned, unsigned>> MaskPair =
getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
- if (Mask == 0)
+ if (!MaskPair)
return nullptr;
-
assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
"Expected equality predicates for masked type of icmps.");
+ unsigned LHSMask = MaskPair->first;
+ unsigned RHSMask = MaskPair->second;
+ unsigned Mask = LHSMask & RHSMask;
+ if (Mask == 0) {
+ // Even if the two sides don't share a common pattern, check if folding can
+ // still happen.
+ if (Value *V = foldLogOpOfMaskedICmpsAsymmetric(
+ LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask,
+ Builder))
+ return V;
+ return nullptr;
+ }
// In full generality:
// (icmp (A & B) Op C) | (icmp (A & D) Op E)
@@ -939,8 +1121,8 @@ Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd)
return nullptr;
// FCmp canonicalization ensures that (fcmp ord/uno X, X) and
- // (fcmp ord/uno X, C) will be transformed to (fcmp X, 0.0).
- if (match(LHS1, m_Zero()) && LHS1 == RHS1)
+ // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0).
+ if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP()))
// Ignore the constants because they are obviously not NANs:
// (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y)
// (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y)
@@ -1106,8 +1288,8 @@ static Instruction *foldAndToXor(BinaryOperator &I,
// Operand complexity canonicalization guarantees that the 'or' is Op0.
// (A | B) & ~(A & B) --> A ^ B
// (A | B) & ~(B & A) --> A ^ B
- if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
- match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B)))))
+ if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)),
+ m_Not(m_c_And(m_Deferred(A), m_Deferred(B))))))
return BinaryOperator::CreateXor(A, B);
// (A | ~B) & (~A | B) --> ~(A ^ B)
@@ -1115,8 +1297,8 @@ static Instruction *foldAndToXor(BinaryOperator &I,
// (~B | A) & (~A | B) --> ~(A ^ B)
// (~B | A) & (B | ~A) --> ~(A ^ B)
if (Op0->hasOneUse() || Op1->hasOneUse())
- if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B))))
+ if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+ m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
return nullptr;
@@ -1148,18 +1330,86 @@ static Instruction *foldOrToXor(BinaryOperator &I,
return nullptr;
}
+/// Return true if a constant shift amount is always less than the specified
+/// bit-width. If not, the shift could create poison in the narrower type.
+static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
+ if (auto *ScalarC = dyn_cast<ConstantInt>(C))
+ return ScalarC->getZExtValue() < BitWidth;
+
+ if (C->getType()->isVectorTy()) {
+ // Check each element of a constant vector.
+ unsigned NumElts = C->getType()->getVectorNumElements();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *Elt = C->getAggregateElement(i);
+ if (!Elt)
+ return false;
+ if (isa<UndefValue>(Elt))
+ continue;
+ auto *CI = dyn_cast<ConstantInt>(Elt);
+ if (!CI || CI->getZExtValue() >= BitWidth)
+ return false;
+ }
+ return true;
+ }
+
+ // The constant is a constant expression or unknown.
+ return false;
+}
+
+/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
+/// a common zext operand: and (binop (zext X), C), (zext X).
+Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
+ // This transform could also apply to {or, and, xor}, but there are better
+ // folds for those cases, so we don't expect those patterns here. AShr is not
+ // handled because it should always be transformed to LShr in this sequence.
+ // The subtract transform is different because it has a constant on the left.
+ // Add/mul commute the constant to RHS; sub with constant RHS becomes add.
+ Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1);
+ Constant *C;
+ if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1)))))
+ return nullptr;
+
+ Value *X;
+ if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3))
+ return nullptr;
+
+ Type *Ty = And.getType();
+ if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType()))
+ return nullptr;
+
+ // If we're narrowing a shift, the shift amount must be safe (less than the
+ // width) in the narrower type. If the shift amount is greater, instsimplify
+ // usually handles that case, but we can't guarantee/assert it.
+ Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode();
+ if (Opc == Instruction::LShr || Opc == Instruction::Shl)
+ if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits()))
+ return nullptr;
+
+ // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X)
+ // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X)
+ Value *NewC = ConstantExpr::getTrunc(C, X->getType());
+ Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X)
+ : Builder.CreateBinOp(Opc, X, NewC);
+ return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty);
+}
+
// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
// here. We should standardize that construct where it is needed or choose some
// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyAndInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
// See if we can simplify any instructions used by the instruction whose sole
// purpose is to compute bits we don't care about.
@@ -1177,6 +1427,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
if (Value *V = SimplifyBSwap(I, Builder))
return replaceInstUsesWith(I, V);
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
const APInt *C;
if (match(Op1, m_APInt(C))) {
Value *X, *Y;
@@ -1289,9 +1540,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
}
}
- if (isa<Constant>(Op1))
- if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
- return FoldedLogic;
+ if (Instruction *Z = narrowMaskedBinOp(I))
+ return Z;
+
+ if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+ return FoldedLogic;
if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
return DeMorgan;
@@ -1397,7 +1650,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
A->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));
- return Changed ? &I : nullptr;
+ return nullptr;
}
/// Given an OR instruction, check to see if this is a bswap idiom. If so,
@@ -1424,7 +1677,18 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
match(Op1, m_And(m_Value(), m_Value()));
- if (!OrOfOrs && !OrOfShifts && !OrOfAnds)
+ // (A << B) | (C & D) -> bswap if possible.
+ // The bigger pattern here is ((A & C1) << C2) | ((B >> C2) & C1), which is a
+ // part of the bswap idiom for specific values of C1, C2 (e.g. C1 = 16711935,
+ // C2 = 8 for i32).
+ // This pattern can occur when the operands of the 'or' are not canonicalized
+ // for some reason (not having only one use, for example).
+ bool OrOfAndAndSh = (match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
+ match(Op1, m_And(m_Value(), m_Value()))) ||
+ (match(Op0, m_And(m_Value(), m_Value())) &&
+ match(Op1, m_LogicalShift(m_Value(), m_Value())));
+
+ if (!OrOfOrs && !OrOfShifts && !OrOfAnds && !OrOfAndAndSh)
return nullptr;
SmallVector<Instruction*, 4> Insts;
@@ -1448,7 +1712,6 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
return false;
// One element must be all ones, and the other must be all zeros.
- // FIXME: Allow undef elements.
if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
(match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
return false;
@@ -1755,14 +2018,15 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
// here. We should standardize that construct where it is needed or choose some
// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombiner::visitOr(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyOrInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
// See if we can simplify any instructions used by the instruction whose sole
// purpose is to compute bits we don't care about.
@@ -1780,14 +2044,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (Value *V = SimplifyBSwap(I, Builder))
return replaceInstUsesWith(I, V);
- if (isa<Constant>(Op1))
- if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
- return FoldedLogic;
+ if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+ return FoldedLogic;
// Given an OR instruction, check to see if this is a bswap.
if (Instruction *BSwap = MatchBSwap(I))
return BSwap;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
{
Value *A;
const APInt *C;
@@ -2027,7 +2291,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
}
}
- return Changed ? &I : nullptr;
+ return nullptr;
}
/// A ^ B can be specified using other logic ops in a variety of patterns. We
@@ -2045,10 +2309,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
// (A & B) ^ (B | A) -> A ^ B
// (A | B) ^ (A & B) -> A ^ B
// (A | B) ^ (B & A) -> A ^ B
- if ((match(Op0, m_And(m_Value(A), m_Value(B))) &&
- match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) ||
- (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
- match(Op1, m_c_And(m_Specific(A), m_Specific(B))))) {
+ if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)),
+ m_c_Or(m_Deferred(A), m_Deferred(B))))) {
I.setOperand(0, A);
I.setOperand(1, B);
return &I;
@@ -2058,10 +2320,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
// (~B | A) ^ (~A | B) -> A ^ B
// (~A | B) ^ (A | ~B) -> A ^ B
// (B | ~A) ^ (A | ~B) -> A ^ B
- if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
- (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
- match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
+ if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+ m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) {
I.setOperand(0, A);
I.setOperand(1, B);
return &I;
@@ -2071,10 +2331,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
// (~B & A) ^ (~A & B) -> A ^ B
// (~A & B) ^ (A & ~B) -> A ^ B
// (B & ~A) ^ (A & ~B) -> A ^ B
- if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
- (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
- match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) {
+ if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))),
+ m_c_And(m_Not(m_Deferred(A)), m_Deferred(B))))) {
I.setOperand(0, A);
I.setOperand(1, B);
return &I;
@@ -2113,6 +2371,34 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
}
}
+ // TODO: This can be generalized to compares of non-signbits using
+ // decomposeBitTestICmp(). It could be enhanced more by using (something like)
+ // foldLogOpOfMaskedICmps().
+ ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+ Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+ Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+ if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+ LHS0->getType() == RHS0->getType()) {
+ // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
+ // (X < 0) ^ (Y < 0) --> (X ^ Y) < 0
+ if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+ PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
+ (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+ PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) {
+ Value *Zero = ConstantInt::getNullValue(LHS0->getType());
+ return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero);
+ }
+ // (X > -1) ^ (Y < 0) --> (X ^ Y) > -1
+ // (X < 0) ^ (Y > -1) --> (X ^ Y) > -1
+ if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+ PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
+ (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+ PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) {
+ Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType());
+ return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne);
+ }
+ }
+
// Instead of trying to imitate the folds for and/or, decompose this 'xor'
// into those logic ops. That is, try to turn this into an and-of-icmps
// because we have many folds for that pattern.
@@ -2140,18 +2426,63 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
return nullptr;
}
+/// If we have a masked merge, in the canonical form of:
+/// (assuming that A only has one use.)
+/// | A | |B|
+/// ((x ^ y) & M) ^ y
+/// | D |
+/// * If M is inverted:
+/// | D |
+/// ((x ^ y) & ~M) ^ y
+/// We can canonicalize by swapping the final xor operand
+/// to eliminate the 'not' of the mask.
+/// ((x ^ y) & M) ^ x
+/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops
+/// because that shortens the dependency chain and improves analysis:
+/// (x & M) | (y & ~M)
+static Instruction *visitMaskedMerge(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *B, *X, *D;
+ Value *M;
+ if (!match(&I, m_c_Xor(m_Value(B),
+ m_OneUse(m_c_And(
+ m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)),
+ m_Value(D)),
+ m_Value(M))))))
+ return nullptr;
+
+ Value *NotM;
+ if (match(M, m_Not(m_Value(NotM)))) {
+ // De-invert the mask and swap the value in B part.
+ Value *NewA = Builder.CreateAnd(D, NotM);
+ return BinaryOperator::CreateXor(NewA, X);
+ }
+
+ Constant *C;
+ if (D->hasOneUse() && match(M, m_Constant(C))) {
+ // Unfold.
+ Value *LHS = Builder.CreateAnd(X, C);
+ Value *NotC = Builder.CreateNot(C);
+ Value *RHS = Builder.CreateAnd(B, NotC);
+ return BinaryOperator::CreateOr(LHS, RHS);
+ }
+
+ return nullptr;
+}
+
// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
// here. We should standardize that construct where it is needed or choose some
// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombiner::visitXor(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyXorInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
if (Instruction *NewXor = foldXorToXor(I, Builder))
return NewXor;
@@ -2168,6 +2499,11 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
if (Value *V = SimplifyBSwap(I, Builder))
return replaceInstUsesWith(I, V);
+ // A^B --> A|B iff A and B have no bits set in common.
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (haveNoCommonBitsSet(Op0, Op1, DL, &AC, &I, &DT))
+ return BinaryOperator::CreateOr(Op0, Op1);
+
// Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
Value *X, *Y;
@@ -2186,6 +2522,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
return BinaryOperator::CreateAnd(X, NotY);
}
+ if (Instruction *Xor = visitMaskedMerge(I, Builder))
+ return Xor;
+
// Is this a 'not' (~) fed by a binary operator?
BinaryOperator *NotVal;
if (match(&I, m_Not(m_BinOp(NotVal)))) {
@@ -2206,6 +2545,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
}
}
+ // ~(X - Y) --> ~X + Y
+ if (match(NotVal, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))))
+ return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y);
+
// ~(~X >>s Y) --> (X >>s Y)
if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
return BinaryOperator::CreateAShr(X, Y);
@@ -2214,16 +2557,18 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
// the 'not' by inverting the constant and using the opposite shift type.
// Canonicalization rules ensure that only a negative constant uses 'ashr',
// but we must check that in case that transform has not fired yet.
- const APInt *C;
- if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+ Constant *C;
+ if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) &&
+ match(C, m_Negative())) {
// ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
- Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ Constant *NotC = ConstantExpr::getNot(C);
return BinaryOperator::CreateLShr(NotC, Y);
}
- if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+ if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) &&
+ match(C, m_NonNegative())) {
// ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
- Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ Constant *NotC = ConstantExpr::getNot(C);
return BinaryOperator::CreateAShr(NotC, Y);
}
}
@@ -2305,9 +2650,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
}
}
- if (isa<Constant>(Op1))
- if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
- return FoldedLogic;
+ if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+ return FoldedLogic;
{
Value *A, *B;
@@ -2397,25 +2741,59 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
return CastedXor;
- // Canonicalize the shifty way to code absolute value to the common pattern.
+ // Canonicalize a shifty way to code absolute value to the common pattern.
// There are 4 potential commuted variants. Move the 'ashr' candidate to Op1.
// We're relying on the fact that we only do this transform when the shift has
// exactly 2 uses and the add has exactly 1 use (otherwise, we might increase
// instructions).
- if (Op0->getNumUses() == 2)
+ if (Op0->hasNUses(2))
std::swap(Op0, Op1);
const APInt *ShAmt;
Type *Ty = I.getType();
if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
- Op1->getNumUses() == 2 && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+ Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
// B = ashr i32 A, 31 ; smear the sign bit
// xor (add A, B), B ; add -1 and flip bits if negative
// --> (A < 0) ? -A : A
Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
- return SelectInst::Create(Cmp, Builder.CreateNeg(A), A);
+ // Copy the nuw/nsw flags from the add to the negate.
+ auto *Add = cast<BinaryOperator>(Op0);
+ Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
+ Add->hasNoSignedWrap());
+ return SelectInst::Create(Cmp, Neg, A);
+ }
+
+ // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
+ //
+ // %notx = xor i32 %x, -1
+ // %cmp1 = icmp sgt i32 %notx, %y
+ // %smax = select i1 %cmp1, i32 %notx, i32 %y
+ // %res = xor i32 %smax, -1
+ // =>
+ // %noty = xor i32 %y, -1
+ // %cmp2 = icmp slt %x, %noty
+ // %res = select i1 %cmp2, i32 %x, i32 %noty
+ //
+ // Same is applicable for smin/umax/umin.
+ {
+ Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor;
+ if (Op0->hasOneUse() && SelectPatternResult::isMinOrMax(SPF) &&
+ match(Op1, m_AllOnes())) {
+
+ Value *X;
+ if (match(RHS, m_Not(m_Value(X))))
+ std::swap(RHS, LHS);
+
+ if (match(LHS, m_Not(m_Value(X)))) {
+ Value *NotY = Builder.CreateNot(RHS);
+ return SelectInst::Create(
+ Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
+ }
+ }
}
- return Changed ? &I : nullptr;
+ return nullptr;
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 40e52ee755e5..cbfbd8a53993 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -57,7 +58,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include <algorithm>
#include <cassert>
@@ -73,11 +73,11 @@ using namespace PatternMatch;
STATISTIC(NumSimplified, "Number of library calls simplified");
-static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
- "unfold-element-atomic-memcpy-max-elements",
- cl::init(16),
- cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
- "allowed to unfold"));
+static cl::opt<unsigned> GuardWideningWindow(
+ "instcombine-guard-widening-window",
+ cl::init(3),
+ cl::desc("How wide an instruction window to bypass looking for "
+ "another guard"));
/// Return the specified type promoted as it would be to pass though a va_arg
/// area.
@@ -106,97 +106,24 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
return ConstantVector::get(BoolVec);
}
-Instruction *
-InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
- // Try to unfold this intrinsic into sequence of explicit atomic loads and
- // stores.
- // First check that number of elements is compile time constant.
- auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
- if (!LengthCI)
- return nullptr;
-
- // Check that there are not too many elements.
- uint64_t LengthInBytes = LengthCI->getZExtValue();
- uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
- uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
- if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
- return nullptr;
-
- // Only expand if there are elements to copy.
- if (NumElements > 0) {
- // Don't unfold into illegal integers
- uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
- if (!getDataLayout().isLegalInteger(ElementSizeInBits))
- return nullptr;
-
- // Cast source and destination to the correct type. Intrinsic input
- // arguments are usually represented as i8*. Often operands will be
- // explicitly casted to i8* and we can just strip those casts instead of
- // inserting new ones. However it's easier to rely on other InstCombine
- // rules which will cover trivial cases anyway.
- Value *Src = AMI->getRawSource();
- Value *Dst = AMI->getRawDest();
- Type *ElementPointerType =
- Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
- Src->getType()->getPointerAddressSpace());
-
- Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
- "memcpy_unfold.src_casted");
- Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
- "memcpy_unfold.dst_casted");
-
- for (uint64_t i = 0; i < NumElements; ++i) {
- // Get current element addresses
- ConstantInt *ElementIdxCI =
- ConstantInt::get(AMI->getContext(), APInt(64, i));
- Value *SrcElementAddr =
- Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
- Value *DstElementAddr =
- Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
- // Load from the source. Transfer alignment information and mark load as
- // unordered atomic.
- LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
- Load->setOrdering(AtomicOrdering::Unordered);
- // We know alignment of the first element. It is also guaranteed by the
- // verifier that element size is less or equal than first element
- // alignment and both of this values are powers of two. This means that
- // all subsequent accesses are at least element size aligned.
- // TODO: We can infer better alignment but there is no evidence that this
- // will matter.
- Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
- : ElementSizeInBytes);
- Load->setDebugLoc(AMI->getDebugLoc());
-
- // Store loaded value via unordered atomic store.
- StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
- Store->setOrdering(AtomicOrdering::Unordered);
- Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
- : ElementSizeInBytes);
- Store->setDebugLoc(AMI->getDebugLoc());
- }
+Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
+ unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
+ unsigned CopyDstAlign = MI->getDestAlignment();
+ if (CopyDstAlign < DstAlign){
+ MI->setDestAlignment(DstAlign);
+ return MI;
}
- // Set the number of elements of the copy to 0, it will be deleted on the
- // next iteration.
- AMI->setLength(Constant::getNullValue(LengthCI->getType()));
- return AMI;
-}
-
-Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
- unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
- unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
- unsigned MinAlign = std::min(DstAlign, SrcAlign);
- unsigned CopyAlign = MI->getAlignment();
-
- if (CopyAlign < MinAlign) {
- MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
+ unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+ unsigned CopySrcAlign = MI->getSourceAlignment();
+ if (CopySrcAlign < SrcAlign) {
+ MI->setSourceAlignment(SrcAlign);
return MI;
}
// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
// load/store.
- ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+ ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
if (!MemOpLength) return nullptr;
// Source and destination pointer types are always "i8*" for intrinsic. See
@@ -222,7 +149,9 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
// If the memcpy has metadata describing the members, see if we can get the
// TBAA tag describing our copy.
MDNode *CopyMD = nullptr;
- if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+ if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
+ CopyMD = M;
+ } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
if (M->getNumOperands() == 3 && M->getOperand(0) &&
mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
@@ -234,15 +163,11 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
CopyMD = cast<MDNode>(M->getOperand(2));
}
- // If the memcpy/memmove provides better alignment info than we can
- // infer, use it.
- SrcAlign = std::max(SrcAlign, CopyAlign);
- DstAlign = std::max(DstAlign, CopyAlign);
-
Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
- LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
- L->setAlignment(SrcAlign);
+ LoadInst *L = Builder.CreateLoad(Src);
+ // Alignment from the mem intrinsic will be better, so use it.
+ L->setAlignment(CopySrcAlign);
if (CopyMD)
L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
MDNode *LoopMemParallelMD =
@@ -250,23 +175,34 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
if (LoopMemParallelMD)
L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
- StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
- S->setAlignment(DstAlign);
+ StoreInst *S = Builder.CreateStore(L, Dest);
+ // Alignment from the mem intrinsic will be better, so use it.
+ S->setAlignment(CopyDstAlign);
if (CopyMD)
S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
if (LoopMemParallelMD)
S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+ if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+ // non-atomics can be volatile
+ L->setVolatile(MT->isVolatile());
+ S->setVolatile(MT->isVolatile());
+ }
+ if (isa<AtomicMemTransferInst>(MI)) {
+ // atomics have to be unordered
+ L->setOrdering(AtomicOrdering::Unordered);
+ S->setOrdering(AtomicOrdering::Unordered);
+ }
+
// Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+ MI->setLength(Constant::getNullValue(MemOpLength->getType()));
return MI;
}
-Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
- if (MI->getAlignment() < Alignment) {
- MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
- Alignment, false));
+ if (MI->getDestAlignment() < Alignment) {
+ MI->setDestAlignment(Alignment);
return MI;
}
@@ -276,7 +212,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
return nullptr;
uint64_t Len = LenC->getLimitedValue();
- Alignment = MI->getAlignment();
+ Alignment = MI->getDestAlignment();
assert(Len && "0-sized memory setting should be removed already.");
// memset(s,c,n) -> store s, c (for n=1,2,4,8)
@@ -296,6 +232,8 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
MI->isVolatile());
S->setAlignment(Alignment);
+ if (isa<AtomicMemSetInst>(MI))
+ S->setOrdering(AtomicOrdering::Unordered);
// Set the size of the copy to 0, it will be deleted on the next iteration.
MI->setLength(Constant::getNullValue(LenC->getType()));
@@ -563,55 +501,6 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
return Builder.CreateAShr(Vec, ShiftVec);
}
-static Value *simplifyX86muldq(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Type *ResTy = II.getType();
- assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
- Arg1->getType()->getScalarSizeInBits() == 32 &&
- ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
-
- // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
- if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
- return ConstantAggregateZero::get(ResTy);
-
- // Constant folding.
- // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
- // vXi64 sext(shuffle<0,2,..>(Arg1))))
- // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
- // vXi64 zext(shuffle<0,2,..>(Arg1))))
- if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
- return nullptr;
-
- unsigned NumElts = ResTy->getVectorNumElements();
- assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
- Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
- "Unexpected muldq/muludq types");
-
- unsigned IntrinsicID = II.getIntrinsicID();
- bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
- Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
- Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
-
- SmallVector<unsigned, 16> ShuffleMask;
- for (unsigned i = 0; i != NumElts; ++i)
- ShuffleMask.push_back(i * 2);
-
- auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
- auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
-
- if (IsSigned) {
- LHS = Builder.CreateSExt(LHS, ResTy);
- RHS = Builder.CreateSExt(RHS, ResTy);
- } else {
- LHS = Builder.CreateZExt(LHS, ResTy);
- RHS = Builder.CreateZExt(RHS, ResTy);
- }
-
- return Builder.CreateMul(LHS, RHS);
-}
-
static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
@@ -687,6 +576,105 @@ static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
return ConstantVector::get(Vals);
}
+// Replace X86-specific intrinsics with generic floor-ceil where applicable.
+static Value *simplifyX86round(IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ ConstantInt *Arg = nullptr;
+ Intrinsic::ID IntrinsicID = II.getIntrinsicID();
+
+ if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+ IntrinsicID == Intrinsic::x86_sse41_round_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+ else
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ if (!Arg)
+ return nullptr;
+ unsigned RoundControl = Arg->getZExtValue();
+
+ Arg = nullptr;
+ unsigned SAE = 0;
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+ else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(5));
+ else
+ SAE = 4;
+ if (!SAE) {
+ if (!Arg)
+ return nullptr;
+ SAE = Arg->getZExtValue();
+ }
+
+ if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/))
+ return nullptr;
+
+ Value *Src, *Dst, *Mask;
+ bool IsScalar = false;
+ if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+ IntrinsicID == Intrinsic::x86_sse41_round_sd ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ IsScalar = true;
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ Mask = II.getArgOperand(3);
+ Value *Zero = Constant::getNullValue(Mask->getType());
+ Mask = Builder.CreateAnd(Mask, 1);
+ Mask = Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero);
+ Dst = II.getArgOperand(2);
+ } else
+ Dst = II.getArgOperand(0);
+ Src = Builder.CreateExtractElement(II.getArgOperand(1), (uint64_t)0);
+ } else {
+ Src = II.getArgOperand(0);
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_128 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_256 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_128 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_256 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512) {
+ Dst = II.getArgOperand(2);
+ Mask = II.getArgOperand(3);
+ } else {
+ Dst = Src;
+ Mask = ConstantInt::getAllOnesValue(
+ Builder.getIntNTy(Src->getType()->getVectorNumElements()));
+ }
+ }
+
+ Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor;
+ Value *Res = Builder.CreateIntrinsic(ID, {Src}, &II);
+ if (!IsScalar) {
+ if (auto *C = dyn_cast<Constant>(Mask))
+ if (C->isAllOnesValue())
+ return Res;
+ auto *MaskTy = VectorType::get(
+ Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = Builder.CreateBitCast(Mask, MaskTy);
+ unsigned Width = Src->getType()->getVectorNumElements();
+ if (MaskTy->getVectorNumElements() > Width) {
+ uint32_t Indices[4];
+ for (unsigned i = 0; i != Width; ++i)
+ Indices[i] = i;
+ Mask = Builder.CreateShuffleVector(Mask, Mask,
+ makeArrayRef(Indices, Width));
+ }
+ return Builder.CreateSelect(Mask, Res, Dst);
+ }
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ Dst = Builder.CreateExtractElement(Dst, (uint64_t)0);
+ Res = Builder.CreateSelect(Mask, Res, Dst);
+ Dst = II.getArgOperand(0);
+ }
+ return Builder.CreateInsertElement(Dst, Res, (uint64_t)0);
+}
+
static Value *simplifyX86movmsk(const IntrinsicInst &II) {
Value *Arg = II.getArgOperand(0);
Type *ResTy = II.getType();
@@ -1145,36 +1133,6 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II,
return nullptr;
}
-// Emit a select instruction and appropriate bitcasts to help simplify
-// masked intrinsics.
-static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1,
- InstCombiner::BuilderTy &Builder) {
- unsigned VWidth = Op0->getType()->getVectorNumElements();
-
- // If the mask is all ones we don't need the select. But we need to check
- // only the bit thats will be used in case VWidth is less than 8.
- if (auto *C = dyn_cast<ConstantInt>(Mask))
- if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue())
- return Op0;
-
- auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
- cast<IntegerType>(Mask->getType())->getBitWidth());
- Mask = Builder.CreateBitCast(Mask, MaskTy);
-
- // If we have less than 8 elements, then the starting mask was an i8 and
- // we need to extract down to the right number of elements.
- if (VWidth < 8) {
- uint32_t Indices[4];
- for (unsigned i = 0; i != VWidth; ++i)
- Indices[i] = i;
- Mask = Builder.CreateShuffleVector(Mask, Mask,
- makeArrayRef(Indices, VWidth),
- "extract");
- }
-
- return Builder.CreateSelect(Mask, Op0, Op1);
-}
-
static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
@@ -1308,6 +1266,40 @@ static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
return nullptr;
}
+/// This function transforms launder.invariant.group and strip.invariant.group
+/// like:
+/// launder(launder(%x)) -> launder(%x) (the result is not the argument)
+/// launder(strip(%x)) -> launder(%x)
+/// strip(strip(%x)) -> strip(%x) (the result is not the argument)
+/// strip(launder(%x)) -> strip(%x)
+/// This is legal because it preserves the most recent information about
+/// the presence or absence of invariant.group.
+static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
+ InstCombiner &IC) {
+ auto *Arg = II.getArgOperand(0);
+ auto *StrippedArg = Arg->stripPointerCasts();
+ auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
+ if (StrippedArg == StrippedInvariantGroupsArg)
+ return nullptr; // No launders/strips to remove.
+
+ Value *Result = nullptr;
+
+ if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
+ Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
+ else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
+ Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
+ else
+ llvm_unreachable(
+ "simplifyInvariantGroupIntrinsic only handles launder and strip");
+ if (Result->getType()->getPointerAddressSpace() !=
+ II.getType()->getPointerAddressSpace())
+ Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
+ if (Result->getType() != II.getType())
+ Result = IC.Builder.CreateBitCast(Result, II.getType());
+
+ return cast<Instruction>(Result);
+}
+
static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
// If the mask is all zeros, a scatter does nothing.
auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
@@ -1498,6 +1490,68 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
return maxnum(Src0, Src1);
}
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+static Value *simplifyNeonTbl1(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ // Bail out if the mask is not a constant.
+ auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!C)
+ return nullptr;
+
+ auto *VecTy = cast<VectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+
+ // Only perform this transformation for <8 x i8> vector types.
+ if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+ return nullptr;
+
+ uint32_t Indexes[8];
+
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = C->getAggregateElement(I);
+
+ if (!COp || !isa<ConstantInt>(COp))
+ return nullptr;
+
+ Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+ // Make sure the mask indices are in range.
+ if (Indexes[I] >= NumElts)
+ return nullptr;
+ }
+
+ auto *ShuffleMask = ConstantDataVector::get(II.getContext(),
+ makeArrayRef(Indexes));
+ auto *V1 = II.getArgOperand(0);
+ auto *V2 = Constant::getNullValue(V1->getType());
+ return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II,
+ unsigned MemAlign,
+ InstCombiner::BuilderTy &Builder) {
+ auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+ if (!IntrAlign)
+ return nullptr;
+
+ unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
+ MemAlign : IntrAlign->getLimitedValue();
+
+ if (!isPowerOf2_32(Alignment))
+ return nullptr;
+
+ auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+ PointerType::get(II.getType(), 0));
+ return Builder.CreateAlignedLoad(BCastInst, Alignment);
+}
+
// Returns true iff the 2 intrinsics have the same operands, limiting the
// comparison to the first NumOperands.
static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1820,7 +1874,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Intrinsics cannot occur in an invoke, so handle them here instead of in
// visitCallSite.
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
bool Changed = false;
// memmove/cpy/set of zero bytes is a noop.
@@ -1837,17 +1891,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
// No other transformations apply to volatile transfers.
- if (MI->isVolatile())
- return nullptr;
+ if (auto *M = dyn_cast<MemIntrinsic>(MI))
+ if (M->isVolatile())
+ return nullptr;
// If we have a memmove and the source operation is a constant global,
// then the source and dest pointers can't alias, so we can change this
// into a call to memcpy.
- if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+ if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
if (GVSrc->isConstant()) {
Module *M = CI.getModule();
- Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+ Intrinsic::ID MemCpyID =
+ isa<AtomicMemMoveInst>(MMI)
+ ? Intrinsic::memcpy_element_unordered_atomic
+ : Intrinsic::memcpy;
Type *Tys[3] = { CI.getArgOperand(0)->getType(),
CI.getArgOperand(1)->getType(),
CI.getArgOperand(2)->getType() };
@@ -1856,7 +1914,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
}
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
// memmove(x,x,size) -> noop.
if (MTI->getSource() == MTI->getDest())
return eraseInstFromFunction(CI);
@@ -1864,26 +1922,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// If we can determine a pointer alignment that is bigger than currently
// set, update the alignment.
- if (isa<MemTransferInst>(MI)) {
- if (Instruction *I = SimplifyMemTransfer(MI))
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemTransfer(MTI))
return I;
- } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
- if (Instruction *I = SimplifyMemSet(MSI))
+ } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemSet(MSI))
return I;
}
if (Changed) return II;
}
- if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
- if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
- if (C->isNullValue())
- return eraseInstFromFunction(*AMI);
-
- if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
- return I;
- }
-
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
return I;
@@ -1925,7 +1974,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return simplifyMaskedGather(*II, *this);
case Intrinsic::masked_scatter:
return simplifyMaskedScatter(*II, *this);
-
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
+ if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
+ return replaceInstUsesWith(*II, SkippedBarrier);
+ break;
case Intrinsic::powi:
if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
// 0 and 1 are handled in instsimplify
@@ -1991,8 +2044,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
II->setArgOperand(1, Arg0);
return II;
}
+
+ // FIXME: Simplifications should be in instsimplify.
if (Value *V = simplifyMinnumMaxnum(*II))
return replaceInstUsesWith(*II, V);
+
+ Value *X, *Y;
+ if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
+ (Arg0->hasOneUse() || Arg1->hasOneUse())) {
+ // If both operands are negated, invert the call and negate the result:
+ // minnum(-X, -Y) --> -(maxnum(X, Y))
+ // maxnum(-X, -Y) --> -(minnum(X, Y))
+ Intrinsic::ID NewIID = II->getIntrinsicID() == Intrinsic::maxnum ?
+ Intrinsic::minnum : Intrinsic::maxnum;
+ Value *NewCall = Builder.CreateIntrinsic(NewIID, { X, Y }, II);
+ Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
+ FNeg->copyIRFlags(II);
+ return FNeg;
+ }
break;
}
case Intrinsic::fmuladd: {
@@ -2013,37 +2082,34 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *Src0 = II->getArgOperand(0);
Value *Src1 = II->getArgOperand(1);
- // Canonicalize constants into the RHS.
+ // Canonicalize constant multiply operand to Src1.
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
II->setArgOperand(0, Src1);
II->setArgOperand(1, Src0);
std::swap(Src0, Src1);
}
- Value *LHS = nullptr;
- Value *RHS = nullptr;
-
// fma fneg(x), fneg(y), z -> fma x, y, z
- if (match(Src0, m_FNeg(m_Value(LHS))) &&
- match(Src1, m_FNeg(m_Value(RHS)))) {
- II->setArgOperand(0, LHS);
- II->setArgOperand(1, RHS);
+ Value *X, *Y;
+ if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
+ II->setArgOperand(0, X);
+ II->setArgOperand(1, Y);
return II;
}
// fma fabs(x), fabs(x), z -> fma x, x, z
- if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
- match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
- II->setArgOperand(0, LHS);
- II->setArgOperand(1, RHS);
+ if (match(Src0, m_FAbs(m_Value(X))) &&
+ match(Src1, m_FAbs(m_Specific(X)))) {
+ II->setArgOperand(0, X);
+ II->setArgOperand(1, X);
return II;
}
// fma x, 1, z -> fadd x, z
if (match(Src1, m_FPOne())) {
- Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
- RI->copyFastMathFlags(II);
- return RI;
+ auto *FAdd = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+ FAdd->copyFastMathFlags(II);
+ return FAdd;
}
break;
@@ -2067,17 +2133,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::rint:
case Intrinsic::trunc: {
Value *ExtSrc;
- if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
- II->getArgOperand(0)->hasOneUse()) {
- // fabs (fpext x) -> fpext (fabs x)
- Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
- { ExtSrc->getType() });
- CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
- NewFabs->copyFastMathFlags(II);
- NewFabs->takeName(II);
- return new FPExtInst(NewFabs, II->getType());
+ if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
+ // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
+ Value *NarrowII = Builder.CreateIntrinsic(II->getIntrinsicID(),
+ { ExtSrc }, II);
+ return new FPExtInst(NarrowII, II->getType());
}
-
break;
}
case Intrinsic::cos:
@@ -2085,7 +2146,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *SrcSrc;
Value *Src = II->getArgOperand(0);
if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
- match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+ match(Src, m_FAbs(m_Value(SrcSrc)))) {
// cos(-x) -> cos(x)
// cos(fabs(x)) -> cos(x)
II->setArgOperand(0, SrcSrc);
@@ -2298,6 +2359,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::x86_sse41_round_ps:
+ case Intrinsic::x86_sse41_round_pd:
+ case Intrinsic::x86_avx_round_ps_256:
+ case Intrinsic::x86_avx_round_pd_256:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_128:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_256:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_512:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_128:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_256:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_512:
+ case Intrinsic::x86_avx512_mask_rndscale_ss:
+ case Intrinsic::x86_avx512_mask_rndscale_sd:
+ if (Value *V = simplifyX86round(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_sse2_movmsk_pd:
@@ -2355,16 +2432,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return II;
break;
}
- case Intrinsic::x86_avx512_mask_cmp_pd_128:
- case Intrinsic::x86_avx512_mask_cmp_pd_256:
- case Intrinsic::x86_avx512_mask_cmp_pd_512:
- case Intrinsic::x86_avx512_mask_cmp_ps_128:
- case Intrinsic::x86_avx512_mask_cmp_ps_256:
- case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+ case Intrinsic::x86_avx512_cmp_pd_128:
+ case Intrinsic::x86_avx512_cmp_pd_256:
+ case Intrinsic::x86_avx512_cmp_pd_512:
+ case Intrinsic::x86_avx512_cmp_ps_128:
+ case Intrinsic::x86_avx512_cmp_ps_256:
+ case Intrinsic::x86_avx512_cmp_ps_512: {
// Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);
- bool Arg0IsZero = match(Arg0, m_Zero());
+ bool Arg0IsZero = match(Arg0, m_PosZeroFP());
if (Arg0IsZero)
std::swap(Arg0, Arg1);
Value *A, *B;
@@ -2376,7 +2453,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// The compare intrinsic uses the above assumptions and therefore
// doesn't require additional flags.
if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
- match(Arg1, m_Zero()) && isa<Instruction>(Arg0) &&
+ match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
if (Arg0IsZero)
std::swap(A, B);
@@ -2387,17 +2464,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
- case Intrinsic::x86_avx512_mask_add_ps_512:
- case Intrinsic::x86_avx512_mask_div_ps_512:
- case Intrinsic::x86_avx512_mask_mul_ps_512:
- case Intrinsic::x86_avx512_mask_sub_ps_512:
- case Intrinsic::x86_avx512_mask_add_pd_512:
- case Intrinsic::x86_avx512_mask_div_pd_512:
- case Intrinsic::x86_avx512_mask_mul_pd_512:
- case Intrinsic::x86_avx512_mask_sub_pd_512:
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
// IR operations.
- if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+ if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
if (R->getValue() == 4) {
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);
@@ -2405,27 +2482,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *V;
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_avx512_mask_add_ps_512:
- case Intrinsic::x86_avx512_mask_add_pd_512:
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
V = Builder.CreateFAdd(Arg0, Arg1);
break;
- case Intrinsic::x86_avx512_mask_sub_ps_512:
- case Intrinsic::x86_avx512_mask_sub_pd_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
V = Builder.CreateFSub(Arg0, Arg1);
break;
- case Intrinsic::x86_avx512_mask_mul_ps_512:
- case Intrinsic::x86_avx512_mask_mul_pd_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
V = Builder.CreateFMul(Arg0, Arg1);
break;
- case Intrinsic::x86_avx512_mask_div_ps_512:
- case Intrinsic::x86_avx512_mask_div_pd_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_div_pd_512:
V = Builder.CreateFDiv(Arg0, Arg1);
break;
}
- // Create a select for the masking.
- V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
- Builder);
return replaceInstUsesWith(*II, V);
}
}
@@ -2499,32 +2573,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_avx512_mask_min_ss_round:
case Intrinsic::x86_avx512_mask_max_sd_round:
case Intrinsic::x86_avx512_mask_min_sd_round:
- case Intrinsic::x86_avx512_mask_vfmadd_ss:
- case Intrinsic::x86_avx512_mask_vfmadd_sd:
- case Intrinsic::x86_avx512_maskz_vfmadd_ss:
- case Intrinsic::x86_avx512_maskz_vfmadd_sd:
- case Intrinsic::x86_avx512_mask3_vfmadd_ss:
- case Intrinsic::x86_avx512_mask3_vfmadd_sd:
- case Intrinsic::x86_avx512_mask3_vfmsub_ss:
- case Intrinsic::x86_avx512_mask3_vfmsub_sd:
- case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
- case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
- case Intrinsic::x86_fma_vfmadd_ss:
- case Intrinsic::x86_fma_vfmsub_ss:
- case Intrinsic::x86_fma_vfnmadd_ss:
- case Intrinsic::x86_fma_vfnmsub_ss:
- case Intrinsic::x86_fma_vfmadd_sd:
- case Intrinsic::x86_fma_vfmsub_sd:
- case Intrinsic::x86_fma_vfnmadd_sd:
- case Intrinsic::x86_fma_vfnmsub_sd:
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
case Intrinsic::x86_sse2_cmp_sd:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd:
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd: {
unsigned VWidth = II->getType()->getVectorNumElements();
@@ -2537,6 +2591,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
break;
}
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ } else if (Value *V = simplifyX86round(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+ }
// Constant fold ashr( <A x Bi>, Ci ).
// Constant fold lshr( <A x Bi>, Ci ).
@@ -2647,26 +2714,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
- case Intrinsic::x86_sse2_pmulu_dq:
- case Intrinsic::x86_sse41_pmuldq:
- case Intrinsic::x86_avx2_pmul_dq:
- case Intrinsic::x86_avx2_pmulu_dq:
- case Intrinsic::x86_avx512_pmul_dq_512:
- case Intrinsic::x86_avx512_pmulu_dq_512: {
- if (Value *V = simplifyX86muldq(*II, Builder))
- return replaceInstUsesWith(*II, V);
-
- unsigned VWidth = II->getType()->getVectorNumElements();
- APInt UndefElts(VWidth, 0);
- APInt DemandedElts = APInt::getAllOnesValue(VWidth);
- if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
- if (V != II)
- return replaceInstUsesWith(*II, V);
- return II;
- }
- break;
- }
-
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_avx2_packssdw:
@@ -2687,7 +2734,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
- case Intrinsic::x86_pclmulqdq: {
+ case Intrinsic::x86_pclmulqdq:
+ case Intrinsic::x86_pclmulqdq_256:
+ case Intrinsic::x86_pclmulqdq_512: {
if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
unsigned Imm = C->getZExtValue();
@@ -2695,27 +2744,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);
unsigned VWidth = Arg0->getType()->getVectorNumElements();
- APInt DemandedElts(VWidth, 0);
APInt UndefElts1(VWidth, 0);
- DemandedElts = (Imm & 0x01) ? 2 : 1;
- if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
+ APInt DemandedElts1 = APInt::getSplat(VWidth,
+ APInt(2, (Imm & 0x01) ? 2 : 1));
+ if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
UndefElts1)) {
II->setArgOperand(0, V);
MadeChange = true;
}
APInt UndefElts2(VWidth, 0);
- DemandedElts = (Imm & 0x10) ? 2 : 1;
- if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
+ APInt DemandedElts2 = APInt::getSplat(VWidth,
+ APInt(2, (Imm & 0x10) ? 2 : 1));
+ if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
UndefElts2)) {
II->setArgOperand(1, V);
MadeChange = true;
}
- // If both input elements are undef, the result is undef.
- if (UndefElts1[(Imm & 0x01) ? 1 : 0] ||
- UndefElts2[(Imm & 0x10) ? 1 : 0])
+ // If either input elements are undef, the result is zero.
+ if (DemandedElts1.isSubsetOf(UndefElts1) ||
+ DemandedElts2.isSubsetOf(UndefElts2))
return replaceInstUsesWith(*II,
ConstantAggregateZero::get(II->getType()));
@@ -2916,32 +2966,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps:
+ case Intrinsic::x86_avx512_permvar_df_256:
+ case Intrinsic::x86_avx512_permvar_df_512:
+ case Intrinsic::x86_avx512_permvar_di_256:
+ case Intrinsic::x86_avx512_permvar_di_512:
+ case Intrinsic::x86_avx512_permvar_hi_128:
+ case Intrinsic::x86_avx512_permvar_hi_256:
+ case Intrinsic::x86_avx512_permvar_hi_512:
+ case Intrinsic::x86_avx512_permvar_qi_128:
+ case Intrinsic::x86_avx512_permvar_qi_256:
+ case Intrinsic::x86_avx512_permvar_qi_512:
+ case Intrinsic::x86_avx512_permvar_sf_512:
+ case Intrinsic::x86_avx512_permvar_si_512:
if (Value *V = simplifyX86vpermv(*II, Builder))
return replaceInstUsesWith(*II, V);
break;
- case Intrinsic::x86_avx512_mask_permvar_df_256:
- case Intrinsic::x86_avx512_mask_permvar_df_512:
- case Intrinsic::x86_avx512_mask_permvar_di_256:
- case Intrinsic::x86_avx512_mask_permvar_di_512:
- case Intrinsic::x86_avx512_mask_permvar_hi_128:
- case Intrinsic::x86_avx512_mask_permvar_hi_256:
- case Intrinsic::x86_avx512_mask_permvar_hi_512:
- case Intrinsic::x86_avx512_mask_permvar_qi_128:
- case Intrinsic::x86_avx512_mask_permvar_qi_256:
- case Intrinsic::x86_avx512_mask_permvar_qi_512:
- case Intrinsic::x86_avx512_mask_permvar_sf_256:
- case Intrinsic::x86_avx512_mask_permvar_sf_512:
- case Intrinsic::x86_avx512_mask_permvar_si_256:
- case Intrinsic::x86_avx512_mask_permvar_si_512:
- if (Value *V = simplifyX86vpermv(*II, Builder)) {
- // We simplified the permuting, now create a select for the masking.
- V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
- Builder);
- return replaceInstUsesWith(*II, V);
- }
- break;
-
case Intrinsic::x86_avx_maskload_ps:
case Intrinsic::x86_avx_maskload_pd:
case Intrinsic::x86_avx_maskload_ps_256:
@@ -3042,7 +3082,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
break;
- case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld1: {
+ unsigned MemAlign = getKnownAlignment(II->getArgOperand(0),
+ DL, II, &AC, &DT);
+ if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+ }
+
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
@@ -3069,6 +3116,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::arm_neon_vtbl1:
+ case Intrinsic::aarch64_neon_tbl1:
+ if (Value *V = simplifyNeonTbl1(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
case Intrinsic::arm_neon_vmulls:
case Intrinsic::arm_neon_vmullu:
case Intrinsic::aarch64_neon_smull:
@@ -3107,6 +3160,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::arm_neon_aesd:
+ case Intrinsic::arm_neon_aese:
+ case Intrinsic::aarch64_crypto_aesd:
+ case Intrinsic::aarch64_crypto_aese: {
+ Value *DataArg = II->getArgOperand(0);
+ Value *KeyArg = II->getArgOperand(1);
+
+ // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+ Value *Data, *Key;
+ if (match(KeyArg, m_ZeroInt()) &&
+ match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+ II->setArgOperand(0, Data);
+ II->setArgOperand(1, Key);
+ return II;
+ }
+ break;
+ }
case Intrinsic::amdgcn_rcp: {
Value *Src = II->getArgOperand(0);
@@ -3264,6 +3334,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ break;
+ }
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.
@@ -3370,6 +3452,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *Src1 = II->getArgOperand(1);
Value *Src2 = II->getArgOperand(2);
+ // Checking for NaN before canonicalization provides better fidelity when
+ // mapping other operations onto fmed3 since the order of operands is
+ // unchanged.
+ CallInst *NewCall = nullptr;
+ if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
+ NewCall = Builder.CreateMinNum(Src1, Src2);
+ } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
+ NewCall = Builder.CreateMinNum(Src0, Src2);
+ } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+ NewCall = Builder.CreateMaxNum(Src0, Src1);
+ }
+
+ if (NewCall) {
+ NewCall->copyFastMathFlags(II);
+ NewCall->takeName(II);
+ return replaceInstUsesWith(*II, NewCall);
+ }
+
bool Swap = false;
// Canonicalize constants to RHS operands.
//
@@ -3396,13 +3496,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return II;
}
- if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
- CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
- NewCall->copyFastMathFlags(II);
- NewCall->takeName(II);
- return replaceInstUsesWith(*II, NewCall);
- }
-
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
@@ -3536,13 +3629,32 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// amdgcn.kill(i1 1) is a no-op
return eraseInstFromFunction(CI);
}
+ case Intrinsic::amdgcn_update_dpp: {
+ Value *Old = II->getArgOperand(0);
+
+ auto BC = dyn_cast<ConstantInt>(II->getArgOperand(5));
+ auto RM = dyn_cast<ConstantInt>(II->getArgOperand(3));
+ auto BM = dyn_cast<ConstantInt>(II->getArgOperand(4));
+ if (!BC || !RM || !BM ||
+ BC->isZeroValue() ||
+ RM->getZExtValue() != 0xF ||
+ BM->getZExtValue() != 0xF ||
+ isa<UndefValue>(Old))
+ break;
+
+ // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+ II->setOperand(0, UndefValue::get(Old->getType()));
+ return II;
+ }
case Intrinsic::stackrestore: {
// If the save is right next to the restore, remove the restore. This can
// happen when variable allocas are DCE'd.
if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
if (SS->getIntrinsicID() == Intrinsic::stacksave) {
- if (&*++SS->getIterator() == II)
+ // Skip over debug info.
+ if (SS->getNextNonDebugInstruction() == II) {
return eraseInstFromFunction(CI);
+ }
}
}
@@ -3597,9 +3709,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
case Intrinsic::assume: {
Value *IIOperand = II->getArgOperand(0);
- // Remove an assume if it is immediately followed by an identical assume.
- if (match(II->getNextNode(),
- m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+ // Remove an assume if it is followed by an identical assume.
+ // TODO: Do we need this? Unless there are conflicting assumptions, the
+ // computeKnownBits(IIOperand) below here eliminates redundant assumes.
+ Instruction *Next = II->getNextNonDebugInstruction();
+ if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
return eraseInstFromFunction(CI);
// Canonicalize assume(a && b) -> assume(a); assume(b);
@@ -3686,8 +3800,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
case Intrinsic::experimental_guard: {
- // Is this guard followed by another guard?
+ // Is this guard followed by another guard? We scan forward over a small
+ // fixed window of instructions to handle common cases with conditions
+ // computed between guards.
Instruction *NextInst = II->getNextNode();
+ for (unsigned i = 0; i < GuardWideningWindow; i++) {
+ // Note: Using context-free form to avoid compile time blow up
+ if (!isSafeToSpeculativelyExecute(NextInst))
+ break;
+ NextInst = NextInst->getNextNode();
+ }
Value *NextCond = nullptr;
if (match(NextInst,
m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
@@ -3698,6 +3820,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return eraseInstFromFunction(*NextInst);
// Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+ Instruction* MoveI = II->getNextNode();
+ while (MoveI != NextInst) {
+ auto *Temp = MoveI;
+ MoveI = MoveI->getNextNode();
+ Temp->moveBefore(II);
+ }
II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
return eraseInstFromFunction(*NextInst);
}
@@ -3710,7 +3838,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Fence instruction simplification
Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
// Remove identical consecutive fences.
- if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+ Instruction *Next = FI.getNextNonDebugInstruction();
+ if (auto *NFI = dyn_cast<FenceInst>(Next))
if (FI.isIdenticalTo(NFI))
return eraseInstFromFunction(FI);
return nullptr;
@@ -3887,8 +4016,8 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
// Remove the convergent attr on calls when the callee is not convergent.
if (CS.isConvergent() && !CalleeF->isConvergent() &&
!CalleeF->isIntrinsic()) {
- DEBUG(dbgs() << "Removing convergent attr from instr "
- << CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << "Removing convergent attr from instr "
+ << CS.getInstruction() << "\n");
CS.setNotConvergent();
return CS.getInstruction();
}
@@ -3919,7 +4048,9 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
}
}
- if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+ if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(CS.getInstruction()->getFunction())) ||
+ isa<UndefValue>(Callee)) {
// If CS does not return void then replaceAllUsesWith undef.
// This allows ValueHandlers and custom metadata to adjust itself.
if (!CS.getInstruction()->getType()->isVoidTy())
@@ -3986,10 +4117,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
if (!Callee)
return false;
- // The prototype of a thunk is a lie. Don't directly call such a function.
+ // If this is a call to a thunk function, don't remove the cast. Thunks are
+ // used to transparently forward all incoming parameters and outgoing return
+ // values, so it's important to leave the cast in place.
if (Callee->hasFnAttribute("thunk"))
return false;
+ // If this is a musttail call, the callee's prototype must match the caller's
+ // prototype with the exception of pointee types. The code below doesn't
+ // implement that, so we can't do this transform.
+ // TODO: Do the transform if it only requires adding pointer casts.
+ if (CS.isMustTailCall())
+ return false;
+
Instruction *Caller = CS.getInstruction();
const AttributeList &CallerPAL = CS.getAttributes();
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 178c8eaf2502..e8ea7396a96a 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
@@ -256,7 +257,7 @@ Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
return Instruction::CastOps(Res);
}
-/// @brief Implement the transforms common to all CastInst visitors.
+/// Implement the transforms common to all CastInst visitors.
Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
Value *Src = CI.getOperand(0);
@@ -265,14 +266,27 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
// The first cast (CSrc) is eliminable so we need to fix up or replace
// the second cast (CI). CSrc will then have a good chance of being dead.
- return CastInst::Create(NewOpc, CSrc->getOperand(0), CI.getType());
+ auto *Ty = CI.getType();
+ auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
+ // Point debug users of the dying cast to the new one.
+ if (CSrc->hasOneUse())
+ replaceAllDbgUsesWith(*CSrc, *Res, CI, DT);
+ return Res;
}
}
- // If we are casting a select, then fold the cast into the select.
- if (auto *SI = dyn_cast<SelectInst>(Src))
- if (Instruction *NV = FoldOpIntoSelect(CI, SI))
- return NV;
+ if (auto *Sel = dyn_cast<SelectInst>(Src)) {
+ // We are casting a select. Try to fold the cast into the select, but only
+ // if the select does not have a compare instruction with matching operand
+ // types. Creating a select with operands that are different sizes than its
+ // condition may inhibit other folds and lead to worse codegen.
+ auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition());
+ if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType())
+ if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) {
+ replaceAllDbgUsesWith(*Sel, *NV, CI, DT);
+ return NV;
+ }
+ }
// If we are casting a PHI, then fold the cast into the PHI.
if (auto *PN = dyn_cast<PHINode>(Src)) {
@@ -287,6 +301,33 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
return nullptr;
}
+/// Constants and extensions/truncates from the destination type are always
+/// free to be evaluated in that type. This is a helper for canEvaluate*.
+static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
+ if (isa<Constant>(V))
+ return true;
+ Value *X;
+ if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
+ X->getType() == Ty)
+ return true;
+
+ return false;
+}
+
+/// Filter out values that we can not evaluate in the destination type for free.
+/// This is a helper for canEvaluate*.
+static bool canNotEvaluateInType(Value *V, Type *Ty) {
+ assert(!isa<Constant>(V) && "Constant should already be handled.");
+ if (!isa<Instruction>(V))
+ return true;
+ // We don't extend or shrink something that has multiple uses -- doing so
+ // would require duplicating the instruction which isn't profitable.
+ if (!V->hasOneUse())
+ return true;
+
+ return false;
+}
+
/// Return true if we can evaluate the specified expression tree as type Ty
/// instead of its larger type, and arrive with the same value.
/// This is used by code that tries to eliminate truncates.
@@ -300,27 +341,14 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
///
static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
Instruction *CxtI) {
- // We can always evaluate constants in another type.
- if (isa<Constant>(V))
+ if (canAlwaysEvaluateInType(V, Ty))
return true;
+ if (canNotEvaluateInType(V, Ty))
+ return false;
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
-
+ auto *I = cast<Instruction>(V);
Type *OrigTy = V->getType();
-
- // If this is an extension from the dest type, we can eliminate it, even if it
- // has multiple uses.
- if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
- I->getOperand(0)->getType() == Ty)
- return true;
-
- // We can't extend or shrink something that has multiple uses: doing so would
- // require duplicating the instruction in general, which isn't profitable.
- if (!I->hasOneUse()) return false;
-
- unsigned Opc = I->getOpcode();
- switch (Opc) {
+ switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
@@ -336,13 +364,12 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
// UDiv and URem can be truncated if all the truncated bits are zero.
uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
uint32_t BitWidth = Ty->getScalarSizeInBits();
- if (BitWidth < OrigBitWidth) {
- APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
- if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
- IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
- return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
- canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
- }
+ assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!");
+ APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+ if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
+ IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
+ return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+ canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
}
break;
}
@@ -365,9 +392,9 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
if (match(I->getOperand(1), m_APInt(Amt))) {
uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
uint32_t BitWidth = Ty->getScalarSizeInBits();
- if (IC.MaskedValueIsZero(I->getOperand(0),
- APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) &&
- Amt->getLimitedValue(BitWidth) < BitWidth) {
+ if (Amt->getLimitedValue(BitWidth) < BitWidth &&
+ IC.MaskedValueIsZero(I->getOperand(0),
+ APInt::getBitsSetFrom(OrigBitWidth, BitWidth), 0, CxtI)) {
return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
}
}
@@ -644,20 +671,6 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
if (Instruction *Result = commonCastTransforms(CI))
return Result;
- // Test if the trunc is the user of a select which is part of a
- // minimum or maximum operation. If so, don't do any more simplification.
- // Even simplifying demanded bits can break the canonical form of a
- // min/max.
- Value *LHS, *RHS;
- if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
- if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
- return nullptr;
-
- // See if we can simplify any instructions used by the input whose sole
- // purpose is to compute bits we don't care about.
- if (SimplifyDemandedInstructionBits(CI))
- return &CI;
-
Value *Src = CI.getOperand(0);
Type *DestTy = CI.getType(), *SrcTy = Src->getType();
@@ -670,13 +683,29 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
// If this cast is a truncate, evaluting in a different type always
// eliminates the cast, so it is always a win.
- DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to avoid cast: " << CI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to avoid cast: "
+ << CI << '\n');
Value *Res = EvaluateInDifferentType(Src, DestTy, false);
assert(Res->getType() == DestTy);
return replaceInstUsesWith(CI, Res);
}
+ // Test if the trunc is the user of a select which is part of a
+ // minimum or maximum operation. If so, don't do any more simplification.
+ // Even simplifying demanded bits can break the canonical form of a
+ // min/max.
+ Value *LHS, *RHS;
+ if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
+ if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
+ return nullptr;
+
+ // See if we can simplify any instructions used by the input whose sole
+ // purpose is to compute bits we don't care about.
+ if (SimplifyDemandedInstructionBits(CI))
+ return &CI;
+
// Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
if (DestTy->getScalarSizeInBits() == 1) {
Constant *One = ConstantInt::get(SrcTy, 1);
@@ -916,23 +945,14 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
InstCombiner &IC, Instruction *CxtI) {
BitsToClear = 0;
- if (isa<Constant>(V))
- return true;
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
-
- // If the input is a truncate from the destination type, we can trivially
- // eliminate it.
- if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+ if (canAlwaysEvaluateInType(V, Ty))
return true;
+ if (canNotEvaluateInType(V, Ty))
+ return false;
- // We can't extend or shrink something that has multiple uses: doing so would
- // require duplicating the instruction in general, which isn't profitable.
- if (!I->hasOneUse()) return false;
-
- unsigned Opc = I->getOpcode(), Tmp;
- switch (Opc) {
+ auto *I = cast<Instruction>(V);
+ unsigned Tmp;
+ switch (I->getOpcode()) {
case Instruction::ZExt: // zext(zext(x)) -> zext(x).
case Instruction::SExt: // zext(sext(x)) -> sext(x).
case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
@@ -961,7 +981,7 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
0, CxtI)) {
// If this is an And instruction and all of the BitsToClear are
// known to be zero we can reset BitsToClear.
- if (Opc == Instruction::And)
+ if (I->getOpcode() == Instruction::And)
BitsToClear = 0;
return true;
}
@@ -1052,11 +1072,18 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
"Can't clear more bits than in SrcTy");
// Okay, we can transform this! Insert the new expression now.
- DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to avoid zero extend: " << CI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to avoid zero extend: "
+ << CI << '\n');
Value *Res = EvaluateInDifferentType(Src, DestTy, false);
assert(Res->getType() == DestTy);
+ // Preserve debug values referring to Src if the zext is its last use.
+ if (auto *SrcOp = dyn_cast<Instruction>(Src))
+ if (SrcOp->hasOneUse())
+ replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT);
+
uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
uint32_t DestBitSize = DestTy->getScalarSizeInBits();
@@ -1168,22 +1195,19 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
if (!Op1->getType()->isIntOrIntVectorTy())
return nullptr;
- if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+ if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) ||
+ (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) {
// (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if negative
// (x >s -1) ? -1 : 0 -> not (ashr x, 31) -> all ones if positive
- if ((Pred == ICmpInst::ICMP_SLT && Op1C->isNullValue()) ||
- (Pred == ICmpInst::ICMP_SGT && Op1C->isAllOnesValue())) {
+ Value *Sh = ConstantInt::get(Op0->getType(),
+ Op0->getType()->getScalarSizeInBits() - 1);
+ Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
+ if (In->getType() != CI.getType())
+ In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
- Value *Sh = ConstantInt::get(Op0->getType(),
- Op0->getType()->getScalarSizeInBits()-1);
- Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
- if (In->getType() != CI.getType())
- In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
-
- if (Pred == ICmpInst::ICMP_SGT)
- In = Builder.CreateNot(In, In->getName() + ".not");
- return replaceInstUsesWith(CI, In);
- }
+ if (Pred == ICmpInst::ICMP_SGT)
+ In = Builder.CreateNot(In, In->getName() + ".not");
+ return replaceInstUsesWith(CI, In);
}
if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
@@ -1254,21 +1278,12 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
static bool canEvaluateSExtd(Value *V, Type *Ty) {
assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
"Can't sign extend type to a smaller type");
- // If this is a constant, it can be trivially promoted.
- if (isa<Constant>(V))
+ if (canAlwaysEvaluateInType(V, Ty))
return true;
+ if (canNotEvaluateInType(V, Ty))
+ return false;
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
-
- // If this is a truncate from the dest type, we can trivially eliminate it.
- if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
- return true;
-
- // We can't extend or shrink something that has multiple uses: doing so would
- // require duplicating the instruction in general, which isn't profitable.
- if (!I->hasOneUse()) return false;
-
+ auto *I = cast<Instruction>(V);
switch (I->getOpcode()) {
case Instruction::SExt: // sext(sext(x)) -> sext(x)
case Instruction::ZExt: // sext(zext(x)) -> zext(x)
@@ -1335,8 +1350,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
canEvaluateSExtd(Src, DestTy)) {
// Okay, we can transform this! Insert the new expression now.
- DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to avoid sign extend: " << CI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to avoid sign extend: "
+ << CI << '\n');
Value *Res = EvaluateInDifferentType(Src, DestTy, true);
assert(Res->getType() == DestTy);
@@ -1401,45 +1418,83 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
/// Return a Constant* for the specified floating-point constant if it fits
/// in the specified FP type without changing its value.
-static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
bool losesInfo;
APFloat F = CFP->getValueAPF();
(void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
- if (!losesInfo)
- return ConstantFP::get(CFP->getContext(), F);
+ return !losesInfo;
+}
+
+static Type *shrinkFPConstant(ConstantFP *CFP) {
+ if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
+ return nullptr; // No constant folding of this.
+ // See if the value can be truncated to half and then reextended.
+ if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+ return Type::getHalfTy(CFP->getContext());
+ // See if the value can be truncated to float and then reextended.
+ if (fitsInFPType(CFP, APFloat::IEEEsingle()))
+ return Type::getFloatTy(CFP->getContext());
+ if (CFP->getType()->isDoubleTy())
+ return nullptr; // Won't shrink.
+ if (fitsInFPType(CFP, APFloat::IEEEdouble()))
+ return Type::getDoubleTy(CFP->getContext());
+ // Don't try to shrink to various long double types.
return nullptr;
}
-/// Look through floating-point extensions until we get the source value.
-static Value *lookThroughFPExtensions(Value *V) {
- while (auto *FPExt = dyn_cast<FPExtInst>(V))
- V = FPExt->getOperand(0);
+// Determine if this is a vector of ConstantFPs and if so, return the minimal
+// type we can safely truncate all elements to.
+// TODO: Make these support undef elements.
+static Type *shrinkFPConstantVector(Value *V) {
+ auto *CV = dyn_cast<Constant>(V);
+ if (!CV || !CV->getType()->isVectorTy())
+ return nullptr;
+
+ Type *MinType = nullptr;
+
+ unsigned NumElts = CV->getType()->getVectorNumElements();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
+ if (!CFP)
+ return nullptr;
+
+ Type *T = shrinkFPConstant(CFP);
+ if (!T)
+ return nullptr;
+
+ // If we haven't found a type yet or this type has a larger mantissa than
+ // our previous type, this is our new minimal type.
+ if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth())
+ MinType = T;
+ }
+
+ // Make a vector type from the minimal type.
+ return VectorType::get(MinType, NumElts);
+}
+
+/// Find the minimum FP type we can safely truncate to.
+static Type *getMinimumFPType(Value *V) {
+ if (auto *FPExt = dyn_cast<FPExtInst>(V))
+ return FPExt->getOperand(0)->getType();
// If this value is a constant, return the constant in the smallest FP type
// that can accurately represent it. This allows us to turn
// (float)((double)X+2.0) into x+2.0f.
- if (auto *CFP = dyn_cast<ConstantFP>(V)) {
- if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
- return V; // No constant folding of this.
- // See if the value can be truncated to half and then reextended.
- if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf()))
- return V;
- // See if the value can be truncated to float and then reextended.
- if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle()))
- return V;
- if (CFP->getType()->isDoubleTy())
- return V; // Won't shrink.
- if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble()))
- return V;
- // Don't try to shrink to various long double types.
- }
-
- return V;
+ if (auto *CFP = dyn_cast<ConstantFP>(V))
+ if (Type *T = shrinkFPConstant(CFP))
+ return T;
+
+ // Try to shrink a vector of FP constants.
+ if (Type *T = shrinkFPConstantVector(V))
+ return T;
+
+ return V->getType();
}
-Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
- if (Instruction *I = commonCastTransforms(CI))
+Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
+ if (Instruction *I = commonCastTransforms(FPT))
return I;
+
// If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
// simplify this expression to avoid one or more of the trunc/extend
// operations if we can do so without changing the numerical results.
@@ -1447,15 +1502,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
// The exact manner in which the widths of the operands interact to limit
// what we can and cannot do safely varies from operation to operation, and
// is explained below in the various case statements.
- BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
+ Type *Ty = FPT.getType();
+ BinaryOperator *OpI = dyn_cast<BinaryOperator>(FPT.getOperand(0));
if (OpI && OpI->hasOneUse()) {
- Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0));
- Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1));
+ Type *LHSMinType = getMinimumFPType(OpI->getOperand(0));
+ Type *RHSMinType = getMinimumFPType(OpI->getOperand(1));
unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
- unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth();
- unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth();
+ unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
+ unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
- unsigned DstWidth = CI.getType()->getFPMantissaWidth();
+ unsigned DstWidth = Ty->getFPMantissaWidth();
switch (OpI->getOpcode()) {
default: break;
case Instruction::FAdd:
@@ -1479,12 +1535,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
// could be tightened for those cases, but they are rare (the main
// case of interest here is (float)((double)float + float)).
if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
- if (LHSOrig->getType() != CI.getType())
- LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
- if (RHSOrig->getType() != CI.getType())
- RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
- Instruction *RI =
- BinaryOperator::Create(OpI->getOpcode(), LHSOrig, RHSOrig);
+ Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
+ Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+ Instruction *RI = BinaryOperator::Create(OpI->getOpcode(), LHS, RHS);
RI->copyFastMathFlags(OpI);
return RI;
}
@@ -1496,14 +1549,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
// rounding can possibly occur; we can safely perform the operation
// in the destination format if it can represent both sources.
if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
- if (LHSOrig->getType() != CI.getType())
- LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
- if (RHSOrig->getType() != CI.getType())
- RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
- Instruction *RI =
- BinaryOperator::CreateFMul(LHSOrig, RHSOrig);
- RI->copyFastMathFlags(OpI);
- return RI;
+ Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
+ Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+ return BinaryOperator::CreateFMulFMF(LHS, RHS, OpI);
}
break;
case Instruction::FDiv:
@@ -1514,72 +1562,48 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
// condition used here is a good conservative first pass.
// TODO: Tighten bound via rigorous analysis of the unbalanced case.
if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
- if (LHSOrig->getType() != CI.getType())
- LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
- if (RHSOrig->getType() != CI.getType())
- RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
- Instruction *RI =
- BinaryOperator::CreateFDiv(LHSOrig, RHSOrig);
- RI->copyFastMathFlags(OpI);
- return RI;
+ Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
+ Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+ return BinaryOperator::CreateFDivFMF(LHS, RHS, OpI);
}
break;
- case Instruction::FRem:
+ case Instruction::FRem: {
// Remainder is straightforward. Remainder is always exact, so the
// type of OpI doesn't enter into things at all. We simply evaluate
// in whichever source type is larger, then convert to the
// destination type.
if (SrcWidth == OpWidth)
break;
- if (LHSWidth < SrcWidth)
- LHSOrig = Builder.CreateFPExt(LHSOrig, RHSOrig->getType());
- else if (RHSWidth <= SrcWidth)
- RHSOrig = Builder.CreateFPExt(RHSOrig, LHSOrig->getType());
- if (LHSOrig != OpI->getOperand(0) || RHSOrig != OpI->getOperand(1)) {
- Value *ExactResult = Builder.CreateFRem(LHSOrig, RHSOrig);
- if (Instruction *RI = dyn_cast<Instruction>(ExactResult))
- RI->copyFastMathFlags(OpI);
- return CastInst::CreateFPCast(ExactResult, CI.getType());
+ Value *LHS, *RHS;
+ if (LHSWidth == SrcWidth) {
+ LHS = Builder.CreateFPTrunc(OpI->getOperand(0), LHSMinType);
+ RHS = Builder.CreateFPTrunc(OpI->getOperand(1), LHSMinType);
+ } else {
+ LHS = Builder.CreateFPTrunc(OpI->getOperand(0), RHSMinType);
+ RHS = Builder.CreateFPTrunc(OpI->getOperand(1), RHSMinType);
}
+
+ Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, OpI);
+ return CastInst::CreateFPCast(ExactResult, Ty);
+ }
}
// (fptrunc (fneg x)) -> (fneg (fptrunc x))
if (BinaryOperator::isFNeg(OpI)) {
- Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1),
- CI.getType());
- Instruction *RI = BinaryOperator::CreateFNeg(InnerTrunc);
- RI->copyFastMathFlags(OpI);
- return RI;
+ Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+ return BinaryOperator::CreateFNegFMF(InnerTrunc, OpI);
}
}
- // (fptrunc (select cond, R1, Cst)) -->
- // (select cond, (fptrunc R1), (fptrunc Cst))
- //
- // - but only if this isn't part of a min/max operation, else we'll
- // ruin min/max canonical form which is to have the select and
- // compare's operands be of the same type with no casts to look through.
- Value *LHS, *RHS;
- SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
- if (SI &&
- (isa<ConstantFP>(SI->getOperand(1)) ||
- isa<ConstantFP>(SI->getOperand(2))) &&
- matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) {
- Value *LHSTrunc = Builder.CreateFPTrunc(SI->getOperand(1), CI.getType());
- Value *RHSTrunc = Builder.CreateFPTrunc(SI->getOperand(2), CI.getType());
- return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
- }
-
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
- if (II) {
+ if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) {
switch (II->getIntrinsicID()) {
default: break;
- case Intrinsic::fabs:
case Intrinsic::ceil:
+ case Intrinsic::fabs:
case Intrinsic::floor:
+ case Intrinsic::nearbyint:
case Intrinsic::rint:
case Intrinsic::round:
- case Intrinsic::nearbyint:
case Intrinsic::trunc: {
Value *Src = II->getArgOperand(0);
if (!Src->hasOneUse())
@@ -1590,30 +1614,26 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
// truncating.
if (II->getIntrinsicID() != Intrinsic::fabs) {
FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
- if (!FPExtSrc || FPExtSrc->getOperand(0)->getType() != CI.getType())
+ if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty)
break;
}
// Do unary FP operation on smaller type.
// (fptrunc (fabs x)) -> (fabs (fptrunc x))
- Value *InnerTrunc = Builder.CreateFPTrunc(Src, CI.getType());
- Type *IntrinsicType[] = { CI.getType() };
- Function *Overload = Intrinsic::getDeclaration(
- CI.getModule(), II->getIntrinsicID(), IntrinsicType);
-
+ Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty);
+ Function *Overload = Intrinsic::getDeclaration(FPT.getModule(),
+ II->getIntrinsicID(), Ty);
SmallVector<OperandBundleDef, 1> OpBundles;
II->getOperandBundlesAsDefs(OpBundles);
-
- Value *Args[] = { InnerTrunc };
- CallInst *NewCI = CallInst::Create(Overload, Args,
- OpBundles, II->getName());
+ CallInst *NewCI = CallInst::Create(Overload, { InnerTrunc }, OpBundles,
+ II->getName());
NewCI->copyFastMathFlags(II);
return NewCI;
}
}
}
- if (Instruction *I = shrinkInsertElt(CI, Builder))
+ if (Instruction *I = shrinkInsertElt(FPT, Builder))
return I;
return nullptr;
@@ -1718,7 +1738,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
return nullptr;
}
-/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+/// Implement the transforms for cast of pointer (bitcast/ptrtoint)
Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
Value *Src = CI.getOperand(0);
@@ -1751,7 +1771,7 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
Type *Ty = CI.getType();
unsigned AS = CI.getPointerAddressSpace();
- if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
+ if (Ty->getScalarSizeInBits() == DL.getIndexSizeInBits(AS))
return commonPointerCastTransforms(CI);
Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
@@ -2004,13 +2024,13 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
!match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
!BO->isBitwiseLogicOp())
return nullptr;
-
+
// FIXME: This transform is restricted to vector types to avoid backend
// problems caused by creating potentially illegal operations. If a fix-up is
// added to handle that situation, we can remove this check.
if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
return nullptr;
-
+
Value *X;
if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
X->getType() == DestTy && !isa<Constant>(X)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3bc7fae77cb1..6de92a4842ab 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -682,7 +682,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
// 4. Emit GEPs to get the original pointers.
// 5. Remove the original instructions.
Type *IndexType = IntegerType::get(
- Base->getContext(), DL.getPointerTypeSizeInBits(Start->getType()));
+ Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));
DenseMap<Value *, Value *> NewInsts;
NewInsts[Base] = ConstantInt::getNullValue(IndexType);
@@ -723,7 +723,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
}
auto *Op = NewInsts[GEP->getOperand(0)];
- if (isa<ConstantInt>(Op) && dyn_cast<ConstantInt>(Op)->isZero())
+ if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
NewInsts[GEP] = Index;
else
NewInsts[GEP] = Builder.CreateNSWAdd(
@@ -790,7 +790,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
static std::pair<Value *, Value *>
getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
Type *IndexType = IntegerType::get(V->getContext(),
- DL.getPointerTypeSizeInBits(V->getType()));
+ DL.getIndexTypeSizeInBits(V->getType()));
Constant *Index = ConstantInt::getNullValue(IndexType);
while (true) {
@@ -1893,11 +1893,8 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
APInt ShiftedC = C.ashr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
- if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
- // This is the same code as the SGT case, but assert the pre-condition
- // that is needed for this to work with equality predicates.
- assert(C.ashr(*ShiftAmt).shl(*ShiftAmt) == C &&
- "Compare known true or false was not folded");
+ if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+ C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) {
APInt ShiftedC = C.ashr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
@@ -1926,11 +1923,8 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
APInt ShiftedC = C.lshr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
- if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
- // This is the same code as the UGT case, but assert the pre-condition
- // that is needed for this to work with equality predicates.
- assert(C.lshr(*ShiftAmt).shl(*ShiftAmt) == C &&
- "Compare known true or false was not folded");
+ if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+ C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) {
APInt ShiftedC = C.lshr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
@@ -2463,6 +2457,45 @@ Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
return nullptr;
}
+Instruction *InstCombiner::foldICmpBitCastConstant(ICmpInst &Cmp,
+ BitCastInst *Bitcast,
+ const APInt &C) {
+ // Folding: icmp <pred> iN X, C
+ // where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
+ // and C is a splat of a K-bit pattern
+ // and SC is a constant vector = <C', C', C', ..., C'>
+ // Into:
+ // %E = extractelement <M x iK> %vec, i32 C'
+ // icmp <pred> iK %E, trunc(C)
+ if (!Bitcast->getType()->isIntegerTy() ||
+ !Bitcast->getSrcTy()->isIntOrIntVectorTy())
+ return nullptr;
+
+ Value *BCIOp = Bitcast->getOperand(0);
+ Value *Vec = nullptr; // 1st vector arg of the shufflevector
+ Constant *Mask = nullptr; // Mask arg of the shufflevector
+ if (match(BCIOp,
+ m_ShuffleVector(m_Value(Vec), m_Undef(), m_Constant(Mask)))) {
+ // Check whether every element of Mask is the same constant
+ if (auto *Elem = dyn_cast_or_null<ConstantInt>(Mask->getSplatValue())) {
+ auto *VecTy = cast<VectorType>(BCIOp->getType());
+ auto *EltTy = cast<IntegerType>(VecTy->getElementType());
+ auto Pred = Cmp.getPredicate();
+ if (C.isSplat(EltTy->getBitWidth())) {
+ // Fold the icmp based on the value of C
+ // If C is M copies of an iK sized bit pattern,
+ // then:
+ // => %E = extractelement <N x iK> %vec, i32 Elem
+ // icmp <pred> iK %SplatVal, <pattern>
+ Value *Extract = Builder.CreateExtractElement(Vec, Elem);
+ Value *NewC = ConstantInt::get(EltTy, C.trunc(EltTy->getBitWidth()));
+ return new ICmpInst(Pred, Extract, NewC);
+ }
+ }
+ }
+ return nullptr;
+}
+
/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
/// where X is some kind of instruction.
Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
@@ -2537,6 +2570,11 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
return I;
}
+ if (auto *BCI = dyn_cast<BitCastInst>(Cmp.getOperand(0))) {
+ if (Instruction *I = foldICmpBitCastConstant(Cmp, BCI, *C))
+ return I;
+ }
+
if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, *C))
return I;
@@ -2828,6 +2866,160 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
return nullptr;
}
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy truncation.
+/// Folds:
+/// x & (-1 >> y) SrcPred x to x DstPred (-1 >> y)
+/// The Mask can be a constant, too.
+/// For some predicates, the operands are commutative.
+/// For others, x can only be on a specific side.
+static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate SrcPred;
+ Value *X, *M;
+ auto m_Mask = m_CombineOr(m_LShr(m_AllOnes(), m_Value()), m_LowBitMask());
+ if (!match(&I, m_c_ICmp(SrcPred,
+ m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
+ m_Deferred(X))))
+ return nullptr;
+
+ ICmpInst::Predicate DstPred;
+ switch (SrcPred) {
+ case ICmpInst::Predicate::ICMP_EQ:
+ // x & (-1 >> y) == x -> x u<= (-1 >> y)
+ DstPred = ICmpInst::Predicate::ICMP_ULE;
+ break;
+ case ICmpInst::Predicate::ICMP_NE:
+ // x & (-1 >> y) != x -> x u> (-1 >> y)
+ DstPred = ICmpInst::Predicate::ICMP_UGT;
+ break;
+ case ICmpInst::Predicate::ICMP_UGT:
+ // x u> x & (-1 >> y) -> x u> (-1 >> y)
+ assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
+ DstPred = ICmpInst::Predicate::ICMP_UGT;
+ break;
+ case ICmpInst::Predicate::ICMP_UGE:
+ // x & (-1 >> y) u>= x -> x u<= (-1 >> y)
+ assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
+ DstPred = ICmpInst::Predicate::ICMP_ULE;
+ break;
+ case ICmpInst::Predicate::ICMP_ULT:
+ // x & (-1 >> y) u< x -> x u> (-1 >> y)
+ assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
+ DstPred = ICmpInst::Predicate::ICMP_UGT;
+ break;
+ case ICmpInst::Predicate::ICMP_ULE:
+ // x u<= x & (-1 >> y) -> x u<= (-1 >> y)
+ assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
+ DstPred = ICmpInst::Predicate::ICMP_ULE;
+ break;
+ case ICmpInst::Predicate::ICMP_SGT:
+ // x s> x & (-1 >> y) -> x s> (-1 >> y)
+ if (X != I.getOperand(0)) // X must be on LHS of comparison!
+ return nullptr; // Ignore the other case.
+ DstPred = ICmpInst::Predicate::ICMP_SGT;
+ break;
+ case ICmpInst::Predicate::ICMP_SGE:
+ // x & (-1 >> y) s>= x -> x s<= (-1 >> y)
+ if (X != I.getOperand(1)) // X must be on RHS of comparison!
+ return nullptr; // Ignore the other case.
+ DstPred = ICmpInst::Predicate::ICMP_SLE;
+ break;
+ case ICmpInst::Predicate::ICMP_SLT:
+ // x & (-1 >> y) s< x -> x s> (-1 >> y)
+ if (X != I.getOperand(1)) // X must be on RHS of comparison!
+ return nullptr; // Ignore the other case.
+ DstPred = ICmpInst::Predicate::ICMP_SGT;
+ break;
+ case ICmpInst::Predicate::ICMP_SLE:
+ // x s<= x & (-1 >> y) -> x s<= (-1 >> y)
+ if (X != I.getOperand(0)) // X must be on LHS of comparison!
+ return nullptr; // Ignore the other case.
+ DstPred = ICmpInst::Predicate::ICMP_SLE;
+ break;
+ default:
+ llvm_unreachable("All possible folds are handled.");
+ }
+
+ return Builder.CreateICmp(DstPred, X, M);
+}
+
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy signed truncation.
+/// Folds: (MaskedBits is a constant.)
+/// ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
+/// Into:
+/// (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
+/// Where KeptBits = bitwidth(%x) - MaskedBits
+static Value *
+foldICmpWithTruncSignExtendedVal(ICmpInst &I,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate SrcPred;
+ Value *X;
+ const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
+ // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
+ if (!match(&I, m_c_ICmp(SrcPred,
+ m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
+ m_APInt(C1))),
+ m_Deferred(X))))
+ return nullptr;
+
+ // Potential handling of non-splats: for each element:
+ // * if both are undef, replace with constant 0.
+ // Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
+ // * if both are not undef, and are different, bailout.
+ // * else, only one is undef, then pick the non-undef one.
+
+ // The shift amount must be equal.
+ if (*C0 != *C1)
+ return nullptr;
+ const APInt &MaskedBits = *C0;
+ assert(MaskedBits != 0 && "shift by zero should be folded away already.");
+
+ ICmpInst::Predicate DstPred;
+ switch (SrcPred) {
+ case ICmpInst::Predicate::ICMP_EQ:
+ // ((%x << MaskedBits) a>> MaskedBits) == %x
+ // =>
+ // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
+ DstPred = ICmpInst::Predicate::ICMP_ULT;
+ break;
+ case ICmpInst::Predicate::ICMP_NE:
+ // ((%x << MaskedBits) a>> MaskedBits) != %x
+ // =>
+ // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
+ DstPred = ICmpInst::Predicate::ICMP_UGE;
+ break;
+ // FIXME: are more folds possible?
+ default:
+ return nullptr;
+ }
+
+ auto *XType = X->getType();
+ const unsigned XBitWidth = XType->getScalarSizeInBits();
+ const APInt BitWidth = APInt(XBitWidth, XBitWidth);
+ assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");
+
+ // KeptBits = bitwidth(%x) - MaskedBits
+ const APInt KeptBits = BitWidth - MaskedBits;
+ assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
+ // ICmpCst = (1 << KeptBits)
+ const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
+ assert(ICmpCst.isPowerOf2());
+ // AddCst = (1 << (KeptBits-1))
+ const APInt AddCst = ICmpCst.lshr(1);
+ assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());
+
+ // T0 = add %x, AddCst
+ Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
+ // T1 = T0 DstPred ICmpCst
+ Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));
+
+ return T1;
+}
+
/// Try to fold icmp (binop), X or icmp X, (binop).
/// TODO: A large part of this logic is duplicated in InstSimplify's
/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
@@ -3011,17 +3203,22 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
// icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
if (A == Op1 && NoOp0WrapProblem)
return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
-
// icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
if (C == Op0 && NoOp1WrapProblem)
return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
+ // (A - B) >u A --> A <u B
+ if (A == Op1 && Pred == ICmpInst::ICMP_UGT)
+ return new ICmpInst(ICmpInst::ICMP_ULT, A, B);
+ // C <u (C - D) --> C <u D
+ if (C == Op0 && Pred == ICmpInst::ICMP_ULT)
+ return new ICmpInst(ICmpInst::ICMP_ULT, C, D);
+
// icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
// Try not to increase register pressure.
BO0->hasOneUse() && BO1->hasOneUse())
return new ICmpInst(Pred, A, C);
-
// icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
// Try not to increase register pressure.
@@ -3032,8 +3229,8 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
Value *X;
if (match(BO0, m_Neg(m_Value(X))))
- if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
- if (!RHSC->isMinValue(/*isSigned=*/true))
+ if (Constant *RHSC = dyn_cast<Constant>(Op1))
+ if (RHSC->isNotMinSignedValue())
return new ICmpInst(I.getSwappedPredicate(), X,
ConstantExpr::getNeg(RHSC));
}
@@ -3160,6 +3357,12 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
}
}
+ if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
+ return replaceInstUsesWith(I, V);
+
return nullptr;
}
@@ -3414,8 +3617,15 @@ Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
// Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
// integer type is the same size as the pointer type.
+ const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool {
+ if (isa<VectorType>(SrcTy)) {
+ SrcTy = cast<VectorType>(SrcTy)->getElementType();
+ DestTy = cast<VectorType>(DestTy)->getElementType();
+ }
+ return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
+ };
if (LHSCI->getOpcode() == Instruction::PtrToInt &&
- DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
+ CompatibleSizes(SrcTy, DestTy)) {
Value *RHSOp = nullptr;
if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
Value *RHSCIOp = RHSC->getOperand(0);
@@ -3618,7 +3828,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
return false;
}
-/// \brief Recognize and process idiom involving test for multiplication
+/// Recognize and process idiom involving test for multiplication
/// overflow.
///
/// The caller has matched a pattern of the form:
@@ -3799,7 +4009,8 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
// mul.with.overflow and adjust properly mask/size.
if (MulVal->hasNUsesOrMore(2)) {
Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
- for (User *U : MulVal->users()) {
+ for (auto UI = MulVal->user_begin(), UE = MulVal->user_end(); UI != UE;) {
+ User *U = *UI++;
if (U == &I || U == OtherVal)
continue;
if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
@@ -3890,48 +4101,33 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
}
}
-/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
+/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
/// should be swapped.
/// The decision is based on how many times these two operands are reused
/// as subtract operands and their positions in those instructions.
-/// The rational is that several architectures use the same instruction for
-/// both subtract and cmp, thus it is better if the order of those operands
+/// The rationale is that several architectures use the same instruction for
+/// both subtract and cmp. Thus, it is better if the order of those operands
/// match.
/// \return true if Op0 and Op1 should be swapped.
-static bool swapMayExposeCSEOpportunities(const Value * Op0,
- const Value * Op1) {
- // Filter out pointer value as those cannot appears directly in subtract.
+static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
+ // Filter out pointer values as those cannot appear directly in subtract.
// FIXME: we may want to go through inttoptrs or bitcasts.
if (Op0->getType()->isPointerTy())
return false;
- // Count every uses of both Op0 and Op1 in a subtract.
- // Each time Op0 is the first operand, count -1: swapping is bad, the
- // subtract has already the same layout as the compare.
- // Each time Op0 is the second operand, count +1: swapping is good, the
- // subtract has a different layout as the compare.
- // At the end, if the benefit is greater than 0, Op0 should come second to
- // expose more CSE opportunities.
- int GlobalSwapBenefits = 0;
+ // If a subtract already has the same operands as a compare, swapping would be
+ // bad. If a subtract has the same operands as a compare but in reverse order,
+ // then swapping is good.
+ int GoodToSwap = 0;
for (const User *U : Op0->users()) {
- const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(U);
- if (!BinOp || BinOp->getOpcode() != Instruction::Sub)
- continue;
- // If Op0 is the first argument, this is not beneficial to swap the
- // arguments.
- int LocalSwapBenefits = -1;
- unsigned Op1Idx = 1;
- if (BinOp->getOperand(Op1Idx) == Op0) {
- Op1Idx = 0;
- LocalSwapBenefits = 1;
- }
- if (BinOp->getOperand(Op1Idx) != Op1)
- continue;
- GlobalSwapBenefits += LocalSwapBenefits;
+ if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
+ GoodToSwap++;
+ else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
+ GoodToSwap--;
}
- return GlobalSwapBenefits > 0;
+ return GoodToSwap > 0;
}
-/// \brief Check that one use is in the same block as the definition and all
+/// Check that one use is in the same block as the definition and all
/// other uses are in blocks dominated by a given block.
///
/// \param DI Definition
@@ -3976,7 +4172,7 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
return true;
}
-/// \brief True when a select result is replaced by one of its operands
+/// True when a select result is replaced by one of its operands
/// in select-icmp sequence. This will eventually result in the elimination
/// of the select.
///
@@ -4052,7 +4248,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
// Get scalar or pointer size.
unsigned BitWidth = Ty->isIntOrIntVectorTy()
? Ty->getScalarSizeInBits()
- : DL.getTypeSizeInBits(Ty->getScalarType());
+ : DL.getIndexTypeSizeInBits(Ty->getScalarType());
if (!BitWidth)
return nullptr;
@@ -4082,13 +4278,13 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
}
- // If Min and Max are known to be the same, then SimplifyDemandedBits
- // figured out that the LHS is a constant. Constant fold this now, so that
+ // If Min and Max are known to be the same, then SimplifyDemandedBits figured
+ // out that the LHS or RHS is a constant. Constant fold this now, so that
// code below can assume that Min != Max.
if (!isa<Constant>(Op0) && Op0Min == Op0Max)
- return new ICmpInst(Pred, ConstantInt::get(Op0->getType(), Op0Min), Op1);
+ return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
if (!isa<Constant>(Op1) && Op1Min == Op1Max)
- return new ICmpInst(Pred, Op0, ConstantInt::get(Op1->getType(), Op1Min));
+ return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
// Based on the range information we know about the LHS, see if we can
// simplify this comparison. For example, (x&4) < 8 is always true.
@@ -4520,6 +4716,34 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
return New;
}
+ // Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
+ Value *X;
+ if (match(Op0, m_BitCast(m_SIToFP(m_Value(X))))) {
+ // icmp eq (bitcast (sitofp X)), 0 --> icmp eq X, 0
+ // icmp ne (bitcast (sitofp X)), 0 --> icmp ne X, 0
+ // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
+ // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
+ if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT ||
+ Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) &&
+ match(Op1, m_Zero()))
+ return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+ // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
+ if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));
+
+ // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
+ if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
+ return new ICmpInst(Pred, X, ConstantInt::getAllOnesValue(X->getType()));
+ }
+
+ // Zero-equality checks are preserved through unsigned floating-point casts:
+ // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
+ // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
+ if (match(Op0, m_BitCast(m_UIToFP(m_Value(X)))))
+ if (I.isEquality() && match(Op1, m_Zero()))
+ return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
// Test to see if the operands of the icmp are casted versions of other
// values. If the ptr->ptr cast can be stripped off both arguments, we do so
// now.
@@ -4642,6 +4866,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
return foldICmpAddOpConst(X, Cst, I.getSwappedPredicate());
}
+
return Changed ? &I : nullptr;
}
@@ -4928,11 +5153,11 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
// If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
// then canonicalize the operand to 0.0.
if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
- if (!match(Op0, m_Zero()) && isKnownNeverNaN(Op0)) {
+ if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0)) {
I.setOperand(0, ConstantFP::getNullValue(Op0->getType()));
return &I;
}
- if (!match(Op1, m_Zero()) && isKnownNeverNaN(Op1)) {
+ if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1)) {
I.setOperand(1, ConstantFP::getNullValue(Op0->getType()));
return &I;
}
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index f1f66d86cb73..58ef3d41415c 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -20,6 +20,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -40,7 +41,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
@@ -122,17 +122,17 @@ static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
return V;
}
-/// \brief Add one to a Constant
+/// Add one to a Constant
static inline Constant *AddOne(Constant *C) {
return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
}
-/// \brief Subtract one from a Constant
+/// Subtract one from a Constant
static inline Constant *SubOne(Constant *C) {
return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
}
-/// \brief Return true if the specified value is free to invert (apply ~ to).
+/// Return true if the specified value is free to invert (apply ~ to).
/// This happens in cases where the ~ can be eliminated. If WillInvertAllUses
/// is true, work under the assumption that the caller intends to remove all
/// uses of V and only keep uses of ~V.
@@ -178,7 +178,7 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
return false;
}
-/// \brief Specific patterns of overflow check idioms that we match.
+/// Specific patterns of overflow check idioms that we match.
enum OverflowCheckFlavor {
OCF_UNSIGNED_ADD,
OCF_SIGNED_ADD,
@@ -190,7 +190,7 @@ enum OverflowCheckFlavor {
OCF_INVALID
};
-/// \brief Returns the OverflowCheckFlavor corresponding to a overflow_with_op
+/// Returns the OverflowCheckFlavor corresponding to a overflow_with_op
/// intrinsic.
static inline OverflowCheckFlavor
IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
@@ -212,7 +212,62 @@ IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
}
}
-/// \brief The core instruction combiner logic.
+/// Some binary operators require special handling to avoid poison and undefined
+/// behavior. If a constant vector has undef elements, replace those undefs with
+/// identity constants if possible because those are always safe to execute.
+/// If no identity constant exists, replace undef with some other safe constant.
+static inline Constant *getSafeVectorConstantForBinop(
+ BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) {
+ assert(In->getType()->isVectorTy() && "Not expecting scalars here");
+
+ Type *EltTy = In->getType()->getVectorElementType();
+ auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
+ if (!SafeC) {
+ // TODO: Should this be available as a constant utility function? It is
+ // similar to getBinOpAbsorber().
+ if (IsRHSConstant) {
+ switch (Opcode) {
+ case Instruction::SRem: // X % 1 = 0
+ case Instruction::URem: // X %u 1 = 0
+ SafeC = ConstantInt::get(EltTy, 1);
+ break;
+ case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
+ SafeC = ConstantFP::get(EltTy, 1.0);
+ break;
+ default:
+ llvm_unreachable("Only rem opcodes have no identity constant for RHS");
+ }
+ } else {
+ switch (Opcode) {
+ case Instruction::Shl: // 0 << X = 0
+ case Instruction::LShr: // 0 >>u X = 0
+ case Instruction::AShr: // 0 >> X = 0
+ case Instruction::SDiv: // 0 / X = 0
+ case Instruction::UDiv: // 0 /u X = 0
+ case Instruction::SRem: // 0 % X = 0
+ case Instruction::URem: // 0 %u X = 0
+ case Instruction::Sub: // 0 - X (doesn't simplify, but it is safe)
+ case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
+ case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
+ case Instruction::FRem: // 0.0 % X = 0
+ SafeC = Constant::getNullValue(EltTy);
+ break;
+ default:
+ llvm_unreachable("Expected to find identity constant for opcode");
+ }
+ }
+ }
+ assert(SafeC && "Must have safe constant for binop");
+ unsigned NumElts = In->getType()->getVectorNumElements();
+ SmallVector<Constant *, 16> Out(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *C = In->getAggregateElement(i);
+ Out[i] = isa<UndefValue>(C) ? SafeC : C;
+ }
+ return ConstantVector::get(Out);
+}
+
+/// The core instruction combiner logic.
///
/// This class provides both the logic to recursively visit instructions and
/// combine them.
@@ -220,10 +275,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
: public InstVisitor<InstCombiner, Instruction *> {
// FIXME: These members shouldn't be public.
public:
- /// \brief A worklist of the instructions that need to be simplified.
+ /// A worklist of the instructions that need to be simplified.
InstCombineWorklist &Worklist;
- /// \brief An IRBuilder that automatically inserts new instructions into the
+ /// An IRBuilder that automatically inserts new instructions into the
/// worklist.
using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
BuilderTy &Builder;
@@ -261,7 +316,7 @@ public:
ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI) {}
- /// \brief Run the combiner over the entire worklist until it is empty.
+ /// Run the combiner over the entire worklist until it is empty.
///
/// \returns true if the IR is changed.
bool run();
@@ -289,8 +344,6 @@ public:
Instruction *visitSub(BinaryOperator &I);
Instruction *visitFSub(BinaryOperator &I);
Instruction *visitMul(BinaryOperator &I);
- Value *foldFMulConst(Instruction *FMulOrDiv, Constant *C,
- Instruction *InsertBefore);
Instruction *visitFMul(BinaryOperator &I);
Instruction *visitURem(BinaryOperator &I);
Instruction *visitSRem(BinaryOperator &I);
@@ -378,7 +431,6 @@ private:
bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
bool shouldChangeType(Type *From, Type *To) const;
Value *dyn_castNegVal(Value *V) const;
- Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
SmallVectorImpl<Value *> &NewIndices);
@@ -393,7 +445,7 @@ private:
/// if it cannot already be eliminated by some other transformation.
bool shouldOptimizeCast(CastInst *CI);
- /// \brief Try to optimize a sequence of instructions checking if an operation
+ /// Try to optimize a sequence of instructions checking if an operation
/// on LHS and RHS overflows.
///
/// If this overflow check is done via one of the overflow check intrinsics,
@@ -445,11 +497,22 @@ private:
}
bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const;
+ const Instruction &CxtI) const {
+ return computeOverflowForSignedSub(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const;
+ const Instruction &CxtI) const {
+ return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const;
+ const Instruction &CxtI) const {
+ return computeOverflowForSignedMul(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
const Instruction &CxtI) const {
@@ -462,6 +525,7 @@ private:
Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
Instruction *narrowBinOp(TruncInst &Trunc);
+ Instruction *narrowMaskedBinOp(BinaryOperator &And);
Instruction *narrowRotate(TruncInst &Trunc);
Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
@@ -490,7 +554,7 @@ private:
Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
bool JoinedByAnd, Instruction &CxtI);
public:
- /// \brief Inserts an instruction \p New before instruction \p Old
+ /// Inserts an instruction \p New before instruction \p Old
///
/// Also adds the new instruction to the worklist and returns \p New so that
/// it is suitable for use as the return from the visitation patterns.
@@ -503,13 +567,13 @@ public:
return New;
}
- /// \brief Same as InsertNewInstBefore, but also sets the debug loc.
+ /// Same as InsertNewInstBefore, but also sets the debug loc.
Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
New->setDebugLoc(Old.getDebugLoc());
return InsertNewInstBefore(New, Old);
}
- /// \brief A combiner-aware RAUW-like routine.
+ /// A combiner-aware RAUW-like routine.
///
/// This method is to be used when an instruction is found to be dead,
/// replaceable with another preexisting expression. Here we add all uses of
@@ -527,8 +591,8 @@ public:
if (&I == V)
V = UndefValue::get(I.getType());
- DEBUG(dbgs() << "IC: Replacing " << I << "\n"
- << " with " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+ << " with " << *V << '\n');
I.replaceAllUsesWith(V);
return &I;
@@ -544,13 +608,13 @@ public:
return InsertValueInst::Create(Struct, Result, 0);
}
- /// \brief Combiner aware instruction erasure.
+ /// Combiner aware instruction erasure.
///
/// When dealing with an instruction that has side effects or produces a void
/// value, we can't rely on DCE to delete the instruction. Instead, visit
/// methods should return the value returned by this function.
Instruction *eraseInstFromFunction(Instruction &I) {
- DEBUG(dbgs() << "IC: ERASE " << I << '\n');
+ LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
assert(I.use_empty() && "Cannot erase instruction that is used!");
salvageDebugInfo(I);
@@ -599,6 +663,12 @@ public:
return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
}
+ OverflowResult computeOverflowForSignedMul(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
const Value *RHS,
const Instruction *CxtI) const {
@@ -611,15 +681,26 @@ public:
return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
}
+ OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
/// Maximum size of array considered when transforming.
uint64_t MaxArraySizeForCombine;
private:
- /// \brief Performs a few simplifications for operators which are associative
+ /// Performs a few simplifications for operators which are associative
/// or commutative.
bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
- /// \brief Tries to simplify binary operations which some other binary
+ /// Tries to simplify binary operations which some other binary
/// operation distributes over.
///
/// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
@@ -628,6 +709,13 @@ private:
/// value, or null if it didn't simplify.
Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
+ /// Tries to simplify add operations using the definition of remainder.
+ ///
+ /// The definition of remainder is X % C = X - (X / C ) * C. The add
+ /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to
+ /// X % (C0 * C1)
+ Value *SimplifyAddWithRemainder(BinaryOperator &I);
+
// Binary Op helper for select operations where the expression can be
// efficiently reorganized.
Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
@@ -647,7 +735,7 @@ private:
ConstantInt *&Less, ConstantInt *&Equal,
ConstantInt *&Greater);
- /// \brief Attempts to replace V with a simpler value based on the demanded
+ /// Attempts to replace V with a simpler value based on the demanded
/// bits.
Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
unsigned Depth, Instruction *CxtI);
@@ -669,15 +757,19 @@ private:
Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
- /// \brief Tries to simplify operands to an integer instruction based on its
+ /// Tries to simplify operands to an integer instruction based on its
/// demanded bits.
bool SimplifyDemandedInstructionBits(Instruction &Inst);
+ Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
+ APInt DemandedElts,
+ int DmaskIdx = -1);
+
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts, unsigned Depth = 0);
- Value *SimplifyVectorOp(BinaryOperator &Inst);
-
+ /// Canonicalize the position of binops relative to shufflevector.
+ Instruction *foldShuffledBinop(BinaryOperator &Inst);
/// Given a binary operator, cast instruction, or select which has a PHI node
/// as operand #0, see if we can fold the instruction into the PHI (which is
@@ -691,11 +783,11 @@ private:
Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
/// This is a convenience wrapper function for the above two functions.
- Instruction *foldOpWithConstantIntoOperand(BinaryOperator &I);
+ Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
Instruction *foldAddWithConstant(BinaryOperator &Add);
- /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
+ /// Try to rotate an operation below a PHI node, using PHI nodes for
/// its operands.
Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
@@ -735,6 +827,8 @@ private:
Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
ConstantInt *C);
+ Instruction *foldICmpBitCastConstant(ICmpInst &Cmp, BitCastInst *Bitcast,
+ const APInt &C);
Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
const APInt &C);
Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
@@ -789,13 +883,12 @@ private:
Instruction *MatchBSwap(BinaryOperator &I);
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
- Instruction *SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI);
- Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
- Instruction *SimplifyMemSet(MemSetInst *MI);
+ Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
+ Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
- /// \brief Returns a value X such that Val = X * Scale, or null if none.
+ /// Returns a value X such that Val = X * Scale, or null if none.
///
/// If the multiplication is known not to overflow then NoSignedWrap is set.
Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index d4f06e18b957..742caf649007 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -23,7 +24,6 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
using namespace PatternMatch;
@@ -270,7 +270,7 @@ void PointerReplacer::findLoadAndReplace(Instruction &I) {
auto *Inst = dyn_cast<Instruction>(&*U);
if (!Inst)
return;
- DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
+ LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
if (isa<LoadInst>(Inst)) {
for (auto P : Path)
replace(P);
@@ -405,8 +405,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
if (AI.getAlignment() <= SourceAlign &&
isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
- DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
- DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
+ LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+ LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
eraseInstFromFunction(*ToDelete[i]);
Constant *TheSrc = cast<Constant>(Copy->getSource());
@@ -437,10 +437,10 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
// Are we allowed to form a atomic load or store of this type?
static bool isSupportedAtomicType(Type *Ty) {
- return Ty->isIntegerTy() || Ty->isPointerTy() || Ty->isFloatingPointTy();
+ return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
}
-/// \brief Helper to combine a load to a new type.
+/// Helper to combine a load to a new type.
///
/// This just does the work of combining a load to a new type. It handles
/// metadata, etc., and returns the new instruction. The \c NewTy should be the
@@ -453,15 +453,20 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
const Twine &Suffix = "") {
assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
"can't fold an atomic load to requested type");
-
+
Value *Ptr = LI.getPointerOperand();
unsigned AS = LI.getPointerAddressSpace();
SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
LI.getAllMetadata(MD);
+ Value *NewPtr = nullptr;
+ if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
+ NewPtr->getType()->getPointerElementType() == NewTy &&
+ NewPtr->getType()->getPointerAddressSpace() == AS))
+ NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
+
LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
- IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
- LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
+ NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
MDBuilder MDB(NewLoad->getContext());
for (const auto &MDPair : MD) {
@@ -507,7 +512,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
return NewLoad;
}
-/// \brief Combine a store to a new type.
+/// Combine a store to a new type.
///
/// Returns the newly created store instruction.
static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
@@ -584,7 +589,7 @@ static bool isMinMaxWithLoads(Value *V) {
match(L2, m_Load(m_Specific(LHS))));
}
-/// \brief Combine loads to match the type of their uses' value after looking
+/// Combine loads to match the type of their uses' value after looking
/// through intervening bitcasts.
///
/// The core idea here is that if the result of a load is used in an operation,
@@ -959,23 +964,26 @@ static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
}
static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
- if (SI.getPointerAddressSpace() != 0)
+ if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
return false;
auto *Ptr = SI.getPointerOperand();
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
Ptr = GEPI->getOperand(0);
- return isa<ConstantPointerNull>(Ptr);
+ return (isa<ConstantPointerNull>(Ptr) &&
+ !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
}
static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
const Value *GEPI0 = GEPI->getOperand(0);
- if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0)
+ if (isa<ConstantPointerNull>(GEPI0) &&
+ !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
return true;
}
if (isa<UndefValue>(Op) ||
- (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0))
+ (isa<ConstantPointerNull>(Op) &&
+ !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
return true;
return false;
}
@@ -1071,14 +1079,16 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
// load (select (cond, null, P)) -> load P
if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
- LI.getPointerAddressSpace() == 0) {
+ !NullPointerIsDefined(SI->getFunction(),
+ LI.getPointerAddressSpace())) {
LI.setOperand(0, SI->getOperand(2));
return &LI;
}
// load (select (cond, P, null)) -> load P
if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
- LI.getPointerAddressSpace() == 0) {
+ !NullPointerIsDefined(SI->getFunction(),
+ LI.getPointerAddressSpace())) {
LI.setOperand(0, SI->getOperand(1));
return &LI;
}
@@ -1087,7 +1097,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
return nullptr;
}
-/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
+/// Look for extractelement/insertvalue sequence that acts like a bitcast.
///
/// \returns underlying value that was "cast", or nullptr otherwise.
///
@@ -1142,7 +1152,7 @@ static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
return U;
}
-/// \brief Combine stores to match the type of value being stored.
+/// Combine stores to match the type of value being stored.
///
/// The core idea here is that the memory does not have any intrinsic type and
/// where we can we should match the type of a store to the type of value being
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 541dde6c47d2..63761d427235 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -33,6 +33,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -94,115 +95,52 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
return MadeChange ? V : nullptr;
}
-/// True if the multiply can not be expressed in an int this size.
-static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
- bool IsSigned) {
- bool Overflow;
- if (IsSigned)
- Product = C1.smul_ov(C2, Overflow);
- else
- Product = C1.umul_ov(C2, Overflow);
-
- return Overflow;
-}
-
-/// \brief True if C2 is a multiple of C1. Quotient contains C2/C1.
-static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
- bool IsSigned) {
- assert(C1.getBitWidth() == C2.getBitWidth() &&
- "Inconsistent width of constants!");
-
- // Bail if we will divide by zero.
- if (C2.isMinValue())
- return false;
-
- // Bail if we would divide INT_MIN by -1.
- if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
- return false;
-
- APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
- if (IsSigned)
- APInt::sdivrem(C1, C2, Quotient, Remainder);
- else
- APInt::udivrem(C1, C2, Quotient, Remainder);
-
- return Remainder.isMinValue();
-}
-
-/// \brief A helper routine of InstCombiner::visitMul().
+/// A helper routine of InstCombiner::visitMul().
///
-/// If C is a vector of known powers of 2, then this function returns
-/// a new vector obtained from C replacing each element with its logBase2.
+/// If C is a scalar/vector of known powers of 2, then this function returns
+/// a new scalar/vector obtained from logBase2 of C.
/// Return a null pointer otherwise.
-static Constant *getLogBase2Vector(ConstantDataVector *CV) {
+static Constant *getLogBase2(Type *Ty, Constant *C) {
const APInt *IVal;
- SmallVector<Constant *, 4> Elts;
+ if (match(C, m_APInt(IVal)) && IVal->isPowerOf2())
+ return ConstantInt::get(Ty, IVal->logBase2());
+
+ if (!Ty->isVectorTy())
+ return nullptr;
- for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
- Constant *Elt = CV->getElementAsConstant(I);
+ SmallVector<Constant *, 4> Elts;
+ for (unsigned I = 0, E = Ty->getVectorNumElements(); I != E; ++I) {
+ Constant *Elt = C->getAggregateElement(I);
+ if (!Elt)
+ return nullptr;
+ if (isa<UndefValue>(Elt)) {
+ Elts.push_back(UndefValue::get(Ty->getScalarType()));
+ continue;
+ }
if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
return nullptr;
- Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2()));
+ Elts.push_back(ConstantInt::get(Ty->getScalarType(), IVal->logBase2()));
}
return ConstantVector::get(Elts);
}
-/// \brief Return true if we can prove that:
-/// (mul LHS, RHS) === (mul nsw LHS, RHS)
-bool InstCombiner::willNotOverflowSignedMul(const Value *LHS,
- const Value *RHS,
- const Instruction &CxtI) const {
- // Multiplying n * m significant bits yields a result of n + m significant
- // bits. If the total number of significant bits does not exceed the
- // result bit width (minus 1), there is no overflow.
- // This means if we have enough leading sign bits in the operands
- // we can guarantee that the result does not overflow.
- // Ref: "Hacker's Delight" by Henry Warren
- unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-
- // Note that underestimating the number of sign bits gives a more
- // conservative answer.
- unsigned SignBits =
- ComputeNumSignBits(LHS, 0, &CxtI) + ComputeNumSignBits(RHS, 0, &CxtI);
-
- // First handle the easy case: if we have enough sign bits there's
- // definitely no overflow.
- if (SignBits > BitWidth + 1)
- return true;
-
- // There are two ambiguous cases where there can be no overflow:
- // SignBits == BitWidth + 1 and
- // SignBits == BitWidth
- // The second case is difficult to check, therefore we only handle the
- // first case.
- if (SignBits == BitWidth + 1) {
- // It overflows only when both arguments are negative and the true
- // product is exactly the minimum negative number.
- // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
- // For simplicity we just check if at least one side is not negative.
- KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
- KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
- if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
- return true;
- }
- return false;
-}
-
Instruction *InstCombiner::visitMul(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyMulInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
// X * -1 == 0 - X
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
if (match(Op1, m_AllOnes())) {
BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
if (I.hasNoSignedWrap())
@@ -231,16 +169,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
}
if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
- Constant *NewCst = nullptr;
- if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2())
- // Replace X*(2^C) with X << C, where C is either a scalar or a splat.
- NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2());
- else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1))
- // Replace X*(2^C) with X << C, where C is a vector of known
- // constant powers of 2.
- NewCst = getLogBase2Vector(CV);
-
- if (NewCst) {
+ // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
+ if (Constant *NewCst = getLogBase2(NewOp->getType(), C1)) {
unsigned Width = NewCst->getType()->getPrimitiveSizeInBits();
BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
@@ -282,34 +212,37 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
}
}
+ if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+ return FoldedMul;
+
// Simplify mul instructions with a constant RHS.
if (isa<Constant>(Op1)) {
- if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
- return FoldedMul;
-
// Canonicalize (X+C1)*CI -> X*CI+C1*CI.
- {
- Value *X;
- Constant *C1;
- if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
- Value *Mul = Builder.CreateMul(C1, Op1);
- // Only go forward with the transform if C1*CI simplifies to a tidier
- // constant.
- if (!match(Mul, m_Mul(m_Value(), m_Value())))
- return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
- }
+ Value *X;
+ Constant *C1;
+ if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
+ Value *Mul = Builder.CreateMul(C1, Op1);
+ // Only go forward with the transform if C1*CI simplifies to a tidier
+ // constant.
+ if (!match(Mul, m_Mul(m_Value(), m_Value())))
+ return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
}
}
- if (Value *Op0v = dyn_castNegVal(Op0)) { // -X * -Y = X*Y
- if (Value *Op1v = dyn_castNegVal(Op1)) {
- BinaryOperator *BO = BinaryOperator::CreateMul(Op0v, Op1v);
- if (I.hasNoSignedWrap() &&
- match(Op0, m_NSWSub(m_Value(), m_Value())) &&
- match(Op1, m_NSWSub(m_Value(), m_Value())))
- BO->setHasNoSignedWrap();
- return BO;
- }
+ // -X * C --> X * -C
+ Value *X, *Y;
+ Constant *Op1C;
+ if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
+ return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
+
+ // -X * -Y --> X * Y
+ if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) {
+ auto *NewMul = BinaryOperator::CreateMul(X, Y);
+ if (I.hasNoSignedWrap() &&
+ cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() &&
+ cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap())
+ NewMul->setHasNoSignedWrap();
+ return NewMul;
}
// (X / Y) * Y = X - (X % Y)
@@ -371,28 +304,24 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
}
}
- // If one of the operands of the multiply is a cast from a boolean value, then
- // we know the bool is either zero or one, so this is a 'masking' multiply.
- // X * Y (where Y is 0 or 1) -> X & (0-Y)
- if (!I.getType()->isVectorTy()) {
- // -2 is "-1 << 1" so it is all bits set except the low one.
- APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
-
- Value *BoolCast = nullptr, *OtherOp = nullptr;
- if (MaskedValueIsZero(Op0, Negative2, 0, &I)) {
- BoolCast = Op0;
- OtherOp = Op1;
- } else if (MaskedValueIsZero(Op1, Negative2, 0, &I)) {
- BoolCast = Op1;
- OtherOp = Op0;
- }
-
- if (BoolCast) {
- Value *V = Builder.CreateSub(Constant::getNullValue(I.getType()),
- BoolCast);
- return BinaryOperator::CreateAnd(V, OtherOp);
- }
- }
+ // (bool X) * Y --> X ? Y : 0
+ // Y * (bool X) --> X ? Y : 0
+ if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+ return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
+ if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+ return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
+
+ // (lshr X, 31) * Y --> (ashr X, 31) & Y
+ // Y * (lshr X, 31) --> (ashr X, 31) & Y
+ // TODO: We are not checking one-use because the elimination of the multiply
+ // is better for analysis?
+ // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
+ // more similar to what we're doing above.
+ const APInt *C;
+ if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+ return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
+ if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+ return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
// Check for (mul (sext x), y), see if we can merge this into an
// integer mul followed by a sext.
@@ -466,6 +395,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
}
}
+ bool Changed = false;
if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
Changed = true;
I.setHasNoSignedWrap(true);
@@ -479,286 +409,103 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
return Changed ? &I : nullptr;
}
-/// Detect pattern log2(Y * 0.5) with corresponding fast math flags.
-static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
- if (!Op->hasOneUse())
- return;
-
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
- if (!II)
- return;
- if (II->getIntrinsicID() != Intrinsic::log2 || !II->isFast())
- return;
- Log2 = II;
-
- Value *OpLog2Of = II->getArgOperand(0);
- if (!OpLog2Of->hasOneUse())
- return;
-
- Instruction *I = dyn_cast<Instruction>(OpLog2Of);
- if (!I)
- return;
-
- if (I->getOpcode() != Instruction::FMul || !I->isFast())
- return;
-
- if (match(I->getOperand(0), m_SpecificFP(0.5)))
- Y = I->getOperand(1);
- else if (match(I->getOperand(1), m_SpecificFP(0.5)))
- Y = I->getOperand(0);
-}
-
-static bool isFiniteNonZeroFp(Constant *C) {
- if (C->getType()->isVectorTy()) {
- for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
- ++I) {
- ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
- if (!CFP || !CFP->getValueAPF().isFiniteNonZero())
- return false;
- }
- return true;
- }
-
- return isa<ConstantFP>(C) &&
- cast<ConstantFP>(C)->getValueAPF().isFiniteNonZero();
-}
-
-static bool isNormalFp(Constant *C) {
- if (C->getType()->isVectorTy()) {
- for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
- ++I) {
- ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
- if (!CFP || !CFP->getValueAPF().isNormal())
- return false;
- }
- return true;
- }
-
- return isa<ConstantFP>(C) && cast<ConstantFP>(C)->getValueAPF().isNormal();
-}
-
-/// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
-/// true iff the given value is FMul or FDiv with one and only one operand
-/// being a normal constant (i.e. not Zero/NaN/Infinity).
-static bool isFMulOrFDivWithConstant(Value *V) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I || (I->getOpcode() != Instruction::FMul &&
- I->getOpcode() != Instruction::FDiv))
- return false;
-
- Constant *C0 = dyn_cast<Constant>(I->getOperand(0));
- Constant *C1 = dyn_cast<Constant>(I->getOperand(1));
-
- if (C0 && C1)
- return false;
-
- return (C0 && isFiniteNonZeroFp(C0)) || (C1 && isFiniteNonZeroFp(C1));
-}
+Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+ if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
-/// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
-/// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
-/// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
-/// This function is to simplify "FMulOrDiv * C" and returns the
-/// resulting expression. Note that this function could return NULL in
-/// case the constants cannot be folded into a normal floating-point.
-Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C,
- Instruction *InsertBefore) {
- assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
-
- Value *Opnd0 = FMulOrDiv->getOperand(0);
- Value *Opnd1 = FMulOrDiv->getOperand(1);
-
- Constant *C0 = dyn_cast<Constant>(Opnd0);
- Constant *C1 = dyn_cast<Constant>(Opnd1);
-
- BinaryOperator *R = nullptr;
-
- // (X * C0) * C => X * (C0*C)
- if (FMulOrDiv->getOpcode() == Instruction::FMul) {
- Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C);
- if (isNormalFp(F))
- R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F);
- } else {
- if (C0) {
- // (C0 / X) * C => (C0 * C) / X
- if (FMulOrDiv->hasOneUse()) {
- // It would otherwise introduce another div.
- Constant *F = ConstantExpr::getFMul(C0, C);
- if (isNormalFp(F))
- R = BinaryOperator::CreateFDiv(F, Opnd1);
- }
- } else {
- // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
- Constant *F = ConstantExpr::getFDiv(C, C1);
- if (isNormalFp(F)) {
- R = BinaryOperator::CreateFMul(Opnd0, F);
- } else {
- // (X / C1) * C => X / (C1/C)
- Constant *F = ConstantExpr::getFDiv(C1, C);
- if (isNormalFp(F))
- R = BinaryOperator::CreateFDiv(Opnd0, F);
- }
- }
- }
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
- if (R) {
- R->setFast(true);
- InsertNewInstWith(R, *InsertBefore);
- }
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
- return R;
-}
+ if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+ return FoldedMul;
-Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
- bool Changed = SimplifyAssociativeOrCommutative(I);
+ // X * -1.0 --> -X
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (match(Op1, m_SpecificFP(-1.0)))
+ return BinaryOperator::CreateFNegFMF(Op0, &I);
- if (Value *V = SimplifyVectorOp(I))
- return replaceInstUsesWith(I, V);
+ // -X * -Y --> X * Y
+ Value *X, *Y;
+ if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+ return BinaryOperator::CreateFMulFMF(X, Y, &I);
- if (isa<Constant>(Op0))
- std::swap(Op0, Op1);
+ // -X * C --> X * -C
+ Constant *C;
+ if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
+ return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
- if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ // Sink negation: -X * Y --> -(X * Y)
+ if (match(Op0, m_OneUse(m_FNeg(m_Value(X)))))
+ return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op1, &I), &I);
- bool AllowReassociate = I.isFast();
+ // Sink negation: Y * -X --> -(X * Y)
+ if (match(Op1, m_OneUse(m_FNeg(m_Value(X)))))
+ return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op0, &I), &I);
- // Simplify mul instructions with a constant RHS.
- if (isa<Constant>(Op1)) {
- if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
- return FoldedMul;
-
- // (fmul X, -1.0) --> (fsub -0.0, X)
- if (match(Op1, m_SpecificFP(-1.0))) {
- Constant *NegZero = ConstantFP::getNegativeZero(Op1->getType());
- Instruction *RI = BinaryOperator::CreateFSub(NegZero, Op0);
- RI->copyFastMathFlags(&I);
- return RI;
- }
-
- Constant *C = cast<Constant>(Op1);
- if (AllowReassociate && isFiniteNonZeroFp(C)) {
- // Let MDC denote an expression in one of these forms:
- // X * C, C/X, X/C, where C is a constant.
- //
- // Try to simplify "MDC * Constant"
- if (isFMulOrFDivWithConstant(Op0))
- if (Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I))
- return replaceInstUsesWith(I, V);
-
- // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
- Instruction *FAddSub = dyn_cast<Instruction>(Op0);
- if (FAddSub &&
- (FAddSub->getOpcode() == Instruction::FAdd ||
- FAddSub->getOpcode() == Instruction::FSub)) {
- Value *Opnd0 = FAddSub->getOperand(0);
- Value *Opnd1 = FAddSub->getOperand(1);
- Constant *C0 = dyn_cast<Constant>(Opnd0);
- Constant *C1 = dyn_cast<Constant>(Opnd1);
- bool Swap = false;
- if (C0) {
- std::swap(C0, C1);
- std::swap(Opnd0, Opnd1);
- Swap = true;
- }
+ // fabs(X) * fabs(X) -> X * X
+ if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
+ return BinaryOperator::CreateFMulFMF(X, X, &I);
- if (C1 && isFiniteNonZeroFp(C1) && isFMulOrFDivWithConstant(Opnd0)) {
- Value *M1 = ConstantExpr::getFMul(C1, C);
- Value *M0 = isNormalFp(cast<Constant>(M1)) ?
- foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
- nullptr;
- if (M0 && M1) {
- if (Swap && FAddSub->getOpcode() == Instruction::FSub)
- std::swap(M0, M1);
-
- Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd)
- ? BinaryOperator::CreateFAdd(M0, M1)
- : BinaryOperator::CreateFSub(M0, M1);
- RI->copyFastMathFlags(&I);
- return RI;
- }
- }
- }
- }
- }
+ // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E)
+ if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
+ return replaceInstUsesWith(I, V);
- if (Op0 == Op1) {
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
- // sqrt(X) * sqrt(X) -> X
- if (AllowReassociate && II->getIntrinsicID() == Intrinsic::sqrt)
- return replaceInstUsesWith(I, II->getOperand(0));
-
- // fabs(X) * fabs(X) -> X * X
- if (II->getIntrinsicID() == Intrinsic::fabs) {
- Instruction *FMulVal = BinaryOperator::CreateFMul(II->getOperand(0),
- II->getOperand(0),
- I.getName());
- FMulVal->copyFastMathFlags(&I);
- return FMulVal;
+ if (I.hasAllowReassoc()) {
+ // Reassociate constant RHS with another constant to form constant
+ // expression.
+ if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
+ Constant *C1;
+ if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
+ // (C1 / X) * C --> (C * C1) / X
+ Constant *CC1 = ConstantExpr::getFMul(C, C1);
+ if (CC1->isNormalFP())
+ return BinaryOperator::CreateFDivFMF(CC1, X, &I);
}
- }
- }
-
- // Under unsafe algebra do:
- // X * log2(0.5*Y) = X*log2(Y) - X
- if (AllowReassociate) {
- Value *OpX = nullptr;
- Value *OpY = nullptr;
- IntrinsicInst *Log2;
- detectLog2OfHalf(Op0, OpY, Log2);
- if (OpY) {
- OpX = Op1;
- } else {
- detectLog2OfHalf(Op1, OpY, Log2);
- if (OpY) {
- OpX = Op0;
+ if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+ // (X / C1) * C --> X * (C / C1)
+ Constant *CDivC1 = ConstantExpr::getFDiv(C, C1);
+ if (CDivC1->isNormalFP())
+ return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+
+ // If the constant was a denormal, try reassociating differently.
+ // (X / C1) * C --> X / (C1 / C)
+ Constant *C1DivC = ConstantExpr::getFDiv(C1, C);
+ if (Op0->hasOneUse() && C1DivC->isNormalFP())
+ return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
}
- }
- // if pattern detected emit alternate sequence
- if (OpX && OpY) {
- BuilderTy::FastMathFlagGuard Guard(Builder);
- Builder.setFastMathFlags(Log2->getFastMathFlags());
- Log2->setArgOperand(0, OpY);
- Value *FMulVal = Builder.CreateFMul(OpX, Log2);
- Value *FSub = Builder.CreateFSub(FMulVal, OpX);
- FSub->takeName(&I);
- return replaceInstUsesWith(I, FSub);
- }
- }
- // Handle symmetric situation in a 2-iteration loop
- Value *Opnd0 = Op0;
- Value *Opnd1 = Op1;
- for (int i = 0; i < 2; i++) {
- bool IgnoreZeroSign = I.hasNoSignedZeros();
- if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
- BuilderTy::FastMathFlagGuard Guard(Builder);
- Builder.setFastMathFlags(I.getFastMathFlags());
-
- Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
- Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
-
- // -X * -Y => X*Y
- if (N1) {
- Value *FMul = Builder.CreateFMul(N0, N1);
- FMul->takeName(&I);
- return replaceInstUsesWith(I, FMul);
+ // We do not need to match 'fadd C, X' and 'fsub X, C' because they are
+ // canonicalized to 'fadd X, C'. Distributing the multiply may allow
+ // further folds and (X * C) + C2 is 'fma'.
+ if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) {
+ // (X + C1) * C --> (X * C) + (C * C1)
+ Constant *CC1 = ConstantExpr::getFMul(C, C1);
+ Value *XC = Builder.CreateFMulFMF(X, C, &I);
+ return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
}
-
- if (Opnd0->hasOneUse()) {
- // -X * Y => -(X*Y) (Promote negation as high as possible)
- Value *T = Builder.CreateFMul(N0, Opnd1);
- Value *Neg = Builder.CreateFNeg(T);
- Neg->takeName(&I);
- return replaceInstUsesWith(I, Neg);
+ if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
+ // (C1 - X) * C --> (C * C1) - (X * C)
+ Constant *CC1 = ConstantExpr::getFMul(C, C1);
+ Value *XC = Builder.CreateFMulFMF(X, C, &I);
+ return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
}
}
- // Handle specials cases for FMul with selects feeding the operation
- if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
- return replaceInstUsesWith(I, V);
+ // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
+ // nnan disallows the possibility of returning a number if both operands are
+ // negative (in that case, we should return NaN).
+ if (I.hasNoNaNs() &&
+ match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
+ match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+ Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+ Value *Sqrt = Builder.CreateIntrinsic(Intrinsic::sqrt, { XY }, &I);
+ return replaceInstUsesWith(I, Sqrt);
+ }
// (X*Y) * X => (X*X) * Y where Y != X
// The purpose is two-fold:
@@ -767,34 +514,40 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
// latency of the instruction Y is amortized by the expression of X*X,
// and therefore Y is in a "less critical" position compared to what it
// was before the transformation.
- if (AllowReassociate) {
- Value *Opnd0_0, *Opnd0_1;
- if (Opnd0->hasOneUse() &&
- match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
- Value *Y = nullptr;
- if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
- Y = Opnd0_1;
- else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
- Y = Opnd0_0;
-
- if (Y) {
- BuilderTy::FastMathFlagGuard Guard(Builder);
- Builder.setFastMathFlags(I.getFastMathFlags());
- Value *T = Builder.CreateFMul(Opnd1, Opnd1);
- Value *R = Builder.CreateFMul(T, Y);
- R->takeName(&I);
- return replaceInstUsesWith(I, R);
- }
- }
+ if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) &&
+ Op1 != Y) {
+ Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I);
+ return BinaryOperator::CreateFMulFMF(XX, Y, &I);
+ }
+ if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) &&
+ Op0 != Y) {
+ Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I);
+ return BinaryOperator::CreateFMulFMF(XX, Y, &I);
}
+ }
- if (!isa<Constant>(Op1))
- std::swap(Opnd0, Opnd1);
- else
- break;
+ // log2(X * 0.5) * Y = log2(X) * Y - Y
+ if (I.isFast()) {
+ IntrinsicInst *Log2 = nullptr;
+ if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+ m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+ Log2 = cast<IntrinsicInst>(Op0);
+ Y = Op1;
+ }
+ if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+ m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+ Log2 = cast<IntrinsicInst>(Op1);
+ Y = Op0;
+ }
+ if (Log2) {
+ Log2->setArgOperand(0, X);
+ Log2->copyFastMathFlags(&I);
+ Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
+ return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
+ }
}
- return Changed ? &I : nullptr;
+ return nullptr;
}
/// Fold a divide or remainder with a select instruction divisor when one of the
@@ -835,9 +588,9 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
Type *CondTy = SelectCond->getType();
while (BBI != BBFront) {
--BBI;
- // If we found a call to a function, we can't assume it will return, so
+ // If we found an instruction that we can't assume will return, so
// information from below it cannot be propagated above it.
- if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI))
break;
// Replace uses of the select or its condition with the known values.
@@ -867,12 +620,44 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
return true;
}
+/// True if the multiply can not be expressed in an int this size.
+static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
+ bool IsSigned) {
+ bool Overflow;
+ Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow);
+ return Overflow;
+}
+
+/// True if C1 is a multiple of C2. Quotient contains C1/C2.
+static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
+ bool IsSigned) {
+ assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");
+
+ // Bail if we will divide by zero.
+ if (C2.isNullValue())
+ return false;
+
+ // Bail if we would divide INT_MIN by -1.
+ if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+ return false;
+
+ APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
+ if (IsSigned)
+ APInt::sdivrem(C1, C2, Quotient, Remainder);
+ else
+ APInt::udivrem(C1, C2, Quotient, Remainder);
+
+ return Remainder.isMinValue();
+}
+
/// This function implements the transforms common to both integer division
/// instructions (udiv and sdiv). It is called by the visitors to those integer
/// division instructions.
-/// @brief Common integer divide transforms
+/// Common integer divide transforms
Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ bool IsSigned = I.getOpcode() == Instruction::SDiv;
+ Type *Ty = I.getType();
// The RHS is known non-zero.
if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
@@ -885,94 +670,87 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
if (simplifyDivRemOfSelectWithZeroOp(I))
return &I;
- if (Instruction *LHS = dyn_cast<Instruction>(Op0)) {
- const APInt *C2;
- if (match(Op1, m_APInt(C2))) {
- Value *X;
- const APInt *C1;
- bool IsSigned = I.getOpcode() == Instruction::SDiv;
-
- // (X / C1) / C2 -> X / (C1*C2)
- if ((IsSigned && match(LHS, m_SDiv(m_Value(X), m_APInt(C1)))) ||
- (!IsSigned && match(LHS, m_UDiv(m_Value(X), m_APInt(C1))))) {
- APInt Product(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
- if (!MultiplyOverflows(*C1, *C2, Product, IsSigned))
- return BinaryOperator::Create(I.getOpcode(), X,
- ConstantInt::get(I.getType(), Product));
- }
-
- if ((IsSigned && match(LHS, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
- (!IsSigned && match(LHS, m_NUWMul(m_Value(X), m_APInt(C1))))) {
- APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+ const APInt *C2;
+ if (match(Op1, m_APInt(C2))) {
+ Value *X;
+ const APInt *C1;
+
+ // (X / C1) / C2 -> X / (C1*C2)
+ if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) ||
+ (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) {
+ APInt Product(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+ if (!multiplyOverflows(*C1, *C2, Product, IsSigned))
+ return BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, Product));
+ }
- // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
- if (IsMultiple(*C2, *C1, Quotient, IsSigned)) {
- BinaryOperator *BO = BinaryOperator::Create(
- I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
- BO->setIsExact(I.isExact());
- return BO;
- }
+ if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
+ (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) {
+ APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
- // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
- if (IsMultiple(*C1, *C2, Quotient, IsSigned)) {
- BinaryOperator *BO = BinaryOperator::Create(
- Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
- BO->setHasNoUnsignedWrap(
- !IsSigned &&
- cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
- BO->setHasNoSignedWrap(
- cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
- return BO;
- }
+ // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
+ if (isMultiple(*C2, *C1, Quotient, IsSigned)) {
+ auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, Quotient));
+ NewDiv->setIsExact(I.isExact());
+ return NewDiv;
}
- if ((IsSigned && match(LHS, m_NSWShl(m_Value(X), m_APInt(C1))) &&
- *C1 != C1->getBitWidth() - 1) ||
- (!IsSigned && match(LHS, m_NUWShl(m_Value(X), m_APInt(C1))))) {
- APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
- APInt C1Shifted = APInt::getOneBitSet(
- C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
-
- // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of C1.
- if (IsMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
- BinaryOperator *BO = BinaryOperator::Create(
- I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
- BO->setIsExact(I.isExact());
- return BO;
- }
+ // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
+ if (isMultiple(*C1, *C2, Quotient, IsSigned)) {
+ auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+ ConstantInt::get(Ty, Quotient));
+ auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+ Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+ Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+ return Mul;
+ }
+ }
- // (X << C1) / C2 -> X * (C2 >> C1) if C1 is a multiple of C2.
- if (IsMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
- BinaryOperator *BO = BinaryOperator::Create(
- Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
- BO->setHasNoUnsignedWrap(
- !IsSigned &&
- cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
- BO->setHasNoSignedWrap(
- cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
- return BO;
- }
+ if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
+ *C1 != C1->getBitWidth() - 1) ||
+ (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+ APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+ APInt C1Shifted = APInt::getOneBitSet(
+ C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+
+ // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
+ if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
+ auto *BO = BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, Quotient));
+ BO->setIsExact(I.isExact());
+ return BO;
}
- if (!C2->isNullValue()) // avoid X udiv 0
- if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
- return FoldedDiv;
+ // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2.
+ if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
+ auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+ ConstantInt::get(Ty, Quotient));
+ auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+ Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+ Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+ return Mul;
+ }
}
+
+ if (!C2->isNullValue()) // avoid X udiv 0
+ if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
+ return FoldedDiv;
}
if (match(Op0, m_One())) {
- assert(!I.getType()->isIntOrIntVectorTy(1) && "i1 divide not removed?");
- if (I.getOpcode() == Instruction::SDiv) {
+ assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
+ if (IsSigned) {
// If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
// result is one, if Op1 is -1 then the result is minus one, otherwise
// it's zero.
Value *Inc = Builder.CreateAdd(Op1, Op0);
- Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(I.getType(), 3));
- return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0));
+ Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
+ return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
} else {
// If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
// result is one, otherwise it's zero.
- return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), I.getType());
+ return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty);
}
}
@@ -981,12 +759,28 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
return &I;
// (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
- Value *X = nullptr, *Z = nullptr;
- if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1
- bool isSigned = I.getOpcode() == Instruction::SDiv;
- if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
- (!isSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
+ Value *X, *Z;
+ if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1
+ if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
+ (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
return BinaryOperator::Create(I.getOpcode(), X, Op1);
+
+ // (X << Y) / X -> 1 << Y
+ Value *Y;
+ if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y))))
+ return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y);
+ if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y))))
+ return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y);
+
+ // X / (X * Y) -> 1 / Y if the multiplication does not overflow.
+ if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) {
+ bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
+ bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
+ if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) {
+ I.setOperand(0, ConstantInt::get(Ty, 1));
+ I.setOperand(1, Y);
+ return &I;
+ }
}
return nullptr;
@@ -1000,7 +794,7 @@ using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
const BinaryOperator &I,
InstCombiner &IC);
-/// \brief Used to maintain state for visitUDivOperand().
+/// Used to maintain state for visitUDivOperand().
struct UDivFoldAction {
/// Informs visitUDiv() how to fold this operand. This can be zero if this
/// action joins two actions together.
@@ -1028,23 +822,15 @@ struct UDivFoldAction {
// X udiv 2^C -> X >> C
static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
const BinaryOperator &I, InstCombiner &IC) {
- const APInt &C = cast<Constant>(Op1)->getUniqueInteger();
- BinaryOperator *LShr = BinaryOperator::CreateLShr(
- Op0, ConstantInt::get(Op0->getType(), C.logBase2()));
+ Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1));
+ if (!C1)
+ llvm_unreachable("Failed to constant fold udiv -> logbase2");
+ BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
if (I.isExact())
LShr->setIsExact();
return LShr;
}
-// X udiv C, where C >= signbit
-static Instruction *foldUDivNegCst(Value *Op0, Value *Op1,
- const BinaryOperator &I, InstCombiner &IC) {
- Value *ICI = IC.Builder.CreateICmpULT(Op0, cast<ConstantInt>(Op1));
-
- return SelectInst::Create(ICI, Constant::getNullValue(I.getType()),
- ConstantInt::get(I.getType(), 1));
-}
-
// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
// X udiv (zext (C1 << N)), where C1 is "1<<C2" --> X >> (N+C2)
static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
@@ -1053,12 +839,14 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
ShiftLeft = Op1;
- const APInt *CI;
+ Constant *CI;
Value *N;
- if (!match(ShiftLeft, m_Shl(m_APInt(CI), m_Value(N))))
+ if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
llvm_unreachable("match should never fail here!");
- if (*CI != 1)
- N = IC.Builder.CreateAdd(N, ConstantInt::get(N->getType(), CI->logBase2()));
+ Constant *Log2Base = getLogBase2(N->getType(), CI);
+ if (!Log2Base)
+ llvm_unreachable("getLogBase2 should never fail here!");
+ N = IC.Builder.CreateAdd(N, Log2Base);
if (Op1 != ShiftLeft)
N = IC.Builder.CreateZExt(N, Op1->getType());
BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
@@ -1067,7 +855,7 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
return LShr;
}
-// \brief Recursively visits the possible right hand operands of a udiv
+// Recursively visits the possible right hand operands of a udiv
// instruction, seeing through select instructions, to determine if we can
// replace the udiv with something simpler. If we find that an operand is not
// able to simplify the udiv, we abort the entire transformation.
@@ -1081,13 +869,6 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
return Actions.size();
}
- if (ConstantInt *C = dyn_cast<ConstantInt>(Op1))
- // X udiv C, where C >= signbit
- if (C->getValue().isNegative()) {
- Actions.push_back(UDivFoldAction(foldUDivNegCst, C));
- return Actions.size();
- }
-
// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
@@ -1148,40 +929,65 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
}
Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyUDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
// Handle the integer div common cases
if (Instruction *Common = commonIDivTransforms(I))
return Common;
- // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
- {
- Value *X;
- const APInt *C1, *C2;
- if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) &&
- match(Op1, m_APInt(C2))) {
- bool Overflow;
- APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
- if (!Overflow) {
- bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
- BinaryOperator *BO = BinaryOperator::CreateUDiv(
- X, ConstantInt::get(X->getType(), C2ShlC1));
- if (IsExact)
- BO->setIsExact();
- return BO;
- }
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *X;
+ const APInt *C1, *C2;
+ if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) {
+ // (X lshr C1) udiv C2 --> X udiv (C2 << C1)
+ bool Overflow;
+ APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
+ if (!Overflow) {
+ bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
+ BinaryOperator *BO = BinaryOperator::CreateUDiv(
+ X, ConstantInt::get(X->getType(), C2ShlC1));
+ if (IsExact)
+ BO->setIsExact();
+ return BO;
}
}
+ // Op0 / C where C is large (negative) --> zext (Op0 >= C)
+ // TODO: Could use isKnownNegative() to handle non-constant values.
+ Type *Ty = I.getType();
+ if (match(Op1, m_Negative())) {
+ Value *Cmp = Builder.CreateICmpUGE(Op0, Op1);
+ return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+ }
+ // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined)
+ if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+ return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+ }
+
if (Instruction *NarrowDiv = narrowUDivURem(I, Builder))
return NarrowDiv;
+ // If the udiv operands are non-overflowing multiplies with a common operand,
+ // then eliminate the common factor:
+ // (A * B) / (A * X) --> B / X (and commuted variants)
+ // TODO: The code would be reduced if we had m_c_NUWMul pattern matching.
+ // TODO: If -reassociation handled this generally, we could remove this.
+ Value *A, *B;
+ if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) {
+ if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) ||
+ match(Op1, m_NUWMul(m_Value(X), m_Specific(A))))
+ return BinaryOperator::CreateUDiv(B, X);
+ if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) ||
+ match(Op1, m_NUWMul(m_Value(X), m_Specific(B))))
+ return BinaryOperator::CreateUDiv(A, X);
+ }
+
// (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
SmallVector<UDivFoldAction, 6> UDivActions;
if (visitUDivOperand(Op0, Op1, I, UDivActions))
@@ -1217,24 +1023,27 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
}
Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifySDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
// Handle the integer div common cases
if (Instruction *Common = commonIDivTransforms(I))
return Common;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *X;
+ // sdiv Op0, -1 --> -Op0
+ // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
+ if (match(Op1, m_AllOnes()) ||
+ (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
+ return BinaryOperator::CreateNeg(Op0);
+
const APInt *Op1C;
if (match(Op1, m_APInt(Op1C))) {
- // sdiv X, -1 == -X
- if (Op1C->isAllOnesValue())
- return BinaryOperator::CreateNeg(Op0);
-
// sdiv exact X, C --> ashr exact X, log2(C)
if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) {
Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2());
@@ -1298,166 +1107,148 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
return nullptr;
}
-/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
-/// FP value and:
-/// 1) 1/C is exact, or
-/// 2) reciprocal is allowed.
-/// If the conversion was successful, the simplified expression "X * 1/C" is
-/// returned; otherwise, nullptr is returned.
-static Instruction *CvtFDivConstToReciprocal(Value *Dividend, Constant *Divisor,
- bool AllowReciprocal) {
- if (!isa<ConstantFP>(Divisor)) // TODO: handle vectors.
+/// Remove negation and try to convert division into multiplication.
+static Instruction *foldFDivConstantDivisor(BinaryOperator &I) {
+ Constant *C;
+ if (!match(I.getOperand(1), m_Constant(C)))
return nullptr;
- const APFloat &FpVal = cast<ConstantFP>(Divisor)->getValueAPF();
- APFloat Reciprocal(FpVal.getSemantics());
- bool Cvt = FpVal.getExactInverse(&Reciprocal);
+ // -X / C --> X / -C
+ Value *X;
+ if (match(I.getOperand(0), m_FNeg(m_Value(X))))
+ return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
- if (!Cvt && AllowReciprocal && FpVal.isFiniteNonZero()) {
- Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
- (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
- Cvt = !Reciprocal.isDenormal();
- }
+ // If the constant divisor has an exact inverse, this is always safe. If not,
+ // then we can still create a reciprocal if fast-math-flags allow it and the
+ // constant is a regular number (not zero, infinite, or denormal).
+ if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP())))
+ return nullptr;
- if (!Cvt)
+ // Disallow denormal constants because we don't know what would happen
+ // on all targets.
+ // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+ // denorms are flushed?
+ auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C);
+ if (!RecipC->isNormalFP())
return nullptr;
- ConstantFP *R;
- R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
- return BinaryOperator::CreateFMul(Dividend, R);
+ // X / C --> X * (1 / C)
+ return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I);
}
-Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+/// Remove negation and try to reassociate constant math.
+static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
+ Constant *C;
+ if (!match(I.getOperand(0), m_Constant(C)))
+ return nullptr;
- if (Value *V = SimplifyVectorOp(I))
- return replaceInstUsesWith(I, V);
+ // C / -X --> -C / X
+ Value *X;
+ if (match(I.getOperand(1), m_FNeg(m_Value(X))))
+ return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
+
+ if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
+ return nullptr;
+
+ // Try to reassociate C / X expressions where X includes another constant.
+ Constant *C2, *NewC = nullptr;
+ if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) {
+ // C / (X * C2) --> (C / C2) / X
+ NewC = ConstantExpr::getFDiv(C, C2);
+ } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) {
+ // C / (X / C2) --> (C * C2) / X
+ NewC = ConstantExpr::getFMul(C, C2);
+ }
+ // Disallow denormal constants because we don't know what would happen
+ // on all targets.
+ // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+ // denorms are flushed?
+ if (!NewC || !NewC->isNormalFP())
+ return nullptr;
+
+ return BinaryOperator::CreateFDivFMF(NewC, X, &I);
+}
- if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+ if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
+
+ if (Instruction *R = foldFDivConstantDivisor(I))
+ return R;
+
+ if (Instruction *R = foldFDivConstantDividend(I))
+ return R;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
if (isa<Constant>(Op0))
if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
if (Instruction *R = FoldOpIntoSelect(I, SI))
return R;
- bool AllowReassociate = I.isFast();
- bool AllowReciprocal = I.hasAllowReciprocal();
-
- if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+ if (isa<Constant>(Op1))
if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
if (Instruction *R = FoldOpIntoSelect(I, SI))
return R;
- if (AllowReassociate) {
- Constant *C1 = nullptr;
- Constant *C2 = Op1C;
- Value *X;
- Instruction *Res = nullptr;
-
- if (match(Op0, m_FMul(m_Value(X), m_Constant(C1)))) {
- // (X*C1)/C2 => X * (C1/C2)
- //
- Constant *C = ConstantExpr::getFDiv(C1, C2);
- if (isNormalFp(C))
- Res = BinaryOperator::CreateFMul(X, C);
- } else if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
- // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
- Constant *C = ConstantExpr::getFMul(C1, C2);
- if (isNormalFp(C)) {
- Res = CvtFDivConstToReciprocal(X, C, AllowReciprocal);
- if (!Res)
- Res = BinaryOperator::CreateFDiv(X, C);
- }
- }
-
- if (Res) {
- Res->setFastMathFlags(I.getFastMathFlags());
- return Res;
- }
+ if (I.hasAllowReassoc() && I.hasAllowReciprocal()) {
+ Value *X, *Y;
+ if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+ (!isa<Constant>(Y) || !isa<Constant>(Op1))) {
+ // (X / Y) / Z => X / (Y * Z)
+ Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I);
+ return BinaryOperator::CreateFDivFMF(X, YZ, &I);
}
-
- // X / C => X * 1/C
- if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal)) {
- T->copyFastMathFlags(&I);
- return T;
+ if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+ (!isa<Constant>(Y) || !isa<Constant>(Op0))) {
+ // Z / (X / Y) => (Y * Z) / X
+ Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
+ return BinaryOperator::CreateFDivFMF(YZ, X, &I);
}
-
- return nullptr;
}
- if (AllowReassociate && isa<Constant>(Op0)) {
- Constant *C1 = cast<Constant>(Op0), *C2;
- Constant *Fold = nullptr;
+ if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
+ // sin(X) / cos(X) -> tan(X)
+ // cos(X) / sin(X) -> 1/tan(X) (cotangent)
Value *X;
- bool CreateDiv = true;
-
- // C1 / (X*C2) => (C1/C2) / X
- if (match(Op1, m_FMul(m_Value(X), m_Constant(C2))))
- Fold = ConstantExpr::getFDiv(C1, C2);
- else if (match(Op1, m_FDiv(m_Value(X), m_Constant(C2)))) {
- // C1 / (X/C2) => (C1*C2) / X
- Fold = ConstantExpr::getFMul(C1, C2);
- } else if (match(Op1, m_FDiv(m_Constant(C2), m_Value(X)))) {
- // C1 / (C2/X) => (C1/C2) * X
- Fold = ConstantExpr::getFDiv(C1, C2);
- CreateDiv = false;
- }
-
- if (Fold && isNormalFp(Fold)) {
- Instruction *R = CreateDiv ? BinaryOperator::CreateFDiv(Fold, X)
- : BinaryOperator::CreateFMul(X, Fold);
- R->setFastMathFlags(I.getFastMathFlags());
- return R;
+ bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) &&
+ match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X)));
+ bool IsCot =
+ !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
+ match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
+
+ if ((IsTan || IsCot) && hasUnaryFloatFn(&TLI, I.getType(), LibFunc_tan,
+ LibFunc_tanf, LibFunc_tanl)) {
+ IRBuilder<> B(&I);
+ IRBuilder<>::FastMathFlagGuard FMFGuard(B);
+ B.setFastMathFlags(I.getFastMathFlags());
+ AttributeList Attrs = CallSite(Op0).getCalledFunction()->getAttributes();
+ Value *Res = emitUnaryFloatFnCall(X, TLI.getName(LibFunc_tan), B, Attrs);
+ if (IsCot)
+ Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
+ return replaceInstUsesWith(I, Res);
}
- return nullptr;
}
- if (AllowReassociate) {
- Value *X, *Y;
- Value *NewInst = nullptr;
- Instruction *SimpR = nullptr;
-
- if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
- // (X/Y) / Z => X / (Y*Z)
- if (!isa<Constant>(Y) || !isa<Constant>(Op1)) {
- NewInst = Builder.CreateFMul(Y, Op1);
- if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
- FastMathFlags Flags = I.getFastMathFlags();
- Flags &= cast<Instruction>(Op0)->getFastMathFlags();
- RI->setFastMathFlags(Flags);
- }
- SimpR = BinaryOperator::CreateFDiv(X, NewInst);
- }
- } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
- // Z / (X/Y) => Z*Y / X
- if (!isa<Constant>(Y) || !isa<Constant>(Op0)) {
- NewInst = Builder.CreateFMul(Op0, Y);
- if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
- FastMathFlags Flags = I.getFastMathFlags();
- Flags &= cast<Instruction>(Op1)->getFastMathFlags();
- RI->setFastMathFlags(Flags);
- }
- SimpR = BinaryOperator::CreateFDiv(NewInst, X);
- }
- }
-
- if (NewInst) {
- if (Instruction *T = dyn_cast<Instruction>(NewInst))
- T->setDebugLoc(I.getDebugLoc());
- SimpR->setFastMathFlags(I.getFastMathFlags());
- return SimpR;
- }
+ // -X / -Y -> X / Y
+ Value *X, *Y;
+ if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) {
+ I.setOperand(0, X);
+ I.setOperand(1, Y);
+ return &I;
}
- Value *LHS;
- Value *RHS;
-
- // -x / -y -> x / y
- if (match(Op0, m_FNeg(m_Value(LHS))) && match(Op1, m_FNeg(m_Value(RHS)))) {
- I.setOperand(0, LHS);
- I.setOperand(1, RHS);
+ // X / (X * Y) --> 1.0 / Y
+ // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
+ // We can ignore the possibility that X is infinity because INF/INF is NaN.
+ if (I.hasNoNaNs() && I.hasAllowReassoc() &&
+ match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
+ I.setOperand(0, ConstantFP::get(I.getType(), 1.0));
+ I.setOperand(1, Y);
return &I;
}
@@ -1467,7 +1258,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
/// This function implements the transforms common to both integer remainder
/// instructions (urem and srem). It is called by the visitors to those integer
/// remainder instructions.
-/// @brief Common integer remainder transforms
+/// Common integer remainder transforms
Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -1509,13 +1300,12 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
}
Instruction *InstCombiner::visitURem(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyURemInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
if (Instruction *common = commonIRemTransforms(I))
return common;
@@ -1524,47 +1314,55 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
return NarrowRem;
// X urem Y -> X and Y-1, where Y is a power of 2,
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = I.getType();
if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
- Constant *N1 = Constant::getAllOnesValue(I.getType());
+ Constant *N1 = Constant::getAllOnesValue(Ty);
Value *Add = Builder.CreateAdd(Op1, N1);
return BinaryOperator::CreateAnd(Op0, Add);
}
// 1 urem X -> zext(X != 1)
- if (match(Op0, m_One())) {
- Value *Cmp = Builder.CreateICmpNE(Op1, Op0);
- Value *Ext = Builder.CreateZExt(Cmp, I.getType());
- return replaceInstUsesWith(I, Ext);
- }
+ if (match(Op0, m_One()))
+ return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty);
// X urem C -> X < C ? X : X - C, where C >= signbit.
- const APInt *DivisorC;
- if (match(Op1, m_APInt(DivisorC)) && DivisorC->isNegative()) {
+ if (match(Op1, m_Negative())) {
Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
Value *Sub = Builder.CreateSub(Op0, Op1);
return SelectInst::Create(Cmp, Op0, Sub);
}
+ // If the divisor is a sext of a boolean, then the divisor must be max
+ // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also
+ // max unsigned value. In that case, the remainder is 0:
+ // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0
+ Value *X;
+ if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+ return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0);
+ }
+
return nullptr;
}
Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifySRemInst(Op0, Op1, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
// Handle the integer rem common cases
if (Instruction *Common = commonIRemTransforms(I))
return Common;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
{
const APInt *Y;
// X % -Y -> X % Y
- if (match(Op1, m_APInt(Y)) && Y->isNegative() && !Y->isMinSignedValue()) {
+ if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue()) {
Worklist.AddValue(I.getOperand(1));
I.setOperand(1, ConstantInt::get(I.getType(), -*Y));
return &I;
@@ -1622,14 +1420,13 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
}
Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- if (Value *V = SimplifyVectorOp(I))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
+ if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
+
return nullptr;
}
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 7ee018dbc49b..e54a1dd05a24 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -15,14 +15,18 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "instcombine"
+static cl::opt<unsigned>
+MaxNumPhis("instcombine-max-num-phis", cl::init(512),
+ cl::desc("Maximum number phis to handle in intptr/ptrint folding"));
+
/// The PHI arguments will be folded into a single operation with a PHI node
/// as input. The debug location of the single operation will be the merged
/// locations of the original PHI node arguments.
@@ -176,8 +180,12 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
assert(AvailablePtrVals.size() == PN.getNumIncomingValues() &&
"Not enough available ptr typed incoming values");
PHINode *MatchingPtrPHI = nullptr;
+ unsigned NumPhis = 0;
for (auto II = BB->begin(), EI = BasicBlock::iterator(BB->getFirstNonPHI());
- II != EI; II++) {
+ II != EI; II++, NumPhis++) {
+ // FIXME: consider handling this in AggressiveInstCombine
+ if (NumPhis > MaxNumPhis)
+ return nullptr;
PHINode *PtrPHI = dyn_cast<PHINode>(II);
if (!PtrPHI || PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
continue;
@@ -1008,10 +1016,9 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
// extracted out of it. First, sort the users by their offset and size.
array_pod_sort(PHIUsers.begin(), PHIUsers.end());
- DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
- for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
- dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';
- );
+ LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+ for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs()
+ << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';);
// PredValues - This is a temporary used when rewriting PHI nodes. It is
// hoisted out here to avoid construction/destruction thrashing.
@@ -1092,8 +1099,8 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
}
PredValues.clear();
- DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": "
- << *EltPHI << '\n');
+ LLVM_DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": "
+ << *EltPHI << '\n');
ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
}
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6f26f7f5cd19..4867808478a3 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -47,93 +47,51 @@ using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
-static SelectPatternFlavor
-getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) {
- switch (SPF) {
- default:
- llvm_unreachable("unhandled!");
-
- case SPF_SMIN:
- return SPF_SMAX;
- case SPF_UMIN:
- return SPF_UMAX;
- case SPF_SMAX:
- return SPF_SMIN;
- case SPF_UMAX:
- return SPF_UMIN;
- }
-}
-
-static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF,
- bool Ordered=false) {
- switch (SPF) {
- default:
- llvm_unreachable("unhandled!");
-
- case SPF_SMIN:
- return ICmpInst::ICMP_SLT;
- case SPF_UMIN:
- return ICmpInst::ICMP_ULT;
- case SPF_SMAX:
- return ICmpInst::ICMP_SGT;
- case SPF_UMAX:
- return ICmpInst::ICMP_UGT;
- case SPF_FMINNUM:
- return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT;
- case SPF_FMAXNUM:
- return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;
- }
-}
-
-static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy &Builder,
- SelectPatternFlavor SPF, Value *A,
- Value *B) {
- CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF);
- assert(CmpInst::isIntPredicate(Pred));
+static Value *createMinMax(InstCombiner::BuilderTy &Builder,
+ SelectPatternFlavor SPF, Value *A, Value *B) {
+ CmpInst::Predicate Pred = getMinMaxPred(SPF);
+ assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate");
return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
}
-/// If one of the constants is zero (we know they can't both be) and we have an
-/// icmp instruction with zero, and we have an 'and' with the non-constant value
-/// and a power of two we can turn the select into a shift on the result of the
-/// 'and'.
/// This folds:
-/// select (icmp eq (and X, C1)), C2, C3
-/// iff C1 is a power 2 and the difference between C2 and C3 is a power of 2.
+/// select (icmp eq (and X, C1)), TC, FC
+/// iff C1 is a power 2 and the difference between TC and FC is a power-of-2.
/// To something like:
-/// (shr (and (X, C1)), (log2(C1) - log2(C2-C3))) + C3
+/// (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC
/// Or:
-/// (shl (and (X, C1)), (log2(C2-C3) - log2(C1))) + C3
-/// With some variations depending if C3 is larger than C2, or the shift
+/// (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC
+/// With some variations depending if FC is larger than TC, or the shift
/// isn't needed, or the bit widths don't match.
-static Value *foldSelectICmpAnd(Type *SelType, const ICmpInst *IC,
- APInt TrueVal, APInt FalseVal,
+static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
InstCombiner::BuilderTy &Builder) {
- assert(SelType->isIntOrIntVectorTy() && "Not an integer select?");
+ const APInt *SelTC, *SelFC;
+ if (!match(Sel.getTrueValue(), m_APInt(SelTC)) ||
+ !match(Sel.getFalseValue(), m_APInt(SelFC)))
+ return nullptr;
// If this is a vector select, we need a vector compare.
- if (SelType->isVectorTy() != IC->getType()->isVectorTy())
+ Type *SelType = Sel.getType();
+ if (SelType->isVectorTy() != Cmp->getType()->isVectorTy())
return nullptr;
Value *V;
APInt AndMask;
bool CreateAnd = false;
- ICmpInst::Predicate Pred = IC->getPredicate();
+ ICmpInst::Predicate Pred = Cmp->getPredicate();
if (ICmpInst::isEquality(Pred)) {
- if (!match(IC->getOperand(1), m_Zero()))
+ if (!match(Cmp->getOperand(1), m_Zero()))
return nullptr;
- V = IC->getOperand(0);
-
+ V = Cmp->getOperand(0);
const APInt *AndRHS;
if (!match(V, m_And(m_Value(), m_Power2(AndRHS))))
return nullptr;
AndMask = *AndRHS;
- } else if (decomposeBitTestICmp(IC->getOperand(0), IC->getOperand(1),
+ } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1),
Pred, V, AndMask)) {
assert(ICmpInst::isEquality(Pred) && "Not equality test?");
-
if (!AndMask.isPowerOf2())
return nullptr;
@@ -142,39 +100,58 @@ static Value *foldSelectICmpAnd(Type *SelType, const ICmpInst *IC,
return nullptr;
}
- // If both select arms are non-zero see if we have a select of the form
- // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic
- // for 'x ? 2^n : 0' and fix the thing up at the end.
- APInt Offset(TrueVal.getBitWidth(), 0);
- if (!TrueVal.isNullValue() && !FalseVal.isNullValue()) {
- if ((TrueVal - FalseVal).isPowerOf2())
- Offset = FalseVal;
- else if ((FalseVal - TrueVal).isPowerOf2())
- Offset = TrueVal;
- else
+ // In general, when both constants are non-zero, we would need an offset to
+ // replace the select. This would require more instructions than we started
+ // with. But there's one special-case that we handle here because it can
+ // simplify/reduce the instructions.
+ APInt TC = *SelTC;
+ APInt FC = *SelFC;
+ if (!TC.isNullValue() && !FC.isNullValue()) {
+ // If the select constants differ by exactly one bit and that's the same
+ // bit that is masked and checked by the select condition, the select can
+ // be replaced by bitwise logic to set/clear one bit of the constant result.
+ if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask)
return nullptr;
-
- // Adjust TrueVal and FalseVal to the offset.
- TrueVal -= Offset;
- FalseVal -= Offset;
+ if (CreateAnd) {
+ // If we have to create an 'and', then we must kill the cmp to not
+ // increase the instruction count.
+ if (!Cmp->hasOneUse())
+ return nullptr;
+ V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask));
+ }
+ bool ExtraBitInTC = TC.ugt(FC);
+ if (Pred == ICmpInst::ICMP_EQ) {
+ // If the masked bit in V is clear, clear or set the bit in the result:
+ // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC
+ // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC
+ Constant *C = ConstantInt::get(SelType, TC);
+ return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C);
+ }
+ if (Pred == ICmpInst::ICMP_NE) {
+ // If the masked bit in V is set, set or clear the bit in the result:
+ // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC
+ // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC
+ Constant *C = ConstantInt::get(SelType, FC);
+ return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C);
+ }
+ llvm_unreachable("Only expecting equality predicates");
}
- // Make sure one of the select arms is a power of 2.
- if (!TrueVal.isPowerOf2() && !FalseVal.isPowerOf2())
+ // Make sure one of the select arms is a power-of-2.
+ if (!TC.isPowerOf2() && !FC.isPowerOf2())
return nullptr;
// Determine which shift is needed to transform result of the 'and' into the
// desired result.
- const APInt &ValC = !TrueVal.isNullValue() ? TrueVal : FalseVal;
+ const APInt &ValC = !TC.isNullValue() ? TC : FC;
unsigned ValZeros = ValC.logBase2();
unsigned AndZeros = AndMask.logBase2();
- if (CreateAnd) {
- // Insert the AND instruction on the input to the truncate.
+ // Insert the 'and' instruction on the input to the truncate.
+ if (CreateAnd)
V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask));
- }
- // If types don't match we can still convert the select by introducing a zext
+ // If types don't match, we can still convert the select by introducing a zext
// or a trunc of the 'and'.
if (ValZeros > AndZeros) {
V = Builder.CreateZExtOrTrunc(V, SelType);
@@ -182,19 +159,17 @@ static Value *foldSelectICmpAnd(Type *SelType, const ICmpInst *IC,
} else if (ValZeros < AndZeros) {
V = Builder.CreateLShr(V, AndZeros - ValZeros);
V = Builder.CreateZExtOrTrunc(V, SelType);
- } else
+ } else {
V = Builder.CreateZExtOrTrunc(V, SelType);
+ }
// Okay, now we know that everything is set up, we just don't know whether we
// have a icmp_ne or icmp_eq and whether the true or false val is the zero.
- bool ShouldNotVal = !TrueVal.isNullValue();
+ bool ShouldNotVal = !TC.isNullValue();
ShouldNotVal ^= Pred == ICmpInst::ICMP_NE;
if (ShouldNotVal)
V = Builder.CreateXor(V, ValC);
- // Apply an offset if needed.
- if (!Offset.isNullValue())
- V = Builder.CreateAdd(V, ConstantInt::get(V->getType(), Offset));
return V;
}
@@ -300,12 +275,13 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
TI->getType());
}
- // Only handle binary operators with one-use here. As with the cast case
- // above, it may be possible to relax the one-use constraint, but that needs
- // be examined carefully since it may not reduce the total number of
- // instructions.
- BinaryOperator *BO = dyn_cast<BinaryOperator>(TI);
- if (!BO || !TI->hasOneUse() || !FI->hasOneUse())
+ // Only handle binary operators (including two-operand getelementptr) with
+ // one-use here. As with the cast case above, it may be possible to relax the
+ // one-use constraint, but that needs be examined carefully since it may not
+ // reduce the total number of instructions.
+ if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 ||
+ (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) ||
+ !TI->hasOneUse() || !FI->hasOneUse())
return nullptr;
// Figure out if the operations have any operands in common.
@@ -342,7 +318,18 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
SI.getName() + ".v", &SI);
Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
- return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+ if (auto *BO = dyn_cast<BinaryOperator>(TI)) {
+ return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+ }
+ if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) {
+ auto *FGEP = cast<GetElementPtrInst>(FI);
+ Type *ElementType = TGEP->getResultElementType();
+ return TGEP->isInBounds() && FGEP->isInBounds()
+ ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1})
+ : GetElementPtrInst::Create(ElementType, Op0, {Op1});
+ }
+ llvm_unreachable("Expected BinaryOperator or GEP");
+ return nullptr;
}
static bool isSelect01(const APInt &C1I, const APInt &C2I) {
@@ -424,6 +411,47 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
}
/// We want to turn:
+/// (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1)
+/// into:
+/// zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0)
+/// Note:
+/// Z may be 0 if lshr is missing.
+/// Worst-case scenario is that we will replace 5 instructions with 5 different
+/// instructions, but we got rid of select.
+static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
+ Value *TVal, Value *FVal,
+ InstCombiner::BuilderTy &Builder) {
+ if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() &&
+ Cmp->getPredicate() == ICmpInst::ICMP_EQ &&
+ match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One())))
+ return nullptr;
+
+ // The TrueVal has general form of: and %B, 1
+ Value *B;
+ if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One()))))
+ return nullptr;
+
+ // Where %B may be optionally shifted: lshr %X, %Z.
+ Value *X, *Z;
+ const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z))));
+ if (!HasShift)
+ X = B;
+
+ Value *Y;
+ if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y))))
+ return nullptr;
+
+ // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0
+ // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0
+ Constant *One = ConstantInt::get(SelType, 1);
+ Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One;
+ Value *FullMask = Builder.CreateOr(Y, MaskB);
+ Value *MaskedX = Builder.CreateAnd(X, FullMask);
+ Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX);
+ return new ZExtInst(ICmpNeZero, SelType);
+}
+
+/// We want to turn:
/// (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
/// into:
/// (or (shl (and X, C1), C3), Y)
@@ -526,6 +554,59 @@ static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal,
return Builder.CreateOr(V, Y);
}
+/// Transform patterns such as: (a > b) ? a - b : 0
+/// into: ((a > b) ? a : b) - b)
+/// This produces a canonical max pattern that is more easily recognized by the
+/// backend and converted into saturated subtraction instructions if those
+/// exist.
+/// There are 8 commuted/swapped variants of this pattern.
+/// TODO: Also support a - UMIN(a,b) patterns.
+static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
+ const Value *TrueVal,
+ const Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ if (!ICmpInst::isUnsigned(Pred))
+ return nullptr;
+
+ // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0
+ if (match(TrueVal, m_Zero())) {
+ Pred = ICmpInst::getInversePredicate(Pred);
+ std::swap(TrueVal, FalseVal);
+ }
+ if (!match(FalseVal, m_Zero()))
+ return nullptr;
+
+ Value *A = ICI->getOperand(0);
+ Value *B = ICI->getOperand(1);
+ if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) {
+ // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0
+ std::swap(A, B);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
+ "Unexpected isUnsigned predicate!");
+
+ // Account for swapped form of subtraction: ((a > b) ? b - a : 0).
+ bool IsNegative = false;
+ if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))))
+ IsNegative = true;
+ else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))))
+ return nullptr;
+
+ // If sub is used anywhere else, we wouldn't be able to eliminate it
+ // afterwards.
+ if (!TrueVal->hasOneUse())
+ return nullptr;
+
+ // All checks passed, convert to canonical unsigned saturated subtraction
+ // form: sub(max()).
+ // (a > b) ? a - b : 0 -> ((a > b) ? a : b) - b)
+ Value *Max = Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
+ return IsNegative ? Builder.CreateSub(B, Max) : Builder.CreateSub(Max, B);
+}
+
/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
///
@@ -687,23 +768,18 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
// Canonicalize the compare predicate based on whether we have min or max.
Value *LHS, *RHS;
- ICmpInst::Predicate NewPred;
SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
- switch (SPR.Flavor) {
- case SPF_SMIN: NewPred = ICmpInst::ICMP_SLT; break;
- case SPF_UMIN: NewPred = ICmpInst::ICMP_ULT; break;
- case SPF_SMAX: NewPred = ICmpInst::ICMP_SGT; break;
- case SPF_UMAX: NewPred = ICmpInst::ICMP_UGT; break;
- default: return nullptr;
- }
+ if (!SelectPatternResult::isMinOrMax(SPR.Flavor))
+ return nullptr;
// Is this already canonical?
+ ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor);
if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
- Cmp.getPredicate() == NewPred)
+ Cmp.getPredicate() == CanonicalPred)
return nullptr;
// Create the canonical compare and plug it into the select.
- Sel.setCondition(Builder.CreateICmp(NewPred, LHS, RHS));
+ Sel.setCondition(Builder.CreateICmp(CanonicalPred, LHS, RHS));
// If the select operands did not change, we're done.
if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
@@ -718,6 +794,89 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
return &Sel;
}
+/// There are many select variants for each of ABS/NABS.
+/// In matchSelectPattern(), there are different compare constants, compare
+/// predicates/operands and select operands.
+/// In isKnownNegation(), there are different formats of negated operands.
+/// Canonicalize all these variants to 1 pattern.
+/// This makes CSE more likely.
+static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
+ InstCombiner::BuilderTy &Builder) {
+ if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+ return nullptr;
+
+ // Choose a sign-bit check for the compare (likely simpler for codegen).
+ // ABS: (X <s 0) ? -X : X
+ // NABS: (X <s 0) ? X : -X
+ Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
+ if (SPF != SelectPatternFlavor::SPF_ABS &&
+ SPF != SelectPatternFlavor::SPF_NABS)
+ return nullptr;
+
+ Value *TVal = Sel.getTrueValue();
+ Value *FVal = Sel.getFalseValue();
+ assert(isKnownNegation(TVal, FVal) &&
+ "Unexpected result from matchSelectPattern");
+
+ // The compare may use the negated abs()/nabs() operand, or it may use
+ // negation in non-canonical form such as: sub A, B.
+ bool CmpUsesNegatedOp = match(Cmp.getOperand(0), m_Neg(m_Specific(TVal))) ||
+ match(Cmp.getOperand(0), m_Neg(m_Specific(FVal)));
+
+ bool CmpCanonicalized = !CmpUsesNegatedOp &&
+ match(Cmp.getOperand(1), m_ZeroInt()) &&
+ Cmp.getPredicate() == ICmpInst::ICMP_SLT;
+ bool RHSCanonicalized = match(RHS, m_Neg(m_Specific(LHS)));
+
+ // Is this already canonical?
+ if (CmpCanonicalized && RHSCanonicalized)
+ return nullptr;
+
+ // If RHS is used by other instructions except compare and select, don't
+ // canonicalize it to not increase the instruction count.
+ if (!(RHS->hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp)))
+ return nullptr;
+
+ // Create the canonical compare: icmp slt LHS 0.
+ if (!CmpCanonicalized) {
+ Cmp.setPredicate(ICmpInst::ICMP_SLT);
+ Cmp.setOperand(1, ConstantInt::getNullValue(Cmp.getOperand(0)->getType()));
+ if (CmpUsesNegatedOp)
+ Cmp.setOperand(0, LHS);
+ }
+
+ // Create the canonical RHS: RHS = sub (0, LHS).
+ if (!RHSCanonicalized) {
+ assert(RHS->hasOneUse() && "RHS use number is not right");
+ RHS = Builder.CreateNeg(LHS);
+ if (TVal == LHS) {
+ Sel.setFalseValue(RHS);
+ FVal = RHS;
+ } else {
+ Sel.setTrueValue(RHS);
+ TVal = RHS;
+ }
+ }
+
+ // If the select operands do not change, we're done.
+ if (SPF == SelectPatternFlavor::SPF_NABS) {
+ if (TVal == LHS)
+ return &Sel;
+ assert(FVal == LHS && "Unexpected results from matchSelectPattern");
+ } else {
+ if (FVal == LHS)
+ return &Sel;
+ assert(TVal == LHS && "Unexpected results from matchSelectPattern");
+ }
+
+ // We are swapping the select operands, so swap the metadata too.
+ Sel.setTrueValue(FVal);
+ Sel.setFalseValue(TVal);
+ Sel.swapProfMetadata();
+ return &Sel;
+}
+
/// Visit a SelectInst that has an ICmpInst as its first operand.
Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
ICmpInst *ICI) {
@@ -727,59 +886,18 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
return NewSel;
+ if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder))
+ return NewAbs;
+
bool Changed = adjustMinMax(SI, *ICI);
+ if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ // NOTE: if we wanted to, this is where to detect integer MIN/MAX
ICmpInst::Predicate Pred = ICI->getPredicate();
Value *CmpLHS = ICI->getOperand(0);
Value *CmpRHS = ICI->getOperand(1);
-
- // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1
- // and (X <s 0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1
- // FIXME: Type and constness constraints could be lifted, but we have to
- // watch code size carefully. We should consider xor instead of
- // sub/add when we decide to do that.
- // TODO: Merge this with foldSelectICmpAnd somehow.
- if (CmpLHS->getType()->isIntOrIntVectorTy() &&
- CmpLHS->getType() == TrueVal->getType()) {
- const APInt *C1, *C2;
- if (match(TrueVal, m_APInt(C1)) && match(FalseVal, m_APInt(C2))) {
- ICmpInst::Predicate Pred = ICI->getPredicate();
- Value *X;
- APInt Mask;
- if (decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask, false)) {
- if (Mask.isSignMask()) {
- assert(X == CmpLHS && "Expected to use the compare input directly");
- assert(ICmpInst::isEquality(Pred) && "Expected equality predicate");
-
- if (Pred == ICmpInst::ICMP_NE)
- std::swap(C1, C2);
-
- // This shift results in either -1 or 0.
- Value *AShr = Builder.CreateAShr(X, Mask.getBitWidth() - 1);
-
- // Check if we can express the operation with a single or.
- if (C2->isAllOnesValue())
- return replaceInstUsesWith(SI, Builder.CreateOr(AShr, *C1));
-
- Value *And = Builder.CreateAnd(AShr, *C2 - *C1);
- return replaceInstUsesWith(SI, Builder.CreateAdd(And,
- ConstantInt::get(And->getType(), *C1)));
- }
- }
- }
- }
-
- {
- const APInt *TrueValC, *FalseValC;
- if (match(TrueVal, m_APInt(TrueValC)) &&
- match(FalseVal, m_APInt(FalseValC)))
- if (Value *V = foldSelectICmpAnd(SI.getType(), ICI, *TrueValC,
- *FalseValC, Builder))
- return replaceInstUsesWith(SI, V);
- }
-
- // NOTE: if we wanted to, this is where to detect integer MIN/MAX
-
if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
// Transform (X == C) ? X : Y -> (X == C) ? C : Y
@@ -842,16 +960,22 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
}
}
+ if (Instruction *V =
+ foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
+ return V;
+
if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
return replaceInstUsesWith(SI, V);
if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
return replaceInstUsesWith(SI, V);
+ if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
return Changed ? &SI : nullptr;
}
-
/// SI is a select whose condition is a PHI node (but the two may be in
/// different blocks). See if the true/false values (V) are live in all of the
/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
@@ -900,7 +1024,7 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
if (C == A || C == B) {
// MAX(MAX(A, B), B) -> MAX(A, B)
// MIN(MIN(a, b), a) -> MIN(a, b)
- if (SPF1 == SPF2)
+ if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1))
return replaceInstUsesWith(Outer, Inner);
// MAX(MIN(a, b), a) -> a
@@ -992,10 +1116,10 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
if (!NotC)
NotC = Builder.CreateNot(C);
- Value *NewInner = generateMinMaxSelectPattern(
- Builder, getInverseMinMaxSelectPattern(SPF1), NotA, NotB);
- Value *NewOuter = Builder.CreateNot(generateMinMaxSelectPattern(
- Builder, getInverseMinMaxSelectPattern(SPF2), NewInner, NotC));
+ Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA,
+ NotB);
+ Value *NewOuter = Builder.CreateNot(
+ createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC));
return replaceInstUsesWith(Outer, NewOuter);
}
@@ -1075,6 +1199,11 @@ static Instruction *foldAddSubSelect(SelectInst &SI,
}
Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+ Constant *C;
+ if (!match(Sel.getTrueValue(), m_Constant(C)) &&
+ !match(Sel.getFalseValue(), m_Constant(C)))
+ return nullptr;
+
Instruction *ExtInst;
if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
!match(Sel.getFalseValue(), m_Instruction(ExtInst)))
@@ -1084,20 +1213,18 @@ Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
return nullptr;
- // TODO: Handle larger types? That requires adjusting FoldOpIntoSelect too.
+ // If we are extending from a boolean type or if we can create a select that
+ // has the same size operands as its condition, try to narrow the select.
Value *X = ExtInst->getOperand(0);
Type *SmallType = X->getType();
- if (!SmallType->isIntOrIntVectorTy(1))
- return nullptr;
-
- Constant *C;
- if (!match(Sel.getTrueValue(), m_Constant(C)) &&
- !match(Sel.getFalseValue(), m_Constant(C)))
+ Value *Cond = Sel.getCondition();
+ auto *Cmp = dyn_cast<CmpInst>(Cond);
+ if (!SmallType->isIntOrIntVectorTy(1) &&
+ (!Cmp || Cmp->getOperand(0)->getType() != SmallType))
return nullptr;
// If the constant is the same after truncation to the smaller type and
// extension to the original type, we can narrow the select.
- Value *Cond = Sel.getCondition();
Type *SelType = Sel.getType();
Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
@@ -1289,6 +1416,63 @@ static Instruction *foldSelectCmpXchg(SelectInst &SI) {
return nullptr;
}
+/// Reduce a sequence of min/max with a common operand.
+static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
+ Value *RHS,
+ InstCombiner::BuilderTy &Builder) {
+ assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max");
+ // TODO: Allow FP min/max with nnan/nsz.
+ if (!LHS->getType()->isIntOrIntVectorTy())
+ return nullptr;
+
+ // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
+ Value *A, *B, *C, *D;
+ SelectPatternResult L = matchSelectPattern(LHS, A, B);
+ SelectPatternResult R = matchSelectPattern(RHS, C, D);
+ if (SPF != L.Flavor || L.Flavor != R.Flavor)
+ return nullptr;
+
+ // Look for a common operand. The use checks are different than usual because
+ // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by
+ // the select.
+ Value *MinMaxOp = nullptr;
+ Value *ThirdOp = nullptr;
+ if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) {
+ // If the LHS is only used in this chain and the RHS is used outside of it,
+ // reuse the RHS min/max because that will eliminate the LHS.
+ if (D == A || C == A) {
+ // min(min(a, b), min(c, a)) --> min(min(c, a), b)
+ // min(min(a, b), min(a, d)) --> min(min(a, d), b)
+ MinMaxOp = RHS;
+ ThirdOp = B;
+ } else if (D == B || C == B) {
+ // min(min(a, b), min(c, b)) --> min(min(c, b), a)
+ // min(min(a, b), min(b, d)) --> min(min(b, d), a)
+ MinMaxOp = RHS;
+ ThirdOp = A;
+ }
+ } else if (!RHS->hasNUsesOrMore(3)) {
+ // Reuse the LHS. This will eliminate the RHS.
+ if (D == A || D == B) {
+ // min(min(a, b), min(c, a)) --> min(min(a, b), c)
+ // min(min(a, b), min(c, b)) --> min(min(a, b), c)
+ MinMaxOp = LHS;
+ ThirdOp = C;
+ } else if (C == A || C == B) {
+ // min(min(a, b), min(b, d)) --> min(min(a, b), d)
+ // min(min(a, b), min(c, b)) --> min(min(a, b), d)
+ MinMaxOp = LHS;
+ ThirdOp = D;
+ }
+ }
+ if (!MinMaxOp || !ThirdOp)
+ return nullptr;
+
+ CmpInst::Predicate P = getMinMaxPred(SPF);
+ Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp);
+ return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
+}
+
Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
Value *CondVal = SI.getCondition();
Value *TrueVal = SI.getTrueValue();
@@ -1489,7 +1673,37 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
// NOTE: if we wanted to, this is where to detect MIN/MAX
}
- // NOTE: if we wanted to, this is where to detect ABS
+
+ // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
+ // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We
+ // also require nnan because we do not want to unintentionally change the
+ // sign of a NaN value.
+ Value *X = FCI->getOperand(0);
+ FCmpInst::Predicate Pred = FCI->getPredicate();
+ if (match(FCI->getOperand(1), m_AnyZeroFP()) && FCI->hasNoNaNs()) {
+ // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
+ // (X > +/-0.0) ? X : (0.0 - X) --> fabs(X)
+ if ((X == FalseVal && Pred == FCmpInst::FCMP_OLE &&
+ match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(X)))) ||
+ (X == TrueVal && Pred == FCmpInst::FCMP_OGT &&
+ match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(X))))) {
+ Value *Fabs = Builder.CreateIntrinsic(Intrinsic::fabs, { X }, FCI);
+ return replaceInstUsesWith(SI, Fabs);
+ }
+ // With nsz:
+ // (X < +/-0.0) ? -X : X --> fabs(X)
+ // (X <= +/-0.0) ? -X : X --> fabs(X)
+ // (X > +/-0.0) ? X : -X --> fabs(X)
+ // (X >= +/-0.0) ? X : -X --> fabs(X)
+ if (FCI->hasNoSignedZeros() &&
+ ((X == FalseVal && match(TrueVal, m_FNeg(m_Specific(X))) &&
+ (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE)) ||
+ (X == TrueVal && match(FalseVal, m_FNeg(m_Specific(X))) &&
+ (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE)))) {
+ Value *Fabs = Builder.CreateIntrinsic(Intrinsic::fabs, { X }, FCI);
+ return replaceInstUsesWith(SI, Fabs);
+ }
+ }
}
// See if we are selecting two values based on a comparison of the two values.
@@ -1532,7 +1746,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
(LHS->getType()->isFPOrFPVectorTy() &&
((CmpLHS != LHS && CmpLHS != RHS) ||
(CmpRHS != LHS && CmpRHS != RHS)))) {
- CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered);
+ CmpInst::Predicate Pred = getMinMaxPred(SPF, SPR.Ordered);
Value *Cmp;
if (CmpInst::isIntPredicate(Pred)) {
@@ -1551,6 +1765,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType);
return replaceInstUsesWith(SI, NewCast);
}
+
+ // MAX(~a, ~b) -> ~MIN(a, b)
+ // MIN(~a, ~b) -> ~MAX(a, b)
+ Value *A, *B;
+ if (match(LHS, m_Not(m_Value(A))) && match(RHS, m_Not(m_Value(B))) &&
+ (LHS->getNumUses() <= 2 || RHS->getNumUses() <= 2)) {
+ CmpInst::Predicate InvertedPred = getInverseMinMaxPred(SPF);
+ Value *InvertedCmp = Builder.CreateICmp(InvertedPred, A, B);
+ Value *NewSel = Builder.CreateSelect(InvertedCmp, A, B);
+ return BinaryOperator::CreateNot(NewSel);
+ }
+
+ if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
+ return I;
}
if (SPF) {
@@ -1570,28 +1798,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
return R;
}
- // MAX(~a, ~b) -> ~MIN(a, b)
- if ((SPF == SPF_SMAX || SPF == SPF_UMAX) &&
- IsFreeToInvert(LHS, LHS->hasNUses(2)) &&
- IsFreeToInvert(RHS, RHS->hasNUses(2))) {
- // For this transform to be profitable, we need to eliminate at least two
- // 'not' instructions if we're going to add one 'not' instruction.
- int NumberOfNots =
- (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) +
- (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) +
- (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
-
- if (NumberOfNots >= 2) {
- Value *NewLHS = Builder.CreateNot(LHS);
- Value *NewRHS = Builder.CreateNot(RHS);
- Value *NewCmp = SPF == SPF_SMAX ? Builder.CreateICmpSLT(NewLHS, NewRHS)
- : Builder.CreateICmpULT(NewLHS, NewRHS);
- Value *NewSI =
- Builder.CreateNot(Builder.CreateSelect(NewCmp, NewLHS, NewRHS));
- return replaceInstUsesWith(SI, NewSI);
- }
- }
-
// TODO.
// ABS(-X) -> ABS(X)
}
@@ -1643,11 +1849,25 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
}
}
+ auto canMergeSelectThroughBinop = [](BinaryOperator *BO) {
+ // The select might be preventing a division by 0.
+ switch (BO->getOpcode()) {
+ default:
+ return true;
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ return false;
+ }
+ };
+
// Try to simplify a binop sandwiched between 2 selects with the same
// condition.
// select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z)
BinaryOperator *TrueBO;
- if (match(TrueVal, m_OneUse(m_BinOp(TrueBO)))) {
+ if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) &&
+ canMergeSelectThroughBinop(TrueBO)) {
if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
if (TrueBOSI->getCondition() == CondVal) {
TrueBO->setOperand(0, TrueBOSI->getTrueValue());
@@ -1666,7 +1886,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
// select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W))
BinaryOperator *FalseBO;
- if (match(FalseVal, m_OneUse(m_BinOp(FalseBO)))) {
+ if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) &&
+ canMergeSelectThroughBinop(FalseBO)) {
if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
if (FalseBOSI->getCondition() == CondVal) {
FalseBO->setOperand(0, FalseBOSI->getFalseValue());
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 44bbb84686ab..34f8037e519f 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -87,8 +87,7 @@ static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
// Equal shift amounts in opposite directions become bitwise 'and':
// lshr (shl X, C), C --> and X, C'
// shl (lshr X, C), C --> and X, C'
- unsigned InnerShAmt = InnerShiftConst->getZExtValue();
- if (InnerShAmt == OuterShAmt)
+ if (*InnerShiftConst == OuterShAmt)
return true;
// If the 2nd shift is bigger than the 1st, we can fold:
@@ -98,7 +97,8 @@ static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
// Also, check that the inner shift is valid (less than the type width) or
// we'll crash trying to produce the bit mask for the 'and'.
unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
- if (InnerShAmt > OuterShAmt && InnerShAmt < TypeWidth) {
+ if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) {
+ unsigned InnerShAmt = InnerShiftConst->getZExtValue();
unsigned MaskShift =
IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
@@ -135,7 +135,7 @@ static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
ConstantInt *CI = nullptr;
if ((IsLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
(!IsLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
- if (CI->getZExtValue() == NumBits) {
+ if (CI->getValue() == NumBits) {
// TODO: Check that the input bits are already zero with MaskedValueIsZero
#if 0
// If this is a truncate of a logical shr, we can truncate it to a smaller
@@ -356,8 +356,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
// cast of lshr(shl(x,c1),c2) as well as other more complex cases.
if (I.getOpcode() != Instruction::AShr &&
canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
- DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
- " to eliminate shift:\n IN: " << *Op0 << "\n SH: " << I <<"\n");
+ LLVM_DEBUG(
+ dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+ " to eliminate shift:\n IN: "
+ << *Op0 << "\n SH: " << I << "\n");
return replaceInstUsesWith(
I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
@@ -370,7 +372,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
assert(!Op1C->uge(TypeBits) &&
"Shift over the type width should have been removed already");
- if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
+ if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I))
return FoldedShift;
// Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
@@ -586,23 +588,23 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
}
Instruction *InstCombiner::visitShl(BinaryOperator &I) {
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
+ I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (Value *V =
- SimplifyShlInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
if (Instruction *V = commonShiftTransforms(I))
return V;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = I.getType();
const APInt *ShAmtAPInt;
if (match(Op1, m_APInt(ShAmtAPInt))) {
unsigned ShAmt = ShAmtAPInt->getZExtValue();
- unsigned BitWidth = I.getType()->getScalarSizeInBits();
- Type *Ty = I.getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
// shl (zext X), ShAmt --> zext (shl X, ShAmt)
// This is only valid if X would have zeros shifted out.
@@ -620,11 +622,8 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
}
- // Be careful about hiding shl instructions behind bit masks. They are used
- // to represent multiplies by a constant, and it is important that simple
- // arithmetic expressions are still recognizable by scalar evolution.
- // The inexact versions are deferred to DAGCombine, so we don't hide shl
- // behind a bit mask.
+ // FIXME: we do not yet transform non-exact shr's. The backend (DAGCombine)
+ // needs a few fixes for the rotate pattern recognition first.
const APInt *ShOp1;
if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {
unsigned ShrAmt = ShOp1->getZExtValue();
@@ -668,6 +667,15 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
}
}
+ // Transform (x >> y) << y to x & (-1 << y)
+ // Valid for any type of right-shift.
+ Value *X;
+ if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) {
+ Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+ Value *Mask = Builder.CreateShl(AllOnes, Op1);
+ return BinaryOperator::CreateAnd(Mask, X);
+ }
+
Constant *C1;
if (match(Op1, m_Constant(C1))) {
Constant *C2;
@@ -685,17 +693,17 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
}
Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (Value *V =
- SimplifyLShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
if (Instruction *R = commonShiftTransforms(I))
return R;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Type *Ty = I.getType();
const APInt *ShAmtAPInt;
if (match(Op1, m_APInt(ShAmtAPInt))) {
@@ -800,25 +808,34 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
return &I;
}
}
+
+ // Transform (x << y) >> y to x & (-1 >> y)
+ Value *X;
+ if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) {
+ Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+ Value *Mask = Builder.CreateLShr(AllOnes, Op1);
+ return BinaryOperator::CreateAnd(Mask, X);
+ }
+
return nullptr;
}
Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
- if (Value *V = SimplifyVectorOp(I))
+ if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+ SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (Value *V =
- SimplifyAShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
+ if (Instruction *X = foldShuffledBinop(I))
+ return X;
if (Instruction *R = commonShiftTransforms(I))
return R;
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Type *Ty = I.getType();
unsigned BitWidth = Ty->getScalarSizeInBits();
const APInt *ShAmtAPInt;
- if (match(Op1, m_APInt(ShAmtAPInt))) {
+ if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) {
unsigned ShAmt = ShAmtAPInt->getZExtValue();
// If the shift amount equals the difference in width of the destination
@@ -832,7 +849,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
// We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
// we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
const APInt *ShOp1;
- if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1)))) {
+ if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) &&
+ ShOp1->ult(BitWidth)) {
unsigned ShlAmt = ShOp1->getZExtValue();
if (ShlAmt < ShAmt) {
// (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
@@ -850,7 +868,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
}
}
- if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1)))) {
+ if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) &&
+ ShOp1->ult(BitWidth)) {
unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
// Oversized arithmetic shifts replicate the sign bit.
AmtSum = std::min(AmtSum, BitWidth - 1);
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a2e757cb4273..425f5ce384be 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -23,6 +23,17 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "instcombine"
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+ unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
/// Check to see if the specified operand of the specified instruction is a
/// constant integer. If so, check to see if there are any bits set in the
/// constant that are not demanded. If so, shrink the constant and return true.
@@ -333,7 +344,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
KnownBits InputKnown(SrcBitWidth);
if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
return I;
- Known = Known.zextOrTrunc(BitWidth);
+ Known = InputKnown.zextOrTrunc(BitWidth);
// Any top bits are known to be zero.
if (BitWidth > SrcBitWidth)
Known.Zero.setBitsFrom(SrcBitWidth);
@@ -545,6 +556,27 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
}
break;
}
+ case Instruction::UDiv: {
+ // UDiv doesn't demand low bits that are zero in the divisor.
+ const APInt *SA;
+ if (match(I->getOperand(1), m_APInt(SA))) {
+ // If the shift is exact, then it does demand the low bits.
+ if (cast<UDivOperator>(I)->isExact())
+ break;
+
+ // FIXME: Take the demanded mask of the result into account.
+ unsigned RHSTrailingZeros = SA->countTrailingZeros();
+ APInt DemandedMaskIn =
+ APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
+ if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
+ return I;
+
+ // Propagate zero bits from the input.
+ Known.Zero.setHighBits(std::min(
+ BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
+ }
+ break;
+ }
case Instruction::SRem:
if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
// X % -1 demands all the bits because we don't want to introduce
@@ -888,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
return nullptr;
}
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
+ APInt DemandedElts,
+ int DMaskIdx) {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ if (VWidth == 1)
+ return nullptr;
+
+ ConstantInt *NewDMask = nullptr;
+
+ if (DMaskIdx < 0) {
+ // Pretend that a prefix of elements is demanded to simplify the code
+ // below.
+ DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
+ } else {
+ ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx));
+ if (!DMask)
+ return nullptr; // non-constant dmask is not supported by codegen
+
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+ // Mask off values that are undefined because the dmask doesn't cover them
+ DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+ unsigned NewDMaskVal = 0;
+ unsigned OrigLoadIdx = 0;
+ for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+ const unsigned Bit = 1 << SrcIdx;
+ if (!!(DMaskVal & Bit)) {
+ if (!!DemandedElts[OrigLoadIdx])
+ NewDMaskVal |= Bit;
+ OrigLoadIdx++;
+ }
+ }
+
+ if (DMaskVal != NewDMaskVal)
+ NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
+ }
+
+ // TODO: Handle 3 vectors when supported in code gen.
+ unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation());
+ if (!NewNumElts)
+ return UndefValue::get(II->getType());
+
+ if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+ if (NewDMask)
+ II->setArgOperand(DMaskIdx, NewDMask);
+ return nullptr;
+ }
+
+ // Determine the overload types of the original intrinsic.
+ auto IID = II->getIntrinsicID();
+ SmallVector<Intrinsic::IITDescriptor, 16> Table;
+ getIntrinsicInfoTableEntries(IID, Table);
+ ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+ FunctionType *FTy = II->getCalledFunction()->getFunctionType();
+ SmallVector<Type *, 6> OverloadTys;
+ Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys);
+ for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+ Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys);
+
+ // Get the new return type overload of the intrinsic.
+ Module *M = II->getParent()->getParent()->getParent();
+ Type *EltTy = II->getType()->getVectorElementType();
+ Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
+
+ OverloadTys[0] = NewTy;
+ Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
+
+ SmallVector<Value *, 16> Args;
+ for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+ Args.push_back(II->getArgOperand(I));
+
+ if (NewDMask)
+ Args[DMaskIdx] = NewDMask;
+
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(II);
+
+ CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
+ NewCall->takeName(II);
+ NewCall->copyMetadata(*II);
+
+ if (NewNumElts == 1) {
+ return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
+ DemandedElts.countTrailingZeros());
+ }
+
+ SmallVector<uint32_t, 8> EltMask;
+ unsigned NewLoadIdx = 0;
+ for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+ if (!!DemandedElts[OrigLoadIdx])
+ EltMask.push_back(NewLoadIdx++);
+ else
+ EltMask.push_back(NewNumElts);
+ }
+
+ Value *Shuffle =
+ Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
+
+ return Shuffle;
+}
+
/// The specified value produces a vector with any number of elements.
/// DemandedElts contains the set of elements that are actually used by the
/// caller. This method analyzes which elements of the operand are undef and
@@ -1187,7 +1323,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
}
- // div/rem demand all inputs, because they don't want divide by zero.
TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
UndefElts2, Depth + 1);
if (TmpV) {
@@ -1247,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
if (!II) break;
switch (II->getIntrinsicID()) {
- default: break;
-
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd:
// The instructions for these intrinsics are speced to zero upper bits not
@@ -1273,8 +1406,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
// Unary scalar-as-vector operations that work column-wise.
case Intrinsic::x86_sse_rcp_ss:
case Intrinsic::x86_sse_rsqrt_ss:
- case Intrinsic::x86_sse_sqrt_ss:
- case Intrinsic::x86_sse2_sqrt_sd:
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1366,18 +1497,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
case Intrinsic::x86_avx512_mask_sub_sd_round:
case Intrinsic::x86_avx512_mask_max_sd_round:
case Intrinsic::x86_avx512_mask_min_sd_round:
- case Intrinsic::x86_fma_vfmadd_ss:
- case Intrinsic::x86_fma_vfmsub_ss:
- case Intrinsic::x86_fma_vfnmadd_ss:
- case Intrinsic::x86_fma_vfnmsub_ss:
- case Intrinsic::x86_fma_vfmadd_sd:
- case Intrinsic::x86_fma_vfmsub_sd:
- case Intrinsic::x86_fma_vfnmadd_sd:
- case Intrinsic::x86_fma_vfnmsub_sd:
- case Intrinsic::x86_avx512_mask_vfmadd_ss:
- case Intrinsic::x86_avx512_mask_vfmadd_sd:
- case Intrinsic::x86_avx512_maskz_vfmadd_ss:
- case Intrinsic::x86_avx512_maskz_vfmadd_sd:
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1404,68 +1523,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
- case Intrinsic::x86_avx512_mask3_vfmadd_ss:
- case Intrinsic::x86_avx512_mask3_vfmadd_sd:
- case Intrinsic::x86_avx512_mask3_vfmsub_ss:
- case Intrinsic::x86_avx512_mask3_vfmsub_sd:
- case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
- case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
- // These intrinsics get the passthru bits from operand 2.
- TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
- UndefElts, Depth + 1);
- if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
-
- // If lowest element of a scalar op isn't used then use Arg2.
- if (!DemandedElts[0]) {
- Worklist.Add(II);
- return II->getArgOperand(2);
- }
-
- // Only lower element is used for operand 0 and 1.
- DemandedElts = 1;
- TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
- UndefElts2, Depth + 1);
- if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
- TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
- UndefElts3, Depth + 1);
- if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
-
- // Lower element is undefined if all three lower elements are undefined.
- // Consider things like undef&0. The result is known zero, not undef.
- if (!UndefElts2[0] || !UndefElts3[0])
- UndefElts.clearBit(0);
-
- break;
-
- case Intrinsic::x86_sse2_pmulu_dq:
- case Intrinsic::x86_sse41_pmuldq:
- case Intrinsic::x86_avx2_pmul_dq:
- case Intrinsic::x86_avx2_pmulu_dq:
- case Intrinsic::x86_avx512_pmul_dq_512:
- case Intrinsic::x86_avx512_pmulu_dq_512: {
- Value *Op0 = II->getArgOperand(0);
- Value *Op1 = II->getArgOperand(1);
- unsigned InnerVWidth = Op0->getType()->getVectorNumElements();
- assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
-
- APInt InnerDemandedElts(InnerVWidth, 0);
- for (unsigned i = 0; i != VWidth; ++i)
- if (DemandedElts[i])
- InnerDemandedElts.setBit(i * 2);
-
- UndefElts2 = APInt(InnerVWidth, 0);
- TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2,
- Depth + 1);
- if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
-
- UndefElts3 = APInt(InnerVWidth, 0);
- TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3,
- Depth + 1);
- if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
-
- break;
- }
-
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_sse2_packuswb_128:
@@ -1554,124 +1611,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
case Intrinsic::amdgcn_buffer_load:
case Intrinsic::amdgcn_buffer_load_format:
- case Intrinsic::amdgcn_image_sample:
- case Intrinsic::amdgcn_image_sample_cl:
- case Intrinsic::amdgcn_image_sample_d:
- case Intrinsic::amdgcn_image_sample_d_cl:
- case Intrinsic::amdgcn_image_sample_l:
- case Intrinsic::amdgcn_image_sample_b:
- case Intrinsic::amdgcn_image_sample_b_cl:
- case Intrinsic::amdgcn_image_sample_lz:
- case Intrinsic::amdgcn_image_sample_cd:
- case Intrinsic::amdgcn_image_sample_cd_cl:
-
- case Intrinsic::amdgcn_image_sample_c:
- case Intrinsic::amdgcn_image_sample_c_cl:
- case Intrinsic::amdgcn_image_sample_c_d:
- case Intrinsic::amdgcn_image_sample_c_d_cl:
- case Intrinsic::amdgcn_image_sample_c_l:
- case Intrinsic::amdgcn_image_sample_c_b:
- case Intrinsic::amdgcn_image_sample_c_b_cl:
- case Intrinsic::amdgcn_image_sample_c_lz:
- case Intrinsic::amdgcn_image_sample_c_cd:
- case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
- case Intrinsic::amdgcn_image_sample_o:
- case Intrinsic::amdgcn_image_sample_cl_o:
- case Intrinsic::amdgcn_image_sample_d_o:
- case Intrinsic::amdgcn_image_sample_d_cl_o:
- case Intrinsic::amdgcn_image_sample_l_o:
- case Intrinsic::amdgcn_image_sample_b_o:
- case Intrinsic::amdgcn_image_sample_b_cl_o:
- case Intrinsic::amdgcn_image_sample_lz_o:
- case Intrinsic::amdgcn_image_sample_cd_o:
- case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
- case Intrinsic::amdgcn_image_sample_c_o:
- case Intrinsic::amdgcn_image_sample_c_cl_o:
- case Intrinsic::amdgcn_image_sample_c_d_o:
- case Intrinsic::amdgcn_image_sample_c_d_cl_o:
- case Intrinsic::amdgcn_image_sample_c_l_o:
- case Intrinsic::amdgcn_image_sample_c_b_o:
- case Intrinsic::amdgcn_image_sample_c_b_cl_o:
- case Intrinsic::amdgcn_image_sample_c_lz_o:
- case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
-
- case Intrinsic::amdgcn_image_getlod: {
- if (VWidth == 1 || !DemandedElts.isMask())
- return nullptr;
-
- // TODO: Handle 3 vectors when supported in code gen.
- unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
- if (NewNumElts == VWidth)
- return nullptr;
-
- Module *M = II->getParent()->getParent()->getParent();
- Type *EltTy = V->getType()->getVectorElementType();
-
- Type *NewTy = (NewNumElts == 1) ? EltTy :
- VectorType::get(EltTy, NewNumElts);
-
- auto IID = II->getIntrinsicID();
-
- bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
- IID == Intrinsic::amdgcn_buffer_load_format;
-
- Function *NewIntrin = IsBuffer ?
- Intrinsic::getDeclaration(M, IID, NewTy) :
- // Samplers have 3 mangled types.
- Intrinsic::getDeclaration(M, IID,
- { NewTy, II->getArgOperand(0)->getType(),
- II->getArgOperand(1)->getType()});
-
- SmallVector<Value *, 5> Args;
- for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
- Args.push_back(II->getArgOperand(I));
-
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(II);
-
- CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
- NewCall->takeName(II);
- NewCall->copyMetadata(*II);
-
- if (!IsBuffer) {
- ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
- if (DMask) {
- unsigned DMaskVal = DMask->getZExtValue() & 0xf;
-
- unsigned PopCnt = 0;
- unsigned NewDMask = 0;
- for (unsigned I = 0; I < 4; ++I) {
- const unsigned Bit = 1 << I;
- if (!!(DMaskVal & Bit)) {
- if (++PopCnt > NewNumElts)
- break;
+ return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
+ default: {
+ if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
+ return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
- NewDMask |= Bit;
- }
- }
-
- NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
- }
- }
-
-
- if (NewNumElts == 1) {
- return Builder.CreateInsertElement(UndefValue::get(V->getType()),
- NewCall, static_cast<uint64_t>(0));
- }
-
- SmallVector<uint32_t, 8> EltMask;
- for (unsigned I = 0; I < VWidth; ++I)
- EltMask.push_back(I);
-
- Value *Shuffle = Builder.CreateShuffleVector(
- NewCall, UndefValue::get(NewTy), EltMask);
-
- MadeChange = true;
- return Shuffle;
+ break;
}
}
break;
diff --git a/lib/Transforms/InstCombine/InstCombineTables.td b/lib/Transforms/InstCombine/InstCombineTables.td
new file mode 100644
index 000000000000..98b2adc442fa
--- /dev/null
+++ b/lib/Transforms/InstCombine/InstCombineTables.td
@@ -0,0 +1,11 @@
+include "llvm/TableGen/SearchableTable.td"
+include "llvm/IR/Intrinsics.td"
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+ let FilterClass = "AMDGPUImageDMaskIntrinsic";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+ let PrimaryKeyEarlyOut = 1;
+}
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index aeac8910af6b..2560feb37d66 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1140,6 +1140,216 @@ static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
return true;
}
+/// These are the ingredients in an alternate form binary operator as described
+/// below.
+struct BinopElts {
+ BinaryOperator::BinaryOps Opcode;
+ Value *Op0;
+ Value *Op1;
+ BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
+ Value *V0 = nullptr, Value *V1 = nullptr) :
+ Opcode(Opc), Op0(V0), Op1(V1) {}
+ operator bool() const { return Opcode != 0; }
+};
+
+/// Binops may be transformed into binops with different opcodes and operands.
+/// Reverse the usual canonicalization to enable folds with the non-canonical
+/// form of the binop. If a transform is possible, return the elements of the
+/// new binop. If not, return invalid elements.
+static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
+ Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
+ Type *Ty = BO->getType();
+ switch (BO->getOpcode()) {
+ case Instruction::Shl: {
+ // shl X, C --> mul X, (1 << C)
+ Constant *C;
+ if (match(BO1, m_Constant(C))) {
+ Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
+ return { Instruction::Mul, BO0, ShlOne };
+ }
+ break;
+ }
+ case Instruction::Or: {
+ // or X, C --> add X, C (when X and C have no common bits set)
+ const APInt *C;
+ if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
+ return { Instruction::Add, BO0, BO1 };
+ break;
+ }
+ default:
+ break;
+ }
+ return {};
+}
+
+static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
+ assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
+
+ // Are we shuffling together some value and that same value after it has been
+ // modified by a binop with a constant?
+ Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+ Constant *C;
+ bool Op0IsBinop;
+ if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
+ Op0IsBinop = true;
+ else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
+ Op0IsBinop = false;
+ else
+ return nullptr;
+
+ // The identity constant for a binop leaves a variable operand unchanged. For
+ // a vector, this is a splat of something like 0, -1, or 1.
+ // If there's no identity constant for this binop, we're done.
+ auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
+ BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
+ Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
+ if (!IdC)
+ return nullptr;
+
+ // Shuffle identity constants into the lanes that return the original value.
+ // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
+ // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
+ // The existing binop constant vector remains in the same operand position.
+ Constant *Mask = Shuf.getMask();
+ Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
+ ConstantExpr::getShuffleVector(IdC, C, Mask);
+
+ bool MightCreatePoisonOrUB =
+ Mask->containsUndefElement() &&
+ (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
+ if (MightCreatePoisonOrUB)
+ NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true);
+
+ // shuf (bop X, C), X, M --> bop X, C'
+ // shuf X, (bop X, C), M --> bop X, C'
+ Value *X = Op0IsBinop ? Op1 : Op0;
+ Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
+ NewBO->copyIRFlags(BO);
+
+ // An undef shuffle mask element may propagate as an undef constant element in
+ // the new binop. That would produce poison where the original code might not.
+ // If we already made a safe constant, then there's no danger.
+ if (Mask->containsUndefElement() && !MightCreatePoisonOrUB)
+ NewBO->dropPoisonGeneratingFlags();
+ return NewBO;
+}
+
+/// Try to fold shuffles that are the equivalent of a vector select.
+static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
+ InstCombiner::BuilderTy &Builder,
+ const DataLayout &DL) {
+ if (!Shuf.isSelect())
+ return nullptr;
+
+ if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
+ return I;
+
+ BinaryOperator *B0, *B1;
+ if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
+ !match(Shuf.getOperand(1), m_BinOp(B1)))
+ return nullptr;
+
+ Value *X, *Y;
+ Constant *C0, *C1;
+ bool ConstantsAreOp1;
+ if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
+ match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
+ ConstantsAreOp1 = true;
+ else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
+ match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
+ ConstantsAreOp1 = false;
+ else
+ return nullptr;
+
+ // We need matching binops to fold the lanes together.
+ BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
+ BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
+ bool DropNSW = false;
+ if (ConstantsAreOp1 && Opc0 != Opc1) {
+ // TODO: We drop "nsw" if shift is converted into multiply because it may
+ // not be correct when the shift amount is BitWidth - 1. We could examine
+ // each vector element to determine if it is safe to keep that flag.
+ if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
+ DropNSW = true;
+ if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
+ assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
+ Opc0 = AltB0.Opcode;
+ C0 = cast<Constant>(AltB0.Op1);
+ } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
+ assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
+ Opc1 = AltB1.Opcode;
+ C1 = cast<Constant>(AltB1.Op1);
+ }
+ }
+
+ if (Opc0 != Opc1)
+ return nullptr;
+
+ // The opcodes must be the same. Use a new name to make that clear.
+ BinaryOperator::BinaryOps BOpc = Opc0;
+
+ // Select the constant elements needed for the single binop.
+ Constant *Mask = Shuf.getMask();
+ Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
+
+ // We are moving a binop after a shuffle. When a shuffle has an undefined
+ // mask element, the result is undefined, but it is not poison or undefined
+ // behavior. That is not necessarily true for div/rem/shift.
+ bool MightCreatePoisonOrUB =
+ Mask->containsUndefElement() &&
+ (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
+ if (MightCreatePoisonOrUB)
+ NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1);
+
+ Value *V;
+ if (X == Y) {
+ // Remove a binop and the shuffle by rearranging the constant:
+ // shuffle (op V, C0), (op V, C1), M --> op V, C'
+ // shuffle (op C0, V), (op C1, V), M --> op C', V
+ V = X;
+ } else {
+ // If there are 2 different variable operands, we must create a new shuffle
+ // (select) first, so check uses to ensure that we don't end up with more
+ // instructions than we started with.
+ if (!B0->hasOneUse() && !B1->hasOneUse())
+ return nullptr;
+
+ // If we use the original shuffle mask and op1 is *variable*, we would be
+ // putting an undef into operand 1 of div/rem/shift. This is either UB or
+ // poison. We do not have to guard against UB when *constants* are op1
+ // because safe constants guarantee that we do not overflow sdiv/srem (and
+ // there's no danger for other opcodes).
+ // TODO: To allow this case, create a new shuffle mask with no undefs.
+ if (MightCreatePoisonOrUB && !ConstantsAreOp1)
+ return nullptr;
+
+ // Note: In general, we do not create new shuffles in InstCombine because we
+ // do not know if a target can lower an arbitrary shuffle optimally. In this
+ // case, the shuffle uses the existing mask, so there is no additional risk.
+
+ // Select the variable vectors first, then perform the binop:
+ // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
+ // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
+ V = Builder.CreateShuffleVector(X, Y, Mask);
+ }
+
+ Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
+ BinaryOperator::Create(BOpc, NewC, V);
+
+ // Flags are intersected from the 2 source binops. But there are 2 exceptions:
+ // 1. If we changed an opcode, poison conditions might have changed.
+ // 2. If the shuffle had undef mask elements, the new binop might have undefs
+ // where the original code did not. But if we already made a safe constant,
+ // then there's no danger.
+ NewBO->copyIRFlags(B0);
+ NewBO->andIRFlags(B1);
+ if (DropNSW)
+ NewBO->setHasNoSignedWrap(false);
+ if (Mask->containsUndefElement() && !MightCreatePoisonOrUB)
+ NewBO->dropPoisonGeneratingFlags();
+ return NewBO;
+}
+
Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
Value *LHS = SVI.getOperand(0);
Value *RHS = SVI.getOperand(1);
@@ -1150,6 +1360,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI)))
return replaceInstUsesWith(SVI, V);
+ if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
+ return I;
+
bool MadeChange = false;
unsigned VWidth = SVI.getType()->getVectorNumElements();
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index b332e75c7feb..12fcc8752ea9 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -34,6 +34,8 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/InstCombine.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -55,6 +57,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetFolder.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -72,6 +75,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
@@ -93,8 +97,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -144,12 +146,20 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
/// We don't want to convert from a legal to an illegal type or from a smaller
/// to a larger illegal type. A width of '1' is always treated as a legal type
/// because i1 is a fundamental type in IR, and there are many specialized
-/// optimizations for i1 types.
+/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
+/// legal to convert to, in order to open up more combining opportunities.
+/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
+/// from frontend languages.
bool InstCombiner::shouldChangeType(unsigned FromWidth,
unsigned ToWidth) const {
bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
+ // Convert to widths of 8, 16 or 32 even if they are not legal types. Only
+ // shrink types, to prevent infinite loops.
+ if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
+ return true;
+
// If this is a legal integer from type, and the result would be an illegal
// type, don't do the transformation.
if (FromLegal && !ToLegal)
@@ -396,28 +406,23 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
// Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
// if C1 and C2 are constants.
+ Value *A, *B;
+ Constant *C1, *C2;
if (Op0 && Op1 &&
Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
- isa<Constant>(Op0->getOperand(1)) &&
- isa<Constant>(Op1->getOperand(1)) &&
- Op0->hasOneUse() && Op1->hasOneUse()) {
- Value *A = Op0->getOperand(0);
- Constant *C1 = cast<Constant>(Op0->getOperand(1));
- Value *B = Op1->getOperand(0);
- Constant *C2 = cast<Constant>(Op1->getOperand(1));
-
- Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
- BinaryOperator *New = BinaryOperator::Create(Opcode, A, B);
- if (isa<FPMathOperator>(New)) {
+ match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
+ match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
+ BinaryOperator *NewBO = BinaryOperator::Create(Opcode, A, B);
+ if (isa<FPMathOperator>(NewBO)) {
FastMathFlags Flags = I.getFastMathFlags();
Flags &= Op0->getFastMathFlags();
Flags &= Op1->getFastMathFlags();
- New->setFastMathFlags(Flags);
+ NewBO->setFastMathFlags(Flags);
}
- InsertNewInstWith(New, I);
- New->takeName(Op1);
- I.setOperand(0, New);
- I.setOperand(1, Folded);
+ InsertNewInstWith(NewBO, I);
+ NewBO->takeName(Op1);
+ I.setOperand(0, NewBO);
+ I.setOperand(1, ConstantExpr::get(Opcode, C1, C2));
// Conservatively clear the optional flags, since they may not be
// preserved by the reassociation.
ClearSubclassDataAfterReassociation(I);
@@ -434,72 +439,38 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
/// Return whether "X LOp (Y ROp Z)" is always equal to
/// "(X LOp Y) ROp (X LOp Z)".
-static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,
+static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
Instruction::BinaryOps ROp) {
- switch (LOp) {
- default:
- return false;
+ // X & (Y | Z) <--> (X & Y) | (X & Z)
+ // X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
+ if (LOp == Instruction::And)
+ return ROp == Instruction::Or || ROp == Instruction::Xor;
- case Instruction::And:
- // And distributes over Or and Xor.
- switch (ROp) {
- default:
- return false;
- case Instruction::Or:
- case Instruction::Xor:
- return true;
- }
+ // X | (Y & Z) <--> (X | Y) & (X | Z)
+ if (LOp == Instruction::Or)
+ return ROp == Instruction::And;
- case Instruction::Mul:
- // Multiplication distributes over addition and subtraction.
- switch (ROp) {
- default:
- return false;
- case Instruction::Add:
- case Instruction::Sub:
- return true;
- }
+ // X * (Y + Z) <--> (X * Y) + (X * Z)
+ // X * (Y - Z) <--> (X * Y) - (X * Z)
+ if (LOp == Instruction::Mul)
+ return ROp == Instruction::Add || ROp == Instruction::Sub;
- case Instruction::Or:
- // Or distributes over And.
- switch (ROp) {
- default:
- return false;
- case Instruction::And:
- return true;
- }
- }
+ return false;
}
/// Return whether "(X LOp Y) ROp Z" is always equal to
/// "(X ROp Z) LOp (Y ROp Z)".
-static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
+static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
Instruction::BinaryOps ROp) {
if (Instruction::isCommutative(ROp))
- return LeftDistributesOverRight(ROp, LOp);
+ return leftDistributesOverRight(ROp, LOp);
+
+ // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts.
+ return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);
- switch (LOp) {
- default:
- return false;
- // (X >> Z) & (Y >> Z) -> (X&Y) >> Z for all shifts.
- // (X >> Z) | (Y >> Z) -> (X|Y) >> Z for all shifts.
- // (X >> Z) ^ (Y >> Z) -> (X^Y) >> Z for all shifts.
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- switch (ROp) {
- default:
- return false;
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- return true;
- }
- }
// TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
// but this requires knowing that the addition does not overflow and other
// such subtleties.
- return false;
}
/// This function returns identity value for given opcode, which can be used to
@@ -511,37 +482,27 @@ static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
}
-/// This function factors binary ops which can be combined using distributive
-/// laws. This function tries to transform 'Op' based TopLevelOpcode to enable
-/// factorization e.g for ADD(SHL(X , 2), MUL(X, 5)), When this function called
-/// with TopLevelOpcode == Instruction::Add and Op = SHL(X, 2), transforms
-/// SHL(X, 2) to MUL(X, 4) i.e. returns Instruction::Mul with LHS set to 'X' and
-/// RHS to 4.
+/// This function predicates factorization using distributive laws. By default,
+/// it just returns the 'Op' inputs. But for special-cases like
+/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
+/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
+/// allow more factorization opportunities.
static Instruction::BinaryOps
-getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
- BinaryOperator *Op, Value *&LHS, Value *&RHS) {
+getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
+ Value *&LHS, Value *&RHS) {
assert(Op && "Expected a binary operator");
-
LHS = Op->getOperand(0);
RHS = Op->getOperand(1);
-
- switch (TopLevelOpcode) {
- default:
- return Op->getOpcode();
-
- case Instruction::Add:
- case Instruction::Sub:
- if (Op->getOpcode() == Instruction::Shl) {
- if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) {
- // The multiplier is really 1 << CST.
- RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST);
- return Instruction::Mul;
- }
+ if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) {
+ Constant *C;
+ if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
+ // X << C --> X * (1 << C)
+ RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
+ return Instruction::Mul;
}
- return Op->getOpcode();
+ // TODO: We can add other conversions e.g. shr => div etc.
}
-
- // TODO: We can add other conversions e.g. shr => div etc.
+ return Op->getOpcode();
}
/// This tries to simplify binary operations by factorizing out common terms
@@ -560,7 +521,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
// Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
- if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+ if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
// Does the instruction have the form "(A op' B) op (A op' D)" or, in the
// commutative case, "(A op' B) op (C op' A)"?
if (A == C || (InnerCommutative && A == D)) {
@@ -579,7 +540,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
}
// Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
- if (!SimplifiedInst && RightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+ if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
// Does the instruction have the form "(A op' B) op (C op' B)" or, in the
// commutative case, "(A op' B) op (B op' D)"?
if (B == D || (InnerCommutative && B == C)) {
@@ -664,21 +625,19 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
// term.
if (Op0)
if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
- if (Value *V =
- tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
+ if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
return V;
// The instruction has the form "(B) op (C op' D)". Try to factorize common
// term.
if (Op1)
if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
- if (Value *V =
- tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
+ if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
return V;
}
// Expansion.
- if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
+ if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
// The instruction has the form "(A op' B) op C". See if expanding it out
// to "(A op C) op' (B op C)" results in simplifications.
Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
@@ -715,7 +674,7 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
}
}
- if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
+ if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
// The instruction has the form "A op (B op' C)". See if expanding it out
// to "(A op B) op' (A op C)" results in simplifications.
Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
@@ -817,23 +776,6 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
return nullptr;
}
-/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is
-/// a constant negative zero (which is the 'negate' form).
-Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
- if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
- return BinaryOperator::getFNegArgument(V);
-
- // Constants can be considered to be negated values if they can be folded.
- if (ConstantFP *C = dyn_cast<ConstantFP>(V))
- return ConstantExpr::getFNeg(C);
-
- if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
- if (C->getType()->getElementType()->isFloatingPointTy())
- return ConstantExpr::getFNeg(C);
-
- return nullptr;
-}
-
static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
InstCombiner::BuilderTy &Builder) {
if (auto *Cast = dyn_cast<CastInst>(&I))
@@ -1081,8 +1023,9 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
return replaceInstUsesWith(I, NewPN);
}
-Instruction *InstCombiner::foldOpWithConstantIntoOperand(BinaryOperator &I) {
- assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
+Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
+ if (!isa<Constant>(I.getOperand(1)))
+ return nullptr;
if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
@@ -1107,7 +1050,7 @@ Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
// Start with the index over the outer type. Note that the type size
// might be zero (even if the offset isn't zero) if the indexed type
// is something like [0 x {int, int}]
- Type *IntPtrTy = DL.getIntPtrType(PtrTy);
+ Type *IndexTy = DL.getIndexType(PtrTy);
int64_t FirstIdx = 0;
if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
FirstIdx = Offset/TySize;
@@ -1122,7 +1065,7 @@ Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
}
- NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+ NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
// Index into the types. If we fail, set OrigBase to null.
while (Offset) {
@@ -1144,7 +1087,7 @@ Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
} else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
assert(EltSize && "Cannot index into a zero-sized array");
- NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+ NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
Offset %= EltSize;
Ty = AT->getElementType();
} else {
@@ -1408,22 +1351,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
} while (true);
}
-/// \brief Creates node of binary operation with the same attributes as the
-/// specified one but with other operands.
-static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
- InstCombiner::BuilderTy &B) {
- Value *BO = B.CreateBinOp(Inst.getOpcode(), LHS, RHS);
- // If LHS and RHS are constant, BO won't be a binary operator.
- if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO))
- NewBO->copyIRFlags(&Inst);
- return BO;
-}
-
-/// \brief Makes transformation of binary operation specific for vector types.
-/// \param Inst Binary operator to transform.
-/// \return Pointer to node that must replace the original binary operator, or
-/// null pointer if no transformation was made.
-Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
+Instruction *InstCombiner::foldShuffledBinop(BinaryOperator &Inst) {
if (!Inst.getType()->isVectorTy()) return nullptr;
// It may not be safe to reorder shuffles and things like div, urem, etc.
@@ -1437,58 +1365,71 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
+ auto createBinOpShuffle = [&](Value *X, Value *Y, Constant *M) {
+ Value *XY = Builder.CreateBinOp(Inst.getOpcode(), X, Y);
+ if (auto *BO = dyn_cast<BinaryOperator>(XY))
+ BO->copyIRFlags(&Inst);
+ return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
+ };
+
// If both arguments of the binary operation are shuffles that use the same
- // mask and shuffle within a single vector, move the shuffle after the binop:
- // Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
- auto *LShuf = dyn_cast<ShuffleVectorInst>(LHS);
- auto *RShuf = dyn_cast<ShuffleVectorInst>(RHS);
- if (LShuf && RShuf && LShuf->getMask() == RShuf->getMask() &&
- isa<UndefValue>(LShuf->getOperand(1)) &&
- isa<UndefValue>(RShuf->getOperand(1)) &&
- LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType()) {
- Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
- RShuf->getOperand(0), Builder);
- return Builder.CreateShuffleVector(
- NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
+ // mask and shuffle within a single vector, move the shuffle after the binop.
+ Value *V1, *V2;
+ Constant *Mask;
+ if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))) &&
+ match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(Mask))) &&
+ V1->getType() == V2->getType() &&
+ (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
+ // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
+ return createBinOpShuffle(V1, V2, Mask);
}
- // If one argument is a shuffle within one vector, the other is a constant,
- // try moving the shuffle after the binary operation.
- ShuffleVectorInst *Shuffle = nullptr;
- Constant *C1 = nullptr;
- if (isa<ShuffleVectorInst>(LHS)) Shuffle = cast<ShuffleVectorInst>(LHS);
- if (isa<ShuffleVectorInst>(RHS)) Shuffle = cast<ShuffleVectorInst>(RHS);
- if (isa<Constant>(LHS)) C1 = cast<Constant>(LHS);
- if (isa<Constant>(RHS)) C1 = cast<Constant>(RHS);
- if (Shuffle && C1 &&
- (isa<ConstantVector>(C1) || isa<ConstantDataVector>(C1)) &&
- isa<UndefValue>(Shuffle->getOperand(1)) &&
- Shuffle->getType() == Shuffle->getOperand(0)->getType()) {
- SmallVector<int, 16> ShMask = Shuffle->getShuffleMask();
- // Find constant C2 that has property:
- // shuffle(C2, ShMask) = C1
- // If such constant does not exist (example: ShMask=<0,0> and C1=<1,2>)
- // reorder is not possible.
- SmallVector<Constant*, 16> C2M(VWidth,
- UndefValue::get(C1->getType()->getScalarType()));
+ // If one argument is a shuffle within one vector and the other is a constant,
+ // try moving the shuffle after the binary operation. This canonicalization
+ // intends to move shuffles closer to other shuffles and binops closer to
+ // other binops, so they can be folded. It may also enable demanded elements
+ // transforms.
+ Constant *C;
+ if (match(&Inst, m_c_BinOp(
+ m_OneUse(m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))),
+ m_Constant(C))) &&
+ V1->getType() == Inst.getType()) {
+ // Find constant NewC that has property:
+ // shuffle(NewC, ShMask) = C
+ // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
+ // reorder is not possible. A 1-to-1 mapping is not required. Example:
+ // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
+ SmallVector<int, 16> ShMask;
+ ShuffleVectorInst::getShuffleMask(Mask, ShMask);
+ SmallVector<Constant *, 16>
+ NewVecC(VWidth, UndefValue::get(C->getType()->getScalarType()));
bool MayChange = true;
for (unsigned I = 0; I < VWidth; ++I) {
if (ShMask[I] >= 0) {
assert(ShMask[I] < (int)VWidth);
- if (!isa<UndefValue>(C2M[ShMask[I]])) {
+ Constant *CElt = C->getAggregateElement(I);
+ Constant *NewCElt = NewVecC[ShMask[I]];
+ if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt)) {
MayChange = false;
break;
}
- C2M[ShMask[I]] = C1->getAggregateElement(I);
+ NewVecC[ShMask[I]] = CElt;
}
}
if (MayChange) {
- Constant *C2 = ConstantVector::get(C2M);
- Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0);
- Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2;
- Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder);
- return Builder.CreateShuffleVector(NewBO,
- UndefValue::get(Inst.getType()), Shuffle->getMask());
+ Constant *NewC = ConstantVector::get(NewVecC);
+ // It may not be safe to execute a binop on a vector with undef elements
+ // because the entire instruction can be folded to undef or create poison
+ // that did not exist in the original code.
+ bool ConstOp1 = isa<Constant>(Inst.getOperand(1));
+ if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
+ NewC = getSafeVectorConstantForBinop(Inst.getOpcode(), NewC, ConstOp1);
+
+ // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
+ // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
+ Value *NewLHS = isa<Constant>(LHS) ? NewC : V1;
+ Value *NewRHS = isa<Constant>(LHS) ? V1 : NewC;
+ return createBinOpShuffle(NewLHS, NewRHS, Mask);
}
}
@@ -1497,9 +1438,9 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
-
- if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops,
- SQ.getWithInstruction(&GEP)))
+ Type *GEPType = GEP.getType();
+ Type *GEPEltType = GEP.getSourceElementType();
+ if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
return replaceInstUsesWith(GEP, V);
Value *PtrOp = GEP.getOperand(0);
@@ -1507,8 +1448,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// Eliminate unneeded casts for indices, and replace indices which displace
// by multiples of a zero size type with zero.
bool MadeChange = false;
- Type *IntPtrTy =
- DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType());
+
+ // Index width may not be the same width as pointer width.
+ // Data layout chooses the right type based on supported integer types.
+ Type *NewScalarIndexTy =
+ DL.getIndexType(GEP.getPointerOperandType()->getScalarType());
gep_type_iterator GTI = gep_type_begin(GEP);
for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
@@ -1517,10 +1461,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (GTI.isStruct())
continue;
- // Index type should have the same width as IntPtr
Type *IndexTy = (*I)->getType();
- Type *NewIndexType = IndexTy->isVectorTy() ?
- VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy;
+ Type *NewIndexType =
+ IndexTy->isVectorTy()
+ ? VectorType::get(NewScalarIndexTy, IndexTy->getVectorNumElements())
+ : NewScalarIndexTy;
// If the element type has zero size then any index over it is equivalent
// to an index of zero, so replace it with zero if it is not zero already.
@@ -1543,8 +1488,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return &GEP;
// Check to see if the inputs to the PHI node are getelementptr instructions.
- if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
- GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+ if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
+ auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
if (!Op1)
return nullptr;
@@ -1560,7 +1505,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
int DI = -1;
for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
- GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
+ auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
return nullptr;
@@ -1602,7 +1547,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (J > 0) {
if (J == 1) {
CurTy = Op1->getSourceElementType();
- } else if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
+ } else if (auto *CT = dyn_cast<CompositeType>(CurTy)) {
CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
} else {
CurTy = nullptr;
@@ -1617,7 +1562,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (DI != -1 && !PN->hasOneUse())
return nullptr;
- GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+ auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
if (DI == -1) {
// All the GEPs feeding the PHI are identical. Clone one down into our
// BB so that it can be merged with the current GEP.
@@ -1652,15 +1597,64 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// Combine Indices - If the source pointer to this getelementptr instruction
// is a getelementptr instruction, combine the indices of the two
// getelementptr instructions into a single instruction.
- if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
+ if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
return nullptr;
+ // Try to reassociate loop invariant GEP chains to enable LICM.
+ if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+ Src->hasOneUse()) {
+ if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+ Value *GO1 = GEP.getOperand(1);
+ Value *SO1 = Src->getOperand(1);
+ // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+ // invariant: this breaks the dependence between GEPs and allows LICM
+ // to hoist the invariant part out of the loop.
+ if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+ // We have to be careful here.
+ // We have something like:
+ // %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+ // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+ // If we just swap idx & idx2 then we could inadvertantly
+ // change %src from a vector to a scalar, or vice versa.
+ // Cases:
+ // 1) %base a scalar & idx a scalar & idx2 a vector
+ // => Swapping idx & idx2 turns %src into a vector type.
+ // 2) %base a scalar & idx a vector & idx2 a scalar
+ // => Swapping idx & idx2 turns %src in a scalar type
+ // 3) %base, %idx, and %idx2 are scalars
+ // => %src & %gep are scalars
+ // => swapping idx & idx2 is safe
+ // 4) %base a vector
+ // => %src is a vector
+ // => swapping idx & idx2 is safe.
+ auto *SO0 = Src->getOperand(0);
+ auto *SO0Ty = SO0->getType();
+ if (!isa<VectorType>(GEPType) || // case 3
+ isa<VectorType>(SO0Ty)) { // case 4
+ Src->setOperand(1, GO1);
+ GEP.setOperand(1, SO1);
+ return &GEP;
+ } else {
+ // Case 1 or 2
+ // -- have to recreate %src & %gep
+ // put NewSrc at same location as %src
+ Builder.SetInsertPoint(cast<Instruction>(PtrOp));
+ auto *NewSrc = cast<GetElementPtrInst>(
+ Builder.CreateGEP(SO0, GO1, Src->getName()));
+ NewSrc->setIsInBounds(Src->isInBounds());
+ auto *NewGEP = GetElementPtrInst::Create(nullptr, NewSrc, {SO1});
+ NewGEP->setIsInBounds(GEP.isInBounds());
+ return NewGEP;
+ }
+ }
+ }
+ }
+
// Note that if our source is a gep chain itself then we wait for that
// chain to be resolved before we perform this transformation. This
// avoids us creating a TON of code in some cases.
- if (GEPOperator *SrcGEP =
- dyn_cast<GEPOperator>(Src->getOperand(0)))
+ if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
return nullptr; // Wait until our source is folded to completion.
@@ -1723,9 +1717,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (GEP.getNumIndices() == 1) {
unsigned AS = GEP.getPointerAddressSpace();
if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
- DL.getPointerSizeInBits(AS)) {
- Type *Ty = GEP.getSourceElementType();
- uint64_t TyAllocSize = DL.getTypeAllocSize(Ty);
+ DL.getIndexSizeInBits(AS)) {
+ uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType);
bool Matched = false;
uint64_t C;
@@ -1752,22 +1745,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
Operator *Index = cast<Operator>(V);
Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
- return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+ return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
}
// Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
// to (bitcast Y)
Value *Y;
if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
- m_PtrToInt(m_Specific(GEP.getOperand(0)))))) {
- return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y,
- GEP.getType());
- }
+ m_PtrToInt(m_Specific(GEP.getOperand(0))))))
+ return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
}
}
}
// We do not handle pointer-vector geps here.
- if (GEP.getType()->isVectorTy())
+ if (GEPType->isVectorTy())
return nullptr;
// Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
@@ -1776,7 +1767,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (StrippedPtr != PtrOp) {
bool HasZeroPointerIndex = false;
- if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
+ if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
HasZeroPointerIndex = C->isZero();
// Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
@@ -1787,8 +1778,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
//
// This occurs when the program declares an array extern like "int X[];"
if (HasZeroPointerIndex) {
- if (ArrayType *CATy =
- dyn_cast<ArrayType>(GEP.getSourceElementType())) {
+ if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
// GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
// -> GEP i8* X, ...
@@ -1804,11 +1794,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// ->
// %0 = GEP i8 addrspace(1)* X, ...
// addrspacecast i8 addrspace(1)* %0 to i8*
- return new AddrSpaceCastInst(Builder.Insert(Res), GEP.getType());
+ return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
}
- if (ArrayType *XATy =
- dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
+ if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrTy->getElementType())) {
// GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
if (CATy->getElementType() == XATy->getElementType()) {
// -> GEP [10 x i8]* X, i32 0, ...
@@ -1836,7 +1825,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
nullptr, StrippedPtr, Idx, GEP.getName())
: Builder.CreateGEP(nullptr, StrippedPtr, Idx,
GEP.getName());
- return new AddrSpaceCastInst(NewGEP, GEP.getType());
+ return new AddrSpaceCastInst(NewGEP, GEPType);
}
}
}
@@ -1844,12 +1833,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// Transform things like:
// %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
// into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
- Type *SrcElTy = StrippedPtrTy->getElementType();
- Type *ResElTy = GEP.getSourceElementType();
- if (SrcElTy->isArrayTy() &&
- DL.getTypeAllocSize(SrcElTy->getArrayElementType()) ==
- DL.getTypeAllocSize(ResElTy)) {
- Type *IdxType = DL.getIntPtrType(GEP.getType());
+ Type *SrcEltTy = StrippedPtrTy->getElementType();
+ if (SrcEltTy->isArrayTy() &&
+ DL.getTypeAllocSize(SrcEltTy->getArrayElementType()) ==
+ DL.getTypeAllocSize(GEPEltType)) {
+ Type *IdxType = DL.getIndexType(GEPType);
Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
Value *NewGEP =
GEP.isInBounds()
@@ -1858,28 +1846,28 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
: Builder.CreateGEP(nullptr, StrippedPtr, Idx, GEP.getName());
// V and GEP are both pointer types --> BitCast
- return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
- GEP.getType());
+ return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
}
// Transform things like:
// %V = mul i64 %N, 4
// %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
// into: %t1 = getelementptr i32* %arr, i32 %N; bitcast
- if (ResElTy->isSized() && SrcElTy->isSized()) {
+ if (GEPEltType->isSized() && SrcEltTy->isSized()) {
// Check that changing the type amounts to dividing the index by a scale
// factor.
- uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
- uint64_t SrcSize = DL.getTypeAllocSize(SrcElTy);
+ uint64_t ResSize = DL.getTypeAllocSize(GEPEltType);
+ uint64_t SrcSize = DL.getTypeAllocSize(SrcEltTy);
if (ResSize && SrcSize % ResSize == 0) {
Value *Idx = GEP.getOperand(1);
unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
uint64_t Scale = SrcSize / ResSize;
- // Earlier transforms ensure that the index has type IntPtrType, which
- // considerably simplifies the logic by eliminating implicit casts.
- assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
- "Index not cast to pointer width?");
+ // Earlier transforms ensure that the index has the right type
+ // according to Data Layout, which considerably simplifies the
+ // logic by eliminating implicit casts.
+ assert(Idx->getType() == DL.getIndexType(GEPType) &&
+ "Index type does not match the Data Layout preferences");
bool NSW;
if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
@@ -1895,7 +1883,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// The NewGEP must be pointer typed, so must the old one -> BitCast
return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
- GEP.getType());
+ GEPType);
}
}
}
@@ -1904,39 +1892,40 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
// (where tmp = 8*tmp2) into:
// getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
- if (ResElTy->isSized() && SrcElTy->isSized() && SrcElTy->isArrayTy()) {
+ if (GEPEltType->isSized() && SrcEltTy->isSized() &&
+ SrcEltTy->isArrayTy()) {
// Check that changing to the array element type amounts to dividing the
// index by a scale factor.
- uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
+ uint64_t ResSize = DL.getTypeAllocSize(GEPEltType);
uint64_t ArrayEltSize =
- DL.getTypeAllocSize(SrcElTy->getArrayElementType());
+ DL.getTypeAllocSize(SrcEltTy->getArrayElementType());
if (ResSize && ArrayEltSize % ResSize == 0) {
Value *Idx = GEP.getOperand(1);
unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
uint64_t Scale = ArrayEltSize / ResSize;
- // Earlier transforms ensure that the index has type IntPtrType, which
- // considerably simplifies the logic by eliminating implicit casts.
- assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
- "Index not cast to pointer width?");
+ // Earlier transforms ensure that the index has the right type
+ // according to the Data Layout, which considerably simplifies
+ // the logic by eliminating implicit casts.
+ assert(Idx->getType() == DL.getIndexType(GEPType) &&
+ "Index type does not match the Data Layout preferences");
bool NSW;
if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
// Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
// If the multiplication NewIdx * Scale may overflow then the new
// GEP may not be "inbounds".
- Value *Off[2] = {
- Constant::getNullValue(DL.getIntPtrType(GEP.getType())),
- NewIdx};
+ Type *IndTy = DL.getIndexType(GEPType);
+ Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
Value *NewGEP = GEP.isInBounds() && NSW
? Builder.CreateInBoundsGEP(
- SrcElTy, StrippedPtr, Off, GEP.getName())
- : Builder.CreateGEP(SrcElTy, StrippedPtr, Off,
+ SrcEltTy, StrippedPtr, Off, GEP.getName())
+ : Builder.CreateGEP(SrcEltTy, StrippedPtr, Off,
GEP.getName());
// The NewGEP must be pointer typed, so must the old one -> BitCast
return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
- GEP.getType());
+ GEPType);
}
}
}
@@ -1946,34 +1935,53 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// addrspacecast between types is canonicalized as a bitcast, then an
// addrspacecast. To take advantage of the below bitcast + struct GEP, look
// through the addrspacecast.
- if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
+ Value *ASCStrippedPtrOp = PtrOp;
+ if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
// X = bitcast A addrspace(1)* to B addrspace(1)*
// Y = addrspacecast A addrspace(1)* to B addrspace(2)*
// Z = gep Y, <...constant indices...>
// Into an addrspacecasted GEP of the struct.
- if (BitCastInst *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
- PtrOp = BC;
+ if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
+ ASCStrippedPtrOp = BC;
}
- /// See if we can simplify:
- /// X = bitcast A* to B*
- /// Y = gep X, <...constant indices...>
- /// into a gep of the original struct. This is important for SROA and alias
- /// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
- if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
- Value *Operand = BCI->getOperand(0);
- PointerType *OpType = cast<PointerType>(Operand->getType());
- unsigned OffsetBits = DL.getPointerTypeSizeInBits(GEP.getType());
- APInt Offset(OffsetBits, 0);
- if (!isa<BitCastInst>(Operand) &&
- GEP.accumulateConstantOffset(DL, Offset)) {
+ if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
+ Value *SrcOp = BCI->getOperand(0);
+ PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
+ Type *SrcEltType = SrcType->getElementType();
+
+ // GEP directly using the source operand if this GEP is accessing an element
+ // of a bitcasted pointer to vector or array of the same dimensions:
+ // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
+ // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
+ auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy) {
+ return ArrTy->getArrayElementType() == VecTy->getVectorElementType() &&
+ ArrTy->getArrayNumElements() == VecTy->getVectorNumElements();
+ };
+ if (GEP.getNumOperands() == 3 &&
+ ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() &&
+ areMatchingArrayAndVecTypes(GEPEltType, SrcEltType)) ||
+ (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() &&
+ areMatchingArrayAndVecTypes(SrcEltType, GEPEltType)))) {
+ GEP.setOperand(0, SrcOp);
+ GEP.setSourceElementType(SrcEltType);
+ return &GEP;
+ }
+ // See if we can simplify:
+ // X = bitcast A* to B*
+ // Y = gep X, <...constant indices...>
+ // into a gep of the original struct. This is important for SROA and alias
+ // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
+ unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
+ APInt Offset(OffsetBits, 0);
+ if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
// If this GEP instruction doesn't move the pointer, just replace the GEP
// with a bitcast of the real input to the dest type.
if (!Offset) {
// If the bitcast is of an allocation, and the allocation will be
// converted to match the type of the cast, don't touch this.
- if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, &TLI)) {
+ if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) {
// See if the bitcast simplifies, if so, don't nuke this GEP yet.
if (Instruction *I = visitBitCast(*BCI)) {
if (I != BCI) {
@@ -1985,43 +1993,43 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
}
}
- if (Operand->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(Operand, GEP.getType());
- return new BitCastInst(Operand, GEP.getType());
+ if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(SrcOp, GEPType);
+ return new BitCastInst(SrcOp, GEPType);
}
// Otherwise, if the offset is non-zero, we need to find out if there is a
// field at Offset in 'A's type. If so, we can pull the cast through the
// GEP.
SmallVector<Value*, 8> NewIndices;
- if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
+ if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
Value *NGEP =
GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(nullptr, Operand, NewIndices)
- : Builder.CreateGEP(nullptr, Operand, NewIndices);
+ ? Builder.CreateInBoundsGEP(nullptr, SrcOp, NewIndices)
+ : Builder.CreateGEP(nullptr, SrcOp, NewIndices);
- if (NGEP->getType() == GEP.getType())
+ if (NGEP->getType() == GEPType)
return replaceInstUsesWith(GEP, NGEP);
NGEP->takeName(&GEP);
if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(NGEP, GEP.getType());
- return new BitCastInst(NGEP, GEP.getType());
+ return new AddrSpaceCastInst(NGEP, GEPType);
+ return new BitCastInst(NGEP, GEPType);
}
}
}
if (!GEP.isInBounds()) {
- unsigned PtrWidth =
- DL.getPointerSizeInBits(PtrOp->getType()->getPointerAddressSpace());
- APInt BasePtrOffset(PtrWidth, 0);
+ unsigned IdxWidth =
+ DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace());
+ APInt BasePtrOffset(IdxWidth, 0);
Value *UnderlyingPtrOp =
PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
BasePtrOffset);
if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
BasePtrOffset.isNonNegative()) {
- APInt AllocSize(PtrWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
+ APInt AllocSize(IdxWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
if (BasePtrOffset.ule(AllocSize)) {
return GetElementPtrInst::CreateInBounds(
PtrOp, makeArrayRef(Ops).slice(1), GEP.getName());
@@ -2198,7 +2206,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
return nullptr;
}
-/// \brief Move the call to free before a NULL test.
+/// Move the call to free before a NULL test.
///
/// Check if this free is accessed after its argument has been test
/// against NULL (property 0).
@@ -2562,6 +2570,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
case EHPersonality::MSVC_Win64SEH:
case EHPersonality::MSVC_CXX:
case EHPersonality::CoreCLR:
+ case EHPersonality::Wasm_CXX:
return TypeInfo->isNullValue();
}
llvm_unreachable("invalid enum");
@@ -2889,6 +2898,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
/// block.
static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
assert(I->hasOneUse() && "Invariants didn't hold!");
+ BasicBlock *SrcBlock = I->getParent();
// Cannot move control-flow-involving, volatile loads, vaarg, etc.
if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
@@ -2918,10 +2928,20 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
if (Scan->mayWriteToMemory())
return false;
}
-
BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
I->moveBefore(&*InsertPos);
++NumSunkInst;
+
+ // Also sink all related debug uses from the source basic block. Otherwise we
+ // get debug use before the def.
+ SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, I);
+ for (auto *DII : DbgUsers) {
+ if (DII->getParent() == SrcBlock) {
+ DII->moveBefore(&*InsertPos);
+ LLVM_DEBUG(dbgs() << "SINK: " << *DII << '\n');
+ }
+ }
return true;
}
@@ -2932,7 +2952,7 @@ bool InstCombiner::run() {
// Check to see if we can DCE the instruction.
if (isInstructionTriviallyDead(I, &TLI)) {
- DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
eraseInstFromFunction(*I);
++NumDeadInst;
MadeIRChange = true;
@@ -2946,7 +2966,8 @@ bool InstCombiner::run() {
if (!I->use_empty() &&
(I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
- DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I
+ << '\n');
// Add operands to the worklist.
replaceInstUsesWith(*I, C);
@@ -2965,8 +2986,8 @@ bool InstCombiner::run() {
KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
if (Known.isConstant()) {
Constant *C = ConstantInt::get(Ty, Known.getConstant());
- DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
- " from: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C
+ << " from: " << *I << '\n');
// Add operands to the worklist.
replaceInstUsesWith(*I, C);
@@ -3005,7 +3026,7 @@ bool InstCombiner::run() {
if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
// Okay, the CFG is simple enough, try to sink this instruction.
if (TryToSinkInstruction(I, UserParent)) {
- DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
MadeIRChange = true;
// We'll add uses of the sunk instruction below, but since sinking
// can expose opportunities for it's *operands* add them to the
@@ -3025,15 +3046,15 @@ bool InstCombiner::run() {
#ifndef NDEBUG
std::string OrigI;
#endif
- DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
- DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+ LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+ LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
if (Instruction *Result = visit(*I)) {
++NumCombined;
// Should we replace the old instruction with a new one?
if (Result != I) {
- DEBUG(dbgs() << "IC: Old = " << *I << '\n'
- << " New = " << *Result << '\n');
+ LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
+ << " New = " << *Result << '\n');
if (I->getDebugLoc())
Result->setDebugLoc(I->getDebugLoc());
@@ -3060,8 +3081,8 @@ bool InstCombiner::run() {
eraseInstFromFunction(*I);
} else {
- DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
- << " New = " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
+ << " New = " << *I << '\n');
// If the instruction was modified, it's possible that it is now dead.
// if so, remove it.
@@ -3112,7 +3133,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
// DCE instruction if trivially dead.
if (isInstructionTriviallyDead(Inst, TLI)) {
++NumDeadInst;
- DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
salvageDebugInfo(*Inst);
Inst->eraseFromParent();
MadeIRChange = true;
@@ -3123,8 +3144,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
if (!Inst->use_empty() &&
(Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
- DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: "
- << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
+ << '\n');
Inst->replaceAllUsesWith(C);
++NumConstProp;
if (isInstructionTriviallyDead(Inst, TLI))
@@ -3146,9 +3167,9 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
FoldRes = C;
if (FoldRes != C) {
- DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
- << "\n Old = " << *C
- << "\n New = " << *FoldRes << '\n');
+ LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+ << "\n Old = " << *C
+ << "\n New = " << *FoldRes << '\n');
U = FoldRes;
MadeIRChange = true;
}
@@ -3191,7 +3212,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
return MadeIRChange;
}
-/// \brief Populate the IC worklist from a function, and prune any dead basic
+/// Populate the IC worklist from a function, and prune any dead basic
/// blocks discovered in the process.
///
/// This also does basic constant propagation and other forward fixing to make
@@ -3251,8 +3272,8 @@ static bool combineInstructionsOverFunction(
int Iteration = 0;
while (true) {
++Iteration;
- DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
- << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+ << F.getName() << "\n");
MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
@@ -3348,3 +3369,7 @@ void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
return new InstructionCombiningPass(ExpensiveCombines);
}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createInstructionCombiningPass());
+}
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8e39f24d819c..b3f659194558 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,7 +16,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
@@ -25,6 +25,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/IR/Argument.h"
@@ -71,7 +72,6 @@
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
@@ -107,10 +107,18 @@ static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+static const uint64_t kMyriadShadowScale = 5;
+static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL;
+static const uint64_t kMyriadMemorySize32 = 0x20000000ULL;
+static const uint64_t kMyriadTagShift = 29;
+static const uint64_t kMyriadDDRTag = 4;
+static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL;
+
// The shadow memory space is dynamically allocated.
static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
@@ -145,7 +153,7 @@ static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
static const int kMaxAsanStackMallocSizeClass = 10;
static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
-static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kAsanGenPrefix = "___asan_gen_";
static const char *const kODRGenPrefix = "__odr_asan_gen_";
static const char *const kSanCovGenPrefix = "__sancov_gen_";
static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
@@ -485,18 +493,17 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
bool IsSystemZ = TargetTriple.getArch() == Triple::systemz;
bool IsX86 = TargetTriple.getArch() == Triple::x86;
bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
- bool IsMIPS32 = TargetTriple.getArch() == Triple::mips ||
- TargetTriple.getArch() == Triple::mipsel;
- bool IsMIPS64 = TargetTriple.getArch() == Triple::mips64 ||
- TargetTriple.getArch() == Triple::mips64el;
+ bool IsMIPS32 = TargetTriple.isMIPS32();
+ bool IsMIPS64 = TargetTriple.isMIPS64();
bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
bool IsWindows = TargetTriple.isOSWindows();
bool IsFuchsia = TargetTriple.isOSFuchsia();
+ bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
ShadowMapping Mapping;
- Mapping.Scale = kDefaultShadowScale;
+ Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale;
if (ClMappingScale.getNumOccurrences() > 0) {
Mapping.Scale = ClMappingScale;
}
@@ -508,11 +515,18 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
Mapping.Offset = kMIPS32_ShadowOffset32;
else if (IsFreeBSD)
Mapping.Offset = kFreeBSD_ShadowOffset32;
+ else if (IsNetBSD)
+ Mapping.Offset = kNetBSD_ShadowOffset32;
else if (IsIOS)
// If we're targeting iOS and x86, the binary is built for iOS simulator.
Mapping.Offset = IsX86 ? kIOSSimShadowOffset32 : kIOSShadowOffset32;
else if (IsWindows)
Mapping.Offset = kWindowsShadowOffset32;
+ else if (IsMyriad) {
+ uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
+ (kMyriadMemorySize32 >> Mapping.Scale));
+ Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale);
+ }
else
Mapping.Offset = kDefaultShadowOffset32;
} else { // LongSize == 64
@@ -589,9 +603,10 @@ struct AddressSanitizer : public FunctionPass {
explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
bool UseAfterScope = false)
- : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan),
- Recover(Recover || ClRecover),
- UseAfterScope(UseAfterScope || ClUseAfterScope) {
+ : FunctionPass(ID), UseAfterScope(UseAfterScope || ClUseAfterScope) {
+ this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+ this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
+ ClEnableKasan : CompileKernel;
initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
}
@@ -717,8 +732,7 @@ public:
explicit AddressSanitizerModule(bool CompileKernel = false,
bool Recover = false,
bool UseGlobalsGC = true)
- : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
- Recover(Recover || ClRecover),
+ : ModulePass(ID),
UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
// Not a typo: ClWithComdat is almost completely pointless without
// ClUseGlobalsGC (because then it only works on modules without
@@ -727,7 +741,12 @@ public:
// argument is designed as workaround. Therefore, disable both
// ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
// do globals-gc.
- UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
+ UseCtorComdat(UseGlobalsGC && ClWithComdat) {
+ this->Recover = ClRecover.getNumOccurrences() > 0 ?
+ ClRecover : Recover;
+ this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
+ ClEnableKasan : CompileKernel;
+ }
bool runOnModule(Module &M) override;
StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -869,7 +888,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
processStaticAllocas();
if (ClDebugStack) {
- DEBUG(dbgs() << F);
+ LLVM_DEBUG(dbgs() << F);
}
return true;
}
@@ -888,13 +907,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
void createDynamicAllocasInitStorage();
// ----------------------- Visitors.
- /// \brief Collect all Ret instructions.
+ /// Collect all Ret instructions.
void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
- /// \brief Collect all Resume instructions.
+ /// Collect all Resume instructions.
void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
- /// \brief Collect all CatchReturnInst instructions.
+ /// Collect all CatchReturnInst instructions.
void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
@@ -942,7 +961,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
// requested memory, but also left, partial and right redzones.
void handleDynamicAllocaCall(AllocaInst *AI);
- /// \brief Collect Alloca instructions we want (and can) handle.
+ /// Collect Alloca instructions we want (and can) handle.
void visitAllocaInst(AllocaInst &AI) {
if (!ASan.isInterestingAlloca(AI)) {
if (AI.isStaticAlloca()) {
@@ -963,7 +982,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
AllocaVec.push_back(&AI);
}
- /// \brief Collect lifetime intrinsic calls to check for use-after-scope
+ /// Collect lifetime intrinsic calls to check for use-after-scope
/// errors.
void visitIntrinsicInst(IntrinsicInst &II) {
Intrinsic::ID ID = II.getIntrinsicID();
@@ -1081,7 +1100,7 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
return Res;
}
-// \brief Create a constant for Str so that we can pass it to the run-time lib.
+// Create a constant for Str so that we can pass it to the run-time lib.
static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
bool AllowMerging) {
Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
@@ -1095,7 +1114,7 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
return GV;
}
-/// \brief Create a global describing a source location.
+/// Create a global describing a source location.
static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
LocationMetadata MD) {
Constant *LocData[] = {
@@ -1111,7 +1130,7 @@ static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
return GV;
}
-/// \brief Check if \p G has been created by a trusted compiler pass.
+/// Check if \p G has been created by a trusted compiler pass.
static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
// Do not instrument asan globals.
if (G->getName().startswith(kAsanGenPrefix) ||
@@ -1487,6 +1506,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
uint32_t TypeSize, bool IsWrite,
Value *SizeArgument, bool UseCalls,
uint32_t Exp) {
+ bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+
IRBuilder<> IRB(InsertBefore);
Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
@@ -1501,6 +1522,23 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
return;
}
+ if (IsMyriad) {
+ // Strip the cache bit and do range check.
+ // AddrLong &= ~kMyriadCacheBitMask32
+ AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32);
+ // Tag = AddrLong >> kMyriadTagShift
+ Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift);
+ // Tag == kMyriadDDRTag
+ Value *TagCheck =
+ IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
+
+ TerminatorInst *TagCheckTerm = SplitBlockAndInsertIfThen(
+ TagCheck, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+ assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
+ IRB.SetInsertPoint(TagCheckTerm);
+ InsertBefore = TagCheckTerm;
+ }
+
Type *ShadowTy =
IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
@@ -1609,7 +1647,7 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
Type *Ty = G->getValueType();
- DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+ LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
if (GlobalsMD.get(G).IsBlacklisted) return false;
if (!Ty->isSized()) return false;
@@ -1646,12 +1684,17 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
return false;
}
- // Callbacks put into the CRT initializer/terminator sections
- // should not be instrumented.
+ // On COFF, if the section name contains '$', it is highly likely that the
+ // user is using section sorting to create an array of globals similar to
+ // the way initialization callbacks are registered in .init_array and
+ // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones
+ // to such globals is counterproductive, because the intent is that they
+ // will form an array, and out-of-bounds accesses are expected.
// See https://github.com/google/sanitizers/issues/305
// and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
- if (Section.startswith(".CRT")) {
- DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n");
+ if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) {
+ LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): "
+ << *G << "\n");
return false;
}
@@ -1668,7 +1711,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
// them.
if (ParsedSegment == "__OBJC" ||
(ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
- DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+ LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
return false;
}
// See https://github.com/google/sanitizers/issues/32
@@ -1680,13 +1723,13 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
// Therefore there's no point in placing redzones into __DATA,__cfstring.
// Moreover, it causes the linker to crash on OS X 10.7
if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
- DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+ LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
return false;
}
// The linker merges the contents of cstring_literals and removes the
// trailing zeroes.
if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
- DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+ LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
return false;
}
}
@@ -2153,11 +2196,21 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
- DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+ LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
Initializers[i] = Initializer;
}
+ // Add instrumented globals to llvm.compiler.used list to avoid LTO from
+ // ConstantMerge'ing them.
+ SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList;
+ for (size_t i = 0; i < n; i++) {
+ GlobalVariable *G = NewGlobals[i];
+ if (G->getName().empty()) continue;
+ GlobalsToAddToUsedList.push_back(G);
+ }
+ appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
+
std::string ELFUniqueModuleId =
(UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
: "";
@@ -2177,7 +2230,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
if (HasDynamicallyInitializedGlobals)
createInitializerPoisonCalls(M, ModuleName);
- DEBUG(dbgs() << M);
+ LLVM_DEBUG(dbgs() << M);
return true;
}
@@ -2247,7 +2300,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
const std::string TypeStr = AccessIsWrite ? "store" : "load";
const std::string ExpStr = Exp ? "exp_" : "";
- const std::string SuffixStr = CompileKernel ? "N" : "_n";
const std::string EndingStr = Recover ? "_noabort" : "";
SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
@@ -2259,8 +2311,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
}
AsanErrorCallbackSized[AccessIsWrite][Exp] =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
- EndingStr,
+ kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
FunctionType::get(IRB.getVoidTy(), Args2, false)));
AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
@@ -2420,7 +2471,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
// Leave if the function doesn't need instrumentation.
if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
- DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+ LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
initializeCallbacks(*F.getParent());
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -2435,7 +2486,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
// We want to instrument every address only once per basic block (unless there
// are calls between uses).
- SmallSet<Value *, 16> TempsToInstrument;
+ SmallPtrSet<Value *, 16> TempsToInstrument;
SmallVector<Instruction *, 16> ToInstrument;
SmallVector<Instruction *, 8> NoReturnCalls;
SmallVector<BasicBlock *, 16> AllBlocks;
@@ -2494,7 +2545,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
}
bool UseCalls =
- CompileKernel ||
(ClInstrumentationWithCallsThreshold >= 0 &&
ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
const DataLayout &DL = F.getParent()->getDataLayout();
@@ -2534,8 +2584,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
if (NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty())
FunctionModified = true;
- DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
- << F << "\n");
+ LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+ << F << "\n");
return FunctionModified;
}
@@ -2710,7 +2760,7 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
Arg.replaceAllUsesWith(AI);
uint64_t AllocSize = DL.getTypeAllocSize(Ty);
- IRB.CreateMemCpy(AI, &Arg, AllocSize, Align);
+ IRB.CreateMemCpy(AI, Align, &Arg, Align, AllocSize);
}
}
}
@@ -2851,7 +2901,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
}
auto DescriptionString = ComputeASanStackFrameDescription(SVD);
- DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
+ LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
uint64_t LocalStackSize = L.FrameSize;
bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
LocalStackSize <= kMaxStackMallocSize;
@@ -3086,7 +3136,8 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
} else if (GetElementPtrInst *EP = dyn_cast<GetElementPtrInst>(V)) {
Res = findAllocaForValue(EP->getPointerOperand());
} else {
- DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V
+ << "\n");
}
if (Res) AllocaForValue[V] = Res;
return Res;
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index be9a22a8681b..e13db08e263c 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetFolder.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/BasicBlock.h"
@@ -59,11 +60,11 @@ template <typename GetTrapBBT>
static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
const DataLayout &DL, TargetLibraryInfo &TLI,
ObjectSizeOffsetEvaluator &ObjSizeEval,
- BuilderTy &IRB,
- GetTrapBBT GetTrapBB) {
+ BuilderTy &IRB, GetTrapBBT GetTrapBB,
+ ScalarEvolution &SE) {
uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
- DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
- << " bytes\n");
+ LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+ << " bytes\n");
SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
@@ -79,6 +80,10 @@ static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
Type *IntTy = DL.getIntPtrType(Ptr->getType());
Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+ auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
+ auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
+ auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
+
// three checks are required to ensure safety:
// . Offset >= 0 (since the offset is given from the base ptr)
// . Size >= Offset (unsigned)
@@ -87,10 +92,17 @@ static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
// optimization: if Size >= 0 (signed), skip 1st check
// FIXME: add NSW/NUW here? -- we dont care if the subtraction overflows
Value *ObjSize = IRB.CreateSub(Size, Offset);
- Value *Cmp2 = IRB.CreateICmpULT(Size, Offset);
- Value *Cmp3 = IRB.CreateICmpULT(ObjSize, NeededSizeVal);
+ Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
+ ? ConstantInt::getFalse(Ptr->getContext())
+ : IRB.CreateICmpULT(Size, Offset);
+ Value *Cmp3 = SizeRange.sub(OffsetRange)
+ .getUnsignedMin()
+ .uge(NeededSizeRange.getUnsignedMax())
+ ? ConstantInt::getFalse(Ptr->getContext())
+ : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
Value *Or = IRB.CreateOr(Cmp2, Cmp3);
- if (!SizeCI || SizeCI->getValue().slt(0)) {
+ if ((!SizeCI || SizeCI->getValue().slt(0)) &&
+ !SizeRange.getSignedMin().isNonNegative()) {
Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
Or = IRB.CreateOr(Cmp1, Or);
}
@@ -123,7 +135,8 @@ static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
return true;
}
-static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI) {
+static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
+ ScalarEvolution &SE) {
const DataLayout &DL = F.getParent()->getDataLayout();
ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(),
/*RoundToAlign=*/true);
@@ -168,19 +181,19 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI) {
BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
MadeChange |= instrumentMemAccess(LI->getPointerOperand(), LI, DL, TLI,
- ObjSizeEval, IRB, GetTrapBB);
+ ObjSizeEval, IRB, GetTrapBB, SE);
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
MadeChange |=
instrumentMemAccess(SI->getPointerOperand(), SI->getValueOperand(),
- DL, TLI, ObjSizeEval, IRB, GetTrapBB);
+ DL, TLI, ObjSizeEval, IRB, GetTrapBB, SE);
} else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
MadeChange |=
instrumentMemAccess(AI->getPointerOperand(), AI->getCompareOperand(),
- DL, TLI, ObjSizeEval, IRB, GetTrapBB);
+ DL, TLI, ObjSizeEval, IRB, GetTrapBB, SE);
} else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
MadeChange |=
instrumentMemAccess(AI->getPointerOperand(), AI->getValOperand(), DL,
- TLI, ObjSizeEval, IRB, GetTrapBB);
+ TLI, ObjSizeEval, IRB, GetTrapBB, SE);
} else {
llvm_unreachable("unknown Instruction type");
}
@@ -190,8 +203,9 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI) {
PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- if (!addBoundsChecking(F, TLI))
+ if (!addBoundsChecking(F, TLI, SE))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
@@ -207,11 +221,13 @@ struct BoundsCheckingLegacyPass : public FunctionPass {
bool runOnFunction(Function &F) override {
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return addBoundsChecking(F, TLI);
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ return addBoundsChecking(F, TLI, SE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
}
};
} // namespace
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
index 075e5672cff8..cc9b149d0b6a 100644
--- a/lib/Transforms/Instrumentation/CFGMST.h
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -31,7 +31,7 @@
namespace llvm {
-/// \brief An union-find based Minimum Spanning Tree for CFG
+/// An union-find based Minimum Spanning Tree for CFG
///
/// Implements a Union-find algorithm to compute Minimum Spanning Tree
/// for a given CFG.
@@ -97,7 +97,7 @@ public:
// Edges with large weight will be put into MST first so they are less likely
// to be instrumented.
void buildEdges() {
- DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
const BasicBlock *Entry = &(F.getEntryBlock());
uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
@@ -107,8 +107,8 @@ public:
// Add a fake edge to the entry.
EntryIncoming = &addEdge(nullptr, Entry, EntryWeight);
- DEBUG(dbgs() << " Edge: from fake node to " << Entry->getName()
- << " w = " << EntryWeight << "\n");
+ LLVM_DEBUG(dbgs() << " Edge: from fake node to " << Entry->getName()
+ << " w = " << EntryWeight << "\n");
// Special handling for single BB functions.
if (succ_empty(Entry)) {
@@ -138,8 +138,8 @@ public:
Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
auto *E = &addEdge(&*BB, TargetBB, Weight);
E->IsCritical = Critical;
- DEBUG(dbgs() << " Edge: from " << BB->getName() << " to "
- << TargetBB->getName() << " w=" << Weight << "\n");
+ LLVM_DEBUG(dbgs() << " Edge: from " << BB->getName() << " to "
+ << TargetBB->getName() << " w=" << Weight << "\n");
// Keep track of entry/exit edges:
if (&*BB == Entry) {
@@ -164,8 +164,8 @@ public:
MaxExitOutWeight = BBWeight;
ExitOutgoing = ExitO;
}
- DEBUG(dbgs() << " Edge: from " << BB->getName() << " to fake exit"
- << " w = " << BBWeight << "\n");
+ LLVM_DEBUG(dbgs() << " Edge: from " << BB->getName() << " to fake exit"
+ << " w = " << BBWeight << "\n");
}
}
diff --git a/lib/Transforms/Instrumentation/CGProfile.cpp b/lib/Transforms/Instrumentation/CGProfile.cpp
new file mode 100644
index 000000000000..9606b3da2475
--- /dev/null
+++ b/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -0,0 +1,100 @@
+//===-- CGProfile.cpp -----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+#include <array>
+
+using namespace llvm;
+
+PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+ MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ InstrProfSymtab Symtab;
+ auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
+ Function *CalledF, uint64_t NewCount) {
+ if (!CalledF || !TTI.isLoweredToCall(CalledF))
+ return;
+ uint64_t &Count = Counts[std::make_pair(F, CalledF)];
+ Count = SaturatingAdd(Count, NewCount);
+ };
+ // Ignore error here. Indirect calls are ignored if this fails.
+ (void)(bool)Symtab.create(M);
+ for (auto &F : M) {
+ if (F.isDeclaration())
+ continue;
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ if (BFI.getEntryFreq() == 0)
+ continue;
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ for (auto &BB : F) {
+ Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
+ if (!BBCount)
+ continue;
+ for (auto &I : BB) {
+ CallSite CS(&I);
+ if (!CS)
+ continue;
+ if (CS.isIndirectCall()) {
+ InstrProfValueData ValueData[8];
+ uint32_t ActualNumValueData;
+ uint64_t TotalC;
+ if (!getValueProfDataFromInst(*CS.getInstruction(),
+ IPVK_IndirectCallTarget, 8, ValueData,
+ ActualNumValueData, TotalC))
+ continue;
+ for (const auto &VD :
+ ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
+ UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
+ }
+ continue;
+ }
+ UpdateCounts(TTI, &F, CS.getCalledFunction(), *BBCount);
+ }
+ }
+ }
+
+ addModuleFlags(M, Counts);
+
+ return PreservedAnalyses::all();
+}
+
+void CGProfilePass::addModuleFlags(
+ Module &M,
+ MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) const {
+ if (Counts.empty())
+ return;
+
+ LLVMContext &Context = M.getContext();
+ MDBuilder MDB(Context);
+ std::vector<Metadata *> Nodes;
+
+ for (auto E : Counts) {
+ SmallVector<Metadata *, 3> Vals;
+ Vals.push_back(ValueAsMetadata::get(E.first.first));
+ Vals.push_back(ValueAsMetadata::get(E.first.second));
+ Vals.push_back(MDB.createConstant(
+ ConstantInt::get(Type::getInt64Ty(Context), E.second)));
+ Nodes.push_back(MDNode::get(Context, Vals));
+ }
+
+ M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+}
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 66fdcb3ccc49..5d0084823190 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,6 +1,7 @@
add_llvm_library(LLVMInstrumentation
AddressSanitizer.cpp
BoundsChecking.cpp
+ CGProfile.cpp
DataFlowSanitizer.cpp
GCOVProfiling.cpp
MemorySanitizer.cpp
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 09bcbb282653..bb0e4379d1a8 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -56,6 +56,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
@@ -90,7 +91,6 @@
#include "llvm/Support/SpecialCaseList.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -211,6 +211,72 @@ class DFSanABIList {
}
};
+/// TransformedFunction is used to express the result of transforming one
+/// function type into another. This struct is immutable. It holds metadata
+/// useful for updating calls of the old function to the new type.
+struct TransformedFunction {
+ TransformedFunction(FunctionType* OriginalType,
+ FunctionType* TransformedType,
+ std::vector<unsigned> ArgumentIndexMapping)
+ : OriginalType(OriginalType),
+ TransformedType(TransformedType),
+ ArgumentIndexMapping(ArgumentIndexMapping) {}
+
+ // Disallow copies.
+ TransformedFunction(const TransformedFunction&) = delete;
+ TransformedFunction& operator=(const TransformedFunction&) = delete;
+
+ // Allow moves.
+ TransformedFunction(TransformedFunction&&) = default;
+ TransformedFunction& operator=(TransformedFunction&&) = default;
+
+ /// Type of the function before the transformation.
+ FunctionType* const OriginalType;
+
+ /// Type of the function after the transformation.
+ FunctionType* const TransformedType;
+
+ /// Transforming a function may change the position of arguments. This
+ /// member records the mapping from each argument's old position to its new
+ /// position. Argument positions are zero-indexed. If the transformation
+ /// from F to F' made the first argument of F into the third argument of F',
+ /// then ArgumentIndexMapping[0] will equal 2.
+ const std::vector<unsigned> ArgumentIndexMapping;
+};
+
+/// Given function attributes from a call site for the original function,
+/// return function attributes appropriate for a call to the transformed
+/// function.
+AttributeList TransformFunctionAttributes(
+ const TransformedFunction& TransformedFunction,
+ LLVMContext& Ctx, AttributeList CallSiteAttrs) {
+
+ // Construct a vector of AttributeSet for each function argument.
+ std::vector<llvm::AttributeSet> ArgumentAttributes(
+ TransformedFunction.TransformedType->getNumParams());
+
+ // Copy attributes from the parameter of the original function to the
+ // transformed version. 'ArgumentIndexMapping' holds the mapping from
+ // old argument position to new.
+ for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size();
+ i < ie; ++i) {
+ unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i];
+ ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i);
+ }
+
+ // Copy annotations on varargs arguments.
+ for (unsigned i = TransformedFunction.OriginalType->getNumParams(),
+ ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) {
+ ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i));
+ }
+
+ return AttributeList::get(
+ Ctx,
+ CallSiteAttrs.getFnAttributes(),
+ CallSiteAttrs.getRetAttributes(),
+ llvm::makeArrayRef(ArgumentAttributes));
+}
+
class DataFlowSanitizer : public ModulePass {
friend struct DFSanFunction;
friend class DFSanVisitor;
@@ -294,7 +360,7 @@ class DataFlowSanitizer : public ModulePass {
bool isInstrumented(const GlobalAlias *GA);
FunctionType *getArgsFunctionType(FunctionType *T);
FunctionType *getTrampolineFunctionType(FunctionType *T);
- FunctionType *getCustomFunctionType(FunctionType *T);
+ TransformedFunction getCustomFunctionType(FunctionType *T);
InstrumentedABI getInstrumentedABI();
WrapperKind getWrapperKind(Function *F);
void addGlobalNamePrefix(GlobalValue *GV);
@@ -437,17 +503,25 @@ FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
return FunctionType::get(T->getReturnType(), ArgTypes, false);
}
-FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
SmallVector<Type *, 4> ArgTypes;
- for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
- i != e; ++i) {
+
+ // Some parameters of the custom function being constructed are
+ // parameters of T. Record the mapping from parameters of T to
+ // parameters of the custom function, so that parameter attributes
+ // at call sites can be updated.
+ std::vector<unsigned> ArgumentIndexMapping;
+ for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) {
+ Type* param_type = T->getParamType(i);
FunctionType *FT;
- if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
- *i)->getElementType()))) {
+ if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>(
+ cast<PointerType>(param_type)->getElementType()))) {
+ ArgumentIndexMapping.push_back(ArgTypes.size());
ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
} else {
- ArgTypes.push_back(*i);
+ ArgumentIndexMapping.push_back(ArgTypes.size());
+ ArgTypes.push_back(param_type);
}
}
for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
@@ -457,14 +531,15 @@ FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
Type *RetType = T->getReturnType();
if (!RetType->isVoidTy())
ArgTypes.push_back(ShadowPtrTy);
- return FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg());
+ return TransformedFunction(
+ T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
+ ArgumentIndexMapping);
}
bool DataFlowSanitizer::doInitialization(Module &M) {
Triple TargetTriple(M.getTargetTriple());
bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
- bool IsMIPS64 = TargetTriple.getArch() == Triple::mips64 ||
- TargetTriple.getArch() == Triple::mips64el;
+ bool IsMIPS64 = TargetTriple.isMIPS64();
bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
TargetTriple.getArch() == Triple::aarch64_be;
@@ -783,9 +858,17 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
FunctionType *NewFT = getInstrumentedABI() == IA_Args
? getArgsFunctionType(FT)
: FT;
+
+ // If the function being wrapped has local linkage, then preserve the
+ // function's linkage in the wrapper function.
+ GlobalValue::LinkageTypes wrapperLinkage =
+ F.hasLocalLinkage()
+ ? F.getLinkage()
+ : GlobalValue::LinkOnceODRLinkage;
+
Function *NewF = buildWrapperFunction(
&F, std::string("dfsw$") + std::string(F.getName()),
- GlobalValue::LinkOnceODRLinkage, NewFT);
+ wrapperLinkage, NewFT);
if (getInstrumentedABI() == IA_TLS)
NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
@@ -1382,20 +1465,19 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
Value *LenShadow = IRB.CreateMul(
I.getLength(),
ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
- Value *AlignShadow;
- if (ClPreserveAlignment) {
- AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
- ConstantInt::get(I.getAlignmentCst()->getType(),
- DFSF.DFS.ShadowWidth / 8));
- } else {
- AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
- DFSF.DFS.ShadowWidth / 8);
- }
Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
- IRB.CreateCall(I.getCalledValue(), {DestShadow, SrcShadow, LenShadow,
- AlignShadow, I.getVolatileCst()});
+ auto *MTI = cast<MemTransferInst>(
+ IRB.CreateCall(I.getCalledValue(),
+ {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
+ if (ClPreserveAlignment) {
+ MTI->setDestAlignment(I.getDestAlignment() * (DFSF.DFS.ShadowWidth / 8));
+ MTI->setSourceAlignment(I.getSourceAlignment() * (DFSF.DFS.ShadowWidth / 8));
+ } else {
+ MTI->setDestAlignment(DFSF.DFS.ShadowWidth / 8);
+ MTI->setSourceAlignment(DFSF.DFS.ShadowWidth / 8);
+ }
}
void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
@@ -1460,11 +1542,11 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
// wrapper.
if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
FunctionType *FT = F->getFunctionType();
- FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+ TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
std::string CustomFName = "__dfsw_";
CustomFName += F->getName();
- Constant *CustomF =
- DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+ Constant *CustomF = DFSF.DFS.Mod->getOrInsertFunction(
+ CustomFName, CustomFn.TransformedType);
if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
CustomFn->copyAttributesFrom(F);
@@ -1532,7 +1614,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
CustomCI->setCallingConv(CI->getCallingConv());
- CustomCI->setAttributes(CI->getAttributes());
+ CustomCI->setAttributes(TransformFunctionAttributes(CustomFn,
+ CI->getContext(), CI->getAttributes()));
// Update the parameter attributes of the custom call instruction to
// zero extend the shadow parameters. This is required for targets
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 6864d295525c..33f220a893df 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -33,7 +34,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
@@ -537,7 +537,7 @@ void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
bool EfficiencySanitizer::initOnModule(Module &M) {
Triple TargetTriple(M.getTargetTriple());
- if (TargetTriple.getArch() == Triple::mips64 || TargetTriple.getArch() == Triple::mips64el)
+ if (TargetTriple.isMIPS64())
ShadowParams = ShadowParams40;
else
ShadowParams = ShadowParams47;
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 67ca8172b0d5..acd27c2e226f 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -17,11 +17,13 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/UniqueVector.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/IRBuilder.h"
@@ -35,8 +37,8 @@
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/GCOVProfiler.h"
#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
#include <memory>
@@ -84,7 +86,7 @@ public:
ReversedVersion[3] = Options.Version[0];
ReversedVersion[4] = '\0';
}
- bool runOnModule(Module &M);
+ bool runOnModule(Module &M, const TargetLibraryInfo &TLI);
private:
// Create the .gcno files for the Module based on DebugInfo.
@@ -130,6 +132,7 @@ private:
SmallVector<uint32_t, 4> FileChecksums;
Module *M;
+ const TargetLibraryInfo *TLI;
LLVMContext *Ctx;
SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
};
@@ -145,7 +148,14 @@ public:
}
StringRef getPassName() const override { return "GCOV Profiler"; }
- bool runOnModule(Module &M) override { return Profiler.runOnModule(M); }
+ bool runOnModule(Module &M) override {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return Profiler.runOnModule(M, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
private:
GCOVProfiler Profiler;
@@ -153,8 +163,13 @@ private:
}
char GCOVProfilerLegacyPass::ID = 0;
-INITIALIZE_PASS(GCOVProfilerLegacyPass, "insert-gcov-profiling",
- "Insert instrumentation for GCOV profiling", false, false)
+INITIALIZE_PASS_BEGIN(
+ GCOVProfilerLegacyPass, "insert-gcov-profiling",
+ "Insert instrumentation for GCOV profiling", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ GCOVProfilerLegacyPass, "insert-gcov-profiling",
+ "Insert instrumentation for GCOV profiling", false, false)
ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
return new GCOVProfilerLegacyPass(Options);
@@ -272,7 +287,7 @@ namespace {
write(Len);
write(Number);
- std::sort(
+ llvm::sort(
SortedLinesByFile.begin(), SortedLinesByFile.end(),
[](StringMapEntry<GCOVLines> *LHS, StringMapEntry<GCOVLines> *RHS) {
return LHS->getKey() < RHS->getKey();
@@ -315,7 +330,7 @@ namespace {
ReturnBlock(1, os) {
this->os = os;
- DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+ LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
uint32_t i = 0;
for (auto &BB : *F) {
@@ -383,7 +398,7 @@ namespace {
for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
write(0); // No flags on our blocks.
}
- DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+ LLVM_DEBUG(dbgs() << Blocks.size() << " blocks.\n");
// Emit edges between blocks.
if (Blocks.empty()) return;
@@ -396,8 +411,8 @@ namespace {
write(Block.OutEdges.size() * 2 + 1);
write(Block.Number);
for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
- DEBUG(dbgs() << Block.Number << " -> " << Block.OutEdges[i]->Number
- << "\n");
+ LLVM_DEBUG(dbgs() << Block.Number << " -> "
+ << Block.OutEdges[i]->Number << "\n");
write(Block.OutEdges[i]->Number);
write(0); // no flags
}
@@ -461,8 +476,9 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
return CurPath.str();
}
-bool GCOVProfiler::runOnModule(Module &M) {
+bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
this->M = &M;
+ this->TLI = &TLI;
Ctx = &M.getContext();
if (Options.EmitNotes) emitProfileNotes();
@@ -475,7 +491,8 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
GCOVProfiler Profiler(GCOVOpts);
- if (!Profiler.runOnModule(M))
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+ if (!Profiler.runOnModule(M, TLI))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
@@ -503,11 +520,11 @@ static bool functionHasLines(Function &F) {
return false;
}
-static bool isUsingFuncletBasedEH(Function &F) {
+static bool isUsingScopeBasedEH(Function &F) {
if (!F.hasPersonalityFn()) return false;
EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
- return isFuncletEHPersonality(Personality);
+ return isScopedEHPersonality(Personality);
}
static bool shouldKeepInEntry(BasicBlock::iterator It) {
@@ -550,8 +567,8 @@ void GCOVProfiler::emitProfileNotes() {
DISubprogram *SP = F.getSubprogram();
if (!SP) continue;
if (!functionHasLines(F)) continue;
- // TODO: Functions using funclet-based EH are currently not supported.
- if (isUsingFuncletBasedEH(F)) continue;
+ // TODO: Functions using scope-based EH are currently not supported.
+ if (isUsingScopeBasedEH(F)) continue;
// gcov expects every function to start with an entry block that has a
// single successor, so split the entry block to make sure of that.
@@ -629,8 +646,8 @@ bool GCOVProfiler::emitProfileArcs() {
DISubprogram *SP = F.getSubprogram();
if (!SP) continue;
if (!functionHasLines(F)) continue;
- // TODO: Functions using funclet-based EH are currently not supported.
- if (isUsingFuncletBasedEH(F)) continue;
+ // TODO: Functions using scope-based EH are currently not supported.
+ if (isUsingScopeBasedEH(F)) continue;
if (!Result) Result = true;
unsigned Edges = 0;
@@ -807,7 +824,12 @@ Constant *GCOVProfiler::getStartFileFunc() {
Type::getInt32Ty(*Ctx), // uint32_t checksum
};
FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
- return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+ auto *Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+ if (Function *FunRes = dyn_cast<Function>(Res))
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ FunRes->addParamAttr(2, AK);
+ return Res;
+
}
Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
@@ -830,7 +852,15 @@ Constant *GCOVProfiler::getEmitFunctionFunc() {
Type::getInt32Ty(*Ctx), // uint32_t cfg_checksum
};
FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
- return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+ auto *Res = M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+ if (Function *FunRes = dyn_cast<Function>(Res))
+ if (auto AK = TLI->getExtAttrForI32Param(false)) {
+ FunRes->addParamAttr(0, AK);
+ FunRes->addParamAttr(2, AK);
+ FunRes->addParamAttr(3, AK);
+ FunRes->addParamAttr(4, AK);
+ }
+ return Res;
}
Constant *GCOVProfiler::getEmitArcsFunc() {
@@ -839,7 +869,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
Type::getInt64PtrTy(*Ctx), // uint64_t *counters
};
FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
- return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+ auto *Res = M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+ if (Function *FunRes = dyn_cast<Function>(Res))
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ FunRes->addParamAttr(0, AK);
+ return Res;
}
Constant *GCOVProfiler::getSummaryInfoFunc() {
@@ -886,46 +920,205 @@ Function *GCOVProfiler::insertCounterWriteout(
Constant *SummaryInfo = getSummaryInfoFunc();
Constant *EndFile = getEndFileFunc();
- NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
- if (CU_Nodes) {
- for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
- auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+ NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu");
+ if (!CUNodes) {
+ Builder.CreateRetVoid();
+ return WriteoutF;
+ }
- // Skip module skeleton (and module) CUs.
- if (CU->getDWOId())
- continue;
+ // Collect the relevant data into a large constant data structure that we can
+ // walk to write out everything.
+ StructType *StartFileCallArgsTy = StructType::create(
+ {Builder.getInt8PtrTy(), Builder.getInt8PtrTy(), Builder.getInt32Ty()});
+ StructType *EmitFunctionCallArgsTy = StructType::create(
+ {Builder.getInt32Ty(), Builder.getInt8PtrTy(), Builder.getInt32Ty(),
+ Builder.getInt8Ty(), Builder.getInt32Ty()});
+ StructType *EmitArcsCallArgsTy = StructType::create(
+ {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()});
+ StructType *FileInfoTy =
+ StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
+ EmitFunctionCallArgsTy->getPointerTo(),
+ EmitArcsCallArgsTy->getPointerTo()});
+
+ Constant *Zero32 = Builder.getInt32(0);
+ // Build an explicit array of two zeros for use in ConstantExpr GEP building.
+ Constant *TwoZero32s[] = {Zero32, Zero32};
+
+ SmallVector<Constant *, 8> FileInfos;
+ for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) {
+ auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i));
- std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
- uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
- Builder.CreateCall(StartFile,
- {Builder.CreateGlobalStringPtr(FilenameGcda),
- Builder.CreateGlobalStringPtr(ReversedVersion),
- Builder.getInt32(CfgChecksum)});
- for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
- auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
- uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
- Builder.CreateCall(
- EmitFunction,
- {Builder.getInt32(j),
- Options.FunctionNamesInData
- ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
- : Constant::getNullValue(Builder.getInt8PtrTy()),
- Builder.getInt32(FuncChecksum),
- Builder.getInt8(Options.UseCfgChecksum),
- Builder.getInt32(CfgChecksum)});
-
- GlobalVariable *GV = CountersBySP[j].first;
- unsigned Arcs =
- cast<ArrayType>(GV->getValueType())->getNumElements();
- Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs),
- Builder.CreateConstGEP2_64(GV, 0, 0)});
- }
- Builder.CreateCall(SummaryInfo, {});
- Builder.CreateCall(EndFile, {});
+ // Skip module skeleton (and module) CUs.
+ if (CU->getDWOId())
+ continue;
+
+ std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
+ uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+ auto *StartFileCallArgs = ConstantStruct::get(
+ StartFileCallArgsTy, {Builder.CreateGlobalStringPtr(FilenameGcda),
+ Builder.CreateGlobalStringPtr(ReversedVersion),
+ Builder.getInt32(CfgChecksum)});
+
+ SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
+ SmallVector<Constant *, 8> EmitArcsCallArgsArray;
+ for (int j : llvm::seq<int>(0, CountersBySP.size())) {
+ auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
+ uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+ EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
+ EmitFunctionCallArgsTy,
+ {Builder.getInt32(j),
+ Options.FunctionNamesInData
+ ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
+ : Constant::getNullValue(Builder.getInt8PtrTy()),
+ Builder.getInt32(FuncChecksum),
+ Builder.getInt8(Options.UseCfgChecksum),
+ Builder.getInt32(CfgChecksum)}));
+
+ GlobalVariable *GV = CountersBySP[j].first;
+ unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements();
+ EmitArcsCallArgsArray.push_back(ConstantStruct::get(
+ EmitArcsCallArgsTy,
+ {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr(
+ GV->getValueType(), GV, TwoZero32s)}));
}
+ // Create global arrays for the two emit calls.
+ int CountersSize = CountersBySP.size();
+ assert(CountersSize == (int)EmitFunctionCallArgsArray.size() &&
+ "Mismatched array size!");
+ assert(CountersSize == (int)EmitArcsCallArgsArray.size() &&
+ "Mismatched array size!");
+ auto *EmitFunctionCallArgsArrayTy =
+ ArrayType::get(EmitFunctionCallArgsTy, CountersSize);
+ auto *EmitFunctionCallArgsArrayGV = new GlobalVariable(
+ *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true,
+ GlobalValue::InternalLinkage,
+ ConstantArray::get(EmitFunctionCallArgsArrayTy,
+ EmitFunctionCallArgsArray),
+ Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i));
+ auto *EmitArcsCallArgsArrayTy =
+ ArrayType::get(EmitArcsCallArgsTy, CountersSize);
+ EmitFunctionCallArgsArrayGV->setUnnamedAddr(
+ GlobalValue::UnnamedAddr::Global);
+ auto *EmitArcsCallArgsArrayGV = new GlobalVariable(
+ *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true,
+ GlobalValue::InternalLinkage,
+ ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray),
+ Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i));
+ EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+ FileInfos.push_back(ConstantStruct::get(
+ FileInfoTy,
+ {StartFileCallArgs, Builder.getInt32(CountersSize),
+ ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy,
+ EmitFunctionCallArgsArrayGV,
+ TwoZero32s),
+ ConstantExpr::getInBoundsGetElementPtr(
+ EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)}));
}
+ // If we didn't find anything to actually emit, bail on out.
+ if (FileInfos.empty()) {
+ Builder.CreateRetVoid();
+ return WriteoutF;
+ }
+
+ // To simplify code, we cap the number of file infos we write out to fit
+ // easily in a 32-bit signed integer. This gives consistent behavior between
+ // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit
+ // operations on 32-bit systems. It also seems unreasonable to try to handle
+ // more than 2 billion files.
+ if ((int64_t)FileInfos.size() > (int64_t)INT_MAX)
+ FileInfos.resize(INT_MAX);
+
+ // Create a global for the entire data structure so we can walk it more
+ // easily.
+ auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size());
+ auto *FileInfoArrayGV = new GlobalVariable(
+ *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage,
+ ConstantArray::get(FileInfoArrayTy, FileInfos),
+ "__llvm_internal_gcov_emit_file_info");
+ FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+ // Create the CFG for walking this data structure.
+ auto *FileLoopHeader =
+ BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF);
+ auto *CounterLoopHeader =
+ BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF);
+ auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF);
+ auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF);
+
+ // We always have at least one file, so just branch to the header.
+ Builder.CreateBr(FileLoopHeader);
+
+ // The index into the files structure is our loop induction variable.
+ Builder.SetInsertPoint(FileLoopHeader);
+ PHINode *IV =
+ Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+ IV->addIncoming(Builder.getInt32(0), BB);
+ auto *FileInfoPtr =
+ Builder.CreateInBoundsGEP(FileInfoArrayGV, {Builder.getInt32(0), IV});
+ auto *StartFileCallArgsPtr = Builder.CreateStructGEP(FileInfoPtr, 0);
+ auto *StartFileCall = Builder.CreateCall(
+ StartFile,
+ {Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 0)),
+ Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 1)),
+ Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 2))});
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ StartFileCall->addParamAttr(2, AK);
+ auto *NumCounters =
+ Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 1));
+ auto *EmitFunctionCallArgsArray =
+ Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 2));
+ auto *EmitArcsCallArgsArray =
+ Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 3));
+ auto *EnterCounterLoopCond =
+ Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
+ Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
+
+ Builder.SetInsertPoint(CounterLoopHeader);
+ auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+ JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
+ auto *EmitFunctionCallArgsPtr =
+ Builder.CreateInBoundsGEP(EmitFunctionCallArgsArray, {JV});
+ auto *EmitFunctionCall = Builder.CreateCall(
+ EmitFunction,
+ {Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 0)),
+ Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 1)),
+ Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 2)),
+ Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 3)),
+ Builder.CreateLoad(
+ Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 4))});
+ if (auto AK = TLI->getExtAttrForI32Param(false)) {
+ EmitFunctionCall->addParamAttr(0, AK);
+ EmitFunctionCall->addParamAttr(2, AK);
+ EmitFunctionCall->addParamAttr(3, AK);
+ EmitFunctionCall->addParamAttr(4, AK);
+ }
+ auto *EmitArcsCallArgsPtr =
+ Builder.CreateInBoundsGEP(EmitArcsCallArgsArray, {JV});
+ auto *EmitArcsCall = Builder.CreateCall(
+ EmitArcs,
+ {Builder.CreateLoad(Builder.CreateStructGEP(EmitArcsCallArgsPtr, 0)),
+ Builder.CreateLoad(Builder.CreateStructGEP(EmitArcsCallArgsPtr, 1))});
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ EmitArcsCall->addParamAttr(0, AK);
+ auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
+ auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters);
+ Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch);
+ JV->addIncoming(NextJV, CounterLoopHeader);
+
+ Builder.SetInsertPoint(FileLoopLatch);
+ Builder.CreateCall(SummaryInfo, {});
+ Builder.CreateCall(EndFile, {});
+ auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1));
+ auto *FileLoopCond =
+ Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
+ Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
+ IV->addIncoming(NextIV, FileLoopLatch);
+
+ Builder.SetInsertPoint(ExitBB);
Builder.CreateRetVoid();
+
return WriteoutF;
}
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8e2833d22032..d62598bb5d4f 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -22,10 +22,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/IR/Function.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/InstVisitor.h"
@@ -34,6 +31,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -41,8 +39,11 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
using namespace llvm;
@@ -51,10 +52,15 @@ using namespace llvm;
static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
static const char *const kHwasanInitName = "__hwasan_init";
+static const char *const kHwasanShadowMemoryDynamicAddress =
+ "__hwasan_shadow_memory_dynamic_address";
+
// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
static const size_t kNumberOfAccessSizes = 5;
-static const size_t kShadowScale = 4;
+static const size_t kDefaultShadowScale = 4;
+static const uint64_t kDynamicShadowSentinel =
+ std::numeric_limits<uint64_t>::max();
static const unsigned kPointerTagShift = 56;
static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
@@ -85,17 +91,57 @@ static cl::opt<bool> ClRecover(
cl::desc("Enable recovery mode (continue-after-error)."),
cl::Hidden, cl::init(false));
+static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
+ cl::desc("instrument stack (allocas)"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClUARRetagToZero(
+ "hwasan-uar-retag-to-zero",
+ cl::desc("Clear alloca tags before returning from the function to allow "
+ "non-instrumented and instrumented function calls mix. When set "
+ "to false, allocas are retagged before returning from the "
+ "function to detect use after return."),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClGenerateTagsWithCalls(
+ "hwasan-generate-tags-with-calls",
+ cl::desc("generate new tags with runtime library calls"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<int> ClMatchAllTag(
+ "hwasan-match-all-tag",
+ cl::desc("don't report bad accesses via pointers with this tag"),
+ cl::Hidden, cl::init(-1));
+
+static cl::opt<bool> ClEnableKhwasan(
+ "hwasan-kernel",
+ cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
+ cl::Hidden, cl::init(false));
+
+// These flags allow to change the shadow mapping and control how shadow memory
+// is accessed. The shadow mapping looks like:
+// Shadow = (Mem >> scale) + offset
+
+static cl::opt<unsigned long long> ClMappingOffset(
+ "hwasan-mapping-offset",
+ cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"), cl::Hidden,
+ cl::init(0));
+
namespace {
-/// \brief An instrumentation pass implementing detection of addressability bugs
+/// An instrumentation pass implementing detection of addressability bugs
/// using tagged pointers.
class HWAddressSanitizer : public FunctionPass {
public:
// Pass identification, replacement for typeid.
static char ID;
- HWAddressSanitizer(bool Recover = false)
- : FunctionPass(ID), Recover(Recover || ClRecover) {}
+ explicit HWAddressSanitizer(bool CompileKernel = false, bool Recover = false)
+ : FunctionPass(ID) {
+ this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+ this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
+ ClEnableKhwasan : CompileKernel;
+ }
StringRef getPassName() const override { return "HWAddressSanitizer"; }
@@ -103,6 +149,11 @@ public:
bool doInitialization(Module &M) override;
void initializeCallbacks(Module &M);
+
+ void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+
+ void untagPointerOperand(Instruction *I, Value *Addr);
+ Value *memToShadow(Value *Shadow, Type *Ty, IRBuilder<> &IRB);
void instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
unsigned AccessSizeIndex,
Instruction *InsertBefore);
@@ -111,16 +162,54 @@ public:
uint64_t *TypeSize, unsigned *Alignment,
Value **MaybeMask);
+ bool isInterestingAlloca(const AllocaInst &AI);
+ bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag);
+ Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
+ Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
+ bool instrumentStack(SmallVectorImpl<AllocaInst *> &Allocas,
+ SmallVectorImpl<Instruction *> &RetVec);
+ Value *getNextTagWithCall(IRBuilder<> &IRB);
+ Value *getStackBaseTag(IRBuilder<> &IRB);
+ Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
+ unsigned AllocaNo);
+ Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
+
private:
LLVMContext *C;
+ Triple TargetTriple;
+
+ /// This struct defines the shadow mapping using the rule:
+ /// shadow = (mem >> Scale) + Offset.
+ /// If InGlobal is true, then
+ /// extern char __hwasan_shadow[];
+ /// shadow = (mem >> Scale) + &__hwasan_shadow
+ struct ShadowMapping {
+ int Scale;
+ uint64_t Offset;
+ bool InGlobal;
+
+ void init(Triple &TargetTriple);
+ unsigned getAllocaAlignment() const { return 1U << Scale; }
+ };
+ ShadowMapping Mapping;
+
Type *IntptrTy;
+ Type *Int8Ty;
+ bool CompileKernel;
bool Recover;
Function *HwasanCtorFunction;
Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
Function *HwasanMemoryAccessCallbackSized[2];
+
+ Function *HwasanTagMemoryFunc;
+ Function *HwasanGenerateTagFunc;
+
+ Constant *ShadowGlobal;
+
+ Value *LocalDynamicShadow = nullptr;
};
} // end anonymous namespace
@@ -129,34 +218,44 @@ char HWAddressSanitizer::ID = 0;
INITIALIZE_PASS_BEGIN(
HWAddressSanitizer, "hwasan",
- "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false)
+ "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+ false)
INITIALIZE_PASS_END(
HWAddressSanitizer, "hwasan",
- "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false)
+ "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+ false)
-FunctionPass *llvm::createHWAddressSanitizerPass(bool Recover) {
- return new HWAddressSanitizer(Recover);
+FunctionPass *llvm::createHWAddressSanitizerPass(bool CompileKernel,
+ bool Recover) {
+ assert(!CompileKernel || Recover);
+ return new HWAddressSanitizer(CompileKernel, Recover);
}
-/// \brief Module-level initialization.
+/// Module-level initialization.
///
/// inserts a call to __hwasan_init to the module's constructor list.
bool HWAddressSanitizer::doInitialization(Module &M) {
- DEBUG(dbgs() << "Init " << M.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
auto &DL = M.getDataLayout();
- Triple TargetTriple(M.getTargetTriple());
+ TargetTriple = Triple(M.getTargetTriple());
+
+ Mapping.init(TargetTriple);
C = &(M.getContext());
IRBuilder<> IRB(*C);
IntptrTy = IRB.getIntPtrTy(DL);
-
- std::tie(HwasanCtorFunction, std::ignore) =
- createSanitizerCtorAndInitFunctions(M, kHwasanModuleCtorName,
- kHwasanInitName,
- /*InitArgTypes=*/{},
- /*InitArgs=*/{});
- appendToGlobalCtors(M, HwasanCtorFunction, 0);
+ Int8Ty = IRB.getInt8Ty();
+
+ HwasanCtorFunction = nullptr;
+ if (!CompileKernel) {
+ std::tie(HwasanCtorFunction, std::ignore) =
+ createSanitizerCtorAndInitFunctions(M, kHwasanModuleCtorName,
+ kHwasanInitName,
+ /*InitArgTypes=*/{},
+ /*InitArgs=*/{});
+ appendToGlobalCtors(M, HwasanCtorFunction, 0);
+ }
return true;
}
@@ -168,7 +267,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
HwasanMemoryAccessCallbackSized[AccessIsWrite] =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- ClMemoryAccessCallbackPrefix + TypeStr + EndingStr,
+ ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false)));
for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -180,16 +279,50 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)));
}
}
+
+ HwasanTagMemoryFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ "__hwasan_tag_memory", IRB.getVoidTy(), IntptrTy, Int8Ty, IntptrTy));
+ HwasanGenerateTagFunc = checkSanitizerInterfaceFunction(
+ M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty));
+
+ if (Mapping.InGlobal)
+ ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
+ ArrayType::get(IRB.getInt8Ty(), 0));
+}
+
+void HWAddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+ // Generate code only when dynamic addressing is needed.
+ if (Mapping.Offset != kDynamicShadowSentinel)
+ return;
+
+ IRBuilder<> IRB(&F.front().front());
+ if (Mapping.InGlobal) {
+ // An empty inline asm with input reg == output reg.
+ // An opaque pointer-to-int cast, basically.
+ InlineAsm *Asm = InlineAsm::get(
+ FunctionType::get(IntptrTy, {ShadowGlobal->getType()}, false),
+ StringRef(""), StringRef("=r,0"),
+ /*hasSideEffects=*/false);
+ LocalDynamicShadow = IRB.CreateCall(Asm, {ShadowGlobal}, ".hwasan.shadow");
+ } else {
+ Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+ kHwasanShadowMemoryDynamicAddress, IntptrTy);
+ LocalDynamicShadow = IRB.CreateLoad(GlobalDynamicAddress);
+ }
}
Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
- bool *IsWrite,
- uint64_t *TypeSize,
- unsigned *Alignment,
- Value **MaybeMask) {
+ bool *IsWrite,
+ uint64_t *TypeSize,
+ unsigned *Alignment,
+ Value **MaybeMask) {
// Skip memory accesses inserted by another instrumentation.
if (I->getMetadata("nosanitize")) return nullptr;
+ // Do not instrument the load fetching the dynamic shadow address.
+ if (LocalDynamicShadow == I)
+ return nullptr;
+
Value *PtrOperand = nullptr;
const DataLayout &DL = I->getModule()->getDataLayout();
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
@@ -219,7 +352,7 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
}
if (PtrOperand) {
- // Do not instrument acesses from different address spaces; we cannot deal
+ // Do not instrument accesses from different address spaces; we cannot deal
// with them.
Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
if (PtrTy->getPointerAddressSpace() != 0)
@@ -236,41 +369,103 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
return PtrOperand;
}
+static unsigned getPointerOperandIndex(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->getPointerOperandIndex();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->getPointerOperandIndex();
+ if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
+ return RMW->getPointerOperandIndex();
+ if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
+ return XCHG->getPointerOperandIndex();
+ report_fatal_error("Unexpected instruction");
+ return -1;
+}
+
static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
size_t Res = countTrailingZeros(TypeSize / 8);
assert(Res < kNumberOfAccessSizes);
return Res;
}
+void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
+ if (TargetTriple.isAArch64())
+ return;
+
+ IRBuilder<> IRB(I);
+ Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+ Value *UntaggedPtr =
+ IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
+ I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
+}
+
+Value *HWAddressSanitizer::memToShadow(Value *Mem, Type *Ty, IRBuilder<> &IRB) {
+ // Mem >> Scale
+ Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
+ if (Mapping.Offset == 0)
+ return Shadow;
+ // (Mem >> Scale) + Offset
+ Value *ShadowBase;
+ if (LocalDynamicShadow)
+ ShadowBase = LocalDynamicShadow;
+ else
+ ShadowBase = ConstantInt::get(Ty, Mapping.Offset);
+ return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
unsigned AccessSizeIndex,
Instruction *InsertBefore) {
IRBuilder<> IRB(InsertBefore);
- Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift), IRB.getInt8Ty());
- Value *AddrLong =
- IRB.CreateAnd(PtrLong, ConstantInt::get(PtrLong->getType(),
- ~(0xFFULL << kPointerTagShift)));
- Value *ShadowLong = IRB.CreateLShr(AddrLong, kShadowScale);
- Value *MemTag = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowLong, IRB.getInt8PtrTy()));
+ Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
+ IRB.getInt8Ty());
+ Value *AddrLong = untagPointer(IRB, PtrLong);
+ Value *ShadowLong = memToShadow(AddrLong, PtrLong->getType(), IRB);
+ Value *MemTag =
+ IRB.CreateLoad(IRB.CreateIntToPtr(ShadowLong, IRB.getInt8PtrTy()));
Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
+ int matchAllTag = ClMatchAllTag.getNumOccurrences() > 0 ?
+ ClMatchAllTag : (CompileKernel ? 0xFF : -1);
+ if (matchAllTag != -1) {
+ Value *TagNotIgnored = IRB.CreateICmpNE(PtrTag,
+ ConstantInt::get(PtrTag->getType(), matchAllTag));
+ TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
+ }
+
TerminatorInst *CheckTerm =
SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
MDBuilder(*C).createBranchWeights(1, 100000));
IRB.SetInsertPoint(CheckTerm);
- // The signal handler will find the data address in x0.
- InlineAsm *Asm = InlineAsm::get(
- FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
- "hlt #" +
- itostr(0x100 + Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex),
- "{x0}",
- /*hasSideEffects=*/true);
+ const int64_t AccessInfo = Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex;
+ InlineAsm *Asm;
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ // The signal handler will find the data address in rdi.
+ Asm = InlineAsm::get(
+ FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+ "int3\nnopl " + itostr(0x40 + AccessInfo) + "(%rax)",
+ "{rdi}",
+ /*hasSideEffects=*/true);
+ break;
+ case Triple::aarch64:
+ case Triple::aarch64_be:
+ // The signal handler will find the data address in x0.
+ Asm = InlineAsm::get(
+ FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+ "brk #" + itostr(0x900 + AccessInfo),
+ "{x0}",
+ /*hasSideEffects=*/true);
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
IRB.CreateCall(Asm, PtrLong);
}
bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
- DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
bool IsWrite = false;
unsigned Alignment = 0;
uint64_t TypeSize = 0;
@@ -288,7 +483,7 @@ bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
if (isPowerOf2_64(TypeSize) &&
(TypeSize / 8 <= (1UL << (kNumberOfAccessSizes - 1))) &&
- (Alignment >= (1UL << kShadowScale) || Alignment == 0 ||
+ (Alignment >= (1UL << Mapping.Scale) || Alignment == 0 ||
Alignment >= TypeSize / 8)) {
size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
if (ClInstrumentWithCalls) {
@@ -301,10 +496,197 @@ bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
IRB.CreateCall(HwasanMemoryAccessCallbackSized[IsWrite],
{AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8)});
}
+ untagPointerOperand(I, Addr);
return true;
}
+static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+ uint64_t ArraySize = 1;
+ if (AI.isArrayAllocation()) {
+ const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+ assert(CI && "non-constant array size");
+ ArraySize = CI->getZExtValue();
+ }
+ Type *Ty = AI.getAllocatedType();
+ uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+ return SizeInBytes * ArraySize;
+}
+
+bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
+ Value *Tag) {
+ size_t Size = (getAllocaSizeInBytes(*AI) + Mapping.getAllocaAlignment() - 1) &
+ ~(Mapping.getAllocaAlignment() - 1);
+
+ Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
+ if (ClInstrumentWithCalls) {
+ IRB.CreateCall(HwasanTagMemoryFunc,
+ {IRB.CreatePointerCast(AI, IntptrTy), JustTag,
+ ConstantInt::get(IntptrTy, Size)});
+ } else {
+ size_t ShadowSize = Size >> Mapping.Scale;
+ Value *ShadowPtr = IRB.CreateIntToPtr(
+ memToShadow(IRB.CreatePointerCast(AI, IntptrTy), AI->getType(), IRB),
+ IRB.getInt8PtrTy());
+ // If this memset is not inlined, it will be intercepted in the hwasan
+ // runtime library. That's OK, because the interceptor skips the checks if
+ // the address is in the shadow region.
+ // FIXME: the interceptor is not as fast as real memset. Consider lowering
+ // llvm.memset right here into either a sequence of stores, or a call to
+ // hwasan_tag_memory.
+ IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+ }
+ return true;
+}
+
+static unsigned RetagMask(unsigned AllocaNo) {
+ // A list of 8-bit numbers that have at most one run of non-zero bits.
+ // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
+ // masks.
+ // The list does not include the value 255, which is used for UAR.
+ static unsigned FastMasks[] = {
+ 0, 1, 2, 3, 4, 6, 7, 8, 12, 14, 15, 16, 24,
+ 28, 30, 31, 32, 48, 56, 60, 62, 63, 64, 96, 112, 120,
+ 124, 126, 127, 128, 192, 224, 240, 248, 252, 254};
+ return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
+}
+
+Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
+ return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
+}
+
+Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
+ if (ClGenerateTagsWithCalls)
+ return nullptr;
+ // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+ // first).
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ auto GetStackPointerFn =
+ Intrinsic::getDeclaration(M, Intrinsic::frameaddress);
+ Value *StackPointer = IRB.CreateCall(
+ GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
+
+ // Extract some entropy from the stack pointer for the tags.
+ // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
+ // between functions).
+ Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+ Value *StackTag =
+ IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
+ "hwasan.stack.base.tag");
+ return StackTag;
+}
+
+Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
+ AllocaInst *AI, unsigned AllocaNo) {
+ if (ClGenerateTagsWithCalls)
+ return getNextTagWithCall(IRB);
+ return IRB.CreateXor(StackTag,
+ ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
+}
+
+Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
+ if (ClUARRetagToZero)
+ return ConstantInt::get(IntptrTy, 0);
+ if (ClGenerateTagsWithCalls)
+ return getNextTagWithCall(IRB);
+ return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
+}
+
+// Add a tag to an address.
+Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
+ Value *PtrLong, Value *Tag) {
+ Value *TaggedPtrLong;
+ if (CompileKernel) {
+ // Kernel addresses have 0xFF in the most significant byte.
+ Value *ShiftedTag = IRB.CreateOr(
+ IRB.CreateShl(Tag, kPointerTagShift),
+ ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
+ TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
+ } else {
+ // Userspace can simply do OR (tag << 56);
+ Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
+ TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
+ }
+ return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
+}
+
+// Remove tag from an address.
+Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
+ Value *UntaggedPtrLong;
+ if (CompileKernel) {
+ // Kernel addresses have 0xFF in the most significant byte.
+ UntaggedPtrLong = IRB.CreateOr(PtrLong,
+ ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
+ } else {
+ // Userspace addresses have 0x00.
+ UntaggedPtrLong = IRB.CreateAnd(PtrLong,
+ ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
+ }
+ return UntaggedPtrLong;
+}
+
+bool HWAddressSanitizer::instrumentStack(
+ SmallVectorImpl<AllocaInst *> &Allocas,
+ SmallVectorImpl<Instruction *> &RetVec) {
+ Function *F = Allocas[0]->getParent()->getParent();
+ Instruction *InsertPt = &*F->getEntryBlock().begin();
+ IRBuilder<> IRB(InsertPt);
+
+ Value *StackTag = getStackBaseTag(IRB);
+
+ // Ideally, we want to calculate tagged stack base pointer, and rewrite all
+ // alloca addresses using that. Unfortunately, offsets are not known yet
+ // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
+ // temp, shift-OR it into each alloca address and xor with the retag mask.
+ // This generates one extra instruction per alloca use.
+ for (unsigned N = 0; N < Allocas.size(); ++N) {
+ auto *AI = Allocas[N];
+ IRB.SetInsertPoint(AI->getNextNode());
+
+ // Replace uses of the alloca with tagged address.
+ Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
+ Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
+ Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
+ std::string Name =
+ AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
+ Replacement->setName(Name + ".hwasan");
+
+ for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
+ Use &U = *UI++;
+ if (U.getUser() != AILong)
+ U.set(Replacement);
+ }
+
+ tagAlloca(IRB, AI, Tag);
+
+ for (auto RI : RetVec) {
+ IRB.SetInsertPoint(RI);
+
+ // Re-tag alloca memory with the special UAR tag.
+ Value *Tag = getUARTag(IRB, StackTag);
+ tagAlloca(IRB, AI, Tag);
+ }
+ }
+
+ return true;
+}
+
+bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+ return (AI.getAllocatedType()->isSized() &&
+ // FIXME: instrument dynamic allocas, too
+ AI.isStaticAlloca() &&
+ // alloca() may be called with 0 size, ignore it.
+ getAllocaSizeInBytes(AI) > 0 &&
+ // We are only interested in allocas not promotable to registers.
+ // Promotable allocas are common under -O0.
+ !isAllocaPromotable(&AI) &&
+ // inalloca allocas are not treated as static, and we don't want
+ // dynamic alloca instrumentation for them as well.
+ !AI.isUsedWithInAlloca() &&
+ // swifterror allocas are register promoted by ISel
+ !AI.isSwiftError());
+}
+
bool HWAddressSanitizer::runOnFunction(Function &F) {
if (&F == HwasanCtorFunction)
return false;
@@ -312,14 +694,35 @@ bool HWAddressSanitizer::runOnFunction(Function &F) {
if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
return false;
- DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
initializeCallbacks(*F.getParent());
+ assert(!LocalDynamicShadow);
+ maybeInsertDynamicShadowAtFunctionEntry(F);
+
bool Changed = false;
SmallVector<Instruction*, 16> ToInstrument;
+ SmallVector<AllocaInst*, 8> AllocasToInstrument;
+ SmallVector<Instruction*, 8> RetVec;
for (auto &BB : F) {
for (auto &Inst : BB) {
+ if (ClInstrumentStack)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+ // Realign all allocas. We don't want small uninteresting allocas to
+ // hide in instrumented alloca's padding.
+ if (AI->getAlignment() < Mapping.getAllocaAlignment())
+ AI->setAlignment(Mapping.getAllocaAlignment());
+ // Instrument some of them.
+ if (isInterestingAlloca(*AI))
+ AllocasToInstrument.push_back(AI);
+ continue;
+ }
+
+ if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
+ isa<CleanupReturnInst>(Inst))
+ RetVec.push_back(&Inst);
+
Value *MaybeMask = nullptr;
bool IsWrite;
unsigned Alignment;
@@ -331,8 +734,30 @@ bool HWAddressSanitizer::runOnFunction(Function &F) {
}
}
+ if (!AllocasToInstrument.empty())
+ Changed |= instrumentStack(AllocasToInstrument, RetVec);
+
for (auto Inst : ToInstrument)
Changed |= instrumentMemAccess(Inst);
+ LocalDynamicShadow = nullptr;
+
return Changed;
}
+
+void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
+ const bool IsAndroid = TargetTriple.isAndroid();
+ const bool IsAndroidWithIfuncSupport =
+ IsAndroid && !TargetTriple.isAndroidVersionLT(21);
+
+ Scale = kDefaultShadowScale;
+
+ if (ClEnableKhwasan || ClInstrumentWithCalls || !IsAndroidWithIfuncSupport)
+ Offset = 0;
+ else
+ Offset = kDynamicShadowSentinel;
+ if (ClMappingOffset.getNumOccurrences() > 0)
+ Offset = ClMappingOffset;
+
+ InGlobal = IsAndroidWithIfuncSupport;
+}
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 49b8a67a6c14..27fb0e4393af 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -45,7 +45,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include <cassert>
@@ -223,12 +223,12 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
uint64_t TotalCount, uint32_t NumCandidates) {
std::vector<PromotionCandidate> Ret;
- DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
- << " Num_targets: " << ValueDataRef.size()
- << " Num_candidates: " << NumCandidates << "\n");
+ LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
+ << " Num_targets: " << ValueDataRef.size()
+ << " Num_candidates: " << NumCandidates << "\n");
NumOfPGOICallsites++;
if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
- DEBUG(dbgs() << " Skip: User options.\n");
+ LLVM_DEBUG(dbgs() << " Skip: User options.\n");
return Ret;
}
@@ -236,11 +236,11 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
uint64_t Count = ValueDataRef[I].Count;
assert(Count <= TotalCount);
uint64_t Target = ValueDataRef[I].Value;
- DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
- << " Target_func: " << Target << "\n");
+ LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+ << " Target_func: " << Target << "\n");
if (ICPInvokeOnly && dyn_cast<CallInst>(Inst)) {
- DEBUG(dbgs() << " Not promote: User options.\n");
+ LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
<< " Not promote: User options";
@@ -248,7 +248,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
break;
}
if (ICPCallOnly && dyn_cast<InvokeInst>(Inst)) {
- DEBUG(dbgs() << " Not promote: User option.\n");
+ LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
<< " Not promote: User options";
@@ -256,7 +256,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
break;
}
if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
- DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+ LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", Inst)
<< " Not promote: Cutoff reached";
@@ -266,7 +266,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
Function *TargetFunction = Symtab->getFunction(Target);
if (TargetFunction == nullptr) {
- DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+ LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", Inst)
<< "Cannot promote indirect call: target not found";
@@ -387,7 +387,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
InstrProfSymtab Symtab;
if (Error E = Symtab.create(M, InLTO)) {
std::string SymtabFailure = toString(std::move(E));
- DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+ LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
(void)SymtabFailure;
return false;
}
@@ -412,12 +412,12 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE);
bool FuncChanged = ICallPromotion.processFunction(PSI);
if (ICPDUMPAFTER && FuncChanged) {
- DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
}
Changed |= FuncChanged;
if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
- DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+ LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n");
break;
}
}
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 9b70f95480e4..22076f04d6ad 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -13,7 +13,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/InstrProfiling.h"
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -271,8 +271,8 @@ public:
break;
}
- DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
- << L.getLoopDepth() << ")\n");
+ LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+ << L.getLoopDepth() << ")\n");
return Promoted != 0;
}
@@ -430,9 +430,24 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) {
}
}
-bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
- bool MadeChange = false;
+/// Check if the module contains uses of any profiling intrinsics.
+static bool containsProfilingIntrinsics(Module &M) {
+ if (auto *F = M.getFunction(
+ Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
+ if (!F->use_empty())
+ return true;
+ if (auto *F = M.getFunction(
+ Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
+ if (!F->use_empty())
+ return true;
+ if (auto *F = M.getFunction(
+ Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
+ if (!F->use_empty())
+ return true;
+ return false;
+}
+bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
this->M = &M;
this->TLI = &TLI;
NamesVar = nullptr;
@@ -443,6 +458,15 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
MemOPSizeRangeLast);
TT = Triple(M.getTargetTriple());
+ // Emit the runtime hook even if no counters are present.
+ bool MadeChange = emitRuntimeHook();
+
+ // Improve compile time by avoiding linear scans when there is no work.
+ GlobalVariable *CoverageNamesVar =
+ M.getNamedGlobal(getCoverageUnusedNamesVarName());
+ if (!containsProfilingIntrinsics(M) && !CoverageNamesVar)
+ return MadeChange;
+
// We did not know how many value sites there would be inside
// the instrumented function. This is counting the number of instrumented
// target value sites to enter it as field in the profile data variable.
@@ -464,8 +488,7 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
for (Function &F : M)
MadeChange |= lowerIntrinsics(&F);
- if (GlobalVariable *CoverageNamesVar =
- M.getNamedGlobal(getCoverageUnusedNamesVarName())) {
+ if (CoverageNamesVar) {
lowerCoverageData(CoverageNamesVar);
MadeChange = true;
}
@@ -476,7 +499,6 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
emitVNodes();
emitNameData();
emitRegistration();
- emitRuntimeHook();
emitUses();
emitInitialization();
return true;
@@ -669,6 +691,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Module &M) {
// Use linker script magic to get data/cnts/name start/end.
if (Triple(M.getTargetTriple()).isOSLinux() ||
Triple(M.getTargetTriple()).isOSFreeBSD() ||
+ Triple(M.getTargetTriple()).isOSFuchsia() ||
Triple(M.getTargetTriple()).isPS4CPU())
return false;
@@ -892,15 +915,15 @@ void InstrProfiling::emitRegistration() {
IRB.CreateRetVoid();
}
-void InstrProfiling::emitRuntimeHook() {
+bool InstrProfiling::emitRuntimeHook() {
// We expect the linker to be invoked with -u<hook_var> flag for linux,
// for which case there is no need to emit the user function.
if (Triple(M->getTargetTriple()).isOSLinux())
- return;
+ return false;
// If the module's provided its own runtime, we don't need to do anything.
if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
- return;
+ return false;
// Declare an external variable that will pull in the runtime initialization.
auto *Int32Ty = Type::getInt32Ty(M->getContext());
@@ -925,6 +948,7 @@ void InstrProfiling::emitRuntimeHook() {
// Mark the user variable as used so that it isn't stripped out.
UsedVars.push_back(User);
+ return true;
}
void InstrProfiling::emitUses() {
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b3c39b5b1665..4bcef6972786 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -101,6 +101,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -138,7 +139,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
#include <cassert>
@@ -163,7 +163,7 @@ static const unsigned kRetvalTLSSize = 800;
// Accesses sizes are powers of two: 1, 2, 4, 8.
static const size_t kNumberOfAccessSizes = 4;
-/// \brief Track origins of uninitialized values.
+/// Track origins of uninitialized values.
///
/// Adds a section to MemorySanitizer report that points to the allocation
/// (stack or heap) the uninitialized bits came from originally.
@@ -199,6 +199,18 @@ static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
cl::desc("exact handling of relational integer ICmp"),
cl::Hidden, cl::init(false));
+// When compiling the Linux kernel, we sometimes see false positives related to
+// MSan being unable to understand that inline assembly calls may initialize
+// local variables.
+// This flag makes the compiler conservatively unpoison every memory location
+// passed into an assembly call. Note that this may cause false positives.
+// Because it's impossible to figure out the array sizes, we can only unpoison
+// the first sizeof(type) bytes for each type* pointer.
+static cl::opt<bool> ClHandleAsmConservative(
+ "msan-handle-asm-conservative",
+ cl::desc("conservative handling of inline assembly"), cl::Hidden,
+ cl::init(false));
+
// This flag controls whether we check the shadow of the address
// operand of load or store. Such bugs are very rare, since load from
// a garbage address typically results in SEGV, but still happen
@@ -234,6 +246,24 @@ static cl::opt<bool> ClWithComdat("msan-with-comdat",
cl::desc("Place MSan constructors in comdat sections"),
cl::Hidden, cl::init(false));
+// These options allow to specify custom memory map parameters
+// See MemoryMapParams for details.
+static cl::opt<unsigned long long> ClAndMask("msan-and-mask",
+ cl::desc("Define custom MSan AndMask"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned long long> ClXorMask("msan-xor-mask",
+ cl::desc("Define custom MSan XorMask"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned long long> ClShadowBase("msan-shadow-base",
+ cl::desc("Define custom MSan ShadowBase"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned long long> ClOriginBase("msan-origin-base",
+ cl::desc("Define custom MSan OriginBase"),
+ cl::Hidden, cl::init(0));
+
static const char *const kMsanModuleCtorName = "msan.module_ctor";
static const char *const kMsanInitName = "__msan_init";
@@ -360,7 +390,7 @@ static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
namespace {
-/// \brief An instrumentation pass implementing detection of uninitialized
+/// An instrumentation pass implementing detection of uninitialized
/// reads.
///
/// MemorySanitizer: instrument the code in module to find
@@ -368,7 +398,7 @@ namespace {
class MemorySanitizer : public FunctionPass {
public:
// Pass identification, replacement for typeid.
- static char ID;
+ static char ID;
MemorySanitizer(int TrackOrigins = 0, bool Recover = false)
: FunctionPass(ID),
@@ -392,8 +422,9 @@ private:
friend struct VarArgPowerPC64Helper;
void initializeCallbacks(Module &M);
+ void createUserspaceApi(Module &M);
- /// \brief Track origins (allocation points) of uninitialized values.
+ /// Track origins (allocation points) of uninitialized values.
int TrackOrigins;
bool Recover;
@@ -401,60 +432,67 @@ private:
Type *IntptrTy;
Type *OriginTy;
- /// \brief Thread-local shadow storage for function parameters.
+ /// Thread-local shadow storage for function parameters.
GlobalVariable *ParamTLS;
- /// \brief Thread-local origin storage for function parameters.
+ /// Thread-local origin storage for function parameters.
GlobalVariable *ParamOriginTLS;
- /// \brief Thread-local shadow storage for function return value.
+ /// Thread-local shadow storage for function return value.
GlobalVariable *RetvalTLS;
- /// \brief Thread-local origin storage for function return value.
+ /// Thread-local origin storage for function return value.
GlobalVariable *RetvalOriginTLS;
- /// \brief Thread-local shadow storage for in-register va_arg function
+ /// Thread-local shadow storage for in-register va_arg function
/// parameters (x86_64-specific).
GlobalVariable *VAArgTLS;
- /// \brief Thread-local shadow storage for va_arg overflow area
+ /// Thread-local shadow storage for va_arg overflow area
/// (x86_64-specific).
GlobalVariable *VAArgOverflowSizeTLS;
- /// \brief Thread-local space used to pass origin value to the UMR reporting
+ /// Thread-local space used to pass origin value to the UMR reporting
/// function.
GlobalVariable *OriginTLS;
- /// \brief The run-time callback to print a warning.
- Value *WarningFn = nullptr;
+ /// Are the instrumentation callbacks set up?
+ bool CallbacksInitialized = false;
+
+ /// The run-time callback to print a warning.
+ Value *WarningFn;
// These arrays are indexed by log2(AccessSize).
Value *MaybeWarningFn[kNumberOfAccessSizes];
Value *MaybeStoreOriginFn[kNumberOfAccessSizes];
- /// \brief Run-time helper that generates a new origin value for a stack
+ /// Run-time helper that generates a new origin value for a stack
/// allocation.
Value *MsanSetAllocaOrigin4Fn;
- /// \brief Run-time helper that poisons stack on function entry.
+ /// Run-time helper that poisons stack on function entry.
Value *MsanPoisonStackFn;
- /// \brief Run-time helper that records a store (or any event) of an
+ /// Run-time helper that records a store (or any event) of an
/// uninitialized value and returns an updated origin id encoding this info.
Value *MsanChainOriginFn;
- /// \brief MSan runtime replacements for memmove, memcpy and memset.
+ /// MSan runtime replacements for memmove, memcpy and memset.
Value *MemmoveFn, *MemcpyFn, *MemsetFn;
- /// \brief Memory map parameters used in application-to-shadow calculation.
+ /// Memory map parameters used in application-to-shadow calculation.
const MemoryMapParams *MapParams;
+ /// Custom memory map parameters used when -msan-shadow-base or
+ // -msan-origin-base is provided.
+ MemoryMapParams CustomMapParams;
+
MDNode *ColdCallWeights;
- /// \brief Branch weights for origin store.
+ /// Branch weights for origin store.
MDNode *OriginStoreWeights;
- /// \brief An empty volatile inline asm that prevents callback merge.
+ /// An empty volatile inline asm that prevents callback merge.
InlineAsm *EmptyAsm;
Function *MsanCtorFunction;
@@ -476,7 +514,7 @@ FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins, bool Recover) {
return new MemorySanitizer(TrackOrigins, Recover);
}
-/// \brief Create a non-const global initialized with the given string.
+/// Create a non-const global initialized with the given string.
///
/// Creates a writable global for Str so that we can pass it to the
/// run-time lib. Runtime uses first 4 bytes of the string to store the
@@ -488,12 +526,8 @@ static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
GlobalValue::PrivateLinkage, StrConst, "");
}
-/// \brief Insert extern declaration of runtime-provided functions and globals.
-void MemorySanitizer::initializeCallbacks(Module &M) {
- // Only do this once.
- if (WarningFn)
- return;
-
+/// Insert declarations for userspace-specific functions and globals.
+void MemorySanitizer::createUserspaceApi(Module &M) {
IRBuilder<> IRB(*C);
// Create the callback.
// FIXME: this function should have "Cold" calling conv,
@@ -502,6 +536,38 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
: "__msan_warning_noreturn";
WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
+ // Create the global TLS variables.
+ RetvalTLS = new GlobalVariable(
+ M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
+ GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
+ GlobalVariable::InitialExecTLSModel);
+
+ RetvalOriginTLS = new GlobalVariable(
+ M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
+ "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
+ ParamTLS = new GlobalVariable(
+ M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+ GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
+ GlobalVariable::InitialExecTLSModel);
+
+ ParamOriginTLS = new GlobalVariable(
+ M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
+ GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
+ nullptr, GlobalVariable::InitialExecTLSModel);
+
+ VAArgTLS = new GlobalVariable(
+ M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+ GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
+ GlobalVariable::InitialExecTLSModel);
+ VAArgOverflowSizeTLS = new GlobalVariable(
+ M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+ "__msan_va_arg_overflow_size_tls", nullptr,
+ GlobalVariable::InitialExecTLSModel);
+ OriginTLS = new GlobalVariable(
+ M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+ "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
AccessSizeIndex++) {
unsigned AccessSize = 1 << AccessSizeIndex;
@@ -522,6 +588,17 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
MsanPoisonStackFn =
M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
IRB.getInt8PtrTy(), IntptrTy);
+}
+
+/// Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+ // Only do this once.
+ if (CallbacksInitialized)
+ return;
+
+ IRBuilder<> IRB(*C);
+ // Initialize callbacks that are common for kernel and userspace
+ // instrumentation.
MsanChainOriginFn = M.getOrInsertFunction(
"__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
MemmoveFn = M.getOrInsertFunction(
@@ -533,98 +610,81 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
MemsetFn = M.getOrInsertFunction(
"__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
IntptrTy);
-
- // Create globals.
- RetvalTLS = new GlobalVariable(
- M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
- GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
- GlobalVariable::InitialExecTLSModel);
- RetvalOriginTLS = new GlobalVariable(
- M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
- "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
-
- ParamTLS = new GlobalVariable(
- M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
- GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
- GlobalVariable::InitialExecTLSModel);
- ParamOriginTLS = new GlobalVariable(
- M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
- GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
- nullptr, GlobalVariable::InitialExecTLSModel);
-
- VAArgTLS = new GlobalVariable(
- M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
- GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
- GlobalVariable::InitialExecTLSModel);
- VAArgOverflowSizeTLS = new GlobalVariable(
- M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
- "__msan_va_arg_overflow_size_tls", nullptr,
- GlobalVariable::InitialExecTLSModel);
- OriginTLS = new GlobalVariable(
- M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
- "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
-
// We insert an empty inline asm after __msan_report* to avoid callback merge.
EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
StringRef(""), StringRef(""),
/*hasSideEffects=*/true);
+
+ createUserspaceApi(M);
+ CallbacksInitialized = true;
}
-/// \brief Module-level initialization.
+/// Module-level initialization.
///
/// inserts a call to __msan_init to the module's constructor list.
bool MemorySanitizer::doInitialization(Module &M) {
auto &DL = M.getDataLayout();
- Triple TargetTriple(M.getTargetTriple());
- switch (TargetTriple.getOS()) {
- case Triple::FreeBSD:
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- MapParams = FreeBSD_X86_MemoryMapParams.bits64;
- break;
- case Triple::x86:
- MapParams = FreeBSD_X86_MemoryMapParams.bits32;
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- break;
- case Triple::NetBSD:
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- MapParams = NetBSD_X86_MemoryMapParams.bits64;
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- break;
- case Triple::Linux:
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- MapParams = Linux_X86_MemoryMapParams.bits64;
- break;
- case Triple::x86:
- MapParams = Linux_X86_MemoryMapParams.bits32;
- break;
- case Triple::mips64:
- case Triple::mips64el:
- MapParams = Linux_MIPS_MemoryMapParams.bits64;
- break;
- case Triple::ppc64:
- case Triple::ppc64le:
- MapParams = Linux_PowerPC_MemoryMapParams.bits64;
- break;
- case Triple::aarch64:
- case Triple::aarch64_be:
- MapParams = Linux_ARM_MemoryMapParams.bits64;
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- break;
- default:
- report_fatal_error("unsupported operating system");
+ bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0;
+ bool OriginPassed = ClOriginBase.getNumOccurrences() > 0;
+ // Check the overrides first
+ if (ShadowPassed || OriginPassed) {
+ CustomMapParams.AndMask = ClAndMask;
+ CustomMapParams.XorMask = ClXorMask;
+ CustomMapParams.ShadowBase = ClShadowBase;
+ CustomMapParams.OriginBase = ClOriginBase;
+ MapParams = &CustomMapParams;
+ } else {
+ Triple TargetTriple(M.getTargetTriple());
+ switch (TargetTriple.getOS()) {
+ case Triple::FreeBSD:
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+ break;
+ case Triple::x86:
+ MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ break;
+ case Triple::NetBSD:
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ MapParams = NetBSD_X86_MemoryMapParams.bits64;
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ break;
+ case Triple::Linux:
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ MapParams = Linux_X86_MemoryMapParams.bits64;
+ break;
+ case Triple::x86:
+ MapParams = Linux_X86_MemoryMapParams.bits32;
+ break;
+ case Triple::mips64:
+ case Triple::mips64el:
+ MapParams = Linux_MIPS_MemoryMapParams.bits64;
+ break;
+ case Triple::ppc64:
+ case Triple::ppc64le:
+ MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+ break;
+ case Triple::aarch64:
+ case Triple::aarch64_be:
+ MapParams = Linux_ARM_MemoryMapParams.bits64;
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ break;
+ default:
+ report_fatal_error("unsupported operating system");
+ }
}
C = &(M.getContext());
@@ -661,7 +721,7 @@ bool MemorySanitizer::doInitialization(Module &M) {
namespace {
-/// \brief A helper class that handles instrumentation of VarArg
+/// A helper class that handles instrumentation of VarArg
/// functions on a particular platform.
///
/// Implementations are expected to insert the instrumentation
@@ -672,16 +732,16 @@ namespace {
struct VarArgHelper {
virtual ~VarArgHelper() = default;
- /// \brief Visit a CallSite.
+ /// Visit a CallSite.
virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
- /// \brief Visit a va_start call.
+ /// Visit a va_start call.
virtual void visitVAStartInst(VAStartInst &I) = 0;
- /// \brief Visit a va_copy call.
+ /// Visit a va_copy call.
virtual void visitVACopyInst(VACopyInst &I) = 0;
- /// \brief Finalize function instrumentation.
+ /// Finalize function instrumentation.
///
/// This method is called after visiting all interesting (see above)
/// instructions in a function.
@@ -715,6 +775,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ValueMap<Value*, Value*> ShadowMap, OriginMap;
std::unique_ptr<VarArgHelper> VAHelper;
const TargetLibraryInfo *TLI;
+ BasicBlock *ActualFnStart;
// The following flags disable parts of MSan instrumentation based on
// blacklist contents and command-line options.
@@ -747,9 +808,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
CheckReturnValue = SanitizeFunction && (F.getName() == "main");
TLI = &MS.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- DEBUG(if (!InsertChecks)
- dbgs() << "MemorySanitizer is not inserting checks into '"
- << F.getName() << "'\n");
+ MS.initializeCallbacks(*F.getParent());
+ ActualFnStart = &F.getEntryBlock();
+
+ LLVM_DEBUG(if (!InsertChecks) dbgs()
+ << "MemorySanitizer is not inserting checks into '"
+ << F.getName() << "'\n");
}
Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
@@ -766,7 +830,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
}
- /// \brief Fill memory range with the given origin value.
+ /// Fill memory range with the given origin value.
void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
unsigned Size, unsigned Alignment) {
const DataLayout &DL = F.getParent()->getDataLayout();
@@ -849,13 +913,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
unsigned Alignment = SI->getAlignment();
unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment);
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
- DEBUG(dbgs() << " STORE: " << *NewSI << "\n");
-
- if (ClCheckAccessAddress)
- insertShadowCheck(Addr, NewSI);
+ LLVM_DEBUG(dbgs() << " STORE: " << *NewSI << "\n");
+ (void)NewSI;
if (SI->isAtomic())
SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
@@ -866,25 +928,31 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
+ /// Helper function to insert a warning at IRB's current insert point.
+ void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
+ if (!Origin)
+ Origin = (Value *)IRB.getInt32(0);
+ if (MS.TrackOrigins) {
+ IRB.CreateStore(Origin, MS.OriginTLS);
+ }
+ IRB.CreateCall(MS.WarningFn, {});
+ IRB.CreateCall(MS.EmptyAsm, {});
+ // FIXME: Insert UnreachableInst if !MS.Recover?
+ // This may invalidate some of the following checks and needs to be done
+ // at the very end.
+ }
+
void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
bool AsCall) {
IRBuilder<> IRB(OrigIns);
- DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n");
+ LLVM_DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n");
Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
- DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n");
+ LLVM_DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n");
Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
if (ConstantShadow) {
if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
- if (MS.TrackOrigins) {
- IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
- MS.OriginTLS);
- }
- IRB.CreateCall(MS.WarningFn, {});
- IRB.CreateCall(MS.EmptyAsm, {});
- // FIXME: Insert UnreachableInst if !MS.Recover?
- // This may invalidate some of the following checks and needs to be done
- // at the very end.
+ insertWarningFn(IRB, Origin);
}
return;
}
@@ -908,13 +976,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/* Unreachable */ !MS.Recover, MS.ColdCallWeights);
IRB.SetInsertPoint(CheckTerm);
- if (MS.TrackOrigins) {
- IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
- MS.OriginTLS);
- }
- IRB.CreateCall(MS.WarningFn, {});
- IRB.CreateCall(MS.EmptyAsm, {});
- DEBUG(dbgs() << " CHECK: " << *Cmp << "\n");
+ insertWarningFn(IRB, Origin);
+ LLVM_DEBUG(dbgs() << " CHECK: " << *Cmp << "\n");
}
}
@@ -925,13 +988,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *Origin = ShadowData.Origin;
materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
}
- DEBUG(dbgs() << "DONE:\n" << F);
+ LLVM_DEBUG(dbgs() << "DONE:\n" << F);
}
- /// \brief Add MemorySanitizer instrumentation to a function.
+ /// Add MemorySanitizer instrumentation to a function.
bool runOnFunction() {
- MS.initializeCallbacks(*F.getParent());
-
// In the presence of unreachable blocks, we may see Phi nodes with
// incoming nodes from such blocks. Since InstVisitor skips unreachable
// blocks, such nodes will not have any shadow value associated with them.
@@ -941,7 +1002,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Iterate all BBs in depth-first order and create shadow instructions
// for all instructions (where applicable).
// For PHI nodes we create dummy shadow PHIs which will be finalized later.
- for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+ for (BasicBlock *BB : depth_first(ActualFnStart))
visit(*BB);
// Finalize PHI nodes.
@@ -961,22 +1022,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
InstrumentationList.size() + StoreList.size() >
(unsigned)ClInstrumentationWithCallThreshold;
- // Delayed instrumentation of StoreInst.
- // This may add new checks to be inserted later.
- materializeStores(InstrumentWithCalls);
-
// Insert shadow value checks.
materializeChecks(InstrumentWithCalls);
+ // Delayed instrumentation of StoreInst.
+ // This may not add new address checks.
+ materializeStores(InstrumentWithCalls);
+
return true;
}
- /// \brief Compute the shadow type that corresponds to a given Value.
+ /// Compute the shadow type that corresponds to a given Value.
Type *getShadowTy(Value *V) {
return getShadowTy(V->getType());
}
- /// \brief Compute the shadow type that corresponds to a given Type.
+ /// Compute the shadow type that corresponds to a given Type.
Type *getShadowTy(Type *OrigTy) {
if (!OrigTy->isSized()) {
return nullptr;
@@ -1000,21 +1061,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
Elements.push_back(getShadowTy(ST->getElementType(i)));
StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
- DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+ LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
return Res;
}
uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
return IntegerType::get(*MS.C, TypeSize);
}
- /// \brief Flatten a vector type.
+ /// Flatten a vector type.
Type *getShadowTyNoVec(Type *ty) {
if (VectorType *vt = dyn_cast<VectorType>(ty))
return IntegerType::get(*MS.C, vt->getBitWidth());
return ty;
}
- /// \brief Convert a shadow value to it's flattened variant.
+ /// Convert a shadow value to it's flattened variant.
Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
Type *Ty = V->getType();
Type *NoVecTy = getShadowTyNoVec(Ty);
@@ -1022,7 +1083,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return IRB.CreateBitCast(V, NoVecTy);
}
- /// \brief Compute the integer shadow offset that corresponds to a given
+ /// Compute the integer shadow offset that corresponds to a given
/// application address.
///
/// Offset = (Addr & ~AndMask) ^ XorMask
@@ -1041,18 +1102,18 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return OffsetLong;
}
- /// \brief Compute the shadow and origin addresses corresponding to a given
+ /// Compute the shadow and origin addresses corresponding to a given
/// application address.
///
/// Shadow = ShadowBase + Offset
/// Origin = (OriginBase + Offset) & ~3ULL
- std::pair<Value *, Value *> getShadowOriginPtrUserspace(
- Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, unsigned Alignment,
- Instruction **FirstInsn) {
+ std::pair<Value *, Value *> getShadowOriginPtrUserspace(Value *Addr,
+ IRBuilder<> &IRB,
+ Type *ShadowTy,
+ unsigned Alignment) {
Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
Value *ShadowLong = ShadowOffset;
uint64_t ShadowBase = MS.MapParams->ShadowBase;
- *FirstInsn = dyn_cast<Instruction>(ShadowLong);
if (ShadowBase != 0) {
ShadowLong =
IRB.CreateAdd(ShadowLong,
@@ -1080,58 +1141,60 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
Type *ShadowTy,
- unsigned Alignment) {
- Instruction *FirstInsn = nullptr;
+ unsigned Alignment,
+ bool isStore) {
std::pair<Value *, Value *> ret =
- getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment, &FirstInsn);
+ getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
return ret;
}
- /// \brief Compute the shadow address for a given function argument.
+ /// Compute the shadow address for a given function argument.
///
/// Shadow = ParamTLS+ArgOffset.
Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
int ArgOffset) {
Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ if (ArgOffset)
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
"_msarg");
}
- /// \brief Compute the origin address for a given function argument.
+ /// Compute the origin address for a given function argument.
Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
int ArgOffset) {
if (!MS.TrackOrigins) return nullptr;
Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ if (ArgOffset)
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
"_msarg_o");
}
- /// \brief Compute the shadow address for a retval.
+ /// Compute the shadow address for a retval.
Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
return IRB.CreatePointerCast(MS.RetvalTLS,
PointerType::get(getShadowTy(A), 0),
"_msret");
}
- /// \brief Compute the origin address for a retval.
+ /// Compute the origin address for a retval.
Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
// We keep a single origin for the entire retval. Might be too optimistic.
return MS.RetvalOriginTLS;
}
- /// \brief Set SV to be the shadow value for V.
+ /// Set SV to be the shadow value for V.
void setShadow(Value *V, Value *SV) {
assert(!ShadowMap.count(V) && "Values may only have one shadow");
ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
}
- /// \brief Set Origin to be the origin value for V.
+ /// Set Origin to be the origin value for V.
void setOrigin(Value *V, Value *Origin) {
if (!MS.TrackOrigins) return;
assert(!OriginMap.count(V) && "Values may only have one origin");
- DEBUG(dbgs() << "ORIGIN: " << *V << " ==> " << *Origin << "\n");
+ LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << " ==> " << *Origin << "\n");
OriginMap[V] = Origin;
}
@@ -1142,7 +1205,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return Constant::getNullValue(ShadowTy);
}
- /// \brief Create a clean shadow value for a given value.
+ /// Create a clean shadow value for a given value.
///
/// Clean shadow (all zeroes) means all bits of the value are defined
/// (initialized).
@@ -1150,7 +1213,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return getCleanShadow(V->getType());
}
- /// \brief Create a dirty shadow of a given shadow type.
+ /// Create a dirty shadow of a given shadow type.
Constant *getPoisonedShadow(Type *ShadowTy) {
assert(ShadowTy);
if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
@@ -1169,7 +1232,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
llvm_unreachable("Unexpected shadow type");
}
- /// \brief Create a dirty shadow for a given value.
+ /// Create a dirty shadow for a given value.
Constant *getPoisonedShadow(Value *V) {
Type *ShadowTy = getShadowTy(V);
if (!ShadowTy)
@@ -1177,12 +1240,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return getPoisonedShadow(ShadowTy);
}
- /// \brief Create a clean (zero) origin.
+ /// Create a clean (zero) origin.
Value *getCleanOrigin() {
return Constant::getNullValue(MS.OriginTy);
}
- /// \brief Get the shadow value for a given Value.
+ /// Get the shadow value for a given Value.
///
/// This function either returns the value set earlier with setShadow,
/// or extracts if from ParamTLS (for function arguments).
@@ -1194,7 +1257,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// For instructions the shadow is already stored in the map.
Value *Shadow = ShadowMap[V];
if (!Shadow) {
- DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+ LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
(void)I;
assert(Shadow && "No shadow for a value");
}
@@ -1202,7 +1265,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
if (UndefValue *U = dyn_cast<UndefValue>(V)) {
Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
- DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+ LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
(void)U;
return AllOnes;
}
@@ -1212,12 +1275,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
if (*ShadowPtr)
return *ShadowPtr;
Function *F = A->getParent();
- IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+ IRBuilder<> EntryIRB(ActualFnStart->getFirstNonPHI());
unsigned ArgOffset = 0;
const DataLayout &DL = F->getParent()->getDataLayout();
for (auto &FArg : F->args()) {
if (!FArg.getType()->isSized()) {
- DEBUG(dbgs() << "Arg is not sized\n");
+ LLVM_DEBUG(dbgs() << "Arg is not sized\n");
continue;
}
unsigned Size =
@@ -1237,7 +1300,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ArgAlign = DL.getABITypeAlignment(EltType);
}
Value *CpShadowPtr =
- getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign)
+ getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
+ /*isStore*/ true)
.first;
if (Overflow) {
// ParamTLS overflow.
@@ -1246,9 +1310,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Size, ArgAlign);
} else {
unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
- Value *Cpy =
- EntryIRB.CreateMemCpy(CpShadowPtr, Base, Size, CopyAlign);
- DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n");
+ Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
+ CopyAlign, Size);
+ LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n");
(void)Cpy;
}
*ShadowPtr = getCleanShadow(V);
@@ -1261,8 +1325,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
}
}
- DEBUG(dbgs() << " ARG: " << FArg << " ==> " <<
- **ShadowPtr << "\n");
+ LLVM_DEBUG(dbgs()
+ << " ARG: " << FArg << " ==> " << **ShadowPtr << "\n");
if (MS.TrackOrigins && !Overflow) {
Value *OriginPtr =
getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
@@ -1280,12 +1344,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return getCleanShadow(V);
}
- /// \brief Get the shadow for i-th argument of the instruction I.
+ /// Get the shadow for i-th argument of the instruction I.
Value *getShadow(Instruction *I, int i) {
return getShadow(I->getOperand(i));
}
- /// \brief Get the origin for a value.
+ /// Get the origin for a value.
Value *getOrigin(Value *V) {
if (!MS.TrackOrigins) return nullptr;
if (!PropagateShadow) return getCleanOrigin();
@@ -1301,12 +1365,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return Origin;
}
- /// \brief Get the origin for i-th argument of the instruction I.
+ /// Get the origin for i-th argument of the instruction I.
Value *getOrigin(Instruction *I, int i) {
return getOrigin(I->getOperand(i));
}
- /// \brief Remember the place where a shadow check should be inserted.
+ /// Remember the place where a shadow check should be inserted.
///
/// This location will be later instrumented with a check that will print a
/// UMR warning in runtime if the shadow value is not 0.
@@ -1322,7 +1386,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
}
- /// \brief Remember the place where a shadow check should be inserted.
+ /// Remember the place where a shadow check should be inserted.
///
/// This location will be later instrumented with a check that will print a
/// UMR warning in runtime if the value is not fully defined.
@@ -1382,7 +1446,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
InstVisitor<MemorySanitizerVisitor>::visit(I);
}
- /// \brief Instrument LoadInst
+ /// Instrument LoadInst
///
/// Loads the corresponding shadow and (optionally) origin.
/// Optionally, checks that the load address is fully defined.
@@ -1396,7 +1460,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
unsigned Alignment = I.getAlignment();
if (PropagateShadow) {
std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment);
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, Alignment, "_msld"));
} else {
setShadow(&I, getCleanShadow(&I));
@@ -1418,12 +1482,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
- /// \brief Instrument StoreInst
+ /// Instrument StoreInst
///
/// Stores the corresponding shadow and (optionally) origin.
/// Optionally, checks that the store address is fully defined.
void visitStoreInst(StoreInst &I) {
StoreList.push_back(&I);
+ if (ClCheckAccessAddress)
+ insertShadowCheck(I.getPointerOperand(), &I);
}
void handleCASOrRMW(Instruction &I) {
@@ -1431,8 +1497,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
Value *Addr = I.getOperand(0);
- Value *ShadowPtr =
- getShadowOriginPtr(Addr, IRB, I.getType(), /*Alignment*/ 1).first;
+ Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(),
+ /*Alignment*/ 1, /*isStore*/ true)
+ .first;
if (ClCheckAccessAddress)
insertShadowCheck(Addr, &I);
@@ -1536,7 +1603,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
- /// \brief Propagate shadow for bitwise AND.
+ /// Propagate shadow for bitwise AND.
///
/// This code is exact, i.e. if, for example, a bit in the left argument
/// is defined and 0, then neither the value not definedness of the
@@ -1585,7 +1652,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- /// \brief Default propagation of shadow and/or origin.
+ /// Default propagation of shadow and/or origin.
///
/// This class implements the general case of shadow propagation, used in all
/// cases where we don't know and/or don't care about what the operation
@@ -1611,7 +1678,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
: IRB(IRB), MSV(MSV) {}
- /// \brief Add a pair of shadow and origin values to the mix.
+ /// Add a pair of shadow and origin values to the mix.
Combiner &Add(Value *OpShadow, Value *OpOrigin) {
if (CombineShadow) {
assert(OpShadow);
@@ -1641,14 +1708,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return *this;
}
- /// \brief Add an application value to the mix.
+ /// Add an application value to the mix.
Combiner &Add(Value *V) {
Value *OpShadow = MSV->getShadow(V);
Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
return Add(OpShadow, OpOrigin);
}
- /// \brief Set the current combined values as the given instruction's shadow
+ /// Set the current combined values as the given instruction's shadow
/// and origin.
void Done(Instruction *I) {
if (CombineShadow) {
@@ -1666,7 +1733,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
using ShadowAndOriginCombiner = Combiner<true>;
using OriginCombiner = Combiner<false>;
- /// \brief Propagate origin for arbitrary operation.
+ /// Propagate origin for arbitrary operation.
void setOriginForNaryOp(Instruction &I) {
if (!MS.TrackOrigins) return;
IRBuilder<> IRB(&I);
@@ -1684,7 +1751,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Ty->getPrimitiveSizeInBits();
}
- /// \brief Cast between two shadow types, extending or truncating as
+ /// Cast between two shadow types, extending or truncating as
/// necessary.
Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
bool Signed = false) {
@@ -1706,7 +1773,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// TODO: handle struct types.
}
- /// \brief Cast an application value to the type of its own shadow.
+ /// Cast an application value to the type of its own shadow.
Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
Type *ShadowTy = getShadowTy(V);
if (V->getType() == ShadowTy)
@@ -1717,7 +1784,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return IRB.CreateBitCast(V, ShadowTy);
}
- /// \brief Propagate shadow for arbitrary operation.
+ /// Propagate shadow for arbitrary operation.
void handleShadowOr(Instruction &I) {
IRBuilder<> IRB(&I);
ShadowAndOriginCombiner SC(this, IRB);
@@ -1726,7 +1793,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
SC.Done(&I);
}
- // \brief Handle multiplication by constant.
+ // Handle multiplication by constant.
//
// Handle a special case of multiplication by constant that may have one or
// more zeros in the lower bits. This makes corresponding number of lower bits
@@ -1788,7 +1855,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void visitSub(BinaryOperator &I) { handleShadowOr(I); }
void visitXor(BinaryOperator &I) { handleShadowOr(I); }
- void handleDiv(Instruction &I) {
+ void handleIntegerDiv(Instruction &I) {
IRBuilder<> IRB(&I);
// Strict on the second argument.
insertShadowCheck(I.getOperand(1), &I);
@@ -1796,14 +1863,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOrigin(&I, getOrigin(&I, 0));
}
- void visitUDiv(BinaryOperator &I) { handleDiv(I); }
- void visitSDiv(BinaryOperator &I) { handleDiv(I); }
- void visitFDiv(BinaryOperator &I) { handleDiv(I); }
- void visitURem(BinaryOperator &I) { handleDiv(I); }
- void visitSRem(BinaryOperator &I) { handleDiv(I); }
- void visitFRem(BinaryOperator &I) { handleDiv(I); }
+ void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+ void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+ void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
+ void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
+
+ // Floating point division is side-effect free. We can not require that the
+ // divisor is fully initialized and must propagate shadow. See PR37523.
+ void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
+ void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
- /// \brief Instrument == and != comparisons.
+ /// Instrument == and != comparisons.
///
/// Sometimes the comparison result is known even if some of the bits of the
/// arguments are not.
@@ -1841,7 +1911,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- /// \brief Build the lowest possible value of V, taking into account V's
+ /// Build the lowest possible value of V, taking into account V's
/// uninitialized bits.
Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
bool isSigned) {
@@ -1858,7 +1928,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
- /// \brief Build the highest possible value of V, taking into account V's
+ /// Build the highest possible value of V, taking into account V's
/// uninitialized bits.
Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
bool isSigned) {
@@ -1875,7 +1945,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
- /// \brief Instrument relational comparisons.
+ /// Instrument relational comparisons.
///
/// This function does exact shadow propagation for all relational
/// comparisons of integers, pointers and vectors of those.
@@ -1908,7 +1978,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- /// \brief Instrument signed relational comparisons.
+ /// Instrument signed relational comparisons.
///
/// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
/// bit of the shadow. Everything else is delegated to handleShadowOr().
@@ -1992,7 +2062,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void visitAShr(BinaryOperator &I) { handleShift(I); }
void visitLShr(BinaryOperator &I) { handleShift(I); }
- /// \brief Instrument llvm.memmove
+ /// Instrument llvm.memmove
///
/// At this point we don't know if llvm.memmove will be inlined or not.
/// If we don't instrument it and it gets inlined,
@@ -2045,7 +2115,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
VAHelper->visitVACopyInst(I);
}
- /// \brief Handle vector store-like intrinsics.
+ /// Handle vector store-like intrinsics.
///
/// Instrument intrinsics that look like a simple SIMD store: writes memory,
/// has 1 pointer argument and 1 vector argument, returns void.
@@ -2057,8 +2127,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// We don't know the pointer alignment (could be unaligned SSE store!).
// Have to assume to worst case.
- std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, Shadow->getType(), /*Alignment*/ 1);
+ std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+ Addr, IRB, Shadow->getType(), /*Alignment*/ 1, /*isStore*/ true);
IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
if (ClCheckAccessAddress)
@@ -2069,7 +2139,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return true;
}
- /// \brief Handle vector load-like intrinsics.
+ /// Handle vector load-like intrinsics.
///
/// Instrument intrinsics that look like a simple SIMD load: reads memory,
/// has 1 pointer argument, returns a vector.
@@ -2084,7 +2154,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Have to assume to worst case.
unsigned Alignment = 1;
std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment);
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, Alignment, "_msld"));
} else {
setShadow(&I, getCleanShadow(&I));
@@ -2102,7 +2172,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return true;
}
- /// \brief Handle (SIMD arithmetic)-like intrinsics.
+ /// Handle (SIMD arithmetic)-like intrinsics.
///
/// Instrument intrinsics with any number of arguments of the same type,
/// equal to the return type. The type should be simple (no aggregates or
@@ -2132,7 +2202,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return true;
}
- /// \brief Heuristically instrument unknown intrinsics.
+ /// Heuristically instrument unknown intrinsics.
///
/// The main purpose of this code is to do something reasonable with all
/// random intrinsics we might encounter, most importantly - SIMD intrinsics.
@@ -2182,7 +2252,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOrigin(&I, getOrigin(Op));
}
- // \brief Instrument vector convert instrinsic.
+ // Instrument vector convert instrinsic.
//
// This function instruments intrinsics like cvtsi2ss:
// %Out = int_xxx_cvtyyy(%ConvertOp)
@@ -2285,7 +2355,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return IRB.CreateSExt(S2, T);
}
- // \brief Instrument vector shift instrinsic.
+ // Instrument vector shift instrinsic.
//
// This function instruments intrinsics like int_x86_avx2_psll_w.
// Intrinsic shifts %In by %ShiftSize bits.
@@ -2310,14 +2380,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // \brief Get an X86_MMX-sized vector type.
+ // Get an X86_MMX-sized vector type.
Type *getMMXVectorTy(unsigned EltSizeInBits) {
const unsigned X86_MMXSizeInBits = 64;
return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
X86_MMXSizeInBits / EltSizeInBits);
}
- // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack
+ // Returns a signed counterpart for an (un)signed-saturate-and-pack
// intrinsic.
Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
switch (id) {
@@ -2348,7 +2418,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
- // \brief Instrument vector pack instrinsic.
+ // Instrument vector pack instrinsic.
//
// This function instruments intrinsics like x86_mmx_packsswb, that
// packs elements of 2 input vectors into half as many bits with saturation.
@@ -2391,7 +2461,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // \brief Instrument sum-of-absolute-differencies intrinsic.
+ // Instrument sum-of-absolute-differencies intrinsic.
void handleVectorSadIntrinsic(IntrinsicInst &I) {
const unsigned SignificantBitsPerResultElement = 16;
bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
@@ -2410,7 +2480,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // \brief Instrument multiply-add intrinsic.
+ // Instrument multiply-add intrinsic.
void handleVectorPmaddIntrinsic(IntrinsicInst &I,
unsigned EltSizeInBits = 0) {
bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
@@ -2425,7 +2495,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // \brief Instrument compare-packed intrinsic.
+ // Instrument compare-packed intrinsic.
// Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
// all-ones shadow.
void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
@@ -2438,7 +2508,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // \brief Instrument compare-scalar intrinsic.
+ // Instrument compare-scalar intrinsic.
// This handles both cmp* intrinsics which return the result in the first
// element of a vector, and comi* which return the result as i32.
void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
@@ -2453,7 +2523,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
Value* Addr = I.getArgOperand(0);
Type *Ty = IRB.getInt32Ty();
- Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1).first;
+ Value *ShadowPtr =
+ getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1, /*isStore*/ true)
+ .first;
IRB.CreateStore(getCleanShadow(Ty),
IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
@@ -2471,7 +2543,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
unsigned Alignment = 1;
Value *ShadowPtr, *OriginPtr;
std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, Ty, Alignment);
+ getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
if (ClCheckAccessAddress)
insertShadowCheck(Addr, &I);
@@ -2482,11 +2554,98 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
insertShadowCheck(Shadow, Origin, &I);
}
+ void handleMaskedStore(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *V = I.getArgOperand(0);
+ Value *Addr = I.getArgOperand(1);
+ unsigned Align = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+ Value *Mask = I.getArgOperand(3);
+ Value *Shadow = getShadow(V);
+
+ Value *ShadowPtr;
+ Value *OriginPtr;
+ std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+ Addr, IRB, Shadow->getType(), Align, /*isStore*/ true);
+
+ if (ClCheckAccessAddress) {
+ insertShadowCheck(Addr, &I);
+ // Uninitialized mask is kind of like uninitialized address, but not as
+ // scary.
+ insertShadowCheck(Mask, &I);
+ }
+
+ IRB.CreateMaskedStore(Shadow, ShadowPtr, Align, Mask);
+
+ if (MS.TrackOrigins) {
+ auto &DL = F.getParent()->getDataLayout();
+ paintOrigin(IRB, getOrigin(V), OriginPtr,
+ DL.getTypeStoreSize(Shadow->getType()),
+ std::max(Align, kMinOriginAlignment));
+ }
+ }
+
+ bool handleMaskedLoad(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *Addr = I.getArgOperand(0);
+ unsigned Align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ Value *Mask = I.getArgOperand(2);
+ Value *PassThru = I.getArgOperand(3);
+
+ Type *ShadowTy = getShadowTy(&I);
+ Value *ShadowPtr, *OriginPtr;
+ if (PropagateShadow) {
+ std::tie(ShadowPtr, OriginPtr) =
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Align, /*isStore*/ false);
+ setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Align, Mask,
+ getShadow(PassThru), "_msmaskedld"));
+ } else {
+ setShadow(&I, getCleanShadow(&I));
+ }
+
+ if (ClCheckAccessAddress) {
+ insertShadowCheck(Addr, &I);
+ insertShadowCheck(Mask, &I);
+ }
+
+ if (MS.TrackOrigins) {
+ if (PropagateShadow) {
+ // Choose between PassThru's and the loaded value's origins.
+ Value *MaskedPassThruShadow = IRB.CreateAnd(
+ getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy));
+
+ Value *Acc = IRB.CreateExtractElement(
+ MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+ for (int i = 1, N = PassThru->getType()->getVectorNumElements(); i < N;
+ ++i) {
+ Value *More = IRB.CreateExtractElement(
+ MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+ Acc = IRB.CreateOr(Acc, More);
+ }
+
+ Value *Origin = IRB.CreateSelect(
+ IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())),
+ getOrigin(PassThru), IRB.CreateLoad(OriginPtr));
+
+ setOrigin(&I, Origin);
+ } else {
+ setOrigin(&I, getCleanOrigin());
+ }
+ }
+ return true;
+ }
+
+
void visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::bswap:
handleBswap(I);
break;
+ case Intrinsic::masked_store:
+ handleMaskedStore(I);
+ break;
+ case Intrinsic::masked_load:
+ handleMaskedLoad(I);
+ break;
case Intrinsic::x86_sse_stmxcsr:
handleStmxcsr(I);
break;
@@ -2501,20 +2660,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx512_cvttss2usi:
case Intrinsic::x86_avx512_cvttsd2usi64:
case Intrinsic::x86_avx512_cvttsd2usi:
- case Intrinsic::x86_avx512_cvtusi2sd:
case Intrinsic::x86_avx512_cvtusi2ss:
case Intrinsic::x86_avx512_cvtusi642sd:
case Intrinsic::x86_avx512_cvtusi642ss:
case Intrinsic::x86_sse2_cvtsd2si64:
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2ss:
- case Intrinsic::x86_sse2_cvtsi2sd:
- case Intrinsic::x86_sse2_cvtsi642sd:
- case Intrinsic::x86_sse2_cvtss2sd:
case Intrinsic::x86_sse2_cvttsd2si64:
case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse_cvtsi2ss:
- case Intrinsic::x86_sse_cvtsi642ss:
case Intrinsic::x86_sse_cvtss2si64:
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvttss2si64:
@@ -2715,7 +2868,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// outputs as clean. Note that any side effects of the inline asm that are
// not immediately visible in its constraints are not handled.
if (Call->isInlineAsm()) {
- visitInstruction(I);
+ if (ClHandleAsmConservative)
+ visitAsmInstruction(I);
+ else
+ visitInstruction(I);
return;
}
@@ -2738,13 +2894,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
unsigned ArgOffset = 0;
- DEBUG(dbgs() << " CallSite: " << I << "\n");
+ LLVM_DEBUG(dbgs() << " CallSite: " << I << "\n");
for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
ArgIt != End; ++ArgIt) {
Value *A = *ArgIt;
unsigned i = ArgIt - CS.arg_begin();
if (!A->getType()->isSized()) {
- DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
continue;
}
unsigned Size = 0;
@@ -2754,8 +2910,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// __msan_param_tls.
Value *ArgShadow = getShadow(A);
Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
- DEBUG(dbgs() << " Arg#" << i << ": " << *A <<
- " Shadow: " << *ArgShadow << "\n");
+ LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A
+ << " Shadow: " << *ArgShadow << "\n");
bool ArgIsInitialized = false;
const DataLayout &DL = F.getParent()->getDataLayout();
if (CS.paramHasAttr(i, Attribute::ByVal)) {
@@ -2765,10 +2921,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
if (ArgOffset + Size > kParamTLSSize) break;
unsigned ParamAlignment = CS.getParamAlignment(i);
unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
- Value *AShadowPtr =
- getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment).first;
+ Value *AShadowPtr = getShadowOriginPtr(A, IRB, IRB.getInt8Ty(),
+ Alignment, /*isStore*/ false)
+ .first;
- Store = IRB.CreateMemCpy(ArgShadowBase, AShadowPtr, Size, Alignment);
+ Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+ Alignment, Size);
} else {
Size = DL.getTypeAllocSize(A->getType());
if (ArgOffset + Size > kParamTLSSize) break;
@@ -2782,10 +2940,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
getOriginPtrForArgument(A, IRB, ArgOffset));
(void)Store;
assert(Size != 0 && Store != nullptr);
- DEBUG(dbgs() << " Param:" << *Store << "\n");
+ LLVM_DEBUG(dbgs() << " Param:" << *Store << "\n");
ArgOffset += alignTo(Size, 8);
}
- DEBUG(dbgs() << " done with call args\n");
+ LLVM_DEBUG(dbgs() << " done with call args\n");
FunctionType *FT =
cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
@@ -2888,8 +3046,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRB.CreateCall(MS.MsanPoisonStackFn,
{IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
} else {
- Value *ShadowBase =
- getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(), I.getAlignment()).first;
+ Value *ShadowBase = getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(),
+ I.getAlignment(), /*isStore*/ true)
+ .first;
Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
@@ -2991,24 +3150,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void visitExtractValueInst(ExtractValueInst &I) {
IRBuilder<> IRB(&I);
Value *Agg = I.getAggregateOperand();
- DEBUG(dbgs() << "ExtractValue: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "ExtractValue: " << I << "\n");
Value *AggShadow = getShadow(Agg);
- DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
+ LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
- DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n");
+ LLVM_DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n");
setShadow(&I, ResShadow);
setOriginForNaryOp(I);
}
void visitInsertValueInst(InsertValueInst &I) {
IRBuilder<> IRB(&I);
- DEBUG(dbgs() << "InsertValue: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "InsertValue: " << I << "\n");
Value *AggShadow = getShadow(I.getAggregateOperand());
Value *InsShadow = getShadow(I.getInsertedValueOperand());
- DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
- DEBUG(dbgs() << " InsShadow: " << *InsShadow << "\n");
+ LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
+ LLVM_DEBUG(dbgs() << " InsShadow: " << *InsShadow << "\n");
Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
- DEBUG(dbgs() << " Res: " << *Res << "\n");
+ LLVM_DEBUG(dbgs() << " Res: " << *Res << "\n");
setShadow(&I, Res);
setOriginForNaryOp(I);
}
@@ -3023,25 +3182,58 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
void visitResumeInst(ResumeInst &I) {
- DEBUG(dbgs() << "Resume: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
// Nothing to do here.
}
void visitCleanupReturnInst(CleanupReturnInst &CRI) {
- DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+ LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
// Nothing to do here.
}
void visitCatchReturnInst(CatchReturnInst &CRI) {
- DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+ LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
// Nothing to do here.
}
+ void visitAsmInstruction(Instruction &I) {
+ // Conservative inline assembly handling: check for poisoned shadow of
+ // asm() arguments, then unpoison the result and all the memory locations
+ // pointed to by those arguments.
+ CallInst *CI = dyn_cast<CallInst>(&I);
+
+ for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+ Value *Operand = CI->getOperand(i);
+ if (Operand->getType()->isSized())
+ insertShadowCheck(Operand, &I);
+ }
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ IRBuilder<> IRB(&I);
+ IRB.SetInsertPoint(I.getNextNode());
+ for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+ Value *Operand = CI->getOperand(i);
+ Type *OpType = Operand->getType();
+ if (!OpType->isPointerTy())
+ continue;
+ Type *ElType = OpType->getPointerElementType();
+ if (!ElType->isSized())
+ continue;
+ Value *ShadowPtr, *OriginPtr;
+ std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+ Operand, IRB, ElType, /*Alignment*/ 1, /*isStore*/ true);
+ Value *CShadow = getCleanShadow(ElType);
+ IRB.CreateStore(
+ CShadow,
+ IRB.CreatePointerCast(ShadowPtr, CShadow->getType()->getPointerTo()));
+ }
+ }
+
void visitInstruction(Instruction &I) {
// Everything else: stop propagating and check for poisoned shadow.
if (ClDumpStrictInstructions)
dumpInst(I);
- DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
Value *Operand = I.getOperand(i);
if (Operand->getType()->isSized())
@@ -3052,7 +3244,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
};
-/// \brief AMD64-specific implementation of VarArgHelper.
+/// AMD64-specific implementation of VarArgHelper.
struct VarArgAMD64Helper : public VarArgHelper {
// An unfortunate workaround for asymmetric lowering of va_arg stuff.
// See a comment in visitCallSite for more details.
@@ -3116,10 +3308,12 @@ struct VarArgAMD64Helper : public VarArgHelper {
getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
OverflowOffset += alignTo(ArgSize, 8);
Value *ShadowPtr, *OriginPtr;
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment);
+ std::tie(ShadowPtr, OriginPtr) =
+ MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment,
+ /*isStore*/ false);
- IRB.CreateMemCpy(ShadowBase, ShadowPtr, ArgSize, kShadowTLSAlignment);
+ IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr,
+ kShadowTLSAlignment, ArgSize);
} else {
ArgKind AK = classifyArgument(A);
if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
@@ -3157,7 +3351,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
}
- /// \brief Compute the shadow address for a given va_arg.
+ /// Compute the shadow address for a given va_arg.
Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
int ArgOffset) {
Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
@@ -3172,7 +3366,8 @@ struct VarArgAMD64Helper : public VarArgHelper {
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+ /*isStore*/ true);
// Unpoison the whole __va_list_tag.
// FIXME: magic ABI constants.
@@ -3200,13 +3395,13 @@ struct VarArgAMD64Helper : public VarArgHelper {
if (!VAStartInstrumentationList.empty()) {
// If there is a va_start in this function, make a backup copy of
// va_arg_tls somewhere in the function entry block.
- IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
Value *CopySize =
IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
VAArgOverflowSize);
VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+ IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
}
// Instrument va_start.
@@ -3219,33 +3414,33 @@ struct VarArgAMD64Helper : public VarArgHelper {
Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
ConstantInt::get(MS.IntptrTy, 16)),
- Type::getInt64PtrTy(*MS.C));
+ PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
unsigned Alignment = 16;
std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment);
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, AMD64FpEndOffset,
- Alignment);
+ Alignment, /*isStore*/ true);
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ AMD64FpEndOffset);
Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
ConstantInt::get(MS.IntptrTy, 8)),
- Type::getInt64PtrTy(*MS.C));
+ PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment);
+ Alignment, /*isStore*/ true);
Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
AMD64FpEndOffset);
- IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize,
- Alignment);
+ IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+ VAArgOverflowSize);
}
}
};
-/// \brief MIPS64-specific implementation of VarArgHelper.
+/// MIPS64-specific implementation of VarArgHelper.
struct VarArgMIPS64Helper : public VarArgHelper {
Function &F;
MemorySanitizer &MS;
@@ -3286,7 +3481,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
}
- /// \brief Compute the shadow address for a given va_arg.
+ /// Compute the shadow address for a given va_arg.
Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
int ArgOffset) {
Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
@@ -3301,8 +3496,8 @@ struct VarArgMIPS64Helper : public VarArgHelper {
Value *VAListTag = I.getArgOperand(0);
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
/* size */ 8, Alignment, false);
}
@@ -3313,8 +3508,8 @@ struct VarArgMIPS64Helper : public VarArgHelper {
Value *VAListTag = I.getArgOperand(0);
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
/* size */ 8, Alignment, false);
}
@@ -3322,7 +3517,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
void finalizeInstrumentation() override {
assert(!VAArgSize && !VAArgTLSCopy &&
"finalizeInstrumentation called twice");
- IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
VAArgSize);
@@ -3331,7 +3526,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
// If there is a va_start in this function, make a backup copy of
// va_arg_tls somewhere in the function entry block.
VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+ IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
}
// Instrument va_start.
@@ -3341,20 +3536,21 @@ struct VarArgMIPS64Helper : public VarArgHelper {
IRBuilder<> IRB(OrigInst->getNextNode());
Value *VAListTag = OrigInst->getArgOperand(0);
Value *RegSaveAreaPtrPtr =
- IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- Type::getInt64PtrTy(*MS.C));
+ IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
unsigned Alignment = 8;
std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment);
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, Alignment);
+ Alignment, /*isStore*/ true);
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ CopySize);
}
}
};
-/// \brief AArch64-specific implementation of VarArgHelper.
+/// AArch64-specific implementation of VarArgHelper.
struct VarArgAArch64Helper : public VarArgHelper {
static const unsigned kAArch64GrArgSize = 64;
static const unsigned kAArch64VrArgSize = 128;
@@ -3461,8 +3657,8 @@ struct VarArgAArch64Helper : public VarArgHelper {
Value *VAListTag = I.getArgOperand(0);
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
/* size */ 32, Alignment, false);
}
@@ -3473,8 +3669,8 @@ struct VarArgAArch64Helper : public VarArgHelper {
Value *VAListTag = I.getArgOperand(0);
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
/* size */ 32, Alignment, false);
}
@@ -3506,13 +3702,13 @@ struct VarArgAArch64Helper : public VarArgHelper {
if (!VAStartInstrumentationList.empty()) {
// If there is a va_start in this function, make a backup copy of
// va_arg_tls somewhere in the function entry block.
- IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
Value *CopySize =
IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
VAArgOverflowSize);
VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+ IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
}
Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
@@ -3563,14 +3759,14 @@ struct VarArgAArch64Helper : public VarArgHelper {
Value *GrRegSaveAreaShadowPtr =
MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- /*Alignment*/ 8)
+ /*Alignment*/ 8, /*isStore*/ true)
.first;
Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
GrRegSaveAreaShadowPtrOff);
Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
- IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8);
+ IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, 8, GrSrcPtr, 8, GrCopySize);
// Again, but for FP/SIMD values.
Value *VrRegSaveAreaShadowPtrOff =
@@ -3578,7 +3774,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
Value *VrRegSaveAreaShadowPtr =
MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- /*Alignment*/ 8)
+ /*Alignment*/ 8, /*isStore*/ true)
.first;
Value *VrSrcPtr = IRB.CreateInBoundsGEP(
@@ -3588,25 +3784,25 @@ struct VarArgAArch64Helper : public VarArgHelper {
VrRegSaveAreaShadowPtrOff);
Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
- IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8);
+ IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, 8, VrSrcPtr, 8, VrCopySize);
// And finally for remaining arguments.
Value *StackSaveAreaShadowPtr =
MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
- /*Alignment*/ 16)
+ /*Alignment*/ 16, /*isStore*/ true)
.first;
Value *StackSrcPtr =
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
IRB.getInt32(AArch64VAEndOffset));
- IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr,
- VAArgOverflowSize, 16);
+ IRB.CreateMemCpy(StackSaveAreaShadowPtr, 16, StackSrcPtr, 16,
+ VAArgOverflowSize);
}
}
};
-/// \brief PowerPC64-specific implementation of VarArgHelper.
+/// PowerPC64-specific implementation of VarArgHelper.
struct VarArgPowerPC64Helper : public VarArgHelper {
Function &F;
MemorySanitizer &MS;
@@ -3657,9 +3853,10 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
VAArgOffset - VAArgBase);
Value *AShadowPtr, *AOriginPtr;
std::tie(AShadowPtr, AOriginPtr) = MSV.getShadowOriginPtr(
- A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment);
+ A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment, /*isStore*/ false);
- IRB.CreateMemCpy(Base, AShadowPtr, ArgSize, kShadowTLSAlignment);
+ IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr,
+ kShadowTLSAlignment, ArgSize);
}
VAArgOffset += alignTo(ArgSize, 8);
} else {
@@ -3704,7 +3901,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
}
- /// \brief Compute the shadow address for a given va_arg.
+ /// Compute the shadow address for a given va_arg.
Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
int ArgOffset) {
Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
@@ -3719,8 +3916,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
Value *VAListTag = I.getArgOperand(0);
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
/* size */ 8, Alignment, false);
}
@@ -3730,8 +3927,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
Value *VAListTag = I.getArgOperand(0);
Value *ShadowPtr, *OriginPtr;
unsigned Alignment = 8;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
// Unpoison the whole __va_list_tag.
// FIXME: magic ABI constants.
IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
@@ -3741,7 +3938,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
void finalizeInstrumentation() override {
assert(!VAArgSize && !VAArgTLSCopy &&
"finalizeInstrumentation called twice");
- IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
VAArgSize);
@@ -3750,7 +3947,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
// If there is a va_start in this function, make a backup copy of
// va_arg_tls somewhere in the function entry block.
VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+ IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize);
}
// Instrument va_start.
@@ -3760,20 +3957,21 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
IRBuilder<> IRB(OrigInst->getNextNode());
Value *VAListTag = OrigInst->getArgOperand(0);
Value *RegSaveAreaPtrPtr =
- IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- Type::getInt64PtrTy(*MS.C));
+ IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ PointerType::get(Type::getInt64PtrTy(*MS.C), 0));
Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
unsigned Alignment = 8;
std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment);
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, Alignment);
+ Alignment, /*isStore*/ true);
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ CopySize);
}
}
};
-/// \brief A no-op implementation of VarArgHelper.
+/// A no-op implementation of VarArgHelper.
struct VarArgNoOpHelper : public VarArgHelper {
VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
MemorySanitizerVisitor &MSV) {}
@@ -3796,8 +3994,7 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
Triple TargetTriple(Func.getParent()->getTargetTriple());
if (TargetTriple.getArch() == Triple::x86_64)
return new VarArgAMD64Helper(Func, Msan, Visitor);
- else if (TargetTriple.getArch() == Triple::mips64 ||
- TargetTriple.getArch() == Triple::mips64el)
+ else if (TargetTriple.isMIPS64())
return new VarArgMIPS64Helper(Func, Msan, Visitor);
else if (TargetTriple.getArch() == Triple::aarch64)
return new VarArgAArch64Helper(Func, Msan, Visitor);
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index cb4b3a9c2545..307b7eaa2196 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -48,7 +48,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "CFGMST.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -119,6 +119,7 @@
#include <vector>
using namespace llvm;
+using ProfileCount = Function::ProfileCount;
#define DEBUG_TYPE "pgo-instrumentation"
@@ -223,8 +224,8 @@ static cl::opt<bool>
EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
cl::desc("When this option is on, the annotated "
"branch probability will be emitted as "
- " optimization remarks: -Rpass-analysis="
- "pgo-instr-use"));
+ "optimization remarks: -{Rpass|"
+ "pass-remarks}=pgo-instrumentation"));
// Command line option to turn on CFG dot dump after profile annotation.
// Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts
@@ -448,7 +449,7 @@ ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename) {
namespace {
-/// \brief An MST based instrumentation for PGO
+/// An MST based instrumentation for PGO
///
/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
/// in the function level.
@@ -545,7 +546,7 @@ public:
computeCFGHash();
if (!ComdatMembers.empty())
renameComdatFunction();
- DEBUG(dumpInfo("after CFGMST"));
+ LLVM_DEBUG(dumpInfo("after CFGMST"));
NumOfPGOBB += MST.BBInfos.size();
for (auto &E : MST.AllEdges) {
@@ -595,12 +596,12 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
(uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
(uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
- DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
- << " CRC = " << JC.getCRC()
- << ", Selects = " << SIVisitor.getNumOfSelectInsts()
- << ", Edges = " << MST.AllEdges.size()
- << ", ICSites = " << ValueSites[IPVK_IndirectCallTarget].size()
- << ", Hash = " << FunctionHash << "\n";);
+ LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
+ << " CRC = " << JC.getCRC()
+ << ", Selects = " << SIVisitor.getNumOfSelectInsts()
+ << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
+ << ValueSites[IPVK_IndirectCallTarget].size()
+ << ", Hash = " << FunctionHash << "\n";);
}
// Check if we can safely rename this Comdat function.
@@ -701,8 +702,8 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
// For a critical edge, we have to split. Instrument the newly
// created BB.
NumOfPGOSplit++;
- DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> "
- << getBBInfo(DestBB).Index << "\n");
+ LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index
+ << " --> " << getBBInfo(DestBB).Index << "\n");
unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
assert(InstrBB && "Critical edge is not split");
@@ -752,8 +753,8 @@ static void instrumentOneFunc(
for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
CallSite CS(I);
Value *Callee = CS.getCalledValue();
- DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
- << NumIndirectCallSites << "\n");
+ LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
+ << NumIndirectCallSites << "\n");
IRBuilder<> Builder(I);
assert(Builder.GetInsertPoint() != I->getParent()->end() &&
"Cannot get the Instrumentation point");
@@ -861,7 +862,7 @@ public:
// Set the branch weights based on the count values.
void setBranchWeights();
- // Annotate the value profile call sites all all value kind.
+ // Annotate the value profile call sites for all value kind.
void annotateValueSites();
// Annotate the value profile call sites for one value kind.
@@ -1041,14 +1042,14 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
NumOfPGOFunc++;
- DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+ LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
uint64_t ValueSum = 0;
for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
- DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n");
+ LLVM_DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n");
ValueSum += CountFromProfile[I];
}
- DEBUG(dbgs() << "SUM = " << ValueSum << "\n");
+ LLVM_DEBUG(dbgs() << "SUM = " << ValueSum << "\n");
getBBInfo(nullptr).UnknownCountOutEdge = 2;
getBBInfo(nullptr).UnknownCountInEdge = 2;
@@ -1128,7 +1129,7 @@ void PGOUseFunc::populateCounters() {
}
}
- DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+ LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
#ifndef NDEBUG
// Assert every BB has a valid counter.
for (auto &BB : F) {
@@ -1139,7 +1140,7 @@ void PGOUseFunc::populateCounters() {
}
#endif
uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
- F.setEntryCount(FuncEntryCount);
+ F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
uint64_t FuncMaxCount = FuncEntryCount;
for (auto &BB : F) {
auto BI = findBBInfo(&BB);
@@ -1153,13 +1154,13 @@ void PGOUseFunc::populateCounters() {
FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
assert(CountPosition == ProfileCountSize);
- DEBUG(FuncInfo.dumpInfo("after reading profile."));
+ LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile."));
}
// Assign the scaled count values to the BB with multiple out edges.
void PGOUseFunc::setBranchWeights() {
// Generate MD_prof metadata for every branch instruction.
- DEBUG(dbgs() << "\nSetting branch weights.\n");
+ LLVM_DEBUG(dbgs() << "\nSetting branch weights.\n");
for (auto &BB : F) {
TerminatorInst *TI = BB.getTerminator();
if (TI->getNumSuccessors() < 2)
@@ -1200,7 +1201,7 @@ static bool isIndirectBrTarget(BasicBlock *BB) {
}
void PGOUseFunc::annotateIrrLoopHeaderWeights() {
- DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+ LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
// Find irr loop headers
for (auto &BB : F) {
// As a heuristic also annotate indrectbr targets as they have a high chance
@@ -1333,9 +1334,9 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) {
}
for (auto &I : ValueSites) {
- DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
- << "): Index = " << ValueSiteIndex << " out of "
- << NumValueSites << "\n");
+ LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+ << "): Index = " << ValueSiteIndex << " out of "
+ << NumValueSites << "\n");
annotateValueSite(*M, *I, ProfileRecord,
static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
@@ -1431,7 +1432,7 @@ static bool annotateAllFunctions(
Module &M, StringRef ProfileFileName,
function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
- DEBUG(dbgs() << "Read in profile counters: ");
+ LLVM_DEBUG(dbgs() << "Read in profile counters: ");
auto &Ctx = M.getContext();
// Read the counter array from file.
auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
@@ -1517,12 +1518,13 @@ static bool annotateAllFunctions(
// inconsistent MST between prof-gen and prof-use.
for (auto &F : HotFunctions) {
F->addFnAttr(Attribute::InlineHint);
- DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+ << "\n");
}
for (auto &F : ColdFunctions) {
F->addFnAttr(Attribute::Cold);
- DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+ << "\n");
}
return true;
}
@@ -1585,22 +1587,25 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
for (const auto &ECI : EdgeCounts)
Weights.push_back(scaleBranchCount(ECI, Scale));
- DEBUG(dbgs() << "Weight is: ";
- for (const auto &W : Weights) { dbgs() << W << " "; }
- dbgs() << "\n";);
+ LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
+ : Weights) {
+ dbgs() << W << " ";
+ } dbgs() << "\n";);
TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
if (EmitBranchProbability) {
std::string BrCondStr = getBranchCondString(TI);
if (BrCondStr.empty())
return;
- unsigned WSum =
- std::accumulate(Weights.begin(), Weights.end(), 0,
- [](unsigned w1, unsigned w2) { return w1 + w2; });
+ uint64_t WSum =
+ std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0,
+ [](uint64_t w1, uint64_t w2) { return w1 + w2; });
uint64_t TotalCount =
- std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), 0,
+ std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
[](uint64_t c1, uint64_t c2) { return c1 + c2; });
- BranchProbability BP(Weights[0], WSum);
+ Scale = calculateCountScale(WSum);
+ BranchProbability BP(scaleBranchCount(Weights[0], Scale),
+ scaleBranchCount(WSum, Scale));
std::string BranchProbStr;
raw_string_ostream OS(BranchProbStr);
OS << BP;
diff --git a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 95eb3680403a..2c71e75dadcc 100644
--- a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -25,6 +25,8 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DomTreeUpdater.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -44,7 +46,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <cassert>
#include <cstdint>
@@ -112,6 +114,7 @@ private:
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
}
};
} // end anonymous namespace
@@ -133,8 +136,8 @@ namespace {
class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
public:
MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
- OptimizationRemarkEmitter &ORE)
- : Func(Func), BFI(BFI), ORE(ORE), Changed(false) {
+ OptimizationRemarkEmitter &ORE, DominatorTree *DT)
+ : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
ValueDataArray =
llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
// Get the MemOPSize range information from option MemOPSizeRange,
@@ -151,8 +154,9 @@ public:
if (perform(MI)) {
Changed = true;
++NumOfPGOMemOPOpt;
- DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
- << "is Transformed.\n");
+ LLVM_DEBUG(dbgs() << "MemOP call: "
+ << MI->getCalledFunction()->getName()
+ << "is Transformed.\n");
}
}
}
@@ -169,6 +173,7 @@ private:
Function &Func;
BlockFrequencyInfo &BFI;
OptimizationRemarkEmitter &ORE;
+ DominatorTree *DT;
bool Changed;
std::vector<MemIntrinsic *> WorkList;
// Start of the previse range.
@@ -245,9 +250,9 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
}
ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
- DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
- << "\n");
- DEBUG(
+ LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
+ << ActualCount << "\n");
+ LLVM_DEBUG(
for (auto &VD
: VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
@@ -260,8 +265,8 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
TotalCount = ActualCount;
if (MemOPScaleCount)
- DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
- << " denominator = " << SavedTotalCount << "\n");
+ LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+ << " denominator = " << SavedTotalCount << "\n");
// Keeping track of the count of the default case:
uint64_t RemainCount = TotalCount;
@@ -310,9 +315,9 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
uint64_t SumForOpt = TotalCount - RemainCount;
- DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
- << " Versions (covering " << SumForOpt << " out of "
- << TotalCount << ")\n");
+ LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+ << " Versions (covering " << SumForOpt << " out of "
+ << TotalCount << ")\n");
// mem_op(..., size)
// ==>
@@ -331,19 +336,20 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
// merge_bb:
BasicBlock *BB = MI->getParent();
- DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
- DEBUG(dbgs() << *BB << "\n");
+ LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+ LLVM_DEBUG(dbgs() << *BB << "\n");
auto OrigBBFreq = BFI.getBlockFreq(BB);
- BasicBlock *DefaultBB = SplitBlock(BB, MI);
+ BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
BasicBlock::iterator It(*MI);
++It;
assert(It != DefaultBB->end());
- BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+ BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
MergeBB->setName("MemOP.Merge");
BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
DefaultBB->setName("MemOP.Default");
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
auto &Ctx = Func.getContext();
IRBuilder<> IRB(BB);
BB->getTerminator()->eraseFromParent();
@@ -358,7 +364,11 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
SavedRemainCount, IPVK_MemOPSize, NumVals);
- DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+ LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+ std::vector<DominatorTree::UpdateType> Updates;
+ if (DT)
+ Updates.reserve(2 * SizeIds.size());
for (uint64_t SizeId : SizeIds) {
BasicBlock *CaseBB = BasicBlock::Create(
@@ -374,13 +384,20 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
IRBuilder<> IRBCase(CaseBB);
IRBCase.CreateBr(MergeBB);
SI->addCase(CaseSizeId, CaseBB);
- DEBUG(dbgs() << *CaseBB << "\n");
+ if (DT) {
+ Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
+ Updates.push_back({DominatorTree::Insert, BB, CaseBB});
+ }
+ LLVM_DEBUG(dbgs() << *CaseBB << "\n");
}
+ DTU.applyUpdates(Updates);
+ Updates.clear();
+
setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
- DEBUG(dbgs() << *BB << "\n");
- DEBUG(dbgs() << *DefaultBB << "\n");
- DEBUG(dbgs() << *MergeBB << "\n");
+ LLVM_DEBUG(dbgs() << *BB << "\n");
+ LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
+ LLVM_DEBUG(dbgs() << *MergeBB << "\n");
ORE.emit([&]() {
using namespace ore;
@@ -396,13 +413,14 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
} // namespace
static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
- OptimizationRemarkEmitter &ORE) {
+ OptimizationRemarkEmitter &ORE,
+ DominatorTree *DT) {
if (DisableMemOPOPT)
return false;
if (F.hasFnAttribute(Attribute::OptimizeForSize))
return false;
- MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE);
+ MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
MemOPSizeOpt.perform();
return MemOPSizeOpt.isChanged();
}
@@ -411,7 +429,9 @@ bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
BlockFrequencyInfo &BFI =
getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- return PGOMemOPSizeOptImpl(F, BFI, ORE);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
}
namespace llvm {
@@ -421,11 +441,13 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
FunctionAnalysisManager &FAM) {
auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE);
+ auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+ bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
if (!Changed)
return PreservedAnalyses::all();
auto PA = PreservedAnalyses();
PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
return PA;
}
} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index d950e2e730f2..a4dd48c8dd6a 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -35,7 +35,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -243,6 +242,7 @@ private:
GlobalVariable *Function8bitCounterArray; // for inline-8bit-counters.
GlobalVariable *FunctionPCsArray; // for pc-table.
SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
+ SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
SanitizerCoverageOptions Options;
};
@@ -405,6 +405,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
// so we need to prevent them from being dead stripped.
if (TargetTriple.isOSBinFormatMachO())
appendToUsed(M, GlobalsToAppendToUsed);
+ appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed);
return true;
}
@@ -480,6 +481,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
if (F.getName() == "__local_stdio_printf_options" ||
F.getName() == "__local_stdio_scanf_options")
return false;
+ if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
+ return false;
// Don't instrument functions using SEH for now. Splitting basic blocks like
// we do for coverage breaks WinEHPrepare.
// FIXME: Remove this when SEH no longer uses landingpad pattern matching.
@@ -592,11 +595,15 @@ void SanitizerCoverageModule::CreateFunctionLocalArrays(
if (Options.Inline8bitCounters) {
Function8bitCounterArray = CreateFunctionLocalArrayInSection(
AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
- GlobalsToAppendToUsed.push_back(Function8bitCounterArray);
+ GlobalsToAppendToCompilerUsed.push_back(Function8bitCounterArray);
+ MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+ Function8bitCounterArray->addMetadata(LLVMContext::MD_associated, *MD);
}
if (Options.PCTable) {
FunctionPCsArray = CreatePCArray(F, AllBlocks);
- GlobalsToAppendToUsed.push_back(FunctionPCsArray);
+ GlobalsToAppendToCompilerUsed.push_back(FunctionPCsArray);
+ MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+ FunctionPCsArray->addMetadata(LLVMContext::MD_associated, *MD);
}
}
@@ -659,11 +666,11 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
Initializers.push_back(C);
}
- std::sort(Initializers.begin() + 2, Initializers.end(),
- [](const Constant *A, const Constant *B) {
- return cast<ConstantInt>(A)->getLimitedValue() <
- cast<ConstantInt>(B)->getLimitedValue();
- });
+ llvm::sort(Initializers.begin() + 2, Initializers.end(),
+ [](const Constant *A, const Constant *B) {
+ return cast<ConstantInt>(A)->getLimitedValue() <
+ cast<ConstantInt>(B)->getLimitedValue();
+ });
ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
GlobalVariable *GV = new GlobalVariable(
*CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index ec6904486e10..fa1e5a157a0f 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,13 +19,14 @@
// The rest is handled by the run-time library.
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
@@ -44,7 +45,6 @@
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/EscapeEnumerator.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
@@ -339,7 +339,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
void ThreadSanitizer::chooseInstructionsToInstrument(
SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
const DataLayout &DL) {
- SmallSet<Value*, 8> WriteTargets;
+ SmallPtrSet<Value*, 8> WriteTargets;
// Iterate from the end.
for (Instruction *I : reverse(Local)) {
if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
@@ -502,7 +502,7 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
if (Idx < 0)
return false;
if (IsWrite && isVtableAccess(I)) {
- DEBUG(dbgs() << " VPTR : " << *I << "\n");
+ LLVM_DEBUG(dbgs() << " VPTR : " << *I << "\n");
Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
// StoredValue may be a vector type if we are storing several vptrs at once.
// In this case, just take the first element of the vector since this is
diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt
index 95482ad20225..f061c6d9285e 100644
--- a/lib/Transforms/LLVMBuild.txt
+++ b/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
;===------------------------------------------------------------------------===;
[common]
-subdirectories = Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
+subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
[component_0]
type = Group
diff --git a/lib/Transforms/ObjCARC/BlotMapVector.h b/lib/Transforms/ObjCARC/BlotMapVector.h
index 5518b49c4095..9ade14c1177a 100644
--- a/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -18,7 +18,7 @@
namespace llvm {
-/// \brief An associative container with fast insertion-order (deterministic)
+/// An associative container with fast insertion-order (deterministic)
/// iteration over its elements. Plus the special blot operation.
template <class KeyT, class ValueT> class BlotMapVector {
/// Map keys to indices in Vector.
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 8cc1232b18ca..0f13b02c806f 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -38,7 +38,7 @@ namespace objcarc {
class ProvenanceAnalysis;
/// \enum DependenceKind
-/// \brief Defines different dependence kinds among various ARC constructs.
+/// Defines different dependence kinds among various ARC constructs.
///
/// There are several kinds of dependence-like concepts in use here.
///
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index cd9b3d96a14f..1dbe72c7569f 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -28,13 +28,13 @@
#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
#include "llvm/Analysis/ObjCARCInstKind.h"
#include "llvm/Analysis/Passes.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/ObjCARC.h"
-#include "llvm/Transforms/Utils/Local.h"
namespace llvm {
class raw_ostream;
@@ -43,7 +43,7 @@ class raw_ostream;
namespace llvm {
namespace objcarc {
-/// \brief Erase the given instruction.
+/// Erase the given instruction.
///
/// Many ObjC calls return their argument verbatim,
/// so if it's such a call and the return value has users, replace them with the
@@ -82,6 +82,26 @@ static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
return dyn_cast<InvokeInst>(Opnd);
}
+/// Return the list of PHI nodes that are equivalent to PN.
+template<class PHINodeTy, class VectorTy>
+void getEquivalentPHIs(PHINodeTy &PN, VectorTy &PHIList) {
+ auto *BB = PN.getParent();
+ for (auto &P : BB->phis()) {
+ if (&P == &PN) // Do not add PN to the list.
+ continue;
+ unsigned I = 0, E = PN.getNumIncomingValues();
+ for (; I < E; ++I) {
+ auto *BB = PN.getIncomingBlock(I);
+ auto *PNOpnd = PN.getIncomingValue(I)->stripPointerCasts();
+ auto *POpnd = P.getIncomingValueForBlock(BB)->stripPointerCasts();
+ if (PNOpnd != POpnd)
+ break;
+ }
+ if (I == E)
+ PHIList.push_back(&P);
+ }
+}
+
} // end namespace objcarc
} // end namespace llvm
diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index b2c62a0e8eeb..8d3ef8fde534 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -36,7 +36,7 @@ using namespace llvm::objcarc;
#define DEBUG_TYPE "objc-arc-ap-elim"
namespace {
- /// \brief Autorelease pool elimination.
+ /// Autorelease pool elimination.
class ObjCARCAPElim : public ModulePass {
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnModule(Module &M) override;
@@ -103,10 +103,12 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
// zap the pair.
if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
Changed = true;
- DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
- "autorelease pair:\n"
- " Pop: " << *Inst << "\n"
- << " Push: " << *Push << "\n");
+ LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+ "autorelease pair:\n"
+ " Pop: "
+ << *Inst << "\n"
+ << " Push: " << *Push
+ << "\n");
Inst->eraseFromParent();
Push->eraseFromParent();
}
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index c4e61218f3f3..1f1ea9f58739 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -31,6 +31,7 @@
#include "ObjCARC.h"
#include "ProvenanceAnalysis.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Operator.h"
@@ -50,7 +51,7 @@ STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
//===----------------------------------------------------------------------===//
namespace {
- /// \brief Late ARC optimizations
+ /// Late ARC optimizations
///
/// These change the IR in a way that makes it difficult to be analyzed by
/// ObjCARCOpt, so it's run late.
@@ -74,11 +75,12 @@ namespace {
SmallPtrSet<CallInst *, 8> StoreStrongCalls;
/// Returns true if we eliminated Inst.
- bool tryToPeepholeInstruction(Function &F, Instruction *Inst,
- inst_iterator &Iter,
- SmallPtrSetImpl<Instruction *> &DepInsts,
- SmallPtrSetImpl<const BasicBlock *> &Visited,
- bool &TailOkForStoreStrong);
+ bool tryToPeepholeInstruction(
+ Function &F, Instruction *Inst, inst_iterator &Iter,
+ SmallPtrSetImpl<Instruction *> &DepInsts,
+ SmallPtrSetImpl<const BasicBlock *> &Visited,
+ bool &TailOkForStoreStrong,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors);
bool optimizeRetainCall(Function &F, Instruction *Retain);
@@ -88,8 +90,9 @@ namespace {
SmallPtrSetImpl<Instruction *> &DependingInstructions,
SmallPtrSetImpl<const BasicBlock *> &Visited);
- void tryToContractReleaseIntoStoreStrong(Instruction *Release,
- inst_iterator &Iter);
+ void tryToContractReleaseIntoStoreStrong(
+ Instruction *Release, inst_iterator &Iter,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors);
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool doInitialization(Module &M) override;
@@ -129,16 +132,18 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
Changed = true;
++NumPeeps;
- DEBUG(dbgs() << "Transforming objc_retain => "
- "objc_retainAutoreleasedReturnValue since the operand is a "
- "return value.\nOld: "<< *Retain << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Transforming objc_retain => "
+ "objc_retainAutoreleasedReturnValue since the operand is a "
+ "return value.\nOld: "
+ << *Retain << "\n");
// We do not have to worry about tail calls/does not throw since
// retain/retainRV have the same properties.
Constant *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
cast<CallInst>(Retain)->setCalledFunction(Decl);
- DEBUG(dbgs() << "New: " << *Retain << "\n");
+ LLVM_DEBUG(dbgs() << "New: " << *Retain << "\n");
return true;
}
@@ -177,16 +182,19 @@ bool ObjCARCContract::contractAutorelease(
Changed = true;
++NumPeeps;
- DEBUG(dbgs() << " Fusing retain/autorelease!\n"
- " Autorelease:" << *Autorelease << "\n"
- " Retain: " << *Retain << "\n");
+ LLVM_DEBUG(dbgs() << " Fusing retain/autorelease!\n"
+ " Autorelease:"
+ << *Autorelease
+ << "\n"
+ " Retain: "
+ << *Retain << "\n");
Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
: ARCRuntimeEntryPointKind::RetainAutorelease);
Retain->setCalledFunction(Decl);
- DEBUG(dbgs() << " New RetainAutorelease: " << *Retain << "\n");
+ LLVM_DEBUG(dbgs() << " New RetainAutorelease: " << *Retain << "\n");
EraseInstruction(Autorelease);
return true;
@@ -303,6 +311,24 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
return Retain;
}
+/// Create a call instruction with the correct funclet token. Should be used
+/// instead of calling CallInst::Create directly.
+static CallInst *
+createCallInst(Value *Func, ArrayRef<Value *> Args, const Twine &NameStr,
+ Instruction *InsertBefore,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second;
+ assert(CV.size() == 1 && "non-unique color for block!");
+ Instruction *EHPad = CV.front()->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ return CallInst::Create(Func, Args, OpBundles, NameStr, InsertBefore);
+}
+
/// Attempt to merge an objc_release with a store, load, and objc_retain to form
/// an objc_storeStrong. An objc_storeStrong:
///
@@ -330,8 +356,9 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
/// (4).
/// 2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
/// safe.
-void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
- inst_iterator &Iter) {
+void ObjCARCContract::tryToContractReleaseIntoStoreStrong(
+ Instruction *Release, inst_iterator &Iter,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
// See if we are releasing something that we just loaded.
auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
if (!Load || !Load->isSimple())
@@ -365,7 +392,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
Changed = true;
++NumStoreStrongs;
- DEBUG(
+ LLVM_DEBUG(
llvm::dbgs() << " Contracting retain, release into objc_storeStrong.\n"
<< " Old:\n"
<< " Store: " << *Store << "\n"
@@ -383,7 +410,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
if (Args[1]->getType() != I8X)
Args[1] = new BitCastInst(Args[1], I8X, "", Store);
Constant *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
- CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store);
+ CallInst *StoreStrong = createCallInst(Decl, Args, "", Store, BlockColors);
StoreStrong->setDoesNotThrow();
StoreStrong->setDebugLoc(Store->getDebugLoc());
@@ -392,7 +419,8 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
// we can set the tail flag once we know it's safe.
StoreStrongCalls.insert(StoreStrong);
- DEBUG(llvm::dbgs() << " New Store Strong: " << *StoreStrong << "\n");
+ LLVM_DEBUG(llvm::dbgs() << " New Store Strong: " << *StoreStrong
+ << "\n");
if (&*Iter == Retain) ++Iter;
if (&*Iter == Store) ++Iter;
@@ -407,7 +435,8 @@ bool ObjCARCContract::tryToPeepholeInstruction(
Function &F, Instruction *Inst, inst_iterator &Iter,
SmallPtrSetImpl<Instruction *> &DependingInsts,
SmallPtrSetImpl<const BasicBlock *> &Visited,
- bool &TailOkForStoreStrongs) {
+ bool &TailOkForStoreStrongs,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
// Only these library routines return their argument. In particular,
// objc_retainBlock does not necessarily return its argument.
ARCInstKind Class = GetBasicARCInstKind(Inst);
@@ -449,15 +478,16 @@ bool ObjCARCContract::tryToPeepholeInstruction(
} while (IsNoopInstruction(&*BBI));
if (&*BBI == GetArgRCIdentityRoot(Inst)) {
- DEBUG(dbgs() << "Adding inline asm marker for the return value "
- "optimization.\n");
+ LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value "
+ "optimization.\n");
Changed = true;
InlineAsm *IA = InlineAsm::get(
FunctionType::get(Type::getVoidTy(Inst->getContext()),
/*isVarArg=*/false),
RVInstMarker->getString(),
/*Constraints=*/"", /*hasSideEffects=*/true);
- CallInst::Create(IA, "", Inst);
+
+ createCallInst(IA, None, "", Inst, BlockColors);
}
decline_rv_optimization:
return false;
@@ -471,8 +501,8 @@ bool ObjCARCContract::tryToPeepholeInstruction(
Changed = true;
new StoreInst(Null, CI->getArgOperand(0), CI);
- DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
- << " New = " << *Null << "\n");
+ LLVM_DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+ << " New = " << *Null << "\n");
CI->replaceAllUsesWith(Null);
CI->eraseFromParent();
@@ -482,7 +512,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
case ARCInstKind::Release:
// Try to form an objc store strong from our release. If we fail, there is
// nothing further to do below, so continue.
- tryToContractReleaseIntoStoreStrong(Inst, Iter);
+ tryToContractReleaseIntoStoreStrong(Inst, Iter, BlockColors);
return true;
case ARCInstKind::User:
// Be conservative if the function has any alloca instructions.
@@ -518,7 +548,12 @@ bool ObjCARCContract::runOnFunction(Function &F) {
PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
- DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
+ DenseMap<BasicBlock *, ColorVector> BlockColors;
+ if (F.hasPersonalityFn() &&
+ isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+ BlockColors = colorEHFunclets(F);
+
+ LLVM_DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
// Track whether it's ok to mark objc_storeStrong calls with the "tail"
// keyword. Be conservative if the function has variadic arguments.
@@ -536,12 +571,12 @@ bool ObjCARCContract::runOnFunction(Function &F) {
for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
Instruction *Inst = &*I++;
- DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
// First try to peephole Inst. If there is nothing further we can do in
// terms of undoing objc-arc-expand, process the next inst.
if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited,
- TailOkForStoreStrongs))
+ TailOkForStoreStrongs, BlockColors))
continue;
// Otherwise, try to undo objc-arc-expand.
@@ -568,35 +603,48 @@ bool ObjCARCContract::runOnFunction(Function &F) {
// trivially dominate itself, which would lead us to rewriting its
// argument in terms of its return value, which would lead to
// infinite loops in GetArgRCIdentityRoot.
- if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
- Changed = true;
- Instruction *Replacement = Inst;
- Type *UseTy = U.get()->getType();
- if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
- // For PHI nodes, insert the bitcast in the predecessor block.
- unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
- BasicBlock *BB = PHI->getIncomingBlock(ValNo);
- if (Replacement->getType() != UseTy)
- Replacement = new BitCastInst(Replacement, UseTy, "",
- &BB->back());
- // While we're here, rewrite all edges for this PHI, rather
- // than just one use at a time, to minimize the number of
- // bitcasts we emit.
- for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
- if (PHI->getIncomingBlock(i) == BB) {
- // Keep the UI iterator valid.
- if (UI != UE &&
- &PHI->getOperandUse(
- PHINode::getOperandNumForIncomingValue(i)) == &*UI)
- ++UI;
- PHI->setIncomingValue(i, Replacement);
- }
- } else {
- if (Replacement->getType() != UseTy)
- Replacement = new BitCastInst(Replacement, UseTy, "",
- cast<Instruction>(U.getUser()));
- U.set(Replacement);
+ if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U))
+ continue;
+
+ Changed = true;
+ Instruction *Replacement = Inst;
+ Type *UseTy = U.get()->getType();
+ if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+ // For PHI nodes, insert the bitcast in the predecessor block.
+ unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(ValNo);
+ if (Replacement->getType() != UseTy) {
+ // A catchswitch is both a pad and a terminator, meaning a basic
+ // block with a catchswitch has no insertion point. Keep going up
+ // the dominator tree until we find a non-catchswitch.
+ BasicBlock *InsertBB = IncomingBB;
+ while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) {
+ InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock();
+ }
+
+ assert(DT->dominates(Inst, &InsertBB->back()) &&
+ "Invalid insertion point for bitcast");
+ Replacement =
+ new BitCastInst(Replacement, UseTy, "", &InsertBB->back());
}
+
+ // While we're here, rewrite all edges for this PHI, rather
+ // than just one use at a time, to minimize the number of
+ // bitcasts we emit.
+ for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+ if (PHI->getIncomingBlock(i) == IncomingBB) {
+ // Keep the UI iterator valid.
+ if (UI != UE &&
+ &PHI->getOperandUse(
+ PHINode::getOperandNumForIncomingValue(i)) == &*UI)
+ ++UI;
+ PHI->setIncomingValue(i, Replacement);
+ }
+ } else {
+ if (Replacement->getType() != UseTy)
+ Replacement = new BitCastInst(Replacement, UseTy, "",
+ cast<Instruction>(U.getUser()));
+ U.set(Replacement);
}
}
};
@@ -618,8 +666,17 @@ bool ObjCARCContract::runOnFunction(Function &F) {
else if (isa<GlobalAlias>(Arg) &&
!cast<GlobalAlias>(Arg)->isInterposable())
Arg = cast<GlobalAlias>(Arg)->getAliasee();
- else
+ else {
+ // If Arg is a PHI node, get PHIs that are equivalent to it and replace
+ // their uses.
+ if (PHINode *PN = dyn_cast<PHINode>(Arg)) {
+ SmallVector<Value *, 1> PHIList;
+ getEquivalentPHIs(*PN, PHIList);
+ for (Value *PHI : PHIList)
+ ReplaceArgUses(PHI);
+ }
break;
+ }
}
// Replace bitcast users of Arg that are dominated by Inst.
diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index bb6a0a0e73db..6a345ef56e1b 100644
--- a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -47,7 +47,7 @@ using namespace llvm;
using namespace llvm::objcarc;
namespace {
- /// \brief Early ARC transformations.
+ /// Early ARC transformations.
class ObjCARCExpand : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool doInitialization(Module &M) override;
@@ -91,12 +91,13 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
bool Changed = false;
- DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName()
+ << "\n");
for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
Instruction *Inst = &*I;
- DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
switch (GetBasicARCInstKind(Inst)) {
case ARCInstKind::Retain:
@@ -111,8 +112,10 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
// emitted here. We'll redo them in the contract pass.
Changed = true;
Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
- DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
- " New = " << *Value << "\n");
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst
+ << "\n"
+ " New = "
+ << *Value << "\n");
Inst->replaceAllUsesWith(Value);
break;
}
@@ -121,7 +124,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
}
}
- DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
return Changed;
}
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 99ed6863c22e..21e2848030fc 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -38,6 +38,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
#include "llvm/Analysis/ObjCARCInstKind.h"
@@ -76,7 +77,7 @@ using namespace llvm::objcarc;
/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
/// @{
-/// \brief This is similar to GetRCIdentityRoot but it stops as soon
+/// This is similar to GetRCIdentityRoot but it stops as soon
/// as it finds a value with multiple uses.
static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
// ConstantData (like ConstantPointerNull and UndefValue) is used across
@@ -174,7 +175,7 @@ STATISTIC(NumReleasesAfterOpt,
namespace {
- /// \brief Per-BasicBlock state.
+ /// Per-BasicBlock state.
class BBState {
/// The number of unique control paths from the entry which can reach this
/// block.
@@ -422,7 +423,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
// Dump the pointers we are tracking.
OS << " TopDown State:\n";
if (!BBInfo.hasTopDownPtrs()) {
- DEBUG(dbgs() << " NONE!\n");
+ LLVM_DEBUG(dbgs() << " NONE!\n");
} else {
for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
I != E; ++I) {
@@ -442,7 +443,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
OS << " BottomUp State:\n";
if (!BBInfo.hasBottomUpPtrs()) {
- DEBUG(dbgs() << " NONE!\n");
+ LLVM_DEBUG(dbgs() << " NONE!\n");
} else {
for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
I != E; ++I) {
@@ -465,7 +466,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
namespace {
- /// \brief The main ARC optimization pass.
+ /// The main ARC optimization pass.
class ObjCARCOpt : public FunctionPass {
bool Changed;
ProvenanceAnalysis PA;
@@ -612,8 +613,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
Changed = true;
++NumPeeps;
- DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
- << "Erasing " << *RetainRV << "\n");
+ LLVM_DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
+ << "Erasing " << *RetainRV << "\n");
EraseInstruction(&*I);
EraseInstruction(RetainRV);
@@ -625,14 +626,15 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
Changed = true;
++NumPeeps;
- DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
- "objc_retain since the operand is not a return value.\n"
- "Old = " << *RetainRV << "\n");
+ LLVM_DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
+ "objc_retain since the operand is not a return value.\n"
+ "Old = "
+ << *RetainRV << "\n");
Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
- DEBUG(dbgs() << "New = " << *RetainRV << "\n");
+ LLVM_DEBUG(dbgs() << "New = " << *RetainRV << "\n");
return false;
}
@@ -652,6 +654,11 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
SmallVector<const Value *, 2> Users;
Users.push_back(Ptr);
+
+ // Add PHIs that are equivalent to Ptr to Users.
+ if (const PHINode *PN = dyn_cast<PHINode>(Ptr))
+ getEquivalentPHIs(*PN, Users);
+
do {
Ptr = Users.pop_back_val();
for (const User *U : Ptr->users()) {
@@ -665,10 +672,12 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
Changed = true;
++NumPeeps;
- DEBUG(dbgs() << "Transforming objc_autoreleaseReturnValue => "
- "objc_autorelease since its operand is not used as a return "
- "value.\n"
- "Old = " << *AutoreleaseRV << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Transforming objc_autoreleaseReturnValue => "
+ "objc_autorelease since its operand is not used as a return "
+ "value.\n"
+ "Old = "
+ << *AutoreleaseRV << "\n");
CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
@@ -676,23 +685,53 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
Class = ARCInstKind::Autorelease;
- DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+ LLVM_DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+}
+
+namespace {
+Instruction *
+CloneCallInstForBB(CallInst &CI, BasicBlock &BB,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (unsigned I = 0, E = CI.getNumOperandBundles(); I != E; ++I) {
+ auto Bundle = CI.getOperandBundleAt(I);
+ // Funclets will be reassociated in the future.
+ if (Bundle.getTagID() == LLVMContext::OB_funclet)
+ continue;
+ OpBundles.emplace_back(Bundle);
+ }
+
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(&BB)->second;
+ assert(CV.size() == 1 && "non-unique color for block!");
+ Instruction *EHPad = CV.front()->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ return CallInst::Create(&CI, OpBundles);
+}
}
/// Visit each call, one at a time, and make simplifications without doing any
/// additional analysis.
void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
- DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
// Reset all the flags in preparation for recomputing them.
UsedInThisFunction = 0;
+ DenseMap<BasicBlock *, ColorVector> BlockColors;
+ if (F.hasPersonalityFn() &&
+ isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+ BlockColors = colorEHFunclets(F);
+
// Visit all objc_* calls in F.
for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
Instruction *Inst = &*I++;
ARCInstKind Class = GetBasicARCInstKind(Inst);
- DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
switch (Class) {
default: break;
@@ -708,7 +747,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
case ARCInstKind::NoopCast:
Changed = true;
++NumNoops;
- DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
EraseInstruction(Inst);
continue;
@@ -726,8 +765,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
Constant::getNullValue(Ty),
CI);
Value *NewValue = UndefValue::get(CI->getType());
- DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
- "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
+ LLVM_DEBUG(
+ dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+ "\nOld = "
+ << *CI << "\nNew = " << *NewValue << "\n");
CI->replaceAllUsesWith(NewValue);
CI->eraseFromParent();
continue;
@@ -746,8 +787,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
CI);
Value *NewValue = UndefValue::get(CI->getType());
- DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
- "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
+ LLVM_DEBUG(
+ dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+ "\nOld = "
+ << *CI << "\nNew = " << *NewValue << "\n");
CI->replaceAllUsesWith(NewValue);
CI->eraseFromParent();
@@ -782,9 +825,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
MDNode::get(C, None));
- DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
- "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
- << *NewCall << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+ "since x is otherwise unused.\nOld: "
+ << *Call << "\nNew: " << *NewCall << "\n");
EraseInstruction(Call);
Inst = NewCall;
@@ -796,8 +840,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
// a tail keyword.
if (IsAlwaysTail(Class)) {
Changed = true;
- DEBUG(dbgs() << "Adding tail keyword to function since it can never be "
- "passed stack args: " << *Inst << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Adding tail keyword to function since it can never be "
+ "passed stack args: "
+ << *Inst << "\n");
cast<CallInst>(Inst)->setTailCall();
}
@@ -805,16 +851,16 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
// semantics of ARC truly do not do so.
if (IsNeverTail(Class)) {
Changed = true;
- DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst <<
- "\n");
+ LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
+ << "\n");
cast<CallInst>(Inst)->setTailCall(false);
}
// Set nounwind as needed.
if (IsNoThrow(Class)) {
Changed = true;
- DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: "
+ << *Inst << "\n");
cast<CallInst>(Inst)->setDoesNotThrow();
}
@@ -829,8 +875,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
if (IsNullOrUndef(Arg)) {
Changed = true;
++NumNoops;
- DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst
- << "\n");
+ LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst
+ << "\n");
EraseInstruction(Inst);
continue;
}
@@ -922,22 +968,24 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
Value *Incoming =
GetRCIdentityRoot(PN->getIncomingValue(i));
if (!IsNullOrUndef(Incoming)) {
- CallInst *Clone = cast<CallInst>(CInst->clone());
Value *Op = PN->getIncomingValue(i);
Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+ CallInst *Clone = cast<CallInst>(CloneCallInstForBB(
+ *CInst, *InsertPos->getParent(), BlockColors));
if (Op->getType() != ParamTy)
Op = new BitCastInst(Op, ParamTy, "", InsertPos);
Clone->setArgOperand(0, Op);
Clone->insertBefore(InsertPos);
- DEBUG(dbgs() << "Cloning "
- << *CInst << "\n"
- "And inserting clone at " << *InsertPos << "\n");
+ LLVM_DEBUG(dbgs() << "Cloning " << *CInst
+ << "\n"
+ "And inserting clone at "
+ << *InsertPos << "\n");
Worklist.push_back(std::make_pair(Clone, Incoming));
}
}
// Erase the original call.
- DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+ LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
EraseInstruction(CInst);
continue;
}
@@ -1114,7 +1162,7 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
ARCInstKind Class = GetARCInstKind(Inst);
const Value *Arg = nullptr;
- DEBUG(dbgs() << " Class: " << Class << "\n");
+ LLVM_DEBUG(dbgs() << " Class: " << Class << "\n");
switch (Class) {
case ARCInstKind::Release: {
@@ -1137,7 +1185,7 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
// Don't do retain+release tracking for ARCInstKind::RetainRV, because
// it's better to let it remain as the first instruction after a call.
if (Class != ARCInstKind::RetainRV) {
- DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
Retains[Inst] = S.GetRRInfo();
}
S.ClearSequenceProgress();
@@ -1179,7 +1227,7 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
DenseMap<const BasicBlock *, BBState> &BBStates,
BlotMapVector<Value *, RRInfo> &Retains) {
- DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
bool NestingDetected = false;
BBState &MyStates = BBStates[BB];
@@ -1202,8 +1250,9 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
}
}
- DEBUG(dbgs() << "Before:\n" << BBStates[BB] << "\n"
- << "Performing Dataflow:\n");
+ LLVM_DEBUG(dbgs() << "Before:\n"
+ << BBStates[BB] << "\n"
+ << "Performing Dataflow:\n");
// Visit all the instructions, bottom-up.
for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
@@ -1213,7 +1262,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
if (isa<InvokeInst>(Inst))
continue;
- DEBUG(dbgs() << " Visiting " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << " Visiting " << *Inst << "\n");
NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
}
@@ -1228,7 +1277,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
}
- DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
+ LLVM_DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
return NestingDetected;
}
@@ -1241,7 +1290,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
ARCInstKind Class = GetARCInstKind(Inst);
const Value *Arg = nullptr;
- DEBUG(dbgs() << " Class: " << Class << "\n");
+ LLVM_DEBUG(dbgs() << " Class: " << Class << "\n");
switch (Class) {
case ARCInstKind::RetainBlock:
@@ -1267,7 +1316,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
if (S.MatchWithRelease(MDKindCache, Inst)) {
// If we succeed, copy S's RRInfo into the Release -> {Retain Set
// Map}. Then we clear S.
- DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
Releases[Inst] = S.GetRRInfo();
S.ClearSequenceProgress();
}
@@ -1307,7 +1356,7 @@ bool
ObjCARCOpt::VisitTopDown(BasicBlock *BB,
DenseMap<const BasicBlock *, BBState> &BBStates,
DenseMap<Value *, RRInfo> &Releases) {
- DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
bool NestingDetected = false;
BBState &MyStates = BBStates[BB];
@@ -1329,20 +1378,21 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
}
}
- DEBUG(dbgs() << "Before:\n" << BBStates[BB] << "\n"
- << "Performing Dataflow:\n");
+ LLVM_DEBUG(dbgs() << "Before:\n"
+ << BBStates[BB] << "\n"
+ << "Performing Dataflow:\n");
// Visit all the instructions, top-down.
for (Instruction &Inst : *BB) {
- DEBUG(dbgs() << " Visiting " << Inst << "\n");
+ LLVM_DEBUG(dbgs() << " Visiting " << Inst << "\n");
NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
}
- DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
- << BBStates[BB] << "\n\n");
+ LLVM_DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
+ << BBStates[BB] << "\n\n");
CheckForCFGHazards(BB, BBStates, MyStates);
- DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
+ LLVM_DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
return NestingDetected;
}
@@ -1465,7 +1515,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
Type *ArgTy = Arg->getType();
Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
- DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+ LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
// Insert the new retain and release calls.
for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
@@ -1476,8 +1526,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
Call->setDoesNotThrow();
Call->setTailCall();
- DEBUG(dbgs() << "Inserting new Retain: " << *Call << "\n"
- "At insertion point: " << *InsertPt << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting new Retain: " << *Call
+ << "\n"
+ "At insertion point: "
+ << *InsertPt << "\n");
}
for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
Value *MyArg = ArgTy == ParamTy ? Arg :
@@ -1491,20 +1543,22 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
if (ReleasesToMove.IsTailCallRelease)
Call->setTailCall();
- DEBUG(dbgs() << "Inserting new Release: " << *Call << "\n"
- "At insertion point: " << *InsertPt << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting new Release: " << *Call
+ << "\n"
+ "At insertion point: "
+ << *InsertPt << "\n");
}
// Delete the original retain and release calls.
for (Instruction *OrigRetain : RetainsToMove.Calls) {
Retains.blot(OrigRetain);
DeadInsts.push_back(OrigRetain);
- DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
+ LLVM_DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
}
for (Instruction *OrigRelease : ReleasesToMove.Calls) {
Releases.erase(OrigRelease);
DeadInsts.push_back(OrigRelease);
- DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
+ LLVM_DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
}
}
@@ -1538,6 +1592,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
assert(It != Retains.end());
const RRInfo &NewRetainRRI = It->second;
KnownSafeTD &= NewRetainRRI.KnownSafe;
+ CFGHazardAfflicted |= NewRetainRRI.CFGHazardAfflicted;
for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
auto Jt = Releases.find(NewRetainRelease);
if (Jt == Releases.end())
@@ -1710,7 +1765,7 @@ bool ObjCARCOpt::PerformCodePlacement(
DenseMap<const BasicBlock *, BBState> &BBStates,
BlotMapVector<Value *, RRInfo> &Retains,
DenseMap<Value *, RRInfo> &Releases, Module *M) {
- DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
bool AnyPairsCompletelyEliminated = false;
SmallVector<Instruction *, 8> DeadInsts;
@@ -1724,7 +1779,7 @@ bool ObjCARCOpt::PerformCodePlacement(
Instruction *Retain = cast<Instruction>(V);
- DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
Value *Arg = GetArgRCIdentityRoot(Retain);
@@ -1769,7 +1824,7 @@ bool ObjCARCOpt::PerformCodePlacement(
/// Weak pointer optimizations.
void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
- DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
// First, do memdep-style RLE and S2L optimizations. We can't use memdep
// itself because it uses AliasAnalysis and we need to do provenance
@@ -1777,7 +1832,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
Instruction *Inst = &*I++;
- DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
ARCInstKind Class = GetBasicARCInstKind(Inst);
if (Class != ARCInstKind::LoadWeak &&
@@ -2036,7 +2091,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
if (!F.getReturnType()->isPointerTy())
return;
- DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
SmallPtrSet<Instruction *, 4> DependingInstructions;
SmallPtrSet<const BasicBlock *, 4> Visited;
@@ -2045,7 +2100,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
if (!Ret)
continue;
- DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
@@ -2083,8 +2138,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
// If so, we can zap the retain and autorelease.
Changed = true;
++NumRets;
- DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: "
- << *Autorelease << "\n");
+ LLVM_DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: " << *Autorelease
+ << "\n");
EraseInstruction(Retain);
EraseInstruction(Autorelease);
}
@@ -2144,8 +2199,9 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
Changed = false;
- DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"
- "\n");
+ LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName()
+ << " >>>"
+ "\n");
PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
@@ -2193,7 +2249,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
}
#endif
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
return Changed;
}
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index f89fc8eb62aa..3004fffb9745 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -115,14 +115,6 @@ static bool IsStoredObjCPointer(const Value *P) {
bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
const DataLayout &DL) {
- // Skip past provenance pass-throughs.
- A = GetUnderlyingObjCPtr(A, DL);
- B = GetUnderlyingObjCPtr(B, DL);
-
- // Quick check.
- if (A == B)
- return true;
-
// Ask regular AliasAnalysis, for a first approximation.
switch (AA->alias(A, B)) {
case NoAlias:
@@ -171,6 +163,13 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
bool ProvenanceAnalysis::related(const Value *A, const Value *B,
const DataLayout &DL) {
+ A = GetUnderlyingObjCPtrCached(A, DL, UnderlyingObjCPtrCache);
+ B = GetUnderlyingObjCPtrCached(B, DL, UnderlyingObjCPtrCache);
+
+ // Quick check.
+ if (A == B)
+ return true;
+
// Begin by inserting a conservative value into the map. If the insertion
// fails, we have the answer already. If it succeeds, leave it there until we
// compute the real answer to guard against recursive queries.
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 5e676167a6a1..1276f564a022 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -28,6 +28,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/ValueHandle.h"
#include <utility>
namespace llvm {
@@ -39,7 +40,7 @@ class Value;
namespace objcarc {
-/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// This is similar to BasicAliasAnalysis, and it uses many of the same
/// techniques, except it uses special ObjC-specific reasoning about pointer
/// relationships.
///
@@ -56,6 +57,8 @@ class ProvenanceAnalysis {
CachedResultsTy CachedResults;
+ DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
+
bool relatedCheck(const Value *A, const Value *B, const DataLayout &DL);
bool relatedSelect(const SelectInst *A, const Value *B);
bool relatedPHI(const PHINode *A, const Value *B);
@@ -73,6 +76,7 @@ public:
void clear() {
CachedResults.clear();
+ UnderlyingObjCPtrCache.clear();
}
};
diff --git a/lib/Transforms/ObjCARC/PtrState.cpp b/lib/Transforms/ObjCARC/PtrState.cpp
index e1774b88fd35..8a7b6a74fae2 100644
--- a/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/lib/Transforms/ObjCARC/PtrState.cpp
@@ -126,22 +126,23 @@ bool RRInfo::Merge(const RRInfo &Other) {
//===----------------------------------------------------------------------===//
void PtrState::SetKnownPositiveRefCount() {
- DEBUG(dbgs() << " Setting Known Positive.\n");
+ LLVM_DEBUG(dbgs() << " Setting Known Positive.\n");
KnownPositiveRefCount = true;
}
void PtrState::ClearKnownPositiveRefCount() {
- DEBUG(dbgs() << " Clearing Known Positive.\n");
+ LLVM_DEBUG(dbgs() << " Clearing Known Positive.\n");
KnownPositiveRefCount = false;
}
void PtrState::SetSeq(Sequence NewSeq) {
- DEBUG(dbgs() << " Old: " << GetSeq() << "; New: " << NewSeq << "\n");
+ LLVM_DEBUG(dbgs() << " Old: " << GetSeq() << "; New: " << NewSeq
+ << "\n");
Seq = NewSeq;
}
void PtrState::ResetSequenceProgress(Sequence NewSeq) {
- DEBUG(dbgs() << " Resetting sequence progress.\n");
+ LLVM_DEBUG(dbgs() << " Resetting sequence progress.\n");
SetSeq(NewSeq);
Partial = false;
RRI.clear();
@@ -184,7 +185,8 @@ bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
// simple and avoids adding overhead for the non-nested case.
bool NestingDetected = false;
if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
- DEBUG(dbgs() << " Found nested releases (i.e. a release pair)\n");
+ LLVM_DEBUG(
+ dbgs() << " Found nested releases (i.e. a release pair)\n");
NestingDetected = true;
}
@@ -234,8 +236,8 @@ bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
if (!CanAlterRefCount(Inst, Ptr, PA, Class))
return false;
- DEBUG(dbgs() << " CanAlterRefCount: Seq: " << S << "; " << *Ptr
- << "\n");
+ LLVM_DEBUG(dbgs() << " CanAlterRefCount: Seq: " << S << "; "
+ << *Ptr << "\n");
switch (S) {
case S_Use:
SetSeq(S_CanRelease);
@@ -266,6 +268,11 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
if (isa<InvokeInst>(Inst)) {
const auto IP = BB->getFirstInsertionPt();
InsertAfter = IP == BB->end() ? std::prev(BB->end()) : IP;
+ if (isa<CatchSwitchInst>(InsertAfter))
+ // A catchswitch must be the only non-phi instruction in its basic
+ // block, so attempting to insert an instruction into such a block would
+ // produce invalid IR.
+ SetCFGHazardAfflicted(true);
} else {
InsertAfter = std::next(Inst->getIterator());
}
@@ -277,26 +284,26 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
case S_Release:
case S_MovableRelease:
if (CanUse(Inst, Ptr, PA, Class)) {
- DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; " << *Ptr
- << "\n");
+ LLVM_DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
SetSeqAndInsertReverseInsertPt(S_Use);
} else if (Seq == S_Release && IsUser(Class)) {
- DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
+ LLVM_DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq()
+ << "; " << *Ptr << "\n");
// Non-movable releases depend on any possible objc pointer use.
SetSeqAndInsertReverseInsertPt(S_Stop);
} else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
- DEBUG(dbgs() << " ReleaseUse: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
+ LLVM_DEBUG(dbgs() << " ReleaseUse: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
SetSeqAndInsertReverseInsertPt(S_Stop);
}
}
break;
case S_Stop:
if (CanUse(Inst, Ptr, PA, Class)) {
- DEBUG(dbgs() << " PreciseStopUse: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
+ LLVM_DEBUG(dbgs() << " PreciseStopUse: Seq: " << GetSeq()
+ << "; " << *Ptr << "\n");
SetSeq(S_Use);
}
break;
@@ -377,8 +384,8 @@ bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
Class != ARCInstKind::IntrinsicUser)
return false;
- DEBUG(dbgs() << " CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr
- << "\n");
+ LLVM_DEBUG(dbgs() << " CanAlterRefCount: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
ClearKnownPositiveRefCount();
switch (GetSeq()) {
case S_Retain:
@@ -410,8 +417,8 @@ void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
case S_CanRelease:
if (!CanUse(Inst, Ptr, PA, Class))
return;
- DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; " << *Ptr
- << "\n");
+ LLVM_DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
SetSeq(S_Use);
return;
case S_Retain:
diff --git a/lib/Transforms/ObjCARC/PtrState.h b/lib/Transforms/ObjCARC/PtrState.h
index e1e95afcf76b..f5b9b853d8e3 100644
--- a/lib/Transforms/ObjCARC/PtrState.h
+++ b/lib/Transforms/ObjCARC/PtrState.h
@@ -36,7 +36,7 @@ class ProvenanceAnalysis;
/// \enum Sequence
///
-/// \brief A sequence of states that a pointer may go through in which an
+/// A sequence of states that a pointer may go through in which an
/// objc_retain and objc_release are actually needed.
enum Sequence {
S_None,
@@ -51,7 +51,7 @@ enum Sequence {
raw_ostream &operator<<(raw_ostream &OS,
const Sequence S) LLVM_ATTRIBUTE_UNUSED;
-/// \brief Unidirectional information about either a
+/// Unidirectional information about either a
/// retain-decrement-use-release sequence or release-use-decrement-retain
/// reverse sequence.
struct RRInfo {
@@ -97,7 +97,7 @@ struct RRInfo {
bool Merge(const RRInfo &Other);
};
-/// \brief This class summarizes several per-pointer runtime properties which
+/// This class summarizes several per-pointer runtime properties which
/// are propagated through the flow graph.
class PtrState {
protected:
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 1e683db50206..ce09a477b5f5 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -174,8 +174,8 @@ class AggressiveDeadCodeElimination {
/// marked live.
void markLiveBranchesFromControlDependences();
- /// Remove instructions not marked live, return if any any instruction
- /// was removed.
+ /// Remove instructions not marked live, return if any instruction was
+ /// removed.
bool removeDeadInstructions();
/// Identify connected sections of the control flow graph which have
@@ -298,8 +298,8 @@ void AggressiveDeadCodeElimination::initialize() {
auto &Info = BlockInfo[BB];
// Real function return
if (isa<ReturnInst>(Info.Terminator)) {
- DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
- << '\n';);
+ LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
+ << '\n';);
continue;
}
@@ -356,7 +356,7 @@ void AggressiveDeadCodeElimination::markLiveInstructions() {
// where we need to mark the inputs as live.
while (!Worklist.empty()) {
Instruction *LiveInst = Worklist.pop_back_val();
- DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+ LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump(););
for (Use &OI : LiveInst->operands())
if (Instruction *Inst = dyn_cast<Instruction>(OI))
@@ -378,7 +378,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) {
if (Info.Live)
return;
- DEBUG(dbgs() << "mark live: "; I->dump());
+ LLVM_DEBUG(dbgs() << "mark live: "; I->dump());
Info.Live = true;
Worklist.push_back(I);
@@ -402,7 +402,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) {
void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
if (BBInfo.Live)
return;
- DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
BBInfo.Live = true;
if (!BBInfo.CFLive) {
BBInfo.CFLive = true;
@@ -463,7 +463,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
if (BlocksWithDeadTerminators.empty())
return;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "new live blocks:\n";
for (auto *BB : NewLiveBlocks)
dbgs() << "\t" << BB->getName() << '\n';
@@ -487,7 +487,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
// Dead terminators which control live blocks are now marked live.
for (auto *BB : IDFBlocks) {
- DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
markLive(BB->getTerminator());
}
}
@@ -501,7 +501,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
// Updates control and dataflow around dead blocks
updateDeadRegions();
- DEBUG({
+ LLVM_DEBUG({
for (Instruction &I : instructions(F)) {
// Check if the instruction is alive.
if (isLive(&I))
@@ -555,7 +555,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
// A dead region is the set of dead blocks with a common live post-dominator.
void AggressiveDeadCodeElimination::updateDeadRegions() {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "final dead terminator blocks: " << '\n';
for (auto *BB : BlocksWithDeadTerminators)
dbgs() << '\t' << BB->getName()
@@ -607,8 +607,9 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
// It might have happened that the same successor appeared multiple times
// and the CFG edge wasn't really removed.
if (Succ != PreferredSucc->BB) {
- DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
- << BB->getName() << " -> " << Succ->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+ << BB->getName() << " -> " << Succ->getName()
+ << "\n");
DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
}
}
@@ -652,7 +653,7 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
InstInfo[PredTerm].Live = true;
return;
}
- DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
NumBranchesRemoved += 1;
IRBuilder<> Builder(PredTerm);
auto *NewTerm = Builder.CreateBr(Target);
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 99480f12da9e..fa7bcec677f7 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -98,8 +98,8 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
- DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
- *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+ LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
+ << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
if (const SCEVConstant *ConstDUSCEV =
dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
@@ -139,12 +139,12 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
// address. This address is displaced by the provided offset.
DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
- DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
- *AlignSCEV << " and offset " << *OffSCEV <<
- " using diff " << *DiffSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to "
+ << *AlignSCEV << " and offset " << *OffSCEV
+ << " using diff " << *DiffSCEV << "\n");
unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
- DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
if (NewAlignment) {
return NewAlignment;
@@ -160,8 +160,8 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
- DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
- *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start "
+ << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
// Now compute the new alignment using the displacement to the value in the
// first iteration, and also the alignment using the per-iteration delta.
@@ -170,26 +170,26 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
- DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
- DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
if (!NewAlignment || !NewIncAlignment) {
return 0;
} else if (NewAlignment > NewIncAlignment) {
if (NewAlignment % NewIncAlignment == 0) {
- DEBUG(dbgs() << "\tnew start/inc alignment: " <<
- NewIncAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewIncAlignment
+ << "\n");
return NewIncAlignment;
}
} else if (NewIncAlignment > NewAlignment) {
if (NewIncAlignment % NewAlignment == 0) {
- DEBUG(dbgs() << "\tnew start/inc alignment: " <<
- NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+ << "\n");
return NewAlignment;
}
} else if (NewIncAlignment == NewAlignment) {
- DEBUG(dbgs() << "\tnew start/inc alignment: " <<
- NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+ << "\n");
return NewAlignment;
}
}
@@ -339,55 +339,24 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
MI->getDest(), SE);
- // For memory transfers, we need a common alignment for both the
- // source and destination. If we have a new alignment for this
- // instruction, but only for one operand, save it. If we reach the
- // other operand through another assumption later, then we may
- // change the alignment at that point.
+ LLVM_DEBUG(dbgs() << "\tmem inst: " << NewDestAlignment << "\n";);
+ if (NewDestAlignment > MI->getDestAlignment()) {
+ MI->setDestAlignment(NewDestAlignment);
+ ++NumMemIntAlignChanged;
+ }
+
+ // For memory transfers, there is also a source alignment that
+ // can be set.
if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
MTI->getSource(), SE);
- DenseMap<MemTransferInst *, unsigned>::iterator DI =
- NewDestAlignments.find(MTI);
- unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
- 0 : DI->second;
-
- DenseMap<MemTransferInst *, unsigned>::iterator SI =
- NewSrcAlignments.find(MTI);
- unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
- 0 : SI->second;
-
- DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
- AltDestAlignment << " " << NewSrcAlignment <<
- " " << AltSrcAlignment << "\n");
-
- // Of these four alignments, pick the largest possible...
- unsigned NewAlignment = 0;
- if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
- NewAlignment = std::max(NewAlignment, NewDestAlignment);
- if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
- NewAlignment = std::max(NewAlignment, AltDestAlignment);
- if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
- NewAlignment = std::max(NewAlignment, NewSrcAlignment);
- if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
- NewAlignment = std::max(NewAlignment, AltSrcAlignment);
-
- if (NewAlignment > MI->getAlignment()) {
- MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
- MI->getParent()->getContext()), NewAlignment));
+ LLVM_DEBUG(dbgs() << "\tmem trans: " << NewSrcAlignment << "\n";);
+
+ if (NewSrcAlignment > MTI->getSourceAlignment()) {
+ MTI->setSourceAlignment(NewSrcAlignment);
++NumMemIntAlignChanged;
}
-
- NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
- NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
- } else if (NewDestAlignment > MI->getAlignment()) {
- assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
- "Unknown memory intrinsic");
-
- MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
- MI->getParent()->getContext()), NewDestAlignment));
- ++NumMemIntAlignChanged;
}
}
@@ -421,9 +390,6 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
SE = SE_;
DT = DT_;
- NewDestAlignments.clear();
- NewSrcAlignments.clear();
-
bool Changed = false;
for (auto &AssumeVH : AC.assumptions())
if (AssumeVH)
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 851efa000f65..3a8ef073cb48 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Pass.h"
@@ -99,7 +100,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
// For live instructions that have all dead bits, first make them dead by
// replacing all uses with something else. Then, if they don't need to
// remain live (because they have side effects, etc.) we can remove them.
- DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+ LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
clearAssumptionsOfUsers(&I, DB);
@@ -114,6 +115,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
if (!DB.isInstructionDead(&I))
continue;
+ salvageDebugInfo(I);
Worklist.push_back(&I);
I.dropAllReferences();
Changed = true;
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 0562d3882f8b..fce37d4bffb8 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_library(LLVMScalarOpts
InductiveRangeCheckElimination.cpp
IndVarSimplify.cpp
InferAddressSpaces.cpp
+ InstSimplifyPass.cpp
JumpThreading.cpp
LICM.cpp
LoopAccessAnalysisPrinter.cpp
@@ -38,6 +39,7 @@ add_llvm_library(LLVMScalarOpts
LoopSimplifyCFG.cpp
LoopStrengthReduce.cpp
LoopUnrollPass.cpp
+ LoopUnrollAndJamPass.cpp
LoopUnswitch.cpp
LoopVersioningLICM.cpp
LowerAtomic.cpp
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 207243231aad..5ebfbf8a879b 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -59,12 +59,14 @@
#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/Cloning.h"
using namespace llvm;
using namespace PatternMatch;
@@ -73,9 +75,16 @@ using namespace PatternMatch;
STATISTIC(NumCallSiteSplit, "Number of call-site split");
-static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
- Value *Op) {
- CallSite CS(NewCallI);
+/// Only allow instructions before a call, if their CodeSize cost is below
+/// DuplicationThreshold. Those instructions need to be duplicated in all
+/// split blocks.
+static cl::opt<unsigned>
+ DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
+ cl::desc("Only allow instructions before a call, if "
+ "their cost is below DuplicationThreshold"),
+ cl::init(5));
+
+static void addNonNullAttribute(CallSite CS, Value *Op) {
unsigned ArgNo = 0;
for (auto &I : CS.args()) {
if (&*I == Op)
@@ -84,13 +93,16 @@ static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
}
}
-static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
- Value *Op, Constant *ConstValue) {
- CallSite CS(NewCallI);
+static void setConstantInArgument(CallSite CS, Value *Op,
+ Constant *ConstValue) {
unsigned ArgNo = 0;
for (auto &I : CS.args()) {
- if (&*I == Op)
+ if (&*I == Op) {
+ // It is possible we have already added the non-null attribute to the
+ // parameter by using an earlier constraining condition.
+ CS.removeParamAttr(ArgNo, Attribute::NonNull);
CS.setArgument(ArgNo, ConstValue);
+ }
++ArgNo;
}
}
@@ -111,11 +123,13 @@ static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
return false;
}
+typedef std::pair<ICmpInst *, unsigned> ConditionTy;
+typedef SmallVector<ConditionTy, 2> ConditionsTy;
+
/// If From has a conditional jump to To, add the condition to Conditions,
/// if it is relevant to any argument at CS.
-static void
-recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
- SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To,
+ ConditionsTy &Conditions) {
auto *BI = dyn_cast<BranchInst>(From->getTerminator());
if (!BI || !BI->isConditional())
return;
@@ -134,40 +148,33 @@ recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
}
/// Record ICmp conditions relevant to any argument in CS following Pred's
-/// single successors. If there are conflicting conditions along a path, like
+/// single predecessors. If there are conflicting conditions along a path, like
/// x == 1 and x == 0, the first condition will be used.
-static void
-recordConditions(const CallSite &CS, BasicBlock *Pred,
- SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+static void recordConditions(CallSite CS, BasicBlock *Pred,
+ ConditionsTy &Conditions) {
recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
BasicBlock *From = Pred;
BasicBlock *To = Pred;
- SmallPtrSet<BasicBlock *, 4> Visited = {From};
+ SmallPtrSet<BasicBlock *, 4> Visited;
while (!Visited.count(From->getSinglePredecessor()) &&
(From = From->getSinglePredecessor())) {
recordCondition(CS, From, To, Conditions);
+ Visited.insert(From);
To = From;
}
}
-static Instruction *
-addConditions(CallSite &CS,
- SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
- if (Conditions.empty())
- return nullptr;
-
- Instruction *NewCI = CS.getInstruction()->clone();
+static void addConditions(CallSite CS, const ConditionsTy &Conditions) {
for (auto &Cond : Conditions) {
Value *Arg = Cond.first->getOperand(0);
Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
if (Cond.second == ICmpInst::ICMP_EQ)
- setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
+ setConstantInArgument(CS, Arg, ConstVal);
else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
assert(Cond.second == ICmpInst::ICMP_NE);
- addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
+ addNonNullAttribute(CS, Arg);
}
}
- return NewCI;
}
static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
@@ -176,38 +183,90 @@ static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
return Preds;
}
-static bool canSplitCallSite(CallSite CS) {
+static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
// FIXME: As of now we handle only CallInst. InvokeInst could be handled
// without too much effort.
Instruction *Instr = CS.getInstruction();
if (!isa<CallInst>(Instr))
return false;
- // Allow splitting a call-site only when there is no instruction before the
- // call-site in the basic block. Based on this constraint, we only clone the
- // call instruction, and we do not move a call-site across any other
- // instruction.
BasicBlock *CallSiteBB = Instr->getParent();
- if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
- return false;
-
// Need 2 predecessors and cannot split an edge from an IndirectBrInst.
SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
isa<IndirectBrInst>(Preds[1]->getTerminator()))
return false;
- return CallSiteBB->canSplitPredecessors();
+ // BasicBlock::canSplitPredecessors is more agressive, so checking for
+ // BasicBlock::isEHPad as well.
+ if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
+ return false;
+
+ // Allow splitting a call-site only when the CodeSize cost of the
+ // instructions before the call is less then DuplicationThreshold. The
+ // instructions before the call will be duplicated in the split blocks and
+ // corresponding uses will be updated.
+ unsigned Cost = 0;
+ for (auto &InstBeforeCall :
+ llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) {
+ Cost += TTI.getInstructionCost(&InstBeforeCall,
+ TargetTransformInfo::TCK_CodeSize);
+ if (Cost >= DuplicationThreshold)
+ return false;
+ }
+
+ return true;
+}
+
+static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
+ Value *V) {
+ Instruction *Copy = I->clone();
+ Copy->setName(I->getName());
+ Copy->insertBefore(Before);
+ if (V)
+ Copy->setOperand(0, V);
+ return Copy;
}
-/// Return true if the CS is split into its new predecessors which are directly
-/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
-/// CallInst1 and CallInst2 will be the new call-sites placed in the new
-/// predecessors split for PredBB1 and PredBB2, respectively.
+/// Copy mandatory `musttail` return sequence that follows original `CI`, and
+/// link it up to `NewCI` value instead:
+///
+/// * (optional) `bitcast NewCI to ...`
+/// * `ret bitcast or NewCI`
+///
+/// Insert this sequence right before `SplitBB`'s terminator, which will be
+/// cleaned up later in `splitCallSite` below.
+static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
+ Instruction *NewCI) {
+ bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
+ auto II = std::next(CI->getIterator());
+
+ BitCastInst* BCI = dyn_cast<BitCastInst>(&*II);
+ if (BCI)
+ ++II;
+
+ ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
+ assert(RI && "`musttail` call must be followed by `ret` instruction");
+
+ TerminatorInst *TI = SplitBB->getTerminator();
+ Value *V = NewCI;
+ if (BCI)
+ V = cloneInstForMustTail(BCI, TI, V);
+ cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V);
+
+ // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug
+ // that prevents doing this now.
+}
+
+/// For each (predecessor, conditions from predecessors) pair, it will split the
+/// basic block containing the call site, hook it up to the predecessor and
+/// replace the call instruction with new call instructions, which contain
+/// constraints based on the conditions from their predecessors.
/// For example, in the IR below with an OR condition, the call-site can
-/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
-/// call-site placed between Header and Tail, and CallInst2 will be the
-/// call-site between TBB and Tail.
+/// be split. In this case, Preds for Tail is [(Header, a == null),
+/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing
+/// CallInst1, which has constraints based on the conditions from Head and
+/// CallInst2, which has constraints based on the conditions coming from TBB.
///
/// From :
///
@@ -240,60 +299,112 @@ static bool canSplitCallSite(CallSite CS) {
/// Note that in case any arguments at the call-site are constrained by its
/// predecessors, new call-sites with more constrained arguments will be
/// created in createCallSitesOnPredicatedArgument().
-static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
- Instruction *CallInst1, Instruction *CallInst2) {
+static void splitCallSite(
+ CallSite CS,
+ const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
+ DominatorTree *DT) {
Instruction *Instr = CS.getInstruction();
BasicBlock *TailBB = Instr->getParent();
- assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
-
- BasicBlock *SplitBlock1 =
- SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
- BasicBlock *SplitBlock2 =
- SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
-
- assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
-
- if (!CallInst1)
- CallInst1 = Instr->clone();
- if (!CallInst2)
- CallInst2 = Instr->clone();
-
- CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
- CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
-
- CallSite CS1(CallInst1);
- CallSite CS2(CallInst2);
-
- // Handle PHIs used as arguments in the call-site.
- for (auto &PI : *TailBB) {
- PHINode *PN = dyn_cast<PHINode>(&PI);
- if (!PN)
- break;
- unsigned ArgNo = 0;
- for (auto &CI : CS.args()) {
- if (&*CI == PN) {
- CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1));
- CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2));
+ bool IsMustTailCall = CS.isMustTailCall();
+
+ PHINode *CallPN = nullptr;
+
+ // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
+ // split blocks will be terminated right after that so there're no users for
+ // this phi in a `TailBB`.
+ if (!IsMustTailCall && !Instr->use_empty())
+ CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call");
+
+ LLVM_DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+
+ assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
+ // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
+ // here.
+ ValueToValueMapTy ValueToValueMaps[2];
+ for (unsigned i = 0; i < Preds.size(); i++) {
+ BasicBlock *PredBB = Preds[i].first;
+ BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+ TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i],
+ DT);
+ assert(SplitBlock && "Unexpected new basic block split.");
+
+ Instruction *NewCI =
+ &*std::prev(SplitBlock->getTerminator()->getIterator());
+ CallSite NewCS(NewCI);
+ addConditions(NewCS, Preds[i].second);
+
+ // Handle PHIs used as arguments in the call-site.
+ for (PHINode &PN : TailBB->phis()) {
+ unsigned ArgNo = 0;
+ for (auto &CI : CS.args()) {
+ if (&*CI == &PN) {
+ NewCS.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
+ }
+ ++ArgNo;
}
- ++ArgNo;
}
+ LLVM_DEBUG(dbgs() << " " << *NewCI << " in " << SplitBlock->getName()
+ << "\n");
+ if (CallPN)
+ CallPN->addIncoming(NewCI, SplitBlock);
+
+ // Clone and place bitcast and return instructions before `TI`
+ if (IsMustTailCall)
+ copyMustTailReturn(SplitBlock, Instr, NewCI);
+ }
+
+ NumCallSiteSplit++;
+
+ // FIXME: remove TI in `copyMustTailReturn`
+ if (IsMustTailCall) {
+ // Remove superfluous `br` terminators from the end of the Split blocks
+ // NOTE: Removing terminator removes the SplitBlock from the TailBB's
+ // predecessors. Therefore we must get complete list of Splits before
+ // attempting removal.
+ SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB)));
+ assert(Splits.size() == 2 && "Expected exactly 2 splits!");
+ for (unsigned i = 0; i < Splits.size(); i++)
+ Splits[i]->getTerminator()->eraseFromParent();
+
+ // Erase the tail block once done with musttail patching
+ TailBB->eraseFromParent();
+ return;
}
+ auto *OriginalBegin = &*TailBB->begin();
// Replace users of the original call with a PHI mering call-sites split.
- if (Instr->getNumUses()) {
- PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
- TailBB->getFirstNonPHI());
- PN->addIncoming(CallInst1, SplitBlock1);
- PN->addIncoming(CallInst2, SplitBlock2);
- Instr->replaceAllUsesWith(PN);
+ if (CallPN) {
+ CallPN->insertBefore(OriginalBegin);
+ Instr->replaceAllUsesWith(CallPN);
+ }
+
+ // Remove instructions moved to split blocks from TailBB, from the duplicated
+ // call instruction to the beginning of the basic block. If an instruction
+ // has any uses, add a new PHI node to combine the values coming from the
+ // split blocks. The new PHI nodes are placed before the first original
+ // instruction, so we do not end up deleting them. By using reverse-order, we
+ // do not introduce unnecessary PHI nodes for def-use chains from the call
+ // instruction to the beginning of the block.
+ auto I = Instr->getReverseIterator();
+ while (I != TailBB->rend()) {
+ Instruction *CurrentI = &*I++;
+ if (!CurrentI->use_empty()) {
+ // If an existing PHI has users after the call, there is no need to create
+ // a new one.
+ if (isa<PHINode>(CurrentI))
+ continue;
+ PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
+ for (auto &Mapping : ValueToValueMaps)
+ NewPN->addIncoming(Mapping[CurrentI],
+ cast<Instruction>(Mapping[CurrentI])->getParent());
+ NewPN->insertBefore(&*TailBB->begin());
+ CurrentI->replaceAllUsesWith(NewPN);
+ }
+ CurrentI->eraseFromParent();
+ // We are done once we handled the first original instruction in TailBB.
+ if (CurrentI == OriginalBegin)
+ break;
}
- DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
- DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName()
- << "\n");
- DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName()
- << "\n");
- Instr->eraseFromParent();
- NumCallSiteSplit++;
}
// Return true if the call-site has an argument which is a PHI with only
@@ -324,45 +435,59 @@ static bool isPredicatedOnPHI(CallSite CS) {
return false;
}
-static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPHIPredicatedArgument(CallSite CS, DominatorTree *DT) {
if (!isPredicatedOnPHI(CS))
return false;
auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
- splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
+ SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS = {
+ {Preds[0], {}}, {Preds[1], {}}};
+ splitCallSite(CS, PredsCS, DT);
return true;
}
-static bool tryToSplitOnPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) {
auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
if (Preds[0] == Preds[1])
return false;
- SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
- recordConditions(CS, Preds[0], C1);
- recordConditions(CS, Preds[1], C2);
+ SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
+ for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
+ ConditionsTy Conditions;
+ recordConditions(CS, Pred, Conditions);
+ PredsCS.push_back({Pred, Conditions});
+ }
- Instruction *CallInst1 = addConditions(CS, C1);
- Instruction *CallInst2 = addConditions(CS, C2);
- if (!CallInst1 && !CallInst2)
+ if (std::all_of(PredsCS.begin(), PredsCS.end(),
+ [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+ return P.second.empty();
+ }))
return false;
- splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
+ splitCallSite(CS, PredsCS, DT);
return true;
}
-static bool tryToSplitCallSite(CallSite CS) {
- if (!CS.arg_size() || !canSplitCallSite(CS))
+static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI,
+ DominatorTree *DT) {
+ if (!CS.arg_size() || !canSplitCallSite(CS, TTI))
return false;
- return tryToSplitOnPredicatedArgument(CS) ||
- tryToSplitOnPHIPredicatedArgument(CS);
+ return tryToSplitOnPredicatedArgument(CS, DT) ||
+ tryToSplitOnPHIPredicatedArgument(CS, DT);
}
-static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
+ TargetTransformInfo &TTI, DominatorTree *DT) {
bool Changed = false;
for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
BasicBlock &BB = *BI++;
- for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ auto II = BB.getFirstNonPHIOrDbg()->getIterator();
+ auto IE = BB.getTerminator()->getIterator();
+ // Iterate until we reach the terminator instruction. tryToSplitCallSite
+ // can replace BB's terminator in case BB is a successor of itself. In that
+ // case, IE will be invalidated and we also have to check the current
+ // terminator.
+ while (II != IE && &*II != BB.getTerminator()) {
Instruction *I = &*II++;
CallSite CS(cast<Value>(I));
if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
@@ -371,7 +496,17 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
Function *Callee = CS.getCalledFunction();
if (!Callee || Callee->isDeclaration())
continue;
- Changed |= tryToSplitCallSite(CS);
+
+ // Successful musttail call-site splits result in erased CI and erased BB.
+ // Check if such path is possible before attempting the splitting.
+ bool IsMustTail = CS.isMustTailCall();
+
+ Changed |= tryToSplitCallSite(CS, TTI, DT);
+
+ // There're no interesting instructions after this. The call site
+ // itself might have been erased on splitting.
+ if (IsMustTail)
+ break;
}
}
return Changed;
@@ -386,6 +521,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
@@ -394,7 +531,10 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
return false;
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return doCallSiteSplitting(F, TLI);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ return doCallSiteSplitting(F, TLI, TTI,
+ DTWP ? &DTWP->getDomTree() : nullptr);
}
};
} // namespace
@@ -403,6 +543,7 @@ char CallSiteSplittingLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
"Call-site splitting", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
"Call-site splitting", false, false)
FunctionPass *llvm::createCallSiteSplittingPass() {
@@ -412,9 +553,12 @@ FunctionPass *llvm::createCallSiteSplittingPass() {
PreservedAnalyses CallSiteSplittingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- if (!doCallSiteSplitting(F, TLI))
+ if (!doCallSiteSplitting(F, TLI, TTI, DT))
return PreservedAnalyses::all();
PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index e4b08c5ed305..3a675b979017 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -43,8 +43,10 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
@@ -59,8 +61,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/DebugInfoMetadata.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -84,7 +84,7 @@ static cl::opt<bool> ConstHoistWithBlockFrequency(
namespace {
-/// \brief The constant hoisting pass.
+/// The constant hoisting pass.
class ConstantHoistingLegacyPass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
@@ -127,13 +127,13 @@ FunctionPass *llvm::createConstantHoistingPass() {
return new ConstantHoistingLegacyPass();
}
-/// \brief Perform the constant hoisting optimization for the given function.
+/// Perform the constant hoisting optimization for the given function.
bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
if (skipFunction(Fn))
return false;
- DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
- DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+ LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
bool MadeChange =
Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
@@ -144,16 +144,16 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
Fn.getEntryBlock());
if (MadeChange) {
- DEBUG(dbgs() << "********** Function after Constant Hoisting: "
- << Fn.getName() << '\n');
- DEBUG(dbgs() << Fn);
+ LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+ << Fn.getName() << '\n');
+ LLVM_DEBUG(dbgs() << Fn);
}
- DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+ LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
return MadeChange;
}
-/// \brief Find the constant materialization insertion point.
+/// Find the constant materialization insertion point.
Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
unsigned Idx) const {
// If the operand is a cast instruction, then we have to materialize the
@@ -187,7 +187,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
return IDom->getBlock()->getTerminator();
}
-/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// Given \p BBs as input, find another set of BBs which collectively
/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
/// set found in \p BBs.
static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
@@ -289,7 +289,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
}
}
-/// \brief Find an insertion point that dominates all uses.
+/// Find an insertion point that dominates all uses.
SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
const ConstantInfo &ConstInfo) const {
assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
@@ -335,7 +335,7 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
return InsertPts;
}
-/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// Record constant integer ConstInt for instruction Inst at operand
/// index Idx.
///
/// The operand at index Idx is not necessarily the constant integer itself. It
@@ -364,18 +364,17 @@ void ConstantHoistingPass::collectConstantCandidates(
Itr->second = ConstCandVec.size() - 1;
}
ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
- DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
- dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+ LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs()
+ << "Collect constant " << *ConstInt << " from " << *Inst
<< " with cost " << Cost << '\n';
- else
- dbgs() << "Collect constant " << *ConstInt << " indirectly from "
- << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
- << Cost << '\n';
- );
+ else dbgs() << "Collect constant " << *ConstInt
+ << " indirectly from " << *Inst << " via "
+ << *Inst->getOperand(Idx) << " with cost " << Cost
+ << '\n';);
}
}
-/// \brief Check the operand for instruction Inst at index Idx.
+/// Check the operand for instruction Inst at index Idx.
void ConstantHoistingPass::collectConstantCandidates(
ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
Value *Opnd = Inst->getOperand(Idx);
@@ -416,7 +415,7 @@ void ConstantHoistingPass::collectConstantCandidates(
}
}
-/// \brief Scan the instruction for expensive integer constants and record them
+/// Scan the instruction for expensive integer constants and record them
/// in the constant candidate vector.
void ConstantHoistingPass::collectConstantCandidates(
ConstCandMapType &ConstCandMap, Instruction *Inst) {
@@ -436,7 +435,7 @@ void ConstantHoistingPass::collectConstantCandidates(
} // end of for all operands
}
-/// \brief Collect all integer constants in the function that cannot be folded
+/// Collect all integer constants in the function that cannot be folded
/// into an instruction itself.
void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
ConstCandMapType ConstCandMap;
@@ -501,20 +500,21 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
return NumUses;
}
- DEBUG(dbgs() << "== Maximize constants in range ==\n");
+ LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n");
int MaxCost = -1;
for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
auto Value = ConstCand->ConstInt->getValue();
Type *Ty = ConstCand->ConstInt->getType();
int Cost = 0;
NumUses += ConstCand->Uses.size();
- DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n");
+ LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue()
+ << "\n");
for (auto User : ConstCand->Uses) {
unsigned Opcode = User.Inst->getOpcode();
unsigned OpndIdx = User.OpndIdx;
Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
- DEBUG(dbgs() << "Cost: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
for (auto C2 = S; C2 != E; ++C2) {
Optional<APInt> Diff = calculateOffsetDiff(
@@ -524,24 +524,24 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
const int ImmCosts =
TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
Cost -= ImmCosts;
- DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
- << "has penalty: " << ImmCosts << "\n"
- << "Adjusted cost: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+ << "has penalty: " << ImmCosts << "\n"
+ << "Adjusted cost: " << Cost << "\n");
}
}
}
- DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
if (Cost > MaxCost) {
MaxCost = Cost;
MaxCostItr = ConstCand;
- DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
- << "\n");
+ LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+ << "\n");
}
}
return NumUses;
}
-/// \brief Find the base constant within the given range and rebase all other
+/// Find the base constant within the given range and rebase all other
/// constants with respect to the base constant.
void ConstantHoistingPass::findAndMakeBaseConstant(
ConstCandVecType::iterator S, ConstCandVecType::iterator E) {
@@ -567,12 +567,12 @@ void ConstantHoistingPass::findAndMakeBaseConstant(
ConstantVec.push_back(std::move(ConstInfo));
}
-/// \brief Finds and combines constant candidates that can be easily
+/// Finds and combines constant candidates that can be easily
/// rematerialized with an add from a common base constant.
void ConstantHoistingPass::findBaseConstants() {
// Sort the constants by value and type. This invalidates the mapping!
- std::sort(ConstCandVec.begin(), ConstCandVec.end(),
- [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+ llvm::sort(ConstCandVec.begin(), ConstCandVec.end(),
+ [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
return LHS.ConstInt->getType()->getBitWidth() <
RHS.ConstInt->getType()->getBitWidth();
@@ -601,7 +601,7 @@ void ConstantHoistingPass::findBaseConstants() {
findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
}
-/// \brief Updates the operand at Idx in instruction Inst with the result of
+/// Updates the operand at Idx in instruction Inst with the result of
/// instruction Mat. If the instruction is a PHI node then special
/// handling for duplicate values form the same incoming basic block is
/// required.
@@ -629,7 +629,7 @@ static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
return true;
}
-/// \brief Emit materialization code for all rebased constants and update their
+/// Emit materialization code for all rebased constants and update their
/// users.
void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
Constant *Offset,
@@ -641,19 +641,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
"const_mat", InsertionPt);
- DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
- << " + " << *Offset << ") in BB "
- << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+ LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+ << " + " << *Offset << ") in BB "
+ << Mat->getParent()->getName() << '\n'
+ << *Mat << '\n');
Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
}
Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
// Visit constant integer.
if (isa<ConstantInt>(Opnd)) {
- DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
Mat->eraseFromParent();
- DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
return;
}
@@ -669,13 +670,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
ClonedCastInst->insertAfter(CastInst);
// Use the same debug location as the original cast instruction.
ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
- DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
- << "To : " << *ClonedCastInst << '\n');
+ LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+ << "To : " << *ClonedCastInst << '\n');
}
- DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
- DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
return;
}
@@ -689,20 +690,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
// Use the same debug location as the instruction we are about to update.
ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
- DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
- << "From : " << *ConstExpr << '\n');
- DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+ << "From : " << *ConstExpr << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
ConstExprInst->eraseFromParent();
if (Offset)
Mat->eraseFromParent();
}
- DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
return;
}
}
-/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// Hoist and hide the base constant behind a bitcast and emit
/// materialization code for derived constants.
bool ConstantHoistingPass::emitBaseConstants() {
bool MadeChange = false;
@@ -720,9 +721,9 @@ bool ConstantHoistingPass::emitBaseConstants() {
Base->setDebugLoc(IP->getDebugLoc());
- DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
- << ") to BB " << IP->getParent()->getName() << '\n'
- << *Base << '\n');
+ LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+ << ") to BB " << IP->getParent()->getName() << '\n'
+ << *Base << '\n');
// Emit materialization code for all rebased constants.
unsigned Uses = 0;
@@ -765,7 +766,7 @@ bool ConstantHoistingPass::emitBaseConstants() {
return MadeChange;
}
-/// \brief Check all cast instructions we made a copy of and remove them if they
+/// Check all cast instructions we made a copy of and remove them if they
/// have no more users.
void ConstantHoistingPass::deleteDeadCastInst() const {
for (auto const &I : ClonedCastMap)
@@ -773,7 +774,7 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
I.first->eraseFromParent();
}
-/// \brief Optimize expensive integer constants in the given function.
+/// Optimize expensive integer constants in the given function.
bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
DominatorTree &DT, BlockFrequencyInfo *BFI,
BasicBlock &Entry) {
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 4fa27891a974..46915889ce7c 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -21,12 +21,12 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <set>
using namespace llvm;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 8f468ebf8949..ea148b728a10 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -28,11 +29,11 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
@@ -43,7 +44,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -52,12 +52,14 @@ using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
STATISTIC(NumPhis, "Number of phis propagated");
+STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
STATISTIC(NumSelects, "Number of selects propagated");
STATISTIC(NumMemAccess, "Number of memory access targets propagated");
STATISTIC(NumCmps, "Number of comparisons propagated");
STATISTIC(NumReturns, "Number of return values propagated");
STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
+STATISTIC(NumUDivs, "Number of udivs whose width was decreased");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
STATISTIC(NumOverflows, "Number of overflow checks removed");
@@ -77,8 +79,10 @@ namespace {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
}
};
@@ -88,6 +92,7 @@ char CorrelatedValuePropagation::ID = 0;
INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
@@ -101,14 +106,14 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
if (S->getType()->isVectorTy()) return false;
if (isa<Constant>(S->getOperand(0))) return false;
- Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+ Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S);
if (!C) return false;
ConstantInt *CI = dyn_cast<ConstantInt>(C);
if (!CI) return false;
- Value *ReplaceWith = S->getOperand(1);
- Value *Other = S->getOperand(2);
+ Value *ReplaceWith = S->getTrueValue();
+ Value *Other = S->getFalseValue();
if (!CI->isOne()) std::swap(ReplaceWith, Other);
if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
@@ -120,7 +125,63 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
return true;
}
-static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+/// Try to simplify a phi with constant incoming values that match the edge
+/// values of a non-constant value on all other edges:
+/// bb0:
+/// %isnull = icmp eq i8* %x, null
+/// br i1 %isnull, label %bb2, label %bb1
+/// bb1:
+/// br label %bb2
+/// bb2:
+/// %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ]
+/// -->
+/// %r = %x
+static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
+ DominatorTree *DT) {
+ // Collect incoming constants and initialize possible common value.
+ SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants;
+ Value *CommonValue = nullptr;
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+ Value *Incoming = P->getIncomingValue(i);
+ if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) {
+ IncomingConstants.push_back(std::make_pair(IncomingConstant, i));
+ } else if (!CommonValue) {
+ // The potential common value is initialized to the first non-constant.
+ CommonValue = Incoming;
+ } else if (Incoming != CommonValue) {
+ // There can be only one non-constant common value.
+ return false;
+ }
+ }
+
+ if (!CommonValue || IncomingConstants.empty())
+ return false;
+
+ // The common value must be valid in all incoming blocks.
+ BasicBlock *ToBB = P->getParent();
+ if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+ if (!DT->dominates(CommonInst, ToBB))
+ return false;
+
+ // We have a phi with exactly 1 variable incoming value and 1 or more constant
+ // incoming values. See if all constant incoming values can be mapped back to
+ // the same incoming variable value.
+ for (auto &IncomingConstant : IncomingConstants) {
+ Constant *C = IncomingConstant.first;
+ BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second);
+ if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P))
+ return false;
+ }
+
+ // All constant incoming values map to the same variable along the incoming
+ // edges of the phi. The phi is unnecessary.
+ P->replaceAllUsesWith(CommonValue);
+ P->eraseFromParent();
+ ++NumPhiCommon;
+ return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
const SimplifyQuery &SQ) {
bool Changed = false;
@@ -168,7 +229,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI,
V = SI->getTrueValue();
}
- DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+ LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
}
P->setIncomingValue(i, V);
@@ -181,6 +242,9 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI,
Changed = true;
}
+ if (!Changed)
+ Changed = simplifyCommonValuePhi(P, LVI, DT);
+
if (Changed)
++NumPhis;
@@ -243,7 +307,7 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
/// that cannot fire no matter what the incoming edge can safely be removed. If
/// a case fires on every incoming edge then the entire switch can be removed
/// and replaced with a branch to the case destination.
-static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, DominatorTree *DT) {
Value *Cond = SI->getCondition();
BasicBlock *BB = SI->getParent();
@@ -258,6 +322,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
// Analyse each switch case in turn.
bool Changed = false;
+ DenseMap<BasicBlock*, int> SuccessorsCount;
+ for (auto *Succ : successors(BB))
+ SuccessorsCount[Succ]++;
+
for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
ConstantInt *Case = CI->getCaseValue();
@@ -292,7 +360,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
if (State == LazyValueInfo::False) {
// This case never fires - remove it.
- CI->getCaseSuccessor()->removePredecessor(BB);
+ BasicBlock *Succ = CI->getCaseSuccessor();
+ Succ->removePredecessor(BB);
CI = SI->removeCase(CI);
CE = SI->case_end();
@@ -302,6 +371,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
++NumDeadCases;
Changed = true;
+ if (--SuccessorsCount[Succ] == 0)
+ DT->deleteEdge(BB, Succ);
continue;
}
if (State == LazyValueInfo::True) {
@@ -318,10 +389,14 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
++CI;
}
- if (Changed)
+ if (Changed) {
// If the switch has been simplified to the point where it can be replaced
// by a branch then do so now.
- ConstantFoldTerminator(BB);
+ DeferredDominance DDT(*DT);
+ ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false,
+ /*TLI = */ nullptr, &DDT);
+ DDT.flush();
+ }
return Changed;
}
@@ -430,9 +505,50 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
return true;
}
+/// Try to shrink a udiv/urem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+ assert(Instr->getOpcode() == Instruction::UDiv ||
+ Instr->getOpcode() == Instruction::URem);
+ if (Instr->getType()->isVectorTy())
+ return false;
+
+ // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+ // operands.
+ auto OrigWidth = Instr->getType()->getIntegerBitWidth();
+ ConstantRange OperandRange(OrigWidth, /*isFullset=*/false);
+ for (Value *Operand : Instr->operands()) {
+ OperandRange = OperandRange.unionWith(
+ LVI->getConstantRange(Operand, Instr->getParent()));
+ }
+ // Don't shrink below 8 bits wide.
+ unsigned NewWidth = std::max<unsigned>(
+ PowerOf2Ceil(OperandRange.getUnsignedMax().getActiveBits()), 8);
+ // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+ // two.
+ if (NewWidth >= OrigWidth)
+ return false;
+
+ ++NumUDivs;
+ auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+ auto *LHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(0), TruncTy,
+ Instr->getName() + ".lhs.trunc", Instr);
+ auto *RHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(1), TruncTy,
+ Instr->getName() + ".rhs.trunc", Instr);
+ auto *BO =
+ BinaryOperator::Create(Instr->getOpcode(), LHS, RHS, Instr->getName(), Instr);
+ auto *Zext = CastInst::Create(Instruction::ZExt, BO, Instr->getType(),
+ Instr->getName() + ".zext", Instr);
+ if (BO->getOpcode() == Instruction::UDiv)
+ BO->setIsExact(Instr->isExact());
+
+ Instr->replaceAllUsesWith(Zext);
+ Instr->eraseFromParent();
+ return true;
+}
+
static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() ||
- !hasPositiveOperands(SDI, LVI))
+ if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
return false;
++NumSRems;
@@ -440,6 +556,10 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
SDI->getName(), SDI);
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
+
+ // Try to process our new urem.
+ processUDivOrURem(BO, LVI);
+
return true;
}
@@ -449,8 +569,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() ||
- !hasPositiveOperands(SDI, LVI))
+ if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
return false;
++NumSDivs;
@@ -460,6 +579,9 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
+ // Try to simplify our new udiv.
+ processUDivOrURem(BO, LVI);
+
return true;
}
@@ -559,7 +681,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
ConstantInt::getFalse(C->getContext());
}
-static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
+static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
+ const SimplifyQuery &SQ) {
bool FnChanged = false;
// Visiting in a pre-order depth-first traversal causes us to simplify early
// blocks before querying later blocks (which require us to analyze early
@@ -575,7 +698,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
BBChanged |= processSelect(cast<SelectInst>(II), LVI);
break;
case Instruction::PHI:
- BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
+ BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
break;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -595,6 +718,10 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
case Instruction::SDiv:
BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
break;
+ case Instruction::UDiv:
+ case Instruction::URem:
+ BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
+ break;
case Instruction::AShr:
BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
break;
@@ -607,7 +734,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
Instruction *Term = BB->getTerminator();
switch (Term->getOpcode()) {
case Instruction::Switch:
- BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
+ BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT);
break;
case Instruction::Ret: {
auto *RI = cast<ReturnInst>(Term);
@@ -636,18 +763,22 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
return false;
LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
- return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F));
}
PreservedAnalyses
CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
-
LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
- bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+ bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index fa4806e884c3..6078967a0f94 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -20,11 +20,11 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "dce"
@@ -50,6 +50,7 @@ namespace {
for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
Instruction *Inst = &*DI++;
if (isInstructionTriviallyDead(Inst, TLI)) {
+ salvageDebugInfo(*Inst);
Inst->eraseFromParent();
Changed = true;
++DIEEliminated;
@@ -76,6 +77,8 @@ static bool DCEInstruction(Instruction *I,
SmallSetVector<Instruction *, 16> &WorkList,
const TargetLibraryInfo *TLI) {
if (isInstructionTriviallyDead(I, TLI)) {
+ salvageDebugInfo(*I);
+
// Null out all of the instruction's operands to see if any operand becomes
// dead as we go.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e703014bb0e6..dd1a2a6adb82 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -30,6 +30,7 @@
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -56,11 +57,10 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
-#include <cstdint>
#include <cstddef>
+#include <cstdint>
#include <iterator>
#include <map>
#include <utility>
@@ -115,6 +115,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
Instruction *DeadInst = NowDeadInsts.pop_back_val();
++NumFastOther;
+ // Try to preserve debug information attached to the dead instruction.
+ salvageDebugInfo(*DeadInst);
+
// This instruction is dead, zap it, in stages. Start by removing it from
// MemDep, which needs to know the operands and needs it to be in the
// function.
@@ -146,7 +149,8 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
/// Does this instruction write some memory? This only returns true for things
/// that we can analyze with other helpers below.
-static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+static bool hasAnalyzableMemoryWrite(Instruction *I,
+ const TargetLibraryInfo &TLI) {
if (isa<StoreInst>(I))
return true;
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
@@ -156,6 +160,9 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
case Intrinsic::memset:
case Intrinsic::memmove:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
case Intrinsic::init_trampoline:
case Intrinsic::lifetime_end:
return true;
@@ -180,43 +187,45 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
/// Return a Location stored to by the specified instruction. If isRemovable
/// returns true, this function and getLocForRead completely describe the memory
/// operations for this instruction.
-static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+static MemoryLocation getLocForWrite(Instruction *Inst) {
+
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
return MemoryLocation::get(SI);
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
// memcpy/memmove/memset.
MemoryLocation Loc = MemoryLocation::getForDest(MI);
return Loc;
}
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
- if (!II)
- return MemoryLocation();
-
- switch (II->getIntrinsicID()) {
- default:
- return MemoryLocation(); // Unhandled intrinsic.
- case Intrinsic::init_trampoline:
- // FIXME: We don't know the size of the trampoline, so we can't really
- // handle it here.
- return MemoryLocation(II->getArgOperand(0));
- case Intrinsic::lifetime_end: {
- uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
- return MemoryLocation(II->getArgOperand(1), Len);
- }
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return MemoryLocation(); // Unhandled intrinsic.
+ case Intrinsic::init_trampoline:
+ return MemoryLocation(II->getArgOperand(0));
+ case Intrinsic::lifetime_end: {
+ uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+ return MemoryLocation(II->getArgOperand(1), Len);
+ }
+ }
}
+ if (auto CS = CallSite(Inst))
+ // All the supported TLI functions so far happen to have dest as their
+ // first argument.
+ return MemoryLocation(CS.getArgument(0));
+ return MemoryLocation();
}
-/// Return the location read by the specified "hasMemoryWrite" instruction if
-/// any.
+/// Return the location read by the specified "hasAnalyzableMemoryWrite"
+/// instruction if any.
static MemoryLocation getLocForRead(Instruction *Inst,
const TargetLibraryInfo &TLI) {
- assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+ assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
// The only instructions that both read and write are the mem transfer
// instructions (memcpy/memmove).
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
return MemoryLocation::getForSource(MTI);
return MemoryLocation();
}
@@ -230,7 +239,7 @@ static bool isRemovable(Instruction *I) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
- default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+ default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
case Intrinsic::lifetime_end:
// Never remove dead lifetime_end's, e.g. because it is followed by a
// free.
@@ -243,9 +252,14 @@ static bool isRemovable(Instruction *I) {
case Intrinsic::memcpy:
// Don't remove volatile memory intrinsics.
return !cast<MemIntrinsic>(II)->isVolatile();
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
+ return true;
}
}
+ // note: only get here for calls with analyzable writes - i.e. libcalls
if (auto CS = CallSite(I))
return CS.getInstruction()->use_empty();
@@ -264,6 +278,8 @@ static bool isShortenableAtTheEnd(Instruction *I) {
default: return false;
case Intrinsic::memset:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
// Do shorten memory intrinsics.
// FIXME: Add memmove if it's also safe to transform.
return true;
@@ -280,35 +296,27 @@ static bool isShortenableAtTheEnd(Instruction *I) {
static bool isShortenableAtTheBeginning(Instruction *I) {
// FIXME: Handle only memset for now. Supporting memcpy/memmove should be
// easily done by offsetting the source address.
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
- return II && II->getIntrinsicID() == Intrinsic::memset;
+ return isa<AnyMemSetInst>(I);
}
/// Return the pointer that is being written to.
static Value *getStoredPointerOperand(Instruction *I) {
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
- return MI->getDest();
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::init_trampoline:
- return II->getArgOperand(0);
- }
- }
-
- CallSite CS(I);
- // All the supported functions so far happen to have dest as their first
- // argument.
- return CS.getArgument(0);
+ //TODO: factor this to reuse getLocForWrite
+ MemoryLocation Loc = getLocForWrite(I);
+ assert(Loc.Ptr &&
+ "unable to find pointer written for analyzable instruction?");
+ // TODO: most APIs don't expect const Value *
+ return const_cast<Value*>(Loc.Ptr);
}
static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
- const TargetLibraryInfo &TLI) {
+ const TargetLibraryInfo &TLI,
+ const Function *F) {
uint64_t Size;
- if (getObjectSize(V, Size, DL, &TLI))
+ ObjectSizeOpts Opts;
+ Opts.NullIsUnknownSize = NullPointerIsDefined(F);
+
+ if (getObjectSize(V, Size, DL, &TLI, Opts))
return Size;
return MemoryLocation::UnknownSize;
}
@@ -338,7 +346,9 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
const TargetLibraryInfo &TLI,
int64_t &EarlierOff, int64_t &LaterOff,
Instruction *DepWrite,
- InstOverlapIntervalsTy &IOL) {
+ InstOverlapIntervalsTy &IOL,
+ AliasAnalysis &AA,
+ const Function *F) {
// If we don't know the sizes of either access, then we can't do a comparison.
if (Later.Size == MemoryLocation::UnknownSize ||
Earlier.Size == MemoryLocation::UnknownSize)
@@ -349,7 +359,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// If the start pointers are the same, we just have to compare sizes to see if
// the later store was larger than the earlier store.
- if (P1 == P2) {
+ if (P1 == P2 || AA.isMustAlias(P1, P2)) {
// Make sure that the Later size is >= the Earlier size.
if (Later.Size >= Earlier.Size)
return OW_Complete;
@@ -367,7 +377,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
return OW_Unknown;
// If the "Later" store is to a recognizable object, get its size.
- uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+ uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
if (ObjectSize != MemoryLocation::UnknownSize)
if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
return OW_Complete;
@@ -415,9 +425,10 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// Insert our part of the overlap into the map.
auto &IM = IOL[DepWrite];
- DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
- int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
- LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");
+ LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
+ << ", " << int64_t(EarlierOff + Earlier.Size)
+ << ") Later [" << LaterOff << ", "
+ << int64_t(LaterOff + Later.Size) << ")\n");
// Make sure that we only insert non-overlapping intervals and combine
// adjacent intervals. The intervals are stored in the map with the ending
@@ -454,11 +465,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
ILI = IM.begin();
if (ILI->second <= EarlierOff &&
ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
- DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
- EarlierOff << ", " <<
- int64_t(EarlierOff + Earlier.Size) <<
- ") Composite Later [" <<
- ILI->second << ", " << ILI->first << ")\n");
+ LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
+ << EarlierOff << ", "
+ << int64_t(EarlierOff + Earlier.Size)
+ << ") Composite Later [" << ILI->second << ", "
+ << ILI->first << ")\n");
++NumCompletePartials;
return OW_Complete;
}
@@ -469,10 +480,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
int64_t(EarlierOff + Earlier.Size) > LaterOff &&
uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
- DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
- << ", " << int64_t(EarlierOff + Earlier.Size)
- << ") by a later store [" << LaterOff << ", "
- << int64_t(LaterOff + Later.Size) << ")\n");
+ LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
+ << EarlierOff << ", "
+ << int64_t(EarlierOff + Earlier.Size)
+ << ") by a later store [" << LaterOff << ", "
+ << int64_t(LaterOff + Later.Size) << ")\n");
// TODO: Maybe come up with a better name?
return OW_PartialEarlierWithFullLater;
}
@@ -514,8 +526,8 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
/// memory region into an identical pointer) then it doesn't actually make its
/// input dead in the traditional sense. Consider this case:
///
-/// memcpy(A <- B)
-/// memcpy(A <- A)
+/// memmove(A <- B)
+/// memmove(A <- A)
///
/// In this case, the second store to A does not make the first store to A dead.
/// The usual situation isn't an explicit A<-A store like this (which can be
@@ -531,24 +543,35 @@ static bool isPossibleSelfRead(Instruction *Inst,
// Self reads can only happen for instructions that read memory. Get the
// location read.
MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
- if (!InstReadLoc.Ptr) return false; // Not a reading instruction.
+ if (!InstReadLoc.Ptr)
+ return false; // Not a reading instruction.
// If the read and written loc obviously don't alias, it isn't a read.
- if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
-
- // Okay, 'Inst' may copy over itself. However, we can still remove a the
- // DepWrite instruction if we can prove that it reads from the same location
- // as Inst. This handles useful cases like:
- // memcpy(A <- B)
- // memcpy(A <- B)
- // Here we don't know if A/B may alias, but we do know that B/B are must
- // aliases, so removing the first memcpy is safe (assuming it writes <= #
- // bytes as the second one.
- MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
-
- if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
return false;
+ if (isa<AnyMemCpyInst>(Inst)) {
+ // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
+ // but in practice memcpy(A <- B) either means that A and B are disjoint or
+ // are equal (i.e. there are not partial overlaps). Given that, if we have:
+ //
+ // memcpy/memmove(A <- B) // DepWrite
+ // memcpy(A <- B) // Inst
+ //
+ // with Inst reading/writing a >= size than DepWrite, we can reason as
+ // follows:
+ //
+ // - If A == B then both the copies are no-ops, so the DepWrite can be
+ // removed.
+ // - If A != B then A and B are disjoint locations in Inst. Since
+ // Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
+ // Therefore DepWrite can be removed.
+ MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+ if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ return false;
+ }
+
// If DepWrite doesn't read memory or if we can't prove it is a must alias,
// then it can't be considered dead.
return true;
@@ -650,7 +673,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
while (Dep.isDef() || Dep.isClobber()) {
Instruction *Dependency = Dep.getInst();
- if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+ if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
+ !isRemovable(Dependency))
break;
Value *DepPointer =
@@ -660,8 +684,9 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
break;
- DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
- << *Dependency << '\n');
+ LLVM_DEBUG(
+ dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
+ << *Dependency << '\n');
// DCE instructions only used to calculate that store.
BasicBlock::iterator BBI(Dependency);
@@ -690,7 +715,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
SmallSetVector<Value *, 16> &DeadStackObjects,
const DataLayout &DL, AliasAnalysis *AA,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo *TLI,
+ const Function *F) {
const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
// A constant can't be in the dead pointer set.
@@ -707,7 +733,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
// Remove objects that could alias LoadedLoc.
DeadStackObjects.remove_if([&](Value *I) {
// See if the loaded location could alias the stack location.
- MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+ MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
return !AA->isNoAlias(StackLoc, LoadedLoc);
});
}
@@ -754,7 +780,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
--BBI;
// If we find a store, check to see if it points into a dead stack value.
- if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+ if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
// See through pointer-to-pointer bitcasts
SmallVector<Value *, 4> Pointers;
GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
@@ -770,15 +796,16 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
if (AllDead) {
Instruction *Dead = &*BBI;
- DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
- << *Dead << "\n Objects: ";
- for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
- E = Pointers.end(); I != E; ++I) {
- dbgs() << **I;
- if (std::next(I) != E)
- dbgs() << ", ";
- }
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
+ << *Dead << "\n Objects: ";
+ for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+ E = Pointers.end();
+ I != E; ++I) {
+ dbgs() << **I;
+ if (std::next(I) != E)
+ dbgs() << ", ";
+ } dbgs()
+ << '\n');
// DCE instructions only used to calculate that store.
deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
@@ -790,8 +817,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// Remove any dead non-memory-mutating instructions.
if (isInstructionTriviallyDead(&*BBI, TLI)) {
- DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
- << *&*BBI << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
+ << *&*BBI << '\n');
deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
++NumFastOther;
MadeChange = true;
@@ -820,7 +847,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// the call is live.
DeadStackObjects.remove_if([&](Value *I) {
// See if the call site touches the value.
- return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
+ return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI,
+ BB.getParent())));
});
// If all of the allocas were clobbered by the call then we're not going
@@ -848,8 +876,6 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
LoadedLoc = MemoryLocation::get(L);
} else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
LoadedLoc = MemoryLocation::get(V);
- } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
- LoadedLoc = MemoryLocation::getForSource(MTI);
} else if (!BBI->mayReadFromMemory()) {
// Instruction doesn't read memory. Note that stores that weren't removed
// above will hit this case.
@@ -861,7 +887,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// Remove any allocas from the DeadPointer set that are loaded, as this
// makes any stores above the access live.
- removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);
+ removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
// If all of the allocas were clobbered by the access then we're not going
// to find anything else to process.
@@ -881,8 +907,8 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
// Power of 2 vector writes are probably always a bad idea to optimize
// as any store/memset/memcpy is likely using vector instructions so
// shortening it to not vector size is likely to be slower
- MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
- unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
+ auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
+ unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
if (!IsOverwriteEnd)
LaterOffset = int64_t(LaterOffset + LaterSize);
@@ -890,15 +916,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
return false;
- DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
- << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
- << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
- << ")\n");
-
int64_t NewLength = IsOverwriteEnd
? LaterOffset - EarlierOffset
: EarlierSize - (LaterOffset - EarlierOffset);
+ if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+ // When shortening an atomic memory intrinsic, the newly shortened
+ // length must remain an integer multiple of the element size.
+ const uint32_t ElementSize = AMI->getElementSizeInBytes();
+ if (0 != NewLength % ElementSize)
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
+ << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
+ << *EarlierWrite << "\n KILLER (offset " << LaterOffset
+ << ", " << EarlierSize << ")\n");
+
Value *EarlierWriteLength = EarlierIntrinsic->getLength();
Value *TrimmedLength =
ConstantInt::get(EarlierWriteLength->getType(), NewLength);
@@ -966,7 +1000,7 @@ static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
bool Changed = false;
for (auto OI : IOL) {
Instruction *EarlierWrite = OI.first;
- MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
+ MemoryLocation Loc = getLocForWrite(EarlierWrite);
assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");
@@ -1002,8 +1036,9 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
- DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
- << *DepLoad << "\n STORE: " << *SI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
+ << *DepLoad << "\n STORE: " << *SI << '\n');
deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
++NumRedundantStores;
@@ -1019,7 +1054,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
<< *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
@@ -1067,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
}
// Check to see if Inst writes to memory. If not, continue.
- if (!hasMemoryWrite(Inst, *TLI))
+ if (!hasAnalyzableMemoryWrite(Inst, *TLI))
continue;
// eliminateNoopStore will update in iterator, if necessary.
@@ -1085,7 +1120,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
continue;
// Figure out what location is being stored to.
- MemoryLocation Loc = getLocForWrite(Inst, *AA);
+ MemoryLocation Loc = getLocForWrite(Inst);
// If we didn't get a useful location, fail.
if (!Loc.Ptr)
@@ -1107,7 +1142,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
//
// Find out what memory location the dependent instruction stores.
Instruction *DepWrite = InstDep.getInst();
- MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+ if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
+ break;
+ MemoryLocation DepLoc = getLocForWrite(DepWrite);
// If we didn't get a useful location, or if it isn't a size, bail out.
if (!DepLoc.Ptr)
break;
@@ -1145,12 +1182,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
int64_t InstWriteOffset, DepWriteOffset;
- OverwriteResult OR =
- isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
- DepWrite, IOL);
+ OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset,
+ InstWriteOffset, DepWrite, IOL, *AA,
+ BB.getParent());
if (OR == OW_Complete) {
- DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
- << *DepWrite << "\n KILLER: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
+ << "\n KILLER: " << *Inst << '\n');
// Delete the store and now-dead instructions that feed it.
deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering);
@@ -1176,7 +1213,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
auto *Earlier = dyn_cast<StoreInst>(DepWrite);
auto *Later = dyn_cast<StoreInst>(Inst);
if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
- Later && isa<ConstantInt>(Later->getValueOperand())) {
+ Later && isa<ConstantInt>(Later->getValueOperand()) &&
+ memoryIsNotModifiedBetween(Earlier, Later, AA)) {
// If the store we find is:
// a) partially overwritten by the store to 'Loc'
// b) the later store is fully contained in the earlier one and
@@ -1207,9 +1245,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
// store, shifted appropriately.
APInt Merged =
(EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
- DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
- << "\n Later: " << *Inst
- << "\n Merged Value: " << Merged << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
+ << "\n Later: " << *Inst
+ << "\n Merged Value: " << Merged << '\n');
auto *SI = new StoreInst(
ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp
index e383af89a384..e1bc590c5c9a 100644
--- a/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -13,6 +13,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -48,7 +50,10 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
// Insert all divide and remainder instructions into maps keyed by their
// operands and opcode (signed or unsigned).
- DenseMap<DivRemMapKey, Instruction *> DivMap, RemMap;
+ DenseMap<DivRemMapKey, Instruction *> DivMap;
+ // Use a MapVector for RemMap so that instructions are moved/inserted in a
+ // deterministic order.
+ MapVector<DivRemMapKey, Instruction *> RemMap;
for (auto &BB : F) {
for (auto &I : BB) {
if (I.getOpcode() == Instruction::SDiv)
@@ -67,14 +72,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
// rare than division.
for (auto &RemPair : RemMap) {
// Find the matching division instruction from the division map.
- Instruction *DivInst = DivMap[RemPair.getFirst()];
+ Instruction *DivInst = DivMap[RemPair.first];
if (!DivInst)
continue;
// We have a matching pair of div/rem instructions. If one dominates the
// other, hoist and/or replace one.
NumPairs++;
- Instruction *RemInst = RemPair.getSecond();
+ Instruction *RemInst = RemPair.second;
bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 5798e1c4ee99..565745d12e99 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -27,6 +27,7 @@
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -49,10 +50,10 @@
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/RecyclingAllocator.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <deque>
#include <memory>
@@ -70,13 +71,16 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
STATISTIC(NumCSECall, "Number of call instructions CSE'd");
STATISTIC(NumDSE, "Number of trivial dead stores removed");
+DEBUG_COUNTER(CSECounter, "early-cse",
+ "Controls which instructions are removed");
+
//===----------------------------------------------------------------------===//
// SimpleValue
//===----------------------------------------------------------------------===//
namespace {
-/// \brief Struct representing the available values in the scoped hash table.
+/// Struct representing the available values in the scoped hash table.
struct SimpleValue {
Instruction *Inst;
@@ -151,12 +155,15 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor;
// TODO: We should also detect FP min/max.
if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
- SPF == SPF_UMIN || SPF == SPF_UMAX ||
- SPF == SPF_ABS || SPF == SPF_NABS) {
+ SPF == SPF_UMIN || SPF == SPF_UMAX) {
if (A > B)
std::swap(A, B);
return hash_combine(Inst->getOpcode(), SPF, A, B);
}
+ if (SPF == SPF_ABS || SPF == SPF_NABS) {
+ // ABS/NABS always puts the input in A and its negation in B.
+ return hash_combine(Inst->getOpcode(), SPF, A, B);
+ }
if (CastInst *CI = dyn_cast<CastInst>(Inst))
return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
@@ -226,8 +233,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
LSPF == SPF_ABS || LSPF == SPF_NABS) {
Value *RHSA, *RHSB;
SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor;
- return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) ||
- (LHSA == RHSB && LHSB == RHSA)));
+ if (LSPF == RSPF) {
+ // Abs results are placed in a defined order by matchSelectPattern.
+ if (LSPF == SPF_ABS || LSPF == SPF_NABS)
+ return LHSA == RHSA && LHSB == RHSB;
+ return ((LHSA == RHSA && LHSB == RHSB) ||
+ (LHSA == RHSB && LHSB == RHSA));
+ }
}
return false;
@@ -239,7 +251,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
namespace {
-/// \brief Struct representing the available call values in the scoped hash
+/// Struct representing the available call values in the scoped hash
/// table.
struct CallValue {
Instruction *Inst;
@@ -305,7 +317,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
namespace {
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
///
/// This pass does a simple depth-first walk over the dominator tree,
/// eliminating trivially redundant instructions and using instsimplify to
@@ -329,7 +341,7 @@ public:
ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
AllocatorTy>;
- /// \brief A scoped hash table of the current values of all of our simple
+ /// A scoped hash table of the current values of all of our simple
/// scalar expressions.
///
/// As we walk down the domtree, we look to see if instructions are in this:
@@ -337,8 +349,8 @@ public:
/// that dominated values can succeed in their lookup.
ScopedHTType AvailableValues;
- /// A scoped hash table of the current values of previously encounted memory
- /// locations.
+ /// A scoped hash table of the current values of previously encountered
+ /// memory locations.
///
/// This allows us to get efficient access to dominating loads or stores when
/// we have a fully redundant load. In addition to the most recent load, we
@@ -356,13 +368,12 @@ public:
unsigned Generation = 0;
int MatchingId = -1;
bool IsAtomic = false;
- bool IsInvariant = false;
LoadValue() = default;
LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
- bool IsAtomic, bool IsInvariant)
+ bool IsAtomic)
: DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
- IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
+ IsAtomic(IsAtomic) {}
};
using LoadMapAllocator =
@@ -373,8 +384,19 @@ public:
LoadMapAllocator>;
LoadHTType AvailableLoads;
+
+ // A scoped hash table mapping memory locations (represented as typed
+ // addresses) to generation numbers at which that memory location became
+ // (henceforth indefinitely) invariant.
+ using InvariantMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<MemoryLocation, unsigned>>;
+ using InvariantHTType =
+ ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+ InvariantMapAllocator>;
+ InvariantHTType AvailableInvariants;
- /// \brief A scoped hash table of the current values of read-only call
+ /// A scoped hash table of the current values of read-only call
/// values.
///
/// It uses the same generation count as loads.
@@ -382,10 +404,10 @@ public:
ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
CallHTType AvailableCalls;
- /// \brief This is the current generation of the memory value.
+ /// This is the current generation of the memory value.
unsigned CurrentGeneration = 0;
- /// \brief Set up the EarlyCSE runner for a particular function.
+ /// Set up the EarlyCSE runner for a particular function.
EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
const TargetTransformInfo &TTI, DominatorTree &DT,
AssumptionCache &AC, MemorySSA *MSSA)
@@ -401,15 +423,16 @@ private:
class NodeScope {
public:
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- CallHTType &AvailableCalls)
- : Scope(AvailableValues), LoadScope(AvailableLoads),
- CallScope(AvailableCalls) {}
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
NodeScope(const NodeScope &) = delete;
NodeScope &operator=(const NodeScope &) = delete;
private:
ScopedHTType::ScopeTy Scope;
LoadHTType::ScopeTy LoadScope;
+ InvariantHTType::ScopeTy InvariantScope;
CallHTType::ScopeTy CallScope;
};
@@ -420,10 +443,13 @@ private:
class StackNode {
public:
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
- DomTreeNode::iterator child, DomTreeNode::iterator end)
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+ unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
+ DomTreeNode::iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
- EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+ EndIter(end),
+ Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls)
{}
StackNode(const StackNode &) = delete;
StackNode &operator=(const StackNode &) = delete;
@@ -455,7 +481,7 @@ private:
bool Processed = false;
};
- /// \brief Wrapper class to handle memory instructions, including loads,
+ /// Wrapper class to handle memory instructions, including loads,
/// stores and intrinsic loads and stores defined by the target.
class ParseMemoryInst {
public:
@@ -532,12 +558,7 @@ private:
Value *getPointerOperand() const {
if (IsTargetMemInst) return Info.PtrVal;
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
- return LI->getPointerOperand();
- } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- return SI->getPointerOperand();
- }
- return nullptr;
+ return getLoadStorePointerOperand(Inst);
}
bool mayReadFromMemory() const {
@@ -558,6 +579,9 @@ private:
bool processNode(DomTreeNode *Node);
+ bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
+ const BasicBlock *BB, const BasicBlock *Pred);
+
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
if (auto *LI = dyn_cast<LoadInst>(Inst))
return LI;
@@ -568,6 +592,10 @@ private:
ExpectedType);
}
+ /// Return true if the instruction is known to only operate on memory
+ /// provably invariant in the given "generation".
+ bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
Instruction *EarlierInst, Instruction *LaterInst);
@@ -661,6 +689,79 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
return MSSA->dominates(LaterDef, EarlierMA);
}
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+ // A location loaded from with an invariant_load is assumed to *never* change
+ // within the visible scope of the compilation.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ auto MemLocOpt = MemoryLocation::getOrNone(I);
+ if (!MemLocOpt)
+ // "target" intrinsic forms of loads aren't currently known to
+ // MemoryLocation::get. TODO
+ return false;
+ MemoryLocation MemLoc = *MemLocOpt;
+ if (!AvailableInvariants.count(MemLoc))
+ return false;
+
+ // Is the generation at which this became invariant older than the
+ // current one?
+ return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
+bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
+ const BranchInst *BI, const BasicBlock *BB,
+ const BasicBlock *Pred) {
+ assert(BI->isConditional() && "Should be a conditional branch!");
+ assert(BI->getCondition() == CondInst && "Wrong condition?");
+ assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+ auto *TorF = (BI->getSuccessor(0) == BB)
+ ? ConstantInt::getTrue(BB->getContext())
+ : ConstantInt::getFalse(BB->getContext());
+ auto MatchBinOp = [](Instruction *I, unsigned Opcode) {
+ if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(I))
+ return BOp->getOpcode() == Opcode;
+ return false;
+ };
+ // If the condition is AND operation, we can propagate its operands into the
+ // true branch. If it is OR operation, we can propagate them into the false
+ // branch.
+ unsigned PropagateOpcode =
+ (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or;
+
+ bool MadeChanges = false;
+ SmallVector<Instruction *, 4> WorkList;
+ SmallPtrSet<Instruction *, 4> Visited;
+ WorkList.push_back(CondInst);
+ while (!WorkList.empty()) {
+ Instruction *Curr = WorkList.pop_back_val();
+
+ AvailableValues.insert(Curr, TorF);
+ LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+ << Curr->getName() << "' as " << *TorF << " in "
+ << BB->getName() << "\n");
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ // Replace all dominated uses with the known value.
+ if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT,
+ BasicBlockEdge(Pred, BB))) {
+ NumCSECVP += Count;
+ MadeChanges = true;
+ }
+ }
+
+ if (MatchBinOp(Curr, PropagateOpcode))
+ for (auto &Op : cast<BinaryOperator>(Curr)->operands())
+ if (Instruction *OPI = dyn_cast<Instruction>(Op))
+ if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
+ WorkList.push_back(OPI);
+ }
+
+ return MadeChanges;
+}
+
bool EarlyCSE::processNode(DomTreeNode *Node) {
bool Changed = false;
BasicBlock *BB = Node->getBlock();
@@ -684,22 +785,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
if (BI && BI->isConditional()) {
auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
- if (CondInst && SimpleValue::canHandle(CondInst)) {
- assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
- auto *TorF = (BI->getSuccessor(0) == BB)
- ? ConstantInt::getTrue(BB->getContext())
- : ConstantInt::getFalse(BB->getContext());
- AvailableValues.insert(CondInst, TorF);
- DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
- << CondInst->getName() << "' as " << *TorF << " in "
- << BB->getName() << "\n");
- // Replace all dominated uses with the known value.
- if (unsigned Count = replaceDominatedUsesWith(
- CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
- Changed = true;
- NumCSECVP += Count;
- }
- }
+ if (CondInst && SimpleValue::canHandle(CondInst))
+ Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
}
}
@@ -716,7 +803,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Dead instructions should just be removed.
if (isInstructionTriviallyDead(Inst, &TLI)) {
- DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
+ salvageDebugInfo(*Inst);
removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
@@ -732,31 +824,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
auto *CondI =
dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
if (CondI && SimpleValue::canHandle(CondI)) {
- DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst
+ << '\n');
AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
} else
- DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
continue;
}
// Skip sideeffect intrinsics, for the same reason as assume intrinsics.
if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
- DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
continue;
}
- // Skip invariant.start intrinsics since they only read memory, and we can
- // forward values across it. Also, we dont need to consume the last store
- // since the semantics of invariant.start allow us to perform DSE of the
- // last store, if there was a store following invariant.start. Consider:
+ // We can skip all invariant.start intrinsics since they only read memory,
+ // and we can forward values across it. For invariant starts without
+ // invariant ends, we can use the fact that the invariantness never ends to
+ // start a scope in the current generaton which is true for all future
+ // generations. Also, we dont need to consume the last store since the
+ // semantics of invariant.start allow us to perform DSE of the last
+ // store, if there was a store following invariant.start. Consider:
//
// store 30, i8* p
// invariant.start(p)
// store 40, i8* p
// We can DSE the store to 30, since the store 40 to invariant location p
// causes undefined behaviour.
- if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+ if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+ // If there are any uses, the scope might end.
+ if (!Inst->use_empty())
+ continue;
+ auto *CI = cast<CallInst>(Inst);
+ MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+ // Don't start a scope if we already have a better one pushed
+ if (!AvailableInvariants.count(MemLoc))
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
continue;
+ }
if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
if (auto *CondI =
@@ -767,7 +872,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Is the condition known to be true?
if (isa<ConstantInt>(KnownCond) &&
cast<ConstantInt>(KnownCond)->isOne()) {
- DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs()
+ << "EarlyCSE removing guard: " << *Inst << '\n');
removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
@@ -792,29 +898,39 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If the instruction can be simplified (e.g. X+0 = X) then replace it with
// its simpler value.
if (Value *V = SimplifyInstruction(Inst, SQ)) {
- DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
- bool Killed = false;
- if (!Inst->use_empty()) {
- Inst->replaceAllUsesWith(V);
- Changed = true;
- }
- if (isInstructionTriviallyDead(Inst, &TLI)) {
- removeMSSA(Inst);
- Inst->eraseFromParent();
- Changed = true;
- Killed = true;
+ LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V
+ << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ bool Killed = false;
+ if (!Inst->use_empty()) {
+ Inst->replaceAllUsesWith(V);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(Inst, &TLI)) {
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ Killed = true;
+ }
+ if (Changed)
+ ++NumSimplify;
+ if (Killed)
+ continue;
}
- if (Changed)
- ++NumSimplify;
- if (Killed)
- continue;
}
// If this is a simple instruction that we can value number, process it.
if (SimpleValue::canHandle(Inst)) {
// See if the instruction has an available value. If so, use it.
if (Value *V = AvailableValues.lookup(Inst)) {
- DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V
+ << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
if (auto *I = dyn_cast<Instruction>(V))
I->andIRFlags(Inst);
Inst->replaceAllUsesWith(V);
@@ -840,6 +956,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
++CurrentGeneration;
}
+ if (MemInst.isInvariantLoad()) {
+ // If we pass an invariant load, we know that memory location is
+ // indefinitely constant from the moment of first dereferenceability.
+ // We conservatively treat the invariant_load as that moment. If we
+ // pass a invariant load after already establishing a scope, don't
+ // restart it since we want to preserve the earliest point seen.
+ auto MemLoc = MemoryLocation::get(Inst);
+ if (!AvailableInvariants.count(MemLoc))
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
+ }
+
// If we have an available version of this load, and if it is the right
// generation or the load is known to be from an invariant location,
// replace this instruction.
@@ -854,13 +981,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
!MemInst.isVolatile() && MemInst.isUnordered() &&
// We can't replace an atomic load with one which isn't also atomic.
InVal.IsAtomic >= MemInst.isAtomic() &&
- (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+ (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
isSameMemGeneration(InVal.Generation, CurrentGeneration,
InVal.DefInst, Inst))) {
Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
if (Op != nullptr) {
- DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
- << " to: " << *InVal.DefInst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+ << " to: " << *InVal.DefInst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
if (!Inst->use_empty())
Inst->replaceAllUsesWith(Op);
removeMSSA(Inst);
@@ -875,7 +1006,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
AvailableLoads.insert(
MemInst.getPointerOperand(),
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
- MemInst.isAtomic(), MemInst.isInvariantLoad()));
+ MemInst.isAtomic()));
LastStore = nullptr;
continue;
}
@@ -898,8 +1029,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (InVal.first != nullptr &&
isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
Inst)) {
- DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
- << " to: " << *InVal.first << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
if (!Inst->use_empty())
Inst->replaceAllUsesWith(InVal.first);
removeMSSA(Inst);
@@ -938,8 +1073,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
InVal.MatchingId == MemInst.getMatchingId() &&
// We don't yet handle removing stores with ordering of any kind.
!MemInst.isVolatile() && MemInst.isUnordered() &&
- isSameMemGeneration(InVal.Generation, CurrentGeneration,
- InVal.DefInst, Inst)) {
+ (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst))) {
// It is okay to have a LastStore to a different pointer here if MemorySSA
// tells us that the load and store are from the same memory generation.
// In that case, LastStore should keep its present value since we're
@@ -949,7 +1085,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
MemInst.getPointerOperand() ||
MSSA) &&
"can't have an intervening store if not using MemorySSA!");
- DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
@@ -980,13 +1120,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
!LastStoreMemInst.isVolatile() &&
"Violated invariant");
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
- DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
- << " due to: " << *Inst << '\n');
- removeMSSA(LastStore);
- LastStore->eraseFromParent();
- Changed = true;
- ++NumDSE;
- LastStore = nullptr;
+ LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << *Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ removeMSSA(LastStore);
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
}
// fallthrough - we can exploit information about this store
}
@@ -999,7 +1143,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
AvailableLoads.insert(
MemInst.getPointerOperand(),
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
- MemInst.isAtomic(), /*IsInvariant=*/false));
+ MemInst.isAtomic()));
// Remember that this was the last unordered store we saw for DSE. We
// don't yet handle DSE on ordered or volatile stores since we don't
@@ -1031,8 +1175,9 @@ bool EarlyCSE::run() {
// Process the root node.
nodesToProcess.push_back(new StackNode(
- AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
- DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+ AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+ CurrentGeneration, DT.getRootNode(),
+ DT.getRootNode()->begin(), DT.getRootNode()->end()));
// Save the current generation.
unsigned LiveOutGeneration = CurrentGeneration;
@@ -1056,9 +1201,9 @@ bool EarlyCSE::run() {
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
nodesToProcess.push_back(
- new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
- NodeToProcess->childGeneration(), child, child->begin(),
- child->end()));
+ new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls, NodeToProcess->childGeneration(),
+ child, child->begin(), child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
@@ -1097,7 +1242,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
namespace {
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
///
/// This pass does a simple depth-first walk over the dominator tree,
/// eliminating trivially redundant instructions and using instsimplify to
diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 063df779a30b..117b19fb8a42 100644
--- a/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -12,10 +12,10 @@
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CFG.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "flattencfg"
diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index b105ece8dc7c..f2828e80bc58 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -138,7 +138,7 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
// Helper - mark I as having been traversed, having range R.
void Float2IntPass::seen(Instruction *I, ConstantRange R) {
- DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
auto IT = SeenInsts.find(I);
if (IT != SeenInsts.end())
IT->second = std::move(R);
@@ -359,7 +359,7 @@ bool Float2IntPass::validateAndTransform() {
for (User *U : I->users()) {
Instruction *UI = dyn_cast<Instruction>(U);
if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
- DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
Fail = true;
break;
}
@@ -380,7 +380,7 @@ bool Float2IntPass::validateAndTransform() {
// lower limits, plus one so it can be signed.
unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
R.getUpper().getMinSignedBits()) + 1;
- DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
// If we've run off the realms of the exactly representable integers,
// the floating point result will differ from an integer approximation.
@@ -391,11 +391,12 @@ bool Float2IntPass::validateAndTransform() {
unsigned MaxRepresentableBits
= APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
if (MinBW > MaxRepresentableBits) {
- DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+ LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
continue;
}
if (MinBW > 64) {
- DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+ LLVM_DEBUG(
+ dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
continue;
}
@@ -490,7 +491,7 @@ void Float2IntPass::cleanup() {
}
bool Float2IntPass::runImpl(Function &F) {
- DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
// Clear out all state.
ECs = EquivalenceClasses<Instruction*>();
SeenInsts.clear();
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index e2c1eaf58e43..1e0a22cb14b3 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -38,7 +38,9 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PHITransAddr.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
@@ -69,7 +71,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
#include <algorithm>
@@ -765,6 +766,15 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
if (SSAUpdate.HasValueForBlock(BB))
continue;
+ // If the value is the load that we will be eliminating, and the block it's
+ // available in is the block that the load is in, then don't add it as
+ // SSAUpdater will resolve the value to the relevant phi which may let it
+ // avoid phi construction entirely if there's actually only one value.
+ if (BB == LI->getParent() &&
+ ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) ||
+ (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI)))
+ continue;
+
SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
}
@@ -783,9 +793,10 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
if (Res->getType() != LoadTy) {
Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
- DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " "
- << *getSimpleValue() << '\n'
- << *Res << '\n' << "\n\n\n");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
+ << " " << *getSimpleValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
}
} else if (isCoercedLoadValue()) {
LoadInst *Load = getCoercedLoadValue();
@@ -799,20 +810,21 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
// but then there all of the operations based on it would need to be
// rehashed. Just leave the dead load around.
gvn.getMemDep().removeInstruction(Load);
- DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " "
- << *getCoercedLoadValue() << '\n'
- << *Res << '\n'
- << "\n\n\n");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset
+ << " " << *getCoercedLoadValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
}
} else if (isMemIntrinValue()) {
Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
InsertPt, DL);
- DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
- << " " << *getMemIntrinValue() << '\n'
- << *Res << '\n' << "\n\n\n");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+ << " " << *getMemIntrinValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
} else {
assert(isUndefValue() && "Should be UndefVal");
- DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
return UndefValue::get(LoadTy);
}
assert(Res && "failed to materialize?");
@@ -825,7 +837,7 @@ static bool isLifetimeStart(const Instruction *Inst) {
return false;
}
-/// \brief Try to locate the three instruction involved in a missed
+/// Try to locate the three instruction involved in a missed
/// load-elimination case that is due to an intervening store.
static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
DominatorTree *DT,
@@ -914,13 +926,11 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
}
}
// Nothing known about this clobber, have to be conservative
- DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load ";
- LI->printAsOperand(dbgs());
- Instruction *I = DepInfo.getInst();
- dbgs() << " is clobbered by " << *I << '\n';
- );
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+ Instruction *I = DepInfo.getInst();
+ dbgs() << " is clobbered by " << *I << '\n';);
if (ORE->allowExtraAnalysis(DEBUG_TYPE))
reportMayClobberedLoad(LI, DepInfo, DT, ORE);
@@ -978,12 +988,10 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
}
// Unknown def - must be conservative
- DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load ";
- LI->printAsOperand(dbgs());
- dbgs() << " has unknown def " << *DepInst << '\n';
- );
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+ dbgs() << " has unknown def " << *DepInst << '\n';);
return false;
}
@@ -1065,7 +1073,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// It is illegal to move the array access to any point above the guard,
// because if the index is out of bounds we should deoptimize rather than
// access the array.
- // Check that there is no guard in this block above our intruction.
+ // Check that there is no guard in this block above our instruction.
if (!IsSafeToSpeculativelyExecute) {
auto It = FirstImplicitControlFlowInsts.find(TmpBB);
if (It != FirstImplicitControlFlowInsts.end()) {
@@ -1113,9 +1121,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// If any predecessor block is an EH pad that does not allow non-PHI
// instructions before the terminator, we can't PRE the load.
if (Pred->getTerminator()->isEHPad()) {
- DEBUG(dbgs()
- << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
- << Pred->getName() << "': " << *LI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+ << Pred->getName() << "': " << *LI << '\n');
return false;
}
@@ -1125,15 +1133,16 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (Pred->getTerminator()->getNumSuccessors() != 1) {
if (isa<IndirectBrInst>(Pred->getTerminator())) {
- DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
return false;
}
if (LoadBB->isEHPad()) {
- DEBUG(dbgs()
- << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
return false;
}
@@ -1161,8 +1170,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
PredLoads[NewPred] = nullptr;
- DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
- << LoadBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+ << LoadBB->getName() << '\n');
}
// Check if the load can safely be moved to all the unavailable predecessors.
@@ -1186,8 +1195,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// If we couldn't find or insert a computation of this phi translated value,
// we fail PRE.
if (!LoadPtr) {
- DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
- << *LI->getPointerOperand() << "\n");
+ LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+ << *LI->getPointerOperand() << "\n");
CanDoPRE = false;
break;
}
@@ -1208,10 +1217,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// Okay, we can eliminate this load by inserting a reload in the predecessor
// and using PHI construction to get the value in the other predecessors, do
// it.
- DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
- DEBUG(if (!NewInsts.empty())
- dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
- << *NewInsts.back() << '\n');
+ LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+ LLVM_DEBUG(if (!NewInsts.empty()) dbgs()
+ << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back()
+ << '\n');
// Assign value numbers to the new instructions.
for (Instruction *I : NewInsts) {
@@ -1262,7 +1271,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
NewLoad));
MD->invalidateCachedPointerInfo(LoadPtr);
- DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+ LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
}
// Perform PHI construction.
@@ -1320,11 +1329,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// clobber in the current block. Reject this early.
if (NumDeps == 1 &&
!Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
- DEBUG(
- dbgs() << "GVN: non-local load ";
- LI->printAsOperand(dbgs());
- dbgs() << " has unknown dependencies\n";
- );
+ LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs());
+ dbgs() << " has unknown dependencies\n";);
return false;
}
@@ -1353,7 +1359,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// load, then it is fully redundant and we can use PHI insertion to compute
// its value. Insert PHIs and remove the fully redundant value now.
if (UnavailableBlocks.empty()) {
- DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+ LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
// Perform PHI construction.
Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
@@ -1506,12 +1512,10 @@ bool GVN::processLoad(LoadInst *L) {
// Only handle the local case below
if (!Dep.isDef() && !Dep.isClobber()) {
// This might be a NonFuncLocal or an Unknown
- DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load ";
- L->printAsOperand(dbgs());
- dbgs() << " has unknown dependence\n";
- );
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; L->printAsOperand(dbgs());
+ dbgs() << " has unknown dependence\n";);
return false;
}
@@ -1695,8 +1699,8 @@ bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
if (it != ReplaceWithConstMap.end()) {
assert(!isa<Constant>(Operand) &&
"Replacing constants with constants is invalid");
- DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
- << " in instruction " << *Instr << '\n');
+ LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
+ << *it->second << " in instruction " << *Instr << '\n');
Instr->setOperand(OpNum, it->second);
Changed = true;
}
@@ -2038,7 +2042,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
unsigned Iteration = 0;
while (ShouldContinue) {
- DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+ LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
ShouldContinue = iterateOnFunction(F);
Changed |= ShouldContinue;
++Iteration;
@@ -2104,9 +2108,10 @@ bool GVN::processBlock(BasicBlock *BB) {
const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB);
for (auto *I : InstrsToErase) {
assert(I->getParent() == BB && "Removing instruction from wrong block?");
- DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ salvageDebugInfo(*I);
if (MD) MD->removeInstruction(I);
- DEBUG(verifyRemoved(I));
+ LLVM_DEBUG(verifyRemoved(I));
if (MaybeFirstICF == I) {
// We have erased the first ICF in block. The map needs to be updated.
InvalidateImplicitCF = true;
@@ -2288,7 +2293,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
PREInstr = CurInst->clone();
if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
// If we failed insertion, make sure we remove the instruction.
- DEBUG(verifyRemoved(PREInstr));
+ LLVM_DEBUG(verifyRemoved(PREInstr));
PREInstr->deleteValue();
return false;
}
@@ -2326,10 +2331,10 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
VN.erase(CurInst);
removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
- DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
if (MD)
MD->removeInstruction(CurInst);
- DEBUG(verifyRemoved(CurInst));
+ LLVM_DEBUG(verifyRemoved(CurInst));
bool InvalidateImplicitCF =
FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst;
// FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index c0cd1ea74a74..6d2b25cf6013 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -72,7 +73,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <iterator>
@@ -534,7 +534,7 @@ private:
if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
- if (firstInBB(NewPt, UD->getMemoryInst()))
+ if (!firstInBB(UD->getMemoryInst(), NewPt))
// Cannot move the load or store to NewPt above its definition in D.
return false;
@@ -570,7 +570,7 @@ private:
// The ides is inspired from:
// "Partial Redundancy Elimination in SSA Form"
// ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
- // They use similar idea in the forward graph to to find fully redundant and
+ // They use similar idea in the forward graph to find fully redundant and
// partially redundant expressions, here it is used in the inverse graph to
// find fully anticipable instructions at merge point (post-dominator in
// the inverse CFG).
@@ -578,7 +578,7 @@ private:
// Returns true when the values are flowing out to each edge.
bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
- if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
+ if (TI->getNumSuccessors() > (unsigned)size(C))
return false; // Not enough args in this CHI.
for (auto CHI : C) {
@@ -622,7 +622,7 @@ private:
// Iterate in reverse order to keep lower ranked values on the top.
for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
// Get the value of instruction I
- DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+ LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
RenameStack[VI.first].push_back(VI.second);
}
}
@@ -636,7 +636,7 @@ private:
if (P == CHIBBs.end()) {
continue;
}
- DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+ LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
// A CHI is found (BB -> Pred is an edge in the CFG)
// Pop the stack until Top(V) = Ve.
auto &VCHI = P->second;
@@ -648,12 +648,12 @@ private:
// track in a CHI. In the PDom walk, there can be values in the
// stack which are not control dependent e.g., nested loop.
if (si != RenameStack.end() && si->second.size() &&
- DT->dominates(Pred, si->second.back()->getParent())) {
+ DT->properlyDominates(Pred, si->second.back()->getParent())) {
C.Dest = BB; // Assign the edge
C.I = si->second.pop_back_val(); // Assign the argument
- DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
- << *C.I << ", VN: " << C.VN.first << ", "
- << C.VN.second);
+ LLVM_DEBUG(dbgs()
+ << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
+ << ", VN: " << C.VN.first << ", " << C.VN.second);
}
// Move to next CHI of a different value
It = std::find_if(It, VCHI.end(),
@@ -748,11 +748,11 @@ private:
// TODO: Remove fully-redundant expressions.
// Get instruction from the Map, assume that all the Instructions
// with same VNs have same rank (this is an approximation).
- std::sort(Ranks.begin(), Ranks.end(),
- [this, &Map](const VNType &r1, const VNType &r2) {
- return (rank(*Map.lookup(r1).begin()) <
- rank(*Map.lookup(r2).begin()));
- });
+ llvm::sort(Ranks.begin(), Ranks.end(),
+ [this, &Map](const VNType &r1, const VNType &r2) {
+ return (rank(*Map.lookup(r1).begin()) <
+ rank(*Map.lookup(r2).begin()));
+ });
// - Sort VNs according to their rank, and start with lowest ranked VN
// - Take a VN and for each instruction with same VN
@@ -798,8 +798,8 @@ private:
// Ignore spurious PDFs.
if (DT->properlyDominates(IDFB, V[i]->getParent())) {
OutValue[IDFB].push_back(C);
- DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
- << ", for Insn: " << *V[i]);
+ LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
+ << ", for Insn: " << *V[i]);
}
}
}
@@ -1200,6 +1200,7 @@ INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
"Early GVN Hoisting of Expressions", false, false)
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
index bf92e43c4715..28c5940db1e0 100644
--- a/lib/Transforms/Scalar/GVNSink.cpp
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -48,6 +48,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -71,7 +72,6 @@
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -239,7 +239,7 @@ public:
SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
- std::sort(Ops.begin(), Ops.end());
+ llvm::sort(Ops.begin(), Ops.end());
for (auto &P : Ops) {
Blocks.push_back(P.first);
Values.push_back(P.second);
@@ -361,7 +361,7 @@ public:
for (auto &U : I->uses())
op_push_back(U.getUser());
- std::sort(op_begin(), op_end());
+ llvm::sort(op_begin(), op_end());
}
void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
@@ -561,7 +561,8 @@ public:
GVNSink() = default;
bool run(Function &F) {
- DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
+ << "\n");
unsigned NumSunk = 0;
ReversePostOrderTraversal<Function*> RPOT(&F);
@@ -592,12 +593,8 @@ private:
/// Create a ModelledPHI for each PHI in BB, adding to PHIs.
void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
SmallPtrSetImpl<Value *> &PHIContents) {
- for (auto &I : *BB) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- return;
-
- auto MPHI = ModelledPHI(PN);
+ for (PHINode &PN : BB->phis()) {
+ auto MPHI = ModelledPHI(&PN);
PHIs.insert(MPHI);
for (auto *V : MPHI.getValues())
PHIContents.insert(V);
@@ -633,15 +630,15 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
auto Insts = *LRI;
- DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
- : Insts) {
+ LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+ : Insts) {
I->dump();
} dbgs() << " ]\n";);
DenseMap<uint32_t, unsigned> VNums;
for (auto *I : Insts) {
uint32_t N = VN.lookupOrAdd(I);
- DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
+ LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
if (N == ~0U)
return None;
VNums[N]++;
@@ -753,8 +750,8 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
}
unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
- DEBUG(dbgs() << "GVNSink: running on basic block ";
- BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "GVNSink: running on basic block ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
SmallVector<BasicBlock *, 4> Preds;
for (auto *B : predecessors(BBEnd)) {
auto *T = B->getTerminator();
@@ -765,7 +762,7 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
}
if (Preds.size() < 2)
return 0;
- std::sort(Preds.begin(), Preds.end());
+ llvm::sort(Preds.begin(), Preds.end());
unsigned NumOrigPreds = Preds.size();
// We can only sink instructions through unconditional branches.
@@ -798,23 +795,23 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
Candidates.begin(), Candidates.end(),
[](const SinkingInstructionCandidate &A,
const SinkingInstructionCandidate &B) { return A > B; });
- DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
- : Candidates) dbgs()
- << " " << C << "\n";);
+ LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+ : Candidates) dbgs()
+ << " " << C << "\n";);
// Pick the top candidate, as long it is positive!
if (Candidates.empty() || Candidates.front().Cost <= 0)
return 0;
auto C = Candidates.front();
- DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+ LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n");
BasicBlock *InsertBB = BBEnd;
if (C.Blocks.size() < NumOrigPreds) {
- DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
- dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " -- Splitting edge to ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
if (!InsertBB) {
- DEBUG(dbgs() << " -- FAILED to split edge!\n");
+ LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n");
// Edge couldn't be split.
return 0;
}
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index c4aeccb85ca7..ad1598d7b8bf 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -40,9 +40,11 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include <functional>
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
@@ -53,6 +55,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -62,9 +65,14 @@ namespace {
class GuardWideningImpl {
DominatorTree &DT;
- PostDominatorTree &PDT;
+ PostDominatorTree *PDT;
LoopInfo &LI;
+ /// Together, these describe the region of interest. This might be all of
+ /// the blocks within a function, or only a given loop's blocks and preheader.
+ DomTreeNode *Root;
+ std::function<bool(BasicBlock*)> BlockFilter;
+
/// The set of guards whose conditions have been widened into dominating
/// guards.
SmallVector<IntrinsicInst *, 16> EliminatedGuards;
@@ -205,39 +213,15 @@ class GuardWideningImpl {
}
public:
- explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
- LoopInfo &LI)
- : DT(DT), PDT(PDT), LI(LI) {}
+
+ explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
+ LoopInfo &LI, DomTreeNode *Root,
+ std::function<bool(BasicBlock*)> BlockFilter)
+ : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {}
/// The entry point for this pass.
bool run();
};
-
-struct GuardWideningLegacyPass : public FunctionPass {
- static char ID;
- GuardWideningPass Impl;
-
- GuardWideningLegacyPass() : FunctionPass(ID) {
- initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- return GuardWideningImpl(
- getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
- getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- }
-};
-
}
bool GuardWideningImpl::run() {
@@ -246,9 +230,12 @@ bool GuardWideningImpl::run() {
DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
bool Changed = false;
- for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+ for (auto DFI = df_begin(Root), DFE = df_end(Root);
DFI != DFE; ++DFI) {
auto *BB = (*DFI)->getBlock();
+ if (!BlockFilter(BB))
+ continue;
+
auto &CurrentList = GuardsInBlock[BB];
for (auto &I : *BB)
@@ -259,6 +246,7 @@ bool GuardWideningImpl::run() {
Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
}
+ assert(EliminatedGuards.empty() || Changed);
for (auto *II : EliminatedGuards)
if (!WidenedGuards.count(II))
II->eraseFromParent();
@@ -278,6 +266,8 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
// for the most profit.
for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
auto *CurBB = DFSI.getPath(i)->getBlock();
+ if (!BlockFilter(CurBB))
+ break;
auto *CurLoop = LI.getLoopFor(CurBB);
assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
@@ -312,9 +302,9 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
for (auto *Candidate : make_range(I, E)) {
auto Score =
computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
- DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
- << " and " << *Candidate->getArgOperand(0) << " is "
- << scoreTypeToString(Score) << "\n");
+ LLVM_DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+ << " and " << *Candidate->getArgOperand(0) << " is "
+ << scoreTypeToString(Score) << "\n");
if (Score > BestScoreSoFar) {
BestScoreSoFar = Score;
BestSoFar = Candidate;
@@ -323,15 +313,16 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
}
if (BestScoreSoFar == WS_IllegalOrNegative) {
- DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+ LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
return false;
}
assert(BestSoFar != GuardInst && "Should have never visited same guard!");
assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
- DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
- << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+ LLVM_DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+ << " with score " << scoreTypeToString(BestScoreSoFar)
+ << "\n");
widenGuard(BestSoFar, GuardInst->getArgOperand(0));
GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
EliminatedGuards.push_back(GuardInst);
@@ -345,6 +336,8 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
bool HoistingOutOfLoop = false;
if (DominatingGuardLoop != DominatedGuardLoop) {
+ // Be conservative and don't widen into a sibling loop. TODO: If the
+ // sibling is colder, we should consider allowing this.
if (DominatingGuardLoop &&
!DominatingGuardLoop->contains(DominatedGuardLoop))
return WS_IllegalOrNegative;
@@ -355,9 +348,14 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
return WS_IllegalOrNegative;
- bool HoistingOutOfIf =
- !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
-
+ // If the guard was conditional executed, it may never be reached
+ // dynamically. There are two potential downsides to hoisting it out of the
+ // conditionally executed region: 1) we may spuriously deopt without need and
+ // 2) we have the extra cost of computing the guard condition in the common
+ // case. At the moment, we really only consider the second in our heuristic
+ // here. TODO: evaluate cost model for spurious deopt
+ // NOTE: As written, this also lets us hoist right over another guard which
+ // is essentially just another spelling for control flow.
if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
DominatingGuard->getArgOperand(0)))
return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
@@ -365,7 +363,26 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
if (HoistingOutOfLoop)
return WS_Positive;
- return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+ // Returns true if we might be hoisting above explicit control flow. Note
+ // that this completely ignores implicit control flow (guards, calls which
+ // throw, etc...). That choice appears arbitrary.
+ auto MaybeHoistingOutOfIf = [&]() {
+ auto *DominatingBlock = DominatingGuard->getParent();
+ auto *DominatedBlock = DominatedGuard->getParent();
+
+ // Same Block?
+ if (DominatedBlock == DominatingBlock)
+ return false;
+ // Obvious successor (common loop header/preheader case)
+ if (DominatedBlock == DominatingBlock->getUniqueSuccessor())
+ return false;
+ // TODO: diamond, triangle cases
+ if (!PDT) return true;
+ return !PDT->dominates(DominatedGuard->getParent(),
+ DominatingGuard->getParent());
+ };
+
+ return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral;
}
bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
@@ -581,9 +598,9 @@ bool GuardWideningImpl::combineRangeChecks(
// CurrentChecks.size() will typically be 3 here, but so far there has been
// no need to hard-code that fact.
- std::sort(CurrentChecks.begin(), CurrentChecks.end(),
- [&](const GuardWideningImpl::RangeCheck &LHS,
- const GuardWideningImpl::RangeCheck &RHS) {
+ llvm::sort(CurrentChecks.begin(), CurrentChecks.end(),
+ [&](const GuardWideningImpl::RangeCheck &LHS,
+ const GuardWideningImpl::RangeCheck &RHS) {
return LHS.getOffsetValue().slt(RHS.getOffsetValue());
});
@@ -651,19 +668,6 @@ bool GuardWideningImpl::combineRangeChecks(
return RangeChecksOut.size() != OldCount;
}
-PreservedAnalyses GuardWideningPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
- if (!GuardWideningImpl(DT, PDT, LI).run())
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
#ifndef NDEBUG
StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
switch (WS) {
@@ -681,7 +685,82 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
}
#endif
+PreservedAnalyses GuardWideningPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+ [](BasicBlock*) { return true; } ).run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+struct GuardWideningLegacyPass : public FunctionPass {
+ static char ID;
+
+ GuardWideningLegacyPass() : FunctionPass(ID) {
+ initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+ [](BasicBlock*) { return true; } ).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ }
+};
+
+/// Same as above, but restricted to a single loop at a time. Can be
+/// scheduled with other loop passes w/o breaking out of LPM
+struct LoopGuardWideningLegacyPass : public LoopPass {
+ static char ID;
+
+ LoopGuardWideningLegacyPass() : LoopPass(ID) {
+ initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ BasicBlock *RootBB = L->getLoopPredecessor();
+ if (!RootBB)
+ RootBB = L->getHeader();
+ auto BlockFilter = [&](BasicBlock *BB) {
+ return BB == RootBB || L->contains(BB);
+ };
+ return GuardWideningImpl(DT, PDT, LI,
+ DT.getNode(RootBB), BlockFilter).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ getLoopAnalysisUsage(AU);
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+};
+}
+
char GuardWideningLegacyPass::ID = 0;
+char LoopGuardWideningLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
false, false)
@@ -691,6 +770,20 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
false, false)
+INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
+ "Widen guards (within a single loop, as a loop pass)",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
+ "Widen guards (within a single loop, as a loop pass)",
+ false, false)
+
FunctionPass *llvm::createGuardWideningPass() {
return new GuardWideningLegacyPass();
}
+
+Pass *llvm::createLoopGuardWideningPass() {
+ return new LoopGuardWideningLegacyPass();
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 74d6014d3e3d..8656e88b79cb 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -43,6 +43,7 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
@@ -77,7 +78,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
#include <cassert>
@@ -210,8 +210,8 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
if (FromBase == ToBase)
return true;
- DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
- << *FromBase << " != " << *ToBase << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " << *FromBase
+ << " != " << *ToBase << "\n");
return false;
}
@@ -485,9 +485,8 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
BasicBlock *Header = L->getHeader();
SmallVector<WeakTrackingVH, 8> PHIs;
- for (BasicBlock::iterator I = Header->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I)
- PHIs.push_back(PN);
+ for (PHINode &PN : Header->phis())
+ PHIs.push_back(&PN);
for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
@@ -654,8 +653,9 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
Value *ExitVal =
expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
- DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
- << " LoopVal = " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
+ << '\n'
+ << " LoopVal = " << *Inst << "\n");
if (!isValidRewrite(Inst, ExitVal)) {
DeadInsts.push_back(ExitVal);
@@ -724,13 +724,12 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
assert(LoopHeader && "Invalid loop");
for (auto *ExitBB : ExitBlocks) {
- BasicBlock::iterator BBI = ExitBB->begin();
// If there are no more PHI nodes in this exit block, then no more
// values defined inside the loop are used on this path.
- while (auto *PN = dyn_cast<PHINode>(BBI++)) {
- for (unsigned IncomingValIdx = 0, E = PN->getNumIncomingValues();
- IncomingValIdx != E; ++IncomingValIdx) {
- auto *IncomingBB = PN->getIncomingBlock(IncomingValIdx);
+ for (PHINode &PN : ExitBB->phis()) {
+ for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues();
+ IncomingValIdx != E; ++IncomingValIdx) {
+ auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx);
// We currently only support loop exits from loop header. If the
// incoming block is not loop header, we need to recursively check
@@ -755,8 +754,7 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
if (!L->isLoopInvariant(Cond))
continue;
- auto *ExitVal =
- dyn_cast<PHINode>(PN->getIncomingValue(IncomingValIdx));
+ auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx));
// Only deal with PHIs.
if (!ExitVal)
@@ -771,8 +769,8 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
if (PreheaderIdx != -1) {
assert(ExitVal->getParent() == LoopHeader &&
"ExitVal must be in loop header");
- PN->setIncomingValue(IncomingValIdx,
- ExitVal->getIncomingValue(PreheaderIdx));
+ PN.setIncomingValue(IncomingValIdx,
+ ExitVal->getIncomingValue(PreheaderIdx));
}
}
}
@@ -1087,7 +1085,7 @@ Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
Instruction *NarrowDef = DU.NarrowDef;
Instruction *WideDef = DU.WideDef;
- DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
// Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
// about the narrow operand yet so must insert a [sz]ext. It is probably loop
@@ -1118,7 +1116,7 @@ Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
Instruction *NarrowDef = DU.NarrowDef;
Instruction *WideDef = DU.WideDef;
- DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
@@ -1318,8 +1316,8 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
/// This IV user cannot be widen. Replace this use of the original narrow IV
/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
- DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
- << " for user " << *DU.NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
+ << *DU.NarrowUse << "\n");
IRBuilder<> Builder(
getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
@@ -1399,8 +1397,8 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
UsePhi->replaceAllUsesWith(Trunc);
DeadInsts.emplace_back(UsePhi);
- DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
- << " to " << *WidePhi << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
+ << *WidePhi << "\n");
}
return nullptr;
}
@@ -1431,15 +1429,16 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// A wider extend was hidden behind a narrower one. This may induce
// another round of IV widening in which the intermediate IV becomes
// dead. It should be very rare.
- DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
- << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+ << " not wide enough to subsume " << *DU.NarrowUse
+ << "\n");
DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
NewDef = DU.NarrowUse;
}
}
if (NewDef != DU.NarrowUse) {
- DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
- << " replaced by " << *DU.WideDef << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+ << " replaced by " << *DU.WideDef << "\n");
++NumElimExt;
DU.NarrowUse->replaceAllUsesWith(NewDef);
DeadInsts.emplace_back(DU.NarrowUse);
@@ -1494,8 +1493,9 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// absolutely guarantee it. Hence the following failsafe check. In rare cases
// where it fails, we simply throw away the newly created wide use.
if (WideAddRec.first != SE->getSCEV(WideUse)) {
- DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
- << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n");
+ LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": "
+ << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first
+ << "\n");
DeadInsts.emplace_back(WideUse);
return nullptr;
}
@@ -1600,7 +1600,7 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
WideInc->setDebugLoc(OrigInc->getDebugLoc());
}
- DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+ LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
++NumWidened;
// Traverse the def-use chain using a worklist starting at the original IV.
@@ -2234,12 +2234,12 @@ linearFunctionTestReplace(Loop *L,
else
P = ICmpInst::ICMP_EQ;
- DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
- << " LHS:" << *CmpIndVar << '\n'
- << " op:\t"
- << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
- << " RHS:\t" << *ExitCnt << "\n"
- << " IVCount:\t" << *IVCount << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+ << " LHS:" << *CmpIndVar << '\n'
+ << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==")
+ << "\n"
+ << " RHS:\t" << *ExitCnt << "\n"
+ << " IVCount:\t" << *IVCount << "\n");
IRBuilder<> Builder(BI);
@@ -2275,7 +2275,7 @@ linearFunctionTestReplace(Loop *L,
NewLimit = Start + Count;
ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
- DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
+ LLVM_DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
} else {
// We try to extend trip count first. If that doesn't work we truncate IV.
// Zext(trunc(IV)) == IV implies equivalence of the following two:
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 5c4d55bfbb2b..e2f29705f2dd 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -43,6 +43,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"
@@ -52,6 +53,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -179,10 +181,7 @@ public:
OS << " Step: ";
Step->print(OS);
OS << " End: ";
- if (End)
- End->print(OS);
- else
- OS << "(null)";
+ End->print(OS);
OS << "\n CheckUse: ";
getCheckUse()->getUser()->print(OS);
OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
@@ -196,7 +195,7 @@ public:
Use *getCheckUse() const { return CheckUse; }
/// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
- /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+ /// R.getEnd() le R.getBegin(), then R denotes the empty range.
class Range {
const SCEV *Begin;
@@ -238,17 +237,31 @@ public:
/// checks, and hence don't end up in \p Checks.
static void
extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
- BranchProbabilityInfo &BPI,
+ BranchProbabilityInfo *BPI,
SmallVectorImpl<InductiveRangeCheck> &Checks);
};
-class InductiveRangeCheckElimination : public LoopPass {
+class InductiveRangeCheckElimination {
+ ScalarEvolution &SE;
+ BranchProbabilityInfo *BPI;
+ DominatorTree &DT;
+ LoopInfo &LI;
+
+public:
+ InductiveRangeCheckElimination(ScalarEvolution &SE,
+ BranchProbabilityInfo *BPI, DominatorTree &DT,
+ LoopInfo &LI)
+ : SE(SE), BPI(BPI), DT(DT), LI(LI) {}
+
+ bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
+};
+
+class IRCELegacyPass : public LoopPass {
public:
static char ID;
- InductiveRangeCheckElimination() : LoopPass(ID) {
- initializeInductiveRangeCheckEliminationPass(
- *PassRegistry::getPassRegistry());
+ IRCELegacyPass() : LoopPass(ID) {
+ initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -261,14 +274,14 @@ public:
} // end anonymous namespace
-char InductiveRangeCheckElimination::ID = 0;
+char IRCELegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
"Inductive range check elimination", false, false)
INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
- "Inductive range check elimination", false, false)
+INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
+ false, false)
StringRef InductiveRangeCheck::rangeCheckKindToStr(
InductiveRangeCheck::RangeCheckKind RCK) {
@@ -299,13 +312,8 @@ InductiveRangeCheck::RangeCheckKind
InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
ScalarEvolution &SE, Value *&Index,
Value *&Length, bool &IsSigned) {
- auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
- const SCEV *S = SE.getSCEV(V);
- if (isa<SCEVCouldNotCompute>(S))
- return false;
-
- return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
- SE.isKnownNonNegative(S);
+ auto IsLoopInvariant = [&SE, L](Value *V) {
+ return SE.isLoopInvariant(SE.getSCEV(V), L);
};
ICmpInst::Predicate Pred = ICI->getPredicate();
@@ -337,7 +345,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
return RANGE_CHECK_LOWER;
}
- if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ if (IsLoopInvariant(LHS)) {
Index = RHS;
Length = LHS;
return RANGE_CHECK_UPPER;
@@ -349,7 +357,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_UGT:
IsSigned = false;
- if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ if (IsLoopInvariant(LHS)) {
Index = RHS;
Length = LHS;
return RANGE_CHECK_BOTH;
@@ -394,8 +402,23 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
if (!IsAffineIndex)
return;
+ const SCEV *End = nullptr;
+ // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+ // We can potentially do much better here.
+ if (Length)
+ End = SE.getSCEV(Length);
+ else {
+ assert(RCKind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+ // So far we can only reach this point for Signed range check. This may
+ // change in future. In this case we will need to pick Unsigned max for the
+ // unsigned range check.
+ unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth();
+ const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+ End = SIntMax;
+ }
+
InductiveRangeCheck IRC;
- IRC.End = Length ? SE.getSCEV(Length) : nullptr;
+ IRC.End = End;
IRC.Begin = IndexAddRec->getStart();
IRC.Step = IndexAddRec->getStepRecurrence(SE);
IRC.CheckUse = &ConditionUse;
@@ -405,15 +428,15 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
}
void InductiveRangeCheck::extractRangeChecksFromBranch(
- BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+ BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
SmallVectorImpl<InductiveRangeCheck> &Checks) {
if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
return;
BranchProbability LikelyTaken(15, 16);
- if (!SkipProfitabilityChecks &&
- BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+ if (!SkipProfitabilityChecks && BPI &&
+ BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
return;
SmallPtrSet<Value *, 8> Visited;
@@ -504,9 +527,8 @@ struct LoopStructure {
}
static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
- BranchProbabilityInfo &BPI,
- Loop &,
- const char *&);
+ BranchProbabilityInfo *BPI,
+ Loop &, const char *&);
};
/// This class is used to constrain loops to run within a given iteration space.
@@ -573,7 +595,7 @@ class LoopConstrainer {
// Create the appropriate loop structure needed to describe a cloned copy of
// `Original`. The clone is described by `VM`.
Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
- ValueToValueMapTy &VM);
+ ValueToValueMapTy &VM, bool IsSubloop);
// Rewrite the iteration space of the loop denoted by (LS, Preheader). The
// iteration space of the rewritten loop ends at ExitLoopAt. The start of the
@@ -625,8 +647,8 @@ class LoopConstrainer {
LLVMContext &Ctx;
ScalarEvolution &SE;
DominatorTree &DT;
- LPPassManager &LPM;
LoopInfo &LI;
+ function_ref<void(Loop *, bool)> LPMAddNewLoop;
// Information about the original loop we started out with.
Loop &OriginalLoop;
@@ -646,12 +668,13 @@ class LoopConstrainer {
LoopStructure MainLoopStructure;
public:
- LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM,
+ LoopConstrainer(Loop &L, LoopInfo &LI,
+ function_ref<void(Loop *, bool)> LPMAddNewLoop,
const LoopStructure &LS, ScalarEvolution &SE,
DominatorTree &DT, InductiveRangeCheck::Range R)
: F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
- SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R),
- MainLoopStructure(LS) {}
+ SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
+ Range(R), MainLoopStructure(LS) {}
// Entry point for the algorithm. Returns true on success.
bool run();
@@ -666,56 +689,141 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
PN->setIncomingBlock(i, ReplaceBy);
}
-static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) {
- APInt Max = Signed ?
- APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) :
- APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
- return SE.getSignedRange(S).contains(Max) &&
- SE.getUnsignedRange(S).contains(Max);
+static bool CannotBeMaxInLoop(const SCEV *BoundSCEV, Loop *L,
+ ScalarEvolution &SE, bool Signed) {
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) :
+ APInt::getMaxValue(BitWidth);
+ auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV,
+ SE.getConstant(Max));
}
-static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
- bool Signed) {
- // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX.
- assert(SE.isKnownNonNegative(S2) &&
- "We expected the 2nd arg to be non-negative!");
- const SCEV *Max = SE.getConstant(
- Signed ? APInt::getSignedMaxValue(
- cast<IntegerType>(S1->getType())->getBitWidth())
- : APInt::getMaxValue(
- cast<IntegerType>(S1->getType())->getBitWidth()));
- const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
- return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
- S1, CapForS1);
+/// Given a loop with an deccreasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeDecreasingBound(const SCEV *Start,
+ const SCEV *BoundSCEV, const SCEV *Step,
+ ICmpInst::Predicate Pred,
+ unsigned LatchBrExitIdx,
+ Loop *L, ScalarEvolution &SE) {
+ if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+ return false;
+
+ if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+ return false;
+
+ assert(SE.isKnownNegative(Step) && "expecting negative step");
+
+ LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
+ LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+ LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+ bool IsSigned = ICmpInst::isSigned(Pred);
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+ if (LatchBrExitIdx == 1)
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+ assert(LatchBrExitIdx == 0 &&
+ "LatchBrExitIdx should be either 0 or 1");
+
+ const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
+ APInt::getMinValue(BitWidth);
+ const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
+
+ const SCEV *MinusOne =
+ SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+
}
-static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) {
- APInt Min = Signed ?
- APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) :
- APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth());
- return SE.getSignedRange(S).contains(Min) &&
- SE.getUnsignedRange(S).contains(Min);
+/// Given a loop with an increasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeIncreasingBound(const SCEV *Start,
+ const SCEV *BoundSCEV, const SCEV *Step,
+ ICmpInst::Predicate Pred,
+ unsigned LatchBrExitIdx,
+ Loop *L, ScalarEvolution &SE) {
+ if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+ return false;
+
+ if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
+ LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+ LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+ bool IsSigned = ICmpInst::isSigned(Pred);
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+ if (LatchBrExitIdx == 1)
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+ assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
+
+ const SCEV *StepMinusOne =
+ SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) :
+ APInt::getMaxValue(BitWidth);
+ const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
+
+ return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
+ SE.getAddExpr(BoundSCEV, Step)) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
}
-static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
- bool Signed) {
- // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN.
- assert(SE.isKnownNonPositive(S2) &&
- "We expected the 2nd arg to be non-positive!");
- const SCEV *Max = SE.getConstant(
- Signed ? APInt::getSignedMinValue(
- cast<IntegerType>(S1->getType())->getBitWidth())
- : APInt::getMinValue(
- cast<IntegerType>(S1->getType())->getBitWidth()));
- const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
- return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT,
- S1, CapForS1);
+static bool CannotBeMinInLoop(const SCEV *BoundSCEV, Loop *L,
+ ScalarEvolution &SE, bool Signed) {
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) :
+ APInt::getMinValue(BitWidth);
+ auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV,
+ SE.getConstant(Min));
+}
+
+static bool isKnownNonNegativeInLoop(const SCEV *BoundSCEV, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEV *Zero = SE.getZero(BoundSCEV->getType());
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, BoundSCEV, Zero);
+}
+
+static bool isKnownNegativeInLoop(const SCEV *BoundSCEV, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEV *Zero = SE.getZero(BoundSCEV->getType());
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, BoundSCEV, Zero);
}
Optional<LoopStructure>
LoopStructure::parseLoopStructure(ScalarEvolution &SE,
- BranchProbabilityInfo &BPI,
- Loop &L, const char *&FailureReason) {
+ BranchProbabilityInfo *BPI, Loop &L,
+ const char *&FailureReason) {
if (!L.isLoopSimplifyForm()) {
FailureReason = "loop not in LoopSimplify form";
return None;
@@ -750,7 +858,8 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
BranchProbability ExitProbability =
- BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+ BPI ? BPI->getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx)
+ : BranchProbability::getZero();
if (!SkipProfitabilityChecks &&
ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
@@ -816,43 +925,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
};
- // Here we check whether the suggested AddRec is an induction variable that
- // can be handled (i.e. with known constant step), and if yes, calculate its
- // step and identify whether it is increasing or decreasing.
- auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing,
- ConstantInt *&StepCI) {
- if (!AR->isAffine())
- return false;
-
- // Currently we only work with induction variables that have been proved to
- // not wrap. This restriction can potentially be lifted in the future.
-
- if (!HasNoSignedWrap(AR))
- return false;
-
- if (const SCEVConstant *StepExpr =
- dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
- StepCI = StepExpr->getValue();
- assert(!StepCI->isZero() && "Zero step?");
- IsIncreasing = !StepCI->isNegative();
- return true;
- }
-
- return false;
- };
-
// `ICI` is interpreted as taking the backedge if the *next* value of the
// induction variable satisfies some constraint.
const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
- bool IsIncreasing = false;
- bool IsSignedPredicate = true;
- ConstantInt *StepCI;
- if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) {
+ if (!IndVarBase->isAffine()) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+ const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
+ if (!isa<SCEVConstant>(StepRec)) {
FailureReason = "LHS in icmp not induction variable";
return None;
}
+ ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
+ if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
+ FailureReason = "LHS in icmp needs nsw for equality predicates";
+ return None;
+ }
+
+ assert(!StepCI->isZero() && "Zero step?");
+ bool IsIncreasing = !StepCI->isNegative();
+ bool IsSignedPredicate = ICmpInst::isSigned(Pred);
const SCEV *StartNext = IndVarBase->getStart();
const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
@@ -870,22 +965,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
// If both parts are known non-negative, it is profitable to use
// unsigned comparison in increasing loop. This allows us to make the
// comparison check against "RightSCEV + 1" more optimistic.
- if (SE.isKnownNonNegative(IndVarStart) &&
- SE.isKnownNonNegative(RightSCEV))
+ if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
+ isKnownNonNegativeInLoop(RightSCEV, &L, SE))
Pred = ICmpInst::ICMP_ULT;
else
Pred = ICmpInst::ICMP_SLT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
- !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
// while (true) { while (true) {
// if (++i == len) ---> if (++i > len - 1)
// break; break;
// ... ...
// } }
- // TODO: Insert ICMP_UGT if both are non-negative?
- Pred = ICmpInst::ICMP_SGT;
- RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
- DecreasedRightValueByOne = true;
+ if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+ CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
+ Pred = ICmpInst::ICMP_UGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV,
+ SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ } else if (CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
+ Pred = ICmpInst::ICMP_SGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV,
+ SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ }
}
}
@@ -899,36 +1001,18 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
return None;
}
- IsSignedPredicate =
- Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
-
+ IsSignedPredicate = ICmpInst::isSigned(Pred);
if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
FailureReason = "unsigned latch conditions are explicitly prohibited";
return None;
}
- // The predicate that we need to check that the induction variable lies
- // within bounds.
- ICmpInst::Predicate BoundPred =
- IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
-
+ if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
+ LatchBrExitIdx, &L, SE)) {
+ FailureReason = "Unsafe loop bounds";
+ return None;
+ }
if (LatchBrExitIdx == 0) {
- const SCEV *StepMinusOne = SE.getMinusSCEV(Step,
- SE.getOne(Step->getType()));
- if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) {
- // TODO: this restriction is easily removable -- we just have to
- // remember that the icmp was an slt and not an sle.
- FailureReason = "limit may overflow when coercing le to lt";
- return None;
- }
-
- if (!SE.isLoopEntryGuardedByCond(
- &L, BoundPred, IndVarStart,
- SE.getAddExpr(RightSCEV, Step))) {
- FailureReason = "Induction variable start not bounded by upper limit";
- return None;
- }
-
// We need to increase the right value unless we have already decreased
// it virtually when we replaced EQ with SGT.
if (!DecreasedRightValueByOne) {
@@ -936,10 +1020,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
RightValue = B.CreateAdd(RightValue, One);
}
} else {
- if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
- FailureReason = "Induction variable start not bounded by upper limit";
- return None;
- }
assert(!DecreasedRightValueByOne &&
"Right value can be decreased only for LatchBrExitIdx == 0!");
}
@@ -955,17 +1035,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
// that both operands are non-negative, because it will only pessimize
// our check against "RightSCEV - 1".
Pred = ICmpInst::ICMP_SGT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
- !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
// while (true) { while (true) {
// if (--i == len) ---> if (--i < len + 1)
// break; break;
// ... ...
// } }
- // TODO: Insert ICMP_ULT if both are non-negative?
- Pred = ICmpInst::ICMP_SLT;
- RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
- IncreasedRightValueByOne = true;
+ if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+ CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
+ Pred = ICmpInst::ICMP_ULT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ } else if (CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
+ Pred = ICmpInst::ICMP_SLT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ }
}
}
@@ -988,27 +1073,13 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
return None;
}
- // The predicate that we need to check that the induction variable lies
- // within bounds.
- ICmpInst::Predicate BoundPred =
- IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+ if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
+ LatchBrExitIdx, &L, SE)) {
+ FailureReason = "Unsafe bounds";
+ return None;
+ }
if (LatchBrExitIdx == 0) {
- const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
- if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) {
- // TODO: this restriction is easily removable -- we just have to
- // remember that the icmp was an sgt and not an sge.
- FailureReason = "limit may overflow when coercing ge to gt";
- return None;
- }
-
- if (!SE.isLoopEntryGuardedByCond(
- &L, BoundPred, IndVarStart,
- SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
- FailureReason = "Induction variable start not bounded by lower limit";
- return None;
- }
-
// We need to decrease the right value unless we have already increased
// it virtually when we replaced EQ with SLT.
if (!IncreasedRightValueByOne) {
@@ -1016,10 +1087,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
RightValue = B.CreateSub(RightValue, One);
}
} else {
- if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
- FailureReason = "Induction variable start not bounded by lower limit";
- return None;
- }
assert(!IncreasedRightValueByOne &&
"Right value can be increased only for LatchBrExitIdx == 0!");
}
@@ -1174,13 +1241,9 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
if (OriginalLoop.contains(SBB))
continue; // not an exit block
- for (Instruction &I : *SBB) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- break;
-
- Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
- PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+ for (PHINode &PN : SBB->phis()) {
+ Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
+ PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
}
}
}
@@ -1327,16 +1390,12 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
// We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
// each of the PHI nodes in the loop header. This feeds into the initial
// value of the same PHI nodes if/when we continue execution.
- for (Instruction &I : *LS.Header) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- break;
-
- PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
+ for (PHINode &PN : LS.Header->phis()) {
+ PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
BranchToContinuation);
- NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader);
- NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch),
+ NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
+ NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
RRI.ExitSelector);
RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
}
@@ -1348,12 +1407,8 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
// The latch exit now has a branch from `RRI.ExitSelector' instead of
// `LS.Latch'. The PHI nodes need to be updated to reflect that.
- for (Instruction &I : *LS.LatchExit) {
- if (PHINode *PN = dyn_cast<PHINode>(&I))
- replacePHIBlock(PN, LS.Latch, RRI.ExitSelector);
- else
- break;
- }
+ for (PHINode &PN : LS.LatchExit->phis())
+ replacePHIBlock(&PN, LS.Latch, RRI.ExitSelector);
return RRI;
}
@@ -1362,15 +1417,10 @@ void LoopConstrainer::rewriteIncomingValuesForPHIs(
LoopStructure &LS, BasicBlock *ContinuationBlock,
const LoopConstrainer::RewrittenRangeInfo &RRI) const {
unsigned PHIIndex = 0;
- for (Instruction &I : *LS.Header) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- break;
-
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
- if (PN->getIncomingBlock(i) == ContinuationBlock)
- PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
- }
+ for (PHINode &PN : LS.Header->phis())
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i < e; ++i)
+ if (PN.getIncomingBlock(i) == ContinuationBlock)
+ PN.setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
LS.IndVarStart = RRI.IndVarEnd;
}
@@ -1381,14 +1431,9 @@ BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
BranchInst::Create(LS.Header, Preheader);
- for (Instruction &I : *LS.Header) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- break;
-
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
- replacePHIBlock(PN, OldPreheader, Preheader);
- }
+ for (PHINode &PN : LS.Header->phis())
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i < e; ++i)
+ replacePHIBlock(&PN, OldPreheader, Preheader);
return Preheader;
}
@@ -1403,13 +1448,14 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
}
Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
- ValueToValueMapTy &VM) {
+ ValueToValueMapTy &VM,
+ bool IsSubloop) {
Loop &New = *LI.AllocateLoop();
if (Parent)
Parent->addChildLoop(&New);
else
LI.addTopLevelLoop(&New);
- LPM.addLoop(New);
+ LPMAddNewLoop(&New, IsSubloop);
// Add all of the blocks in Original to the new loop.
for (auto *BB : Original->blocks())
@@ -1418,7 +1464,7 @@ Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
// Add all of the subloops to the new loop.
for (Loop *SubLoop : *Original)
- createClonedLoopStructure(SubLoop, &New, VM);
+ createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
return &New;
}
@@ -1436,7 +1482,7 @@ bool LoopConstrainer::run() {
bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
if (!MaybeSR.hasValue()) {
- DEBUG(dbgs() << "irce: could not compute subranges\n");
+ LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
return false;
}
@@ -1468,19 +1514,22 @@ bool LoopConstrainer::run() {
if (Increasing)
ExitPreLoopAtSCEV = *SR.LowLimit;
else {
- if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) {
- DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
- << "preloop exit limit. HighLimit = " << *(*SR.HighLimit)
- << "\n");
+ if (CannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
+ IsSignedPredicate))
+ ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+ else {
+ LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "preloop exit limit. HighLimit = "
+ << *(*SR.HighLimit) << "\n");
return false;
}
- ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
}
if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
- DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
- << " preloop exit limit " << *ExitPreLoopAtSCEV
- << " at block " << InsertPt->getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " preloop exit limit " << *ExitPreLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName()
+ << "\n");
return false;
}
@@ -1494,19 +1543,22 @@ bool LoopConstrainer::run() {
if (Increasing)
ExitMainLoopAtSCEV = *SR.HighLimit;
else {
- if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) {
- DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
- << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit)
- << "\n");
+ if (CannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
+ IsSignedPredicate))
+ ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+ else {
+ LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "mainloop exit limit. LowLimit = "
+ << *(*SR.LowLimit) << "\n");
return false;
}
- ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
}
if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
- DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
- << " main loop exit limit " << *ExitMainLoopAtSCEV
- << " at block " << InsertPt->getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " main loop exit limit " << *ExitMainLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName()
+ << "\n");
return false;
}
@@ -1568,13 +1620,15 @@ bool LoopConstrainer::run() {
// LI when LoopSimplifyForm is generated.
Loop *PreL = nullptr, *PostL = nullptr;
if (!PreLoop.Blocks.empty()) {
- PreL = createClonedLoopStructure(
- &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
+ PreL = createClonedLoopStructure(&OriginalLoop,
+ OriginalLoop.getParentLoop(), PreLoop.Map,
+ /* IsSubLoop */ false);
}
if (!PostLoop.Blocks.empty()) {
- PostL = createClonedLoopStructure(
- &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+ PostL =
+ createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
+ PostLoop.Map, /* IsSubLoop */ false);
}
// This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
@@ -1640,32 +1694,34 @@ InductiveRangeCheck::computeSafeIterationSpace(
unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
- // Substract Y from X so that it does not go through border of the IV
+ // Subtract Y from X so that it does not go through border of the IV
// iteration space. Mathematically, it is equivalent to:
//
- // ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
+ // ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
//
- // In [1], 'X - Y' is a mathematical substraction (result is not bounded to
+ // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to
// any width of bit grid). But after we take min/max, the result is
// guaranteed to be within [INT_MIN, INT_MAX].
//
// In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
// values, depending on type of latch condition that defines IV iteration
// space.
- auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) {
- assert(SE.isKnownNonNegative(X) &&
- "We can only substract from values in [0; SINT_MAX]!");
+ auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) {
+ // FIXME: The current implementation assumes that X is in [0, SINT_MAX].
+ // This is required to ensure that SINT_MAX - X does not overflow signed and
+ // that X - Y does not overflow unsigned if Y is negative. Can we lift this
+ // restriction and make it work for negative X either?
if (IsLatchSigned) {
// X is a number from signed range, Y is interpreted as signed.
// Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
// thing we should care about is that we didn't cross SINT_MAX.
- // So, if Y is positive, we substract Y safely.
+ // So, if Y is positive, we subtract Y safely.
// Rule 1: Y > 0 ---> Y.
- // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely.
+ // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely.
// Rule 2: Y >=s (X - SINT_MAX) ---> Y.
- // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX).
+ // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX).
// Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
- // It gives us smax(Y, X - SINT_MAX) to substract in all cases.
+ // It gives us smax(Y, X - SINT_MAX) to subtract in all cases.
const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
SCEV::FlagNSW);
@@ -1673,29 +1729,45 @@ InductiveRangeCheck::computeSafeIterationSpace(
// X is a number from unsigned range, Y is interpreted as signed.
// Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
// thing we should care about is that we didn't cross zero.
- // So, if Y is negative, we substract Y safely.
+ // So, if Y is negative, we subtract Y safely.
// Rule 1: Y <s 0 ---> Y.
- // If 0 <= Y <= X, we substract Y safely.
+ // If 0 <= Y <= X, we subtract Y safely.
// Rule 2: Y <=s X ---> Y.
- // If 0 <= X < Y, we should stop at 0 and can only substract X.
+ // If 0 <= X < Y, we should stop at 0 and can only subtract X.
// Rule 3: Y >s X ---> X.
- // It gives us smin(X, Y) to substract in all cases.
+ // It gives us smin(X, Y) to subtract in all cases.
return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
};
const SCEV *M = SE.getMinusSCEV(C, A);
const SCEV *Zero = SE.getZero(M->getType());
- const SCEV *Begin = ClampedSubstract(Zero, M);
- const SCEV *L = nullptr;
- // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
- // We can potentially do much better here.
- if (const SCEV *EndLimit = getEnd())
- L = EndLimit;
- else {
- assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
- L = SIntMax;
- }
- const SCEV *End = ClampedSubstract(L, M);
+ // This function returns SCEV equal to 1 if X is non-negative 0 otherwise.
+ auto SCEVCheckNonNegative = [&](const SCEV *X) {
+ const Loop *L = IndVar->getLoop();
+ const SCEV *One = SE.getOne(X->getType());
+ // Can we trivially prove that X is a non-negative or negative value?
+ if (isKnownNonNegativeInLoop(X, L, SE))
+ return One;
+ else if (isKnownNegativeInLoop(X, L, SE))
+ return Zero;
+ // If not, we will have to figure it out during the execution.
+ // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0.
+ const SCEV *NegOne = SE.getNegativeSCEV(One);
+ return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One);
+ };
+ // FIXME: Current implementation of ClampedSubtract implicitly assumes that
+ // X is non-negative (in sense of a signed value). We need to re-implement
+ // this function in a way that it will correctly handle negative X as well.
+ // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can
+ // end up with a negative X and produce wrong results. So currently we ensure
+ // that if getEnd() is negative then both ends of the safe range are zero.
+ // Note that this may pessimize elimination of unsigned range checks against
+ // negative values.
+ const SCEV *REnd = getEnd();
+ const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd);
+
+ const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative);
+ const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative);
return InductiveRangeCheck::Range(Begin, End);
}
@@ -1757,26 +1829,56 @@ IntersectUnsignedRange(ScalarEvolution &SE,
return Ret;
}
-bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+PreservedAnalyses IRCEPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function *F = L.getHeader()->getParent();
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
+ InductiveRangeCheckElimination IRCE(AR.SE, BPI, AR.DT, AR.LI);
+ auto LPMAddNewLoop = [&U](Loop *NL, bool IsSubloop) {
+ if (!IsSubloop)
+ U.addSiblingLoops(NL);
+ };
+ bool Changed = IRCE.run(&L, LPMAddNewLoop);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+bool IRCELegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
return false;
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+ auto LPMAddNewLoop = [&LPM](Loop *NL, bool /* IsSubLoop */) {
+ LPM.addLoop(*NL);
+ };
+ return IRCE.run(L, LPMAddNewLoop);
+}
+
+bool InductiveRangeCheckElimination::run(
+ Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
if (L->getBlocks().size() >= LoopSizeCutoff) {
- DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+ LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n");
return false;
}
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader) {
- DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+ LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
return false;
}
LLVMContext &Context = Preheader->getContext();
SmallVector<InductiveRangeCheck, 16> RangeChecks;
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- BranchProbabilityInfo &BPI =
- getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
for (auto BBI : L->getBlocks())
if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
@@ -1794,7 +1896,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
IRC.print(OS);
};
- DEBUG(PrintRecognizedRangeChecks(dbgs()));
+ LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs()));
if (PrintRangeChecks)
PrintRecognizedRangeChecks(errs());
@@ -1803,8 +1905,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
Optional<LoopStructure> MaybeLoopStructure =
LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
if (!MaybeLoopStructure.hasValue()) {
- DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
- << "\n";);
+ LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
+ << FailureReason << "\n";);
return false;
}
LoopStructure LS = MaybeLoopStructure.getValue();
@@ -1842,9 +1944,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
if (!SafeIterRange.hasValue())
return false;
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM,
- LS, SE, DT, SafeIterRange.getValue());
+ LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
+ SafeIterRange.getValue());
bool Changed = LC.run();
if (Changed) {
@@ -1855,7 +1956,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
L->print(dbgs());
};
- DEBUG(PrintConstrainedLoopInfo());
+ LLVM_DEBUG(PrintConstrainedLoopInfo());
if (PrintChangedLoops)
PrintConstrainedLoopInfo();
@@ -1874,5 +1975,5 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
}
Pass *llvm::createInductiveRangeCheckEliminationPass() {
- return new InductiveRangeCheckElimination;
+ return new IRCELegacyPass();
}
diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 7d66c0f73821..fbbc09eb487f 100644
--- a/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -97,6 +97,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -121,7 +122,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <cassert>
#include <iterator>
@@ -140,7 +140,7 @@ namespace {
using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
-/// \brief InferAddressSpaces
+/// InferAddressSpaces
class InferAddressSpaces : public FunctionPass {
/// Target specific address space which uses of should be replaced if
/// possible.
@@ -260,7 +260,10 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:{
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
if (!IsVolatile || !IsVolatile->isZero())
return false;
@@ -289,6 +292,9 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
case Intrinsic::objectsize:
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
PostorderStack, Visited);
break;
@@ -647,13 +653,13 @@ void InferAddressSpaces::inferAddressSpaces(
// Tries to update the address space of the stack top according to the
// address spaces of its operands.
- DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
if (!NewAS.hasValue())
continue;
// If any updates are made, grabs its users to the worklist because
// their address spaces can also be possibly updated.
- DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
+ LLVM_DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
(*InferredAddrSpace)[V] = NewAS.getValue();
for (Value *User : V->users()) {
@@ -779,7 +785,7 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
B.CreateMemSet(NewV, MSI->getValue(),
- MSI->getLength(), MSI->getAlignment(),
+ MSI->getLength(), MSI->getDestAlignment(),
false, // isVolatile
TBAA, ScopeMD, NoAliasMD);
} else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
@@ -795,14 +801,16 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
if (isa<MemCpyInst>(MTI)) {
MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
- B.CreateMemCpy(Dest, Src, MTI->getLength(),
- MTI->getAlignment(),
+ B.CreateMemCpy(Dest, MTI->getDestAlignment(),
+ Src, MTI->getSourceAlignment(),
+ MTI->getLength(),
false, // isVolatile
TBAA, TBAAStruct, ScopeMD, NoAliasMD);
} else {
assert(isa<MemMoveInst>(MTI));
- B.CreateMemMove(Dest, Src, MTI->getLength(),
- MTI->getAlignment(),
+ B.CreateMemMove(Dest, MTI->getDestAlignment(),
+ Src, MTI->getSourceAlignment(),
+ MTI->getLength(),
false, // isVolatile
TBAA, ScopeMD, NoAliasMD);
}
@@ -893,15 +901,15 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
if (NewV == nullptr)
continue;
- DEBUG(dbgs() << "Replacing the uses of " << *V
- << "\n with\n " << *NewV << '\n');
+ LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n "
+ << *NewV << '\n');
if (Constant *C = dyn_cast<Constant>(V)) {
Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
C->getType());
if (C != Replace) {
- DEBUG(dbgs() << "Inserting replacement const cast: "
- << Replace << ": " << *Replace << '\n');
+ LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace
+ << ": " << *Replace << '\n');
C->replaceAllUsesWith(Replace);
V = Replace;
}
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Scalar/InstSimplifyPass.cpp
index f3d4f2ef38d7..05cd48d83267 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -1,4 +1,4 @@
-//===------ SimplifyInstructions.cpp - Remove redundant instructions ------===//
+//===- InstSimplifyPass.cpp -----------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,15 +6,8 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification. If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
-//
-//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/SimplifyInstructions.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
@@ -27,7 +20,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -84,58 +77,57 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
}
namespace {
- struct InstSimplifier : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- InstSimplifier() : FunctionPass(ID) {
- initializeInstSimplifierPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- }
-
- /// runOnFunction - Remove instructions that simplify.
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- const DominatorTree *DT =
- &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- OptimizationRemarkEmitter *ORE =
- &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- const DataLayout &DL = F.getParent()->getDataLayout();
- const SimplifyQuery SQ(DL, TLI, DT, AC);
- return runImpl(F, SQ, ORE);
- }
- };
-}
-
-char InstSimplifier::ID = 0;
-INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
+struct InstSimplifyLegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ InstSimplifyLegacyPass() : FunctionPass(ID) {
+ initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+ /// runOnFunction - Remove instructions that simplify.
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ const DominatorTree *DT =
+ &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const SimplifyQuery SQ(DL, TLI, DT, AC);
+ return runImpl(F, SQ, ORE);
+ }
+};
+} // namespace
+
+char InstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify",
"Remove redundant instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
+INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify",
"Remove redundant instructions", false, false)
-char &llvm::InstructionSimplifierID = InstSimplifier::ID;
// Public interface to the simplify instructions pass.
-FunctionPass *llvm::createInstructionSimplifierPass() {
- return new InstSimplifier();
+FunctionPass *llvm::createInstSimplifyLegacyPass() {
+ return new InstSimplifyLegacyPass();
}
-PreservedAnalyses InstSimplifierPass::run(Function &F,
- FunctionAnalysisManager &AM) {
+PreservedAnalyses InstSimplifyPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 1476f7850cf0..1d66472f93c8 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -30,6 +30,7 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -64,7 +65,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
@@ -131,10 +131,11 @@ namespace {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- if (PrintLVIAfterJumpThreading)
- AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LazyValueInfoWrapperPass>();
+ AU.addPreserved<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
@@ -148,6 +149,7 @@ char JumpThreading::ID = 0;
INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -164,7 +166,7 @@ JumpThreadingPass::JumpThreadingPass(int T) {
}
// Update branch probability information according to conditional
-// branch probablity. This is usually made possible for cloned branches
+// branch probability. This is usually made possible for cloned branches
// in inline instances by the context specific profile in the caller.
// For instance,
//
@@ -278,8 +280,12 @@ bool JumpThreading::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ // Get DT analysis before LVI. When LVI is initialized it conditionally adds
+ // DT if it's available.
+ auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DeferredDominance DDT(*DT);
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
bool HasProfileData = F.hasProfileData();
@@ -289,12 +295,11 @@ bool JumpThreading::runOnFunction(Function &F) {
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
}
- bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
- std::move(BPI));
+ bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DDT, HasProfileData,
+ std::move(BFI), std::move(BPI));
if (PrintLVIAfterJumpThreading) {
dbgs() << "LVI for function '" << F.getName() << "':\n";
- LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- dbgs());
+ LVI->printLVI(F, *DT, dbgs());
}
return Changed;
}
@@ -302,8 +307,12 @@ bool JumpThreading::runOnFunction(Function &F) {
PreservedAnalyses JumpThreadingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ // Get DT analysis before LVI. When LVI is initialized it conditionally adds
+ // DT if it's available.
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
+ DeferredDominance DDT(DT);
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
@@ -313,25 +322,28 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
}
- bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
- std::move(BPI));
+ bool Changed = runImpl(F, &TLI, &LVI, &AA, &DDT, HasProfileData,
+ std::move(BFI), std::move(BPI));
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LazyValueAnalysis>();
return PA;
}
bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
LazyValueInfo *LVI_, AliasAnalysis *AA_,
- bool HasProfileData_,
+ DeferredDominance *DDT_, bool HasProfileData_,
std::unique_ptr<BlockFrequencyInfo> BFI_,
std::unique_ptr<BranchProbabilityInfo> BPI_) {
- DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+ LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
TLI = TLI_;
LVI = LVI_;
AA = AA_;
+ DDT = DDT_;
BFI.reset();
BPI.reset();
// When profile data is available, we need to update edge weights after
@@ -345,69 +357,66 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
BFI = std::move(BFI_);
}
- // Remove unreachable blocks from function as they may result in infinite
- // loop. We do threading if we found something profitable. Jump threading a
- // branch can create other opportunities. If these opportunities form a cycle
- // i.e. if any jump threading is undoing previous threading in the path, then
- // we will loop forever. We take care of this issue by not jump threading for
- // back edges. This works for normal cases but not for unreachable blocks as
- // they may have cycle with no back edge.
- bool EverChanged = false;
- EverChanged |= removeUnreachableBlocks(F, LVI);
+ // JumpThreading must not processes blocks unreachable from entry. It's a
+ // waste of compute time and can potentially lead to hangs.
+ SmallPtrSet<BasicBlock *, 16> Unreachable;
+ DominatorTree &DT = DDT->flush();
+ for (auto &BB : F)
+ if (!DT.isReachableFromEntry(&BB))
+ Unreachable.insert(&BB);
FindLoopHeaders(F);
+ bool EverChanged = false;
bool Changed;
do {
Changed = false;
- for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
- BasicBlock *BB = &*I;
- // Thread all of the branches we can over this block.
- while (ProcessBlock(BB))
+ for (auto &BB : F) {
+ if (Unreachable.count(&BB))
+ continue;
+ while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
Changed = true;
+ // Stop processing BB if it's the entry or is now deleted. The following
+ // routines attempt to eliminate BB and locating a suitable replacement
+ // for the entry is non-trivial.
+ if (&BB == &F.getEntryBlock() || DDT->pendingDeletedBB(&BB))
+ continue;
- ++I;
-
- // If the block is trivially dead, zap it. This eliminates the successor
- // edges which simplifies the CFG.
- if (pred_empty(BB) &&
- BB != &BB->getParent()->getEntryBlock()) {
- DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
- << "' with terminator: " << *BB->getTerminator() << '\n');
- LoopHeaders.erase(BB);
- LVI->eraseBlock(BB);
- DeleteDeadBlock(BB);
+ if (pred_empty(&BB)) {
+ // When ProcessBlock makes BB unreachable it doesn't bother to fix up
+ // the instructions in it. We must remove BB to prevent invalid IR.
+ LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
+ << "' with terminator: " << *BB.getTerminator()
+ << '\n');
+ LoopHeaders.erase(&BB);
+ LVI->eraseBlock(&BB);
+ DeleteDeadBlock(&BB, DDT);
Changed = true;
continue;
}
- BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
-
- // Can't thread an unconditional jump, but if the block is "almost
- // empty", we can replace uses of it with uses of the successor and make
- // this dead.
- // We should not eliminate the loop header or latch either, because
- // eliminating a loop header or latch might later prevent LoopSimplify
- // from transforming nested loops into simplified form. We will rely on
- // later passes in backend to clean up empty blocks.
+ // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
+ // is "almost empty", we attempt to merge BB with its sole successor.
+ auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
if (BI && BI->isUnconditional() &&
- BB != &BB->getParent()->getEntryBlock() &&
- // If the terminator is the only non-phi instruction, try to nuke it.
- BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) &&
- !LoopHeaders.count(BI->getSuccessor(0))) {
- // FIXME: It is always conservatively correct to drop the info
- // for a block even if it doesn't get erased. This isn't totally
- // awesome, but it allows us to use AssertingVH to prevent nasty
- // dangling pointer issues within LazyValueInfo.
- LVI->eraseBlock(BB);
- if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
- Changed = true;
+ // The terminator must be the only non-phi instruction in BB.
+ BB.getFirstNonPHIOrDbg()->isTerminator() &&
+ // Don't alter Loop headers and latches to ensure another pass can
+ // detect and transform nested loops later.
+ !LoopHeaders.count(&BB) && !LoopHeaders.count(BI->getSuccessor(0)) &&
+ TryToSimplifyUncondBranchFromEmptyBlock(&BB, DDT)) {
+ // BB is valid for cleanup here because we passed in DDT. F remains
+ // BB's parent until a DDT->flush() event.
+ LVI->eraseBlock(&BB);
+ Changed = true;
}
}
EverChanged |= Changed;
} while (Changed);
LoopHeaders.clear();
+ DDT->flush();
+ LVI->enableDT();
return EverChanged;
}
@@ -600,6 +609,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
// "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
// Perhaps getConstantOnEdge should be smart enough to do this?
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
@@ -613,6 +626,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
/// If I is a PHI node, then we know the incoming values for any constants.
if (PHINode *PN = dyn_cast<PHINode>(I)) {
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
Value *InVal = PN->getIncomingValue(i);
if (Constant *KC = getKnownConstant(InVal, Preference)) {
@@ -630,11 +647,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
}
// Handle Cast instructions. Only see through Cast when the source operand is
- // PHI or Cmp and the source type is i1 to save the compilation time.
+ // PHI or Cmp to save the compilation time.
if (CastInst *CI = dyn_cast<CastInst>(I)) {
Value *Source = CI->getOperand(0);
- if (!Source->getType()->isIntegerTy(1))
- return false;
if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
return false;
ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
@@ -738,20 +753,36 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
CmpInst::Predicate Pred = Cmp->getPredicate();
PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+ if (!PN)
+ PN = dyn_cast<PHINode>(CmpRHS);
if (PN && PN->getParent() == BB) {
const DataLayout &DL = PN->getModule()->getDataLayout();
// We can do this simplification if any comparisons fold to true or false.
// See if any do.
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *PredBB = PN->getIncomingBlock(i);
- Value *LHS = PN->getIncomingValue(i);
- Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
-
+ Value *LHS, *RHS;
+ if (PN == CmpLHS) {
+ LHS = PN->getIncomingValue(i);
+ RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+ } else {
+ LHS = CmpLHS->DoPHITranslation(BB, PredBB);
+ RHS = PN->getIncomingValue(i);
+ }
Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
if (!Res) {
if (!isa<Constant>(RHS))
continue;
+ // getPredicateOnEdge call will make no sense if LHS is defined in BB.
+ auto LHSInst = dyn_cast<Instruction>(LHS);
+ if (LHSInst && LHSInst->getParent() == BB)
+ continue;
+
LazyValueInfo::Tristate
ResT = LVI->getPredicateOnEdge(Pred, LHS,
cast<Constant>(RHS), PredBB, BB,
@@ -775,6 +806,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
if (!isa<Instruction>(CmpLHS) ||
cast<Instruction>(CmpLHS)->getParent() != BB) {
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
@@ -803,6 +838,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
if (!isa<Instruction>(AddLHS) ||
cast<Instruction>(AddLHS)->getParent() != BB) {
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a ConstantRange in
// a predecessor, use that information to try to thread this
@@ -884,6 +923,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
}
// If all else fails, see if LVI can figure out a constant value for us.
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
Constant *CI = LVI->getConstant(V, BB, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference)) {
for (BasicBlock *Pred : predecessors(BB))
@@ -903,10 +946,10 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
unsigned MinSucc = 0;
BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
// Compute the successor with the minimum number of predecessors.
- unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ unsigned MinNumPreds = pred_size(TestBB);
for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
TestBB = BBTerm->getSuccessor(i);
- unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ unsigned NumPreds = pred_size(TestBB);
if (NumPreds < MinNumPreds) {
MinSucc = i;
MinNumPreds = NumPreds;
@@ -931,8 +974,8 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
- if (pred_empty(BB) &&
- BB != &BB->getParent()->getEntryBlock())
+ if (DDT->pendingDeletedBB(BB) ||
+ (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
return false;
// If this block has a single predecessor, and if that pred has a single
@@ -948,7 +991,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
LoopHeaders.insert(BB);
LVI->eraseBlock(SinglePred);
- MergeBasicBlockIntoOnlyPred(BB);
+ MergeBasicBlockIntoOnlyPred(BB, nullptr, DDT);
// Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
// BB code within one basic block `BB`), we need to invalidate the LVI
@@ -977,9 +1020,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// Invalidate LVI information for BB if the LVI is not provably true for
// all of BB.
- if (any_of(*BB, [](Instruction &I) {
- return !isGuaranteedToTransferExecutionToSuccessor(&I);
- }))
+ if (!isGuaranteedToTransferExecutionToSuccessor(BB))
LVI->eraseBlock(BB);
return true;
}
@@ -1031,18 +1072,23 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// successors to branch to. Let GetBestDestForJumpOnUndef decide.
if (isa<UndefValue>(Condition)) {
unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+ std::vector<DominatorTree::UpdateType> Updates;
// Fold the branch/switch.
TerminatorInst *BBTerm = BB->getTerminator();
+ Updates.reserve(BBTerm->getNumSuccessors());
for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
if (i == BestSucc) continue;
- BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+ BasicBlock *Succ = BBTerm->getSuccessor(i);
+ Succ->removePredecessor(BB, true);
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
}
- DEBUG(dbgs() << " In block '" << BB->getName()
- << "' folding undef terminator: " << *BBTerm << '\n');
+ LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding undef terminator: " << *BBTerm << '\n');
BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
BBTerm->eraseFromParent();
+ DDT->applyUpdates(Updates);
return true;
}
@@ -1050,10 +1096,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// terminator to an unconditional branch. This can occur due to threading in
// other blocks.
if (getKnownConstant(Condition, Preference)) {
- DEBUG(dbgs() << " In block '" << BB->getName()
- << "' folding terminator: " << *BB->getTerminator() << '\n');
+ LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding terminator: " << *BB->getTerminator()
+ << '\n');
++NumFolds;
- ConstantFoldTerminator(BB, true);
+ ConstantFoldTerminator(BB, true, nullptr, DDT);
return true;
}
@@ -1080,13 +1127,18 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// threading is concerned.
assert(CondBr->isConditional() && "Threading on unconditional terminator");
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
LazyValueInfo::Tristate Ret =
LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
CondConst, CondBr);
if (Ret != LazyValueInfo::Unknown) {
unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
- CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+ BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
+ ToRemoveSucc->removePredecessor(BB, true);
BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
CondBr->eraseFromParent();
if (CondCmp->use_empty())
@@ -1104,6 +1156,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
ConstantInt::getFalse(CondCmp->getType());
ReplaceFoldableUses(CondCmp, CI);
}
+ DDT->deleteEdge(BB, ToRemoveSucc);
return true;
}
@@ -1125,8 +1178,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// TODO: There are other places where load PRE would be profitable, such as
// more complex comparisons.
- if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
- if (SimplifyPartiallyRedundantLoad(LI))
+ if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
+ if (SimplifyPartiallyRedundantLoad(LoadI))
return true;
// Before threading, try to propagate profile data backwards:
@@ -1182,9 +1235,12 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
Optional<bool> Implication =
isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
if (Implication) {
- BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
- BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
+ BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
+ BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
+ RemoveSucc->removePredecessor(BB);
+ BranchInst::Create(KeepSucc, BI);
BI->eraseFromParent();
+ DDT->deleteEdge(BB, RemoveSucc);
return true;
}
CurrentBB = CurrentPred;
@@ -1202,17 +1258,17 @@ static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
return false;
}
-/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
-/// load instruction, eliminate it by replacing it with a PHI node. This is an
-/// important optimization that encourages jump threading, and needs to be run
-/// interlaced with other jump threading tasks.
-bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
+/// redundant load instruction, eliminate it by replacing it with a PHI node.
+/// This is an important optimization that encourages jump threading, and needs
+/// to be run interlaced with other jump threading tasks.
+bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// Don't hack volatile and ordered loads.
- if (!LI->isUnordered()) return false;
+ if (!LoadI->isUnordered()) return false;
// If the load is defined in a block with exactly one predecessor, it can't be
// partially redundant.
- BasicBlock *LoadBB = LI->getParent();
+ BasicBlock *LoadBB = LoadI->getParent();
if (LoadBB->getSinglePredecessor())
return false;
@@ -1222,7 +1278,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (LoadBB->isEHPad())
return false;
- Value *LoadedPtr = LI->getOperand(0);
+ Value *LoadedPtr = LoadI->getOperand(0);
// If the loaded operand is defined in the LoadBB and its not a phi,
// it can't be available in predecessors.
@@ -1231,26 +1287,27 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Scan a few instructions up from the load, to see if it is obviously live at
// the entry to its block.
- BasicBlock::iterator BBIt(LI);
+ BasicBlock::iterator BBIt(LoadI);
bool IsLoadCSE;
if (Value *AvailableVal = FindAvailableLoadedValue(
- LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
// If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
if (IsLoadCSE) {
- LoadInst *NLI = cast<LoadInst>(AvailableVal);
- combineMetadataForCSE(NLI, LI);
+ LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
+ combineMetadataForCSE(NLoadI, LoadI);
};
// If the returned value is the load itself, replace with an undef. This can
// only happen in dead loops.
- if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
- if (AvailableVal->getType() != LI->getType())
- AvailableVal =
- CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
- LI->replaceAllUsesWith(AvailableVal);
- LI->eraseFromParent();
+ if (AvailableVal == LoadI)
+ AvailableVal = UndefValue::get(LoadI->getType());
+ if (AvailableVal->getType() != LoadI->getType())
+ AvailableVal = CastInst::CreateBitOrPointerCast(
+ AvailableVal, LoadI->getType(), "", LoadI);
+ LoadI->replaceAllUsesWith(AvailableVal);
+ LoadI->eraseFromParent();
return true;
}
@@ -1263,7 +1320,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// If all of the loads and stores that feed the value have the same AA tags,
// then we can propagate them onto any newly inserted loads.
AAMDNodes AATags;
- LI->getAAMetadata(AATags);
+ LoadI->getAAMetadata(AATags);
SmallPtrSet<BasicBlock*, 8> PredsScanned;
@@ -1285,16 +1342,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
Value *PredAvailable = nullptr;
// NOTE: We don't CSE load that is volatile or anything stronger than
// unordered, that should have been checked when we entered the function.
- assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+ assert(LoadI->isUnordered() &&
+ "Attempting to CSE volatile or atomic loads");
// If this is a load on a phi pointer, phi-translate it and search
// for available load/store to the pointer in predecessors.
Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
PredAvailable = FindAvailablePtrLoadStore(
- Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
- AA, &IsLoadCSE, &NumScanedInst);
+ Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
+ DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
// If PredBB has a single predecessor, continue scanning through the
- // single precessor.
+ // single predecessor.
BasicBlock *SinglePredBB = PredBB;
while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
NumScanedInst < DefMaxInstsToScan) {
@@ -1302,7 +1360,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (SinglePredBB) {
BBIt = SinglePredBB->end();
PredAvailable = FindAvailablePtrLoadStore(
- Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+ Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
(DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
&NumScanedInst);
}
@@ -1334,15 +1392,15 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// If the value is unavailable in one of predecessors, we will end up
// inserting a new instruction into them. It is only valid if all the
- // instructions before LI are guaranteed to pass execution to its successor,
- // or if LI is safe to speculate.
+ // instructions before LoadI are guaranteed to pass execution to its
+ // successor, or if LoadI is safe to speculate.
// TODO: If this logic becomes more complex, and we will perform PRE insertion
// farther than to a predecessor, we need to reuse the code from GVN's PRE.
// It requires domination tree analysis, so for this simple case it is an
// overkill.
if (PredsScanned.size() != AvailablePreds.size() &&
- !isSafeToSpeculativelyExecute(LI))
- for (auto I = LoadBB->begin(); &*I != LI; ++I)
+ !isSafeToSpeculativelyExecute(LoadI))
+ for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
return false;
@@ -1381,11 +1439,12 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (UnavailablePred) {
assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
"Can't handle critical edge here!");
- LoadInst *NewVal = new LoadInst(
- LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
- LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
- LI->getSyncScopeID(), UnavailablePred->getTerminator());
- NewVal->setDebugLoc(LI->getDebugLoc());
+ LoadInst *NewVal =
+ new LoadInst(LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+ LoadI->getName() + ".pr", false, LoadI->getAlignment(),
+ LoadI->getOrdering(), LoadI->getSyncScopeID(),
+ UnavailablePred->getTerminator());
+ NewVal->setDebugLoc(LoadI->getDebugLoc());
if (AATags)
NewVal->setAAMetadata(AATags);
@@ -1398,10 +1457,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Create a PHI node at the start of the block for the PRE'd load value.
pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
- PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+ PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
&LoadBB->front());
- PN->takeName(LI);
- PN->setDebugLoc(LI->getDebugLoc());
+ PN->takeName(LoadI);
+ PN->setDebugLoc(LoadI->getDebugLoc());
// Insert new entries into the PHI for each predecessor. A single block may
// have multiple entries here.
@@ -1419,19 +1478,19 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// AvailablePreds vector as we go so that all of the PHI entries for this
// predecessor use the same bitcast.
Value *&PredV = I->second;
- if (PredV->getType() != LI->getType())
- PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ if (PredV->getType() != LoadI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
P->getTerminator());
PN->addIncoming(PredV, I->first);
}
- for (LoadInst *PredLI : CSELoads) {
- combineMetadataForCSE(PredLI, LI);
+ for (LoadInst *PredLoadI : CSELoads) {
+ combineMetadataForCSE(PredLoadI, LoadI);
}
- LI->replaceAllUsesWith(PN);
- LI->eraseFromParent();
+ LoadI->replaceAllUsesWith(PN);
+ LoadI->eraseFromParent();
return true;
}
@@ -1454,6 +1513,9 @@ FindMostPopularDest(BasicBlock *BB,
if (PredToDest.second)
DestPopularity[PredToDest.second]++;
+ if (DestPopularity.empty())
+ return nullptr;
+
// Find the most popular dest.
DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
BasicBlock *MostPopularDest = DPI->first;
@@ -1513,12 +1575,12 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
assert(!PredValues.empty() &&
"ComputeValueKnownInPredecessors returned true with no values");
- DEBUG(dbgs() << "IN BB: " << *BB;
- for (const auto &PredValue : PredValues) {
- dbgs() << " BB '" << BB->getName() << "': FOUND condition = "
- << *PredValue.first
- << " for pred '" << PredValue.second->getName() << "'.\n";
- });
+ LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
+ for (const auto &PredValue : PredValues) {
+ dbgs() << " BB '" << BB->getName()
+ << "': FOUND condition = " << *PredValue.first
+ << " for pred '" << PredValue.second->getName() << "'.\n";
+ });
// Decide what we want to thread through. Convert our list of known values to
// a list of known destinations for each pred. This also discards duplicate
@@ -1588,20 +1650,24 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
// not thread. By doing so, we do not need to duplicate the current block and
// also miss potential opportunities in case we dont/cant duplicate.
if (OnlyDest && OnlyDest != MultipleDestSentinel) {
- if (PredWithKnownDest ==
- (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+ if (PredWithKnownDest == (size_t)pred_size(BB)) {
bool SeenFirstBranchToOnlyDest = false;
+ std::vector <DominatorTree::UpdateType> Updates;
+ Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
for (BasicBlock *SuccBB : successors(BB)) {
- if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+ if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
- else
+ } else {
SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+ Updates.push_back({DominatorTree::Delete, BB, SuccBB});
+ }
}
// Finally update the terminator.
TerminatorInst *Term = BB->getTerminator();
BranchInst::Create(OnlyDest, Term);
Term->eraseFromParent();
+ DDT->applyUpdates(Updates);
// If the condition is now dead due to the removal of the old terminator,
// erase it.
@@ -1629,8 +1695,20 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
// threadable destination (the common case) we can avoid this.
BasicBlock *MostPopularDest = OnlyDest;
- if (MostPopularDest == MultipleDestSentinel)
+ if (MostPopularDest == MultipleDestSentinel) {
+ // Remove any loop headers from the Dest list, ThreadEdge conservatively
+ // won't process them, but we might have other destination that are eligible
+ // and we still want to process.
+ erase_if(PredToDestList,
+ [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
+ return LoopHeaders.count(PredToDest.second) != 0;
+ });
+
+ if (PredToDestList.empty())
+ return false;
+
MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+ }
// Now that we know what the most popular destination is, factor all
// predecessors that will jump to it into a single predecessor.
@@ -1800,11 +1878,10 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
BasicBlock *OldPred,
BasicBlock *NewPred,
DenseMap<Instruction*, Value*> &ValueMap) {
- for (BasicBlock::iterator PNI = PHIBB->begin();
- PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) {
+ for (PHINode &PN : PHIBB->phis()) {
// Ok, we have a PHI node. Figure out what the incoming value was for the
// DestBlock.
- Value *IV = PN->getIncomingValueForBlock(OldPred);
+ Value *IV = PN.getIncomingValueForBlock(OldPred);
// Remap the value if necessary.
if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
@@ -1813,7 +1890,7 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
IV = I->second;
}
- PN->addIncoming(IV, NewPred);
+ PN.addIncoming(IV, NewPred);
}
}
@@ -1825,15 +1902,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
BasicBlock *SuccBB) {
// If threading to the same block as we come from, we would infinite loop.
if (SuccBB == BB) {
- DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
- << "' - would thread to self!\n");
+ LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
+ << "' - would thread to self!\n");
return false;
}
// If threading this would thread across a loop header, don't thread the edge.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
- DEBUG({
+ LLVM_DEBUG({
bool BBIsHeader = LoopHeaders.count(BB);
bool SuccIsHeader = LoopHeaders.count(SuccBB);
dbgs() << " Not threading across "
@@ -1847,8 +1924,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
unsigned JumpThreadCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (JumpThreadCost > BBDupThreshold) {
- DEBUG(dbgs() << " Not threading BB '" << BB->getName()
- << "' - Cost is too high: " << JumpThreadCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
+ << "' - Cost is too high: " << JumpThreadCost << "\n");
return false;
}
@@ -1857,17 +1934,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
- DEBUG(dbgs() << " Factoring out " << PredBBs.size()
- << " common predecessors.\n");
+ LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
}
// And finally, do it!
- DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '"
- << SuccBB->getName() << "' with cost: " << JumpThreadCost
- << ", across block:\n "
- << *BB << "\n");
-
+ LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName()
+ << "' to '" << SuccBB->getName()
+ << "' with cost: " << JumpThreadCost
+ << ", across block:\n " << *BB << "\n");
+
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
LVI->threadEdge(PredBB, BB, SuccBB);
// We are going to have to map operands from the original BB block to the new
@@ -1917,15 +1998,32 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// PHI nodes for NewBB now.
AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+ // Update the terminator of PredBB to jump to NewBB instead of BB. This
+ // eliminates predecessors from BB, which requires us to simplify any PHI
+ // nodes in BB.
+ TerminatorInst *PredTerm = PredBB->getTerminator();
+ for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+ if (PredTerm->getSuccessor(i) == BB) {
+ BB->removePredecessor(PredBB, true);
+ PredTerm->setSuccessor(i, NewBB);
+ }
+
+ // Enqueue required DT updates.
+ DDT->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB},
+ {DominatorTree::Insert, PredBB, NewBB},
+ {DominatorTree::Delete, PredBB, BB}});
+
// If there were values defined in BB that are used outside the block, then we
// now have to update all uses of the value to use either the original value,
// the cloned value, or some PHI derived value. This can require arbitrary
// PHI insertion, of which we are prepared to do, clean these up now.
SSAUpdater SSAUpdate;
SmallVector<Use*, 16> UsesToRename;
+
for (Instruction &I : *BB) {
- // Scan all uses of this instruction to see if it is used outside of its
- // block, and if so, record them in UsesToRename.
+ // Scan all uses of this instruction to see if their uses are no longer
+ // dominated by the previous def and if so, record them in UsesToRename.
+ // Also, skip phi operands from PredBB - we'll remove them anyway.
for (Use &U : I.uses()) {
Instruction *User = cast<Instruction>(U.getUser());
if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
@@ -1940,8 +2038,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// If there are no uses outside the block, we're done with this instruction.
if (UsesToRename.empty())
continue;
-
- DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
// We found a use of I outside of BB. Rename all uses of I that are outside
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
@@ -1952,19 +2049,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
while (!UsesToRename.empty())
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
}
- // Ok, NewBB is good to go. Update the terminator of PredBB to jump to
- // NewBB instead of BB. This eliminates predecessors from BB, which requires
- // us to simplify any PHI nodes in BB.
- TerminatorInst *PredTerm = PredBB->getTerminator();
- for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
- if (PredTerm->getSuccessor(i) == BB) {
- BB->removePredecessor(PredBB, true);
- PredTerm->setSuccessor(i, NewBB);
- }
-
// At this point, the IR is fully up to date and consistent. Do a quick scan
// over the new instructions and zap any that are constants or dead. This
// frequently happens because of phi translation.
@@ -1984,20 +2071,42 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
ArrayRef<BasicBlock *> Preds,
const char *Suffix) {
+ SmallVector<BasicBlock *, 2> NewBBs;
+
// Collect the frequencies of all predecessors of BB, which will be used to
- // update the edge weight on BB->SuccBB.
- BlockFrequency PredBBFreq(0);
+ // update the edge weight of the result of splitting predecessors.
+ DenseMap<BasicBlock *, BlockFrequency> FreqMap;
if (HasProfileData)
for (auto Pred : Preds)
- PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+ FreqMap.insert(std::make_pair(
+ Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
+
+ // In the case when BB is a LandingPad block we create 2 new predecessors
+ // instead of just one.
+ if (BB->isLandingPad()) {
+ std::string NewName = std::string(Suffix) + ".split-lp";
+ SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
+ } else {
+ NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
+ }
- BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve((2 * Preds.size()) + NewBBs.size());
+ for (auto NewBB : NewBBs) {
+ BlockFrequency NewBBFreq(0);
+ Updates.push_back({DominatorTree::Insert, NewBB, BB});
+ for (auto Pred : predecessors(NewBB)) {
+ Updates.push_back({DominatorTree::Delete, Pred, BB});
+ Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+ if (HasProfileData) // Update frequencies between Pred -> NewBB.
+ NewBBFreq += FreqMap.lookup(Pred);
+ }
+ if (HasProfileData) // Apply the summed frequency to NewBB.
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
- // Set the block frequency of the newly created PredBB, which is the sum of
- // frequencies of Preds.
- if (HasProfileData)
- BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
- return PredBB;
+ DDT->applyUpdates(Updates);
+ return NewBBs[0];
}
bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
@@ -2126,42 +2235,49 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// cause us to transform this into an irreducible loop, don't do this.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB)) {
- DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
- << "' into predecessor block '" << PredBBs[0]->getName()
- << "' - it might create an irreducible loop!\n");
+ LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
+ << "' into predecessor block '" << PredBBs[0]->getName()
+ << "' - it might create an irreducible loop!\n");
return false;
}
unsigned DuplicationCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (DuplicationCost > BBDupThreshold) {
- DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
- << "' - Cost is too high: " << DuplicationCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
+ << "' - Cost is too high: " << DuplicationCost << "\n");
return false;
}
// And finally, do it! Start by factoring the predecessors if needed.
+ std::vector<DominatorTree::UpdateType> Updates;
BasicBlock *PredBB;
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
- DEBUG(dbgs() << " Factoring out " << PredBBs.size()
- << " common predecessors.\n");
+ LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
}
+ Updates.push_back({DominatorTree::Delete, PredBB, BB});
// Okay, we decided to do this! Clone all the instructions in BB onto the end
// of PredBB.
- DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '"
- << PredBB->getName() << "' to eliminate branch on phi. Cost: "
- << DuplicationCost << " block is:" << *BB << "\n");
+ LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName()
+ << "' into end of '" << PredBB->getName()
+ << "' to eliminate branch on phi. Cost: "
+ << DuplicationCost << " block is:" << *BB << "\n");
// Unless PredBB ends with an unconditional branch, split the edge so that we
// can just clone the bits from BB into the end of the new PredBB.
BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
- PredBB = SplitEdge(PredBB, BB);
+ BasicBlock *OldPredBB = PredBB;
+ PredBB = SplitEdge(OldPredBB, BB);
+ Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
+ Updates.push_back({DominatorTree::Insert, PredBB, BB});
+ Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
}
@@ -2203,6 +2319,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// Otherwise, insert the new instruction into the block.
New->setName(BI->getName());
PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+ // Update Dominance from simplified New instruction operands.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
+ Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
}
}
@@ -2238,7 +2358,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
if (UsesToRename.empty())
continue;
- DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
// We found a use of I outside of BB. Rename all uses of I that are outside
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
@@ -2249,7 +2369,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
while (!UsesToRename.empty())
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
}
// PredBB no longer jumps to BB, remove entries in the PHI node for the edge
@@ -2258,6 +2378,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// Remove the unconditional branch at the end of the PredBB block.
OldPredBranch->eraseFromParent();
+ DDT->applyUpdates(Updates);
++NumDupes;
return true;
@@ -2300,6 +2421,10 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
// Now check if one of the select values would allow us to constant fold the
// terminator in BB. We don't do the transform if both sides fold, those
// cases will be threaded in any case.
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
LazyValueInfo::Tristate LHSFolds =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
CondRHS, Pred, BB, CondCmp);
@@ -2330,6 +2455,8 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
// The select is now dead.
SI->eraseFromParent();
+ DDT->applyUpdates({{DominatorTree::Insert, NewBB, BB},
+ {DominatorTree::Insert, Pred, NewBB}});
// Update any other PHI nodes in BB.
for (BasicBlock::iterator BI = BB->begin();
PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
@@ -2395,7 +2522,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
break;
}
} else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
- // Look for a Select in BB that uses PN as condtion.
+ // Look for a Select in BB that uses PN as condition.
if (isUnfoldCandidate(SelectI, U.get())) {
SI = SelectI;
break;
@@ -2408,11 +2535,25 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
// Expand the select.
TerminatorInst *Term =
SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+ BasicBlock *SplitBB = SI->getParent();
+ BasicBlock *NewBB = Term->getParent();
PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
NewPN->addIncoming(SI->getFalseValue(), BB);
SI->replaceAllUsesWith(NewPN);
SI->eraseFromParent();
+ // NewBB and SplitBB are newly created blocks which require insertion.
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
+ Updates.push_back({DominatorTree::Insert, BB, SplitBB});
+ Updates.push_back({DominatorTree::Insert, BB, NewBB});
+ Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
+ // BB's successors were moved to SplitBB, update DDT accordingly.
+ for (auto *Succ : successors(SplitBB)) {
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
+ }
+ DDT->applyUpdates(Updates);
return true;
}
return false;
@@ -2499,8 +2640,8 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
if (!TrueDestIsSafe && !FalseDestIsSafe)
return false;
- BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
- BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
ValueToValueMapTy UnguardedMapping, GuardedMapping;
Instruction *AfterGuard = Guard->getNextNode();
@@ -2509,18 +2650,29 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
return false;
// Duplicate all instructions before the guard and the guard itself to the
// branch where implication is not proved.
- GuardedBlock = DuplicateInstructionsInSplitBetween(
- BB, GuardedBlock, AfterGuard, GuardedMapping);
+ BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, PredGuardedBlock, AfterGuard, GuardedMapping);
assert(GuardedBlock && "Could not create the guarded block?");
// Duplicate all instructions before the guard in the unguarded branch.
// Since we have successfully duplicated the guarded block and this block
// has fewer instructions, we expect it to succeed.
- UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
- Guard, UnguardedMapping);
+ BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, PredUnguardedBlock, Guard, UnguardedMapping);
assert(UnguardedBlock && "Could not create the unguarded block?");
- DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
- << GuardedBlock->getName() << "\n");
-
+ LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+ << GuardedBlock->getName() << "\n");
+ // DuplicateInstructionsInSplitBetween inserts a new block "BB.split" between
+ // PredBB and BB. We need to perform two inserts and one delete for each of
+ // the above calls to update Dominators.
+ DDT->applyUpdates(
+ {// Guarded block split.
+ {DominatorTree::Delete, PredGuardedBlock, BB},
+ {DominatorTree::Insert, PredGuardedBlock, GuardedBlock},
+ {DominatorTree::Insert, GuardedBlock, BB},
+ // Unguarded block split.
+ {DominatorTree::Delete, PredUnguardedBlock, BB},
+ {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock},
+ {DominatorTree::Insert, UnguardedBlock, BB}});
// Some instructions before the guard may still have uses. For them, we need
// to create Phi nodes merging their copies in both guarded and unguarded
// branches. Those instructions that have no uses can be just removed.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 4ea935793b80..ff66632f0391 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -47,6 +47,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -64,7 +65,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
@@ -97,7 +97,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE);
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
- const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+ const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE, bool FreeInLoop);
static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
@@ -170,7 +170,8 @@ struct LegacyLICMPass : public LoopPass {
/// loop preheaders be inserted into the CFG...
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
if (EnableMSSALoopDependency)
AU.addRequired<MemorySSAWrapperPass>();
@@ -220,7 +221,10 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
- PA.preserveSet<CFGAnalyses>();
+
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+
return PA;
}
@@ -392,7 +396,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// If the instruction is dead, we would try to sink it because it isn't
// used in the loop, instead, just delete it.
if (isInstructionTriviallyDead(&I, TLI)) {
- DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ salvageDebugInfo(I);
++II;
CurAST->deleteValue(&I);
I.eraseFromParent();
@@ -445,101 +450,78 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
BasicBlock *BB = DTN->getBlock();
// Only need to process the contents of this block if it is not part of a
// subloop (which would already have been processed).
- if (!inSubLoop(BB, CurLoop, LI))
- for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
- Instruction &I = *II++;
- // Try constant folding this instruction. If all the operands are
- // constants, it is technically hoistable, but it would be better to
- // just fold it.
- if (Constant *C = ConstantFoldInstruction(
- &I, I.getModule()->getDataLayout(), TLI)) {
- DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n');
- CurAST->copyValue(&I, C);
- I.replaceAllUsesWith(C);
- if (isInstructionTriviallyDead(&I, TLI)) {
- CurAST->deleteValue(&I);
- I.eraseFromParent();
- }
- Changed = true;
- continue;
- }
+ if (inSubLoop(BB, CurLoop, LI))
+ continue;
- // Attempt to remove floating point division out of the loop by
- // converting it to a reciprocal multiplication.
- if (I.getOpcode() == Instruction::FDiv &&
- CurLoop->isLoopInvariant(I.getOperand(1)) &&
- I.hasAllowReciprocal()) {
- auto Divisor = I.getOperand(1);
- auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
- auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
- ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
- ReciprocalDivisor->insertBefore(&I);
-
- auto Product =
- BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
- Product->setFastMathFlags(I.getFastMathFlags());
- Product->insertAfter(&I);
- I.replaceAllUsesWith(Product);
+ // Keep track of whether the prefix of instructions visited so far are such
+ // that the next instruction visited is guaranteed to execute if the loop
+ // is entered.
+ bool IsMustExecute = CurLoop->getHeader() == BB;
+
+ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+ Instruction &I = *II++;
+ // Try constant folding this instruction. If all the operands are
+ // constants, it is technically hoistable, but it would be better to
+ // just fold it.
+ if (Constant *C = ConstantFoldInstruction(
+ &I, I.getModule()->getDataLayout(), TLI)) {
+ LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C
+ << '\n');
+ CurAST->copyValue(&I, C);
+ I.replaceAllUsesWith(C);
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ CurAST->deleteValue(&I);
I.eraseFromParent();
-
- hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
- Changed = true;
- continue;
}
+ Changed = true;
+ continue;
+ }
+
+ // Try hoisting the instruction out to the preheader. We can only do
+ // this if all of the operands of the instruction are loop invariant and
+ // if it is safe to hoist the instruction.
+ //
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+ (IsMustExecute ||
+ isSafeToExecuteUnconditionally(
+ I, DT, CurLoop, SafetyInfo, ORE,
+ CurLoop->getLoopPreheader()->getTerminator()))) {
+ Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+ continue;
+ }
- // Try hoisting the instruction out to the preheader. We can only do
- // this if all of the operands of the instruction are loop invariant and
- // if it is safe to hoist the instruction.
- //
- if (CurLoop->hasLoopInvariantOperands(&I) &&
- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
- isSafeToExecuteUnconditionally(
- I, DT, CurLoop, SafetyInfo, ORE,
- CurLoop->getLoopPreheader()->getTerminator()))
- Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+ // Attempt to remove floating point division out of the loop by
+ // converting it to a reciprocal multiplication.
+ if (I.getOpcode() == Instruction::FDiv &&
+ CurLoop->isLoopInvariant(I.getOperand(1)) &&
+ I.hasAllowReciprocal()) {
+ auto Divisor = I.getOperand(1);
+ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+ ReciprocalDivisor->insertBefore(&I);
+
+ auto Product =
+ BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+ Product->setFastMathFlags(I.getFastMathFlags());
+ Product->insertAfter(&I);
+ I.replaceAllUsesWith(Product);
+ I.eraseFromParent();
+
+ hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+ Changed = true;
+ continue;
}
+
+ if (IsMustExecute)
+ IsMustExecute = isGuaranteedToTransferExecutionToSuccessor(&I);
+ }
}
return Changed;
}
-/// Computes loop safety information, checks loop body & header
-/// for the possibility of may throw exception.
-///
-void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
- assert(CurLoop != nullptr && "CurLoop cant be null");
- BasicBlock *Header = CurLoop->getHeader();
- // Setting default safety values.
- SafetyInfo->MayThrow = false;
- SafetyInfo->HeaderMayThrow = false;
- // Iterate over header and compute safety info.
- for (BasicBlock::iterator I = Header->begin(), E = Header->end();
- (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
- SafetyInfo->HeaderMayThrow |=
- !isGuaranteedToTransferExecutionToSuccessor(&*I);
-
- SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
- // Iterate over loop instructions and compute safety info.
- // Skip header as it has been computed and stored in HeaderMayThrow.
- // The first block in loopinfo.Blocks is guaranteed to be the header.
- assert(Header == *CurLoop->getBlocks().begin() &&
- "First block must be header");
- for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
- BBE = CurLoop->block_end();
- (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
- for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
- (I != E) && !SafetyInfo->MayThrow; ++I)
- SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I);
-
- // Compute funclet colors if we might sink/hoist in a function with a funclet
- // personality routine.
- Function *Fn = CurLoop->getHeader()->getParent();
- if (Fn->hasPersonalityFn())
- if (Constant *PersonalityFn = Fn->getPersonalityFn())
- if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
- SafetyInfo->BlockColors = colorEHFunclets(*Fn);
-}
-
// Return true if LI is invariant within scope of the loop. LI is invariant if
// CurLoop is dominated by an invariant.start representing the same memory
// location and size as the memory location LI loads from, and also the
@@ -708,7 +690,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
/// This is true when all incoming values are that instruction.
/// This pattern occurs most often with LCSSA PHI nodes.
///
-static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
for (const Value *IncValue : PN.incoming_values())
if (IncValue != &I)
return false;
@@ -838,12 +820,12 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
return New;
}
-static Instruction *sinkThroughTriviallyReplacablePHI(
+static Instruction *sinkThroughTriviallyReplaceablePHI(
PHINode *TPN, Instruction *I, LoopInfo *LI,
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
- assert(isTriviallyReplacablePHI(*TPN, *I) &&
- "Expect only trivially replacalbe PHI");
+ assert(isTriviallyReplaceablePHI(*TPN, *I) &&
+ "Expect only trivially replaceable PHI");
BasicBlock *ExitBlock = TPN->getParent();
Instruction *New;
auto It = SunkCopies.find(ExitBlock);
@@ -855,10 +837,16 @@ static Instruction *sinkThroughTriviallyReplacablePHI(
return New;
}
-static bool canSplitPredecessors(PHINode *PN) {
+static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
BasicBlock *BB = PN->getParent();
if (!BB->canSplitPredecessors())
return false;
+ // It's not impossible to split EHPad blocks, but if BlockColors already exist
+ // it require updating BlockColors for all offspring blocks accordingly. By
+ // skipping such corner case, we can make updating BlockColors after splitting
+ // predecessor fairly simple.
+ if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad())
+ return false;
for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
BasicBlock *BBPred = *PI;
if (isa<IndirectBrInst>(BBPred->getTerminator()))
@@ -868,7 +856,8 @@ static bool canSplitPredecessors(PHINode *PN) {
}
static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
- LoopInfo *LI, const Loop *CurLoop) {
+ LoopInfo *LI, const Loop *CurLoop,
+ LoopSafetyInfo *SafetyInfo) {
#ifndef NDEBUG
SmallVector<BasicBlock *, 32> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
@@ -879,7 +868,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
// Split predecessors of the loop exit to make instructions in the loop are
- // exposed to exit blocks through trivially replacable PHIs while keeping the
+ // exposed to exit blocks through trivially replaceable PHIs while keeping the
// loop in the canonical form where each predecessor of each exit block should
// be contained within the loop. For example, this will convert the loop below
// from
@@ -891,7 +880,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
// %v2 =
// br %LE, %LB1
// LE:
- // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+ // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable
//
// to
//
@@ -902,21 +891,35 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
// %v2 =
// br %LE.split2, %LB1
// LE.split:
- // %p1 = phi [%v1, %LB1] <-- trivially replacable
+ // %p1 = phi [%v1, %LB1] <-- trivially replaceable
// br %LE
// LE.split2:
- // %p2 = phi [%v2, %LB2] <-- trivially replacable
+ // %p2 = phi [%v2, %LB2] <-- trivially replaceable
// br %LE
// LE:
// %p = phi [%p1, %LE.split], [%p2, %LE.split2]
//
+ auto &BlockColors = SafetyInfo->BlockColors;
SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
while (!PredBBs.empty()) {
BasicBlock *PredBB = *PredBBs.begin();
assert(CurLoop->contains(PredBB) &&
"Expect all predecessors are in the loop");
- if (PN->getBasicBlockIndex(PredBB) >= 0)
- SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+ if (PN->getBasicBlockIndex(PredBB) >= 0) {
+ BasicBlock *NewPred = SplitBlockPredecessors(
+ ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+ // Since we do not allow splitting EH-block with BlockColors in
+ // canSplitPredecessors(), we can simply assign predecessor's color to
+ // the new block.
+ if (!BlockColors.empty()) {
+ // Grab a reference to the ColorVector to be inserted before getting the
+ // reference to the vector we are copying because inserting the new
+ // element in BlockColors might cause the map to be reallocated.
+ ColorVector &ColorsForNewBlock = BlockColors[NewPred];
+ ColorVector &ColorsForOldBlock = BlockColors[PredBB];
+ ColorsForNewBlock = ColorsForOldBlock;
+ }
+ }
PredBBs.remove(PredBB);
}
}
@@ -927,9 +930,9 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
/// position, and may either delete it or move it to outside of the loop.
///
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
- const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+ const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
- DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
<< "sinking " << ore::NV("Inst", &I);
@@ -972,15 +975,15 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
}
VisitedUsers.insert(PN);
- if (isTriviallyReplacablePHI(*PN, I))
+ if (isTriviallyReplaceablePHI(*PN, I))
continue;
- if (!canSplitPredecessors(PN))
+ if (!canSplitPredecessors(PN, SafetyInfo))
return Changed;
// Split predecessors of the PHI so that we can make users trivially
- // replacable.
- splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop);
+ // replaceable.
+ splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo);
// Should rebuild the iterators, as they may be invalidated by
// splitPredecessorsOfLoopExit().
@@ -1014,9 +1017,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
PHINode *PN = cast<PHINode>(User);
assert(ExitBlockSet.count(PN->getParent()) &&
"The LCSSA PHI is not in an exit block!");
- // The PHI must be trivially replacable.
- Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
- SafetyInfo, CurLoop);
+ // The PHI must be trivially replaceable.
+ Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies,
+ SafetyInfo, CurLoop);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
Changed = true;
@@ -1031,8 +1034,8 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE) {
auto *Preheader = CurLoop->getLoopPreheader();
- DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
- << "\n");
+ LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+ << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
<< ore::NV("Inst", &I);
@@ -1221,7 +1224,7 @@ bool llvm::promoteLoopAccessesToScalars(
Value *SomePtr = *PointerMustAliases.begin();
BasicBlock *Preheader = CurLoop->getLoopPreheader();
- // It isn't safe to promote a load/store from the loop if the load/store is
+ // It is not safe to promote a load/store from the loop if the load/store is
// conditional. For example, turning:
//
// for () { if (c) *P += 1; }
@@ -1350,7 +1353,7 @@ bool llvm::promoteLoopAccessesToScalars(
// If a store dominates all exit blocks, it is safe to sink.
// As explained above, if an exit block was executed, a dominating
- // store must have been been executed at least once, so we are not
+ // store must have been executed at least once, so we are not
// introducing stores on paths that did not have them.
// Note that this only looks at explicit exit blocks. If we ever
// start sinking stores into unwind edges (see above), this will break.
@@ -1412,8 +1415,8 @@ bool llvm::promoteLoopAccessesToScalars(
return false;
// Otherwise, this is safe to promote, lets do it!
- DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
- << '\n');
+ LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+ << '\n');
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
LoopUses[0])
diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
index 8a99df86b84a..ffe35f041b35 100644
--- a/lib/Transforms/Scalar/LLVMBuild.txt
+++ b/lib/Transforms/Scalar/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
name = Scalar
parent = Transforms
library_name = ScalarOpts
-required_libraries = Analysis Core InstCombine Support TransformUtils
+required_libraries = AggressiveInstCombine Analysis Core InstCombine Support TransformUtils
diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 7f7c6de76450..3b41b5d96c86 100644
--- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -71,7 +71,7 @@ public:
private:
bool runOnLoop(Loop *L);
- /// \brief Check if the the stride of the accesses is large enough to
+ /// Check if the stride of the accesses is large enough to
/// warrant a prefetch.
bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
@@ -244,9 +244,9 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
if (ItersAhead > getMaxPrefetchIterationsAhead())
return MadeChange;
- DEBUG(dbgs() << "Prefetching " << ItersAhead
- << " iterations ahead (loop size: " << LoopSize << ") in "
- << L->getHeader()->getParent()->getName() << ": " << *L);
+ LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+ << " iterations ahead (loop size: " << LoopSize << ") in "
+ << L->getHeader()->getParent()->getName() << ": " << *L);
SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
for (const auto BB : L->blocks()) {
@@ -275,7 +275,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
if (!LSCEVAddRec)
continue;
- // Check if the the stride of the accesses is large enough to warrant a
+ // Check if the stride of the accesses is large enough to warrant a
// prefetch.
if (!isStrideLargeEnough(LSCEVAddRec))
continue;
@@ -320,8 +320,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
++NumPrefetches;
- DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
- << "\n");
+ LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
+ << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
<< "prefetched memory access";
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 82604a8842bf..d412025d7e94 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -49,11 +49,10 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
// must pass through a PHI in the exit block, meaning that this check is
// sufficient to guarantee that no loop-variant values are used outside
// of the loop.
- BasicBlock::iterator BI = ExitBlock->begin();
bool AllEntriesInvariant = true;
bool AllOutgoingValuesSame = true;
- while (PHINode *P = dyn_cast<PHINode>(BI)) {
- Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+ for (PHINode &P : ExitBlock->phis()) {
+ Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
// Make sure all exiting blocks produce the same incoming value for the exit
// block. If there are different incoming values for different exiting
@@ -61,7 +60,7 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
// be used.
AllOutgoingValuesSame =
all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
- return incoming == P->getIncomingValueForBlock(BB);
+ return incoming == P.getIncomingValueForBlock(BB);
});
if (!AllOutgoingValuesSame)
@@ -72,8 +71,6 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
AllEntriesInvariant = false;
break;
}
-
- ++BI;
}
if (Changed)
@@ -145,14 +142,15 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// of trouble.
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader || !L->hasDedicatedExits()) {
- DEBUG(dbgs()
- << "Deletion requires Loop with preheader and dedicated exits.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Deletion requires Loop with preheader and dedicated exits.\n");
return LoopDeletionResult::Unmodified;
}
// We can't remove loops that contain subloops. If the subloops were dead,
// they would already have been removed in earlier executions of this pass.
if (L->begin() != L->end()) {
- DEBUG(dbgs() << "Loop contains subloops.\n");
+ LLVM_DEBUG(dbgs() << "Loop contains subloops.\n");
return LoopDeletionResult::Unmodified;
}
@@ -160,13 +158,11 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
BasicBlock *ExitBlock = L->getUniqueExitBlock();
if (ExitBlock && isLoopNeverExecuted(L)) {
- DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+ LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
// Set incoming value to undef for phi nodes in the exit block.
- BasicBlock::iterator BI = ExitBlock->begin();
- while (PHINode *P = dyn_cast<PHINode>(BI)) {
- for (unsigned i = 0; i < P->getNumIncomingValues(); i++)
- P->setIncomingValue(i, UndefValue::get(P->getType()));
- BI++;
+ for (PHINode &P : ExitBlock->phis()) {
+ std::fill(P.incoming_values().begin(), P.incoming_values().end(),
+ UndefValue::get(P.getType()));
}
deleteDeadLoop(L, &DT, &SE, &LI);
++NumDeleted;
@@ -183,13 +179,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// block will be branched to, or trying to preserve the branching logic in
// a loop invariant manner.
if (!ExitBlock) {
- DEBUG(dbgs() << "Deletion requires single exit block\n");
+ LLVM_DEBUG(dbgs() << "Deletion requires single exit block\n");
return LoopDeletionResult::Unmodified;
}
// Finally, we have to check that the loop really is dead.
bool Changed = false;
if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
- DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+ LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
return Changed ? LoopDeletionResult::Modified
: LoopDeletionResult::Unmodified;
}
@@ -198,12 +194,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// They could be infinite, in which case we'd be changing program behavior.
const SCEV *S = SE.getMaxBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(S)) {
- DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+ LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
return Changed ? LoopDeletionResult::Modified
: LoopDeletionResult::Unmodified;
}
- DEBUG(dbgs() << "Loop is invariant, delete it!");
+ LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
deleteDeadLoop(L, &DT, &SE, &LI);
++NumDeleted;
@@ -214,8 +210,8 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &Updater) {
- DEBUG(dbgs() << "Analyzing Loop for deletion: ");
- DEBUG(L.dump());
+ LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ LLVM_DEBUG(L.dump());
std::string LoopName = L.getName();
auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
if (Result == LoopDeletionResult::Unmodified)
@@ -260,8 +256,8 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DEBUG(dbgs() << "Analyzing Loop for deletion: ");
- DEBUG(L->dump());
+ LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ LLVM_DEBUG(L->dump());
LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 0d7e3db901cb..06083a4f5086 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -111,7 +111,7 @@ STATISTIC(NumLoopsDistributed, "Number of loops distributed");
namespace {
-/// \brief Maintains the set of instructions of the loop for a partition before
+/// Maintains the set of instructions of the loop for a partition before
/// cloning. After cloning, it hosts the new loop.
class InstPartition {
using InstructionSet = SmallPtrSet<Instruction *, 8>;
@@ -122,20 +122,20 @@ public:
Set.insert(I);
}
- /// \brief Returns whether this partition contains a dependence cycle.
+ /// Returns whether this partition contains a dependence cycle.
bool hasDepCycle() const { return DepCycle; }
- /// \brief Adds an instruction to this partition.
+ /// Adds an instruction to this partition.
void add(Instruction *I) { Set.insert(I); }
- /// \brief Collection accessors.
+ /// Collection accessors.
InstructionSet::iterator begin() { return Set.begin(); }
InstructionSet::iterator end() { return Set.end(); }
InstructionSet::const_iterator begin() const { return Set.begin(); }
InstructionSet::const_iterator end() const { return Set.end(); }
bool empty() const { return Set.empty(); }
- /// \brief Moves this partition into \p Other. This partition becomes empty
+ /// Moves this partition into \p Other. This partition becomes empty
/// after this.
void moveTo(InstPartition &Other) {
Other.Set.insert(Set.begin(), Set.end());
@@ -143,7 +143,7 @@ public:
Other.DepCycle |= DepCycle;
}
- /// \brief Populates the partition with a transitive closure of all the
+ /// Populates the partition with a transitive closure of all the
/// instructions that the seeded instructions dependent on.
void populateUsedSet() {
// FIXME: We currently don't use control-dependence but simply include all
@@ -166,7 +166,7 @@ public:
}
}
- /// \brief Clones the original loop.
+ /// Clones the original loop.
///
/// Updates LoopInfo and DominatorTree using the information that block \p
/// LoopDomBB dominates the loop.
@@ -179,27 +179,27 @@ public:
return ClonedLoop;
}
- /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// The cloned loop. If this partition is mapped to the original loop,
/// this is null.
const Loop *getClonedLoop() const { return ClonedLoop; }
- /// \brief Returns the loop where this partition ends up after distribution.
+ /// Returns the loop where this partition ends up after distribution.
/// If this partition is mapped to the original loop then use the block from
/// the loop.
const Loop *getDistributedLoop() const {
return ClonedLoop ? ClonedLoop : OrigLoop;
}
- /// \brief The VMap that is populated by cloning and then used in
+ /// The VMap that is populated by cloning and then used in
/// remapinstruction to remap the cloned instructions.
ValueToValueMapTy &getVMap() { return VMap; }
- /// \brief Remaps the cloned instructions using VMap.
+ /// Remaps the cloned instructions using VMap.
void remapInstructions() {
remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
}
- /// \brief Based on the set of instructions selected for this partition,
+ /// Based on the set of instructions selected for this partition,
/// removes the unnecessary ones.
void removeUnusedInsts() {
SmallVector<Instruction *, 8> Unused;
@@ -239,30 +239,30 @@ public:
}
private:
- /// \brief Instructions from OrigLoop selected for this partition.
+ /// Instructions from OrigLoop selected for this partition.
InstructionSet Set;
- /// \brief Whether this partition contains a dependence cycle.
+ /// Whether this partition contains a dependence cycle.
bool DepCycle;
- /// \brief The original loop.
+ /// The original loop.
Loop *OrigLoop;
- /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// The cloned loop. If this partition is mapped to the original loop,
/// this is null.
Loop *ClonedLoop = nullptr;
- /// \brief The blocks of ClonedLoop including the preheader. If this
+ /// The blocks of ClonedLoop including the preheader. If this
/// partition is mapped to the original loop, this is empty.
SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
- /// \brief These gets populated once the set of instructions have been
+ /// These gets populated once the set of instructions have been
/// finalized. If this partition is mapped to the original loop, these are not
/// set.
ValueToValueMapTy VMap;
};
-/// \brief Holds the set of Partitions. It populates them, merges them and then
+/// Holds the set of Partitions. It populates them, merges them and then
/// clones the loops.
class InstPartitionContainer {
using InstToPartitionIdT = DenseMap<Instruction *, int>;
@@ -271,10 +271,10 @@ public:
InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
: L(L), LI(LI), DT(DT) {}
- /// \brief Returns the number of partitions.
+ /// Returns the number of partitions.
unsigned getSize() const { return PartitionContainer.size(); }
- /// \brief Adds \p Inst into the current partition if that is marked to
+ /// Adds \p Inst into the current partition if that is marked to
/// contain cycles. Otherwise start a new partition for it.
void addToCyclicPartition(Instruction *Inst) {
// If the current partition is non-cyclic. Start a new one.
@@ -284,7 +284,7 @@ public:
PartitionContainer.back().add(Inst);
}
- /// \brief Adds \p Inst into a partition that is not marked to contain
+ /// Adds \p Inst into a partition that is not marked to contain
/// dependence cycles.
///
// Initially we isolate memory instructions into as many partitions as
@@ -293,7 +293,7 @@ public:
PartitionContainer.emplace_back(Inst, L);
}
- /// \brief Merges adjacent non-cyclic partitions.
+ /// Merges adjacent non-cyclic partitions.
///
/// The idea is that we currently only want to isolate the non-vectorizable
/// partition. We could later allow more distribution among these partition
@@ -303,7 +303,7 @@ public:
[](const InstPartition *P) { return !P->hasDepCycle(); });
}
- /// \brief If a partition contains only conditional stores, we won't vectorize
+ /// If a partition contains only conditional stores, we won't vectorize
/// it. Try to merge it with a previous cyclic partition.
void mergeNonIfConvertible() {
mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
@@ -323,14 +323,14 @@ public:
});
}
- /// \brief Merges the partitions according to various heuristics.
+ /// Merges the partitions according to various heuristics.
void mergeBeforePopulating() {
mergeAdjacentNonCyclic();
if (!DistributeNonIfConvertible)
mergeNonIfConvertible();
}
- /// \brief Merges partitions in order to ensure that no loads are duplicated.
+ /// Merges partitions in order to ensure that no loads are duplicated.
///
/// We can't duplicate loads because that could potentially reorder them.
/// LoopAccessAnalysis provides dependency information with the context that
@@ -362,9 +362,11 @@ public:
std::tie(LoadToPart, NewElt) =
LoadToPartition.insert(std::make_pair(Inst, PartI));
if (!NewElt) {
- DEBUG(dbgs() << "Merging partitions due to this load in multiple "
- << "partitions: " << PartI << ", "
- << LoadToPart->second << "\n" << *Inst << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Merging partitions due to this load in multiple "
+ << "partitions: " << PartI << ", " << LoadToPart->second
+ << "\n"
+ << *Inst << "\n");
auto PartJ = I;
do {
@@ -398,7 +400,7 @@ public:
return true;
}
- /// \brief Sets up the mapping between instructions to partitions. If the
+ /// Sets up the mapping between instructions to partitions. If the
/// instruction is duplicated across multiple partitions, set the entry to -1.
void setupPartitionIdOnInstructions() {
int PartitionID = 0;
@@ -416,14 +418,14 @@ public:
}
}
- /// \brief Populates the partition with everything that the seeding
+ /// Populates the partition with everything that the seeding
/// instructions require.
void populateUsedSet() {
for (auto &P : PartitionContainer)
P.populateUsedSet();
}
- /// \brief This performs the main chunk of the work of cloning the loops for
+ /// This performs the main chunk of the work of cloning the loops for
/// the partitions.
void cloneLoops() {
BasicBlock *OrigPH = L->getLoopPreheader();
@@ -470,13 +472,13 @@ public:
Curr->getDistributedLoop()->getExitingBlock());
}
- /// \brief Removes the dead instructions from the cloned loops.
+ /// Removes the dead instructions from the cloned loops.
void removeUnusedInsts() {
for (auto &Partition : PartitionContainer)
Partition.removeUnusedInsts();
}
- /// \brief For each memory pointer, it computes the partitionId the pointer is
+ /// For each memory pointer, it computes the partitionId the pointer is
/// used in.
///
/// This returns an array of int where the I-th entry corresponds to I-th
@@ -543,10 +545,10 @@ public:
private:
using PartitionContainerT = std::list<InstPartition>;
- /// \brief List of partitions.
+ /// List of partitions.
PartitionContainerT PartitionContainer;
- /// \brief Mapping from Instruction to partition Id. If the instruction
+ /// Mapping from Instruction to partition Id. If the instruction
/// belongs to multiple partitions the entry contains -1.
InstToPartitionIdT InstToPartitionId;
@@ -554,7 +556,7 @@ private:
LoopInfo *LI;
DominatorTree *DT;
- /// \brief The control structure to merge adjacent partitions if both satisfy
+ /// The control structure to merge adjacent partitions if both satisfy
/// the \p Predicate.
template <class UnaryPredicate>
void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
@@ -575,7 +577,7 @@ private:
}
};
-/// \brief For each memory instruction, this class maintains difference of the
+/// For each memory instruction, this class maintains difference of the
/// number of unsafe dependences that start out from this instruction minus
/// those that end here.
///
@@ -602,7 +604,7 @@ public:
const SmallVectorImpl<Dependence> &Dependences) {
Accesses.append(Instructions.begin(), Instructions.end());
- DEBUG(dbgs() << "Backward dependences:\n");
+ LLVM_DEBUG(dbgs() << "Backward dependences:\n");
for (auto &Dep : Dependences)
if (Dep.isPossiblyBackward()) {
// Note that the designations source and destination follow the program
@@ -611,7 +613,7 @@ public:
++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
--Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
- DEBUG(Dep.print(dbgs(), 2, Instructions));
+ LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
}
}
@@ -619,7 +621,7 @@ private:
AccessesType Accesses;
};
-/// \brief The actual class performing the per-loop work.
+/// The actual class performing the per-loop work.
class LoopDistributeForLoop {
public:
LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
@@ -628,12 +630,13 @@ public:
setForced();
}
- /// \brief Try to distribute an inner-most loop.
+ /// Try to distribute an inner-most loop.
bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
assert(L->empty() && "Only process inner loops.");
- DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
- << "\" checking " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "\nLDist: In \""
+ << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
if (!L->getExitBlock())
return fail("MultipleExitBlocks", "multiple exit blocks");
@@ -705,7 +708,7 @@ public:
for (auto *Inst : DefsUsedOutside)
Partitions.addToNewNonCyclicPartition(Inst);
- DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+ LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");
@@ -713,20 +716,20 @@ public:
// Run the merge heuristics: Merge non-cyclic adjacent partitions since we
// should be able to vectorize these together.
Partitions.mergeBeforePopulating();
- DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+ LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");
// Now, populate the partitions with non-memory operations.
Partitions.populateUsedSet();
- DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+ LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
// In order to preserve original lexical order for loads, keep them in the
// partition that we set up in the MemoryInstructionDependences loop.
if (Partitions.mergeToAvoidDuplicatedLoads()) {
- DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
- << Partitions);
+ LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+ << Partitions);
if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");
@@ -740,7 +743,7 @@ public:
return fail("TooManySCEVRuntimeChecks",
"too many SCEV run-time checks needed.\n");
- DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
// We're done forming the partitions set up the reverse mapping from
// instructions to partitions.
Partitions.setupPartitionIdOnInstructions();
@@ -759,8 +762,8 @@ public:
RtPtrChecking);
if (!Pred.isAlwaysTrue() || !Checks.empty()) {
- DEBUG(dbgs() << "\nPointers:\n");
- DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LLVM_DEBUG(dbgs() << "\nPointers:\n");
+ LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
LVer.setAliasChecks(std::move(Checks));
LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
@@ -775,12 +778,12 @@ public:
// Now, we remove the instruction from each loop that don't belong to that
// partition.
Partitions.removeUnusedInsts();
- DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
- DEBUG(Partitions.printBlocks());
+ LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+ LLVM_DEBUG(Partitions.printBlocks());
if (LDistVerify) {
LI->verify(*DT);
- DT->verifyDomTree();
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
}
++NumLoopsDistributed;
@@ -793,12 +796,12 @@ public:
return true;
}
- /// \brief Provide diagnostics then \return with false.
+ /// Provide diagnostics then \return with false.
bool fail(StringRef RemarkName, StringRef Message) {
LLVMContext &Ctx = F->getContext();
bool Forced = isForced().getValueOr(false);
- DEBUG(dbgs() << "Skipping; " << Message << "\n");
+ LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
// With Rpass-missed report that distribution failed.
ORE->emit([&]() {
@@ -826,7 +829,7 @@ public:
return false;
}
- /// \brief Return if distribution forced to be enabled/disabled for the loop.
+ /// Return if distribution forced to be enabled/disabled for the loop.
///
/// If the optional has a value, it indicates whether distribution was forced
/// to be enabled (true) or disabled (false). If the optional has no value
@@ -834,7 +837,7 @@ public:
const Optional<bool> &isForced() const { return IsForced; }
private:
- /// \brief Filter out checks between pointers from the same partition.
+ /// Filter out checks between pointers from the same partition.
///
/// \p PtrToPartition contains the partition number for pointers. Partition
/// number -1 means that the pointer is used in multiple partitions. In this
@@ -873,7 +876,7 @@ private:
return Checks;
}
- /// \brief Check whether the loop metadata is forcing distribution to be
+ /// Check whether the loop metadata is forcing distribution to be
/// enabled/disabled.
void setForced() {
Optional<const MDOperand *> Value =
@@ -896,7 +899,7 @@ private:
ScalarEvolution *SE;
OptimizationRemarkEmitter *ORE;
- /// \brief Indicates whether distribution is forced to be enabled/disabled for
+ /// Indicates whether distribution is forced to be enabled/disabled for
/// the loop.
///
/// If the optional has a value, it indicates whether distribution was forced
@@ -939,7 +942,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
namespace {
-/// \brief The pass class.
+/// The pass class.
class LoopDistributeLegacy : public FunctionPass {
public:
static char ID;
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 21551f0a0825..d8692198f7a3 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -37,7 +37,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -57,6 +56,7 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -87,8 +87,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
@@ -188,8 +188,9 @@ private:
PHINode *CntPhi, Value *Var);
bool recognizeAndInsertCTLZ();
void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
- PHINode *CntPhi, Value *Var, const DebugLoc DL,
- bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
+ PHINode *CntPhi, Value *Var, Instruction *DefX,
+ const DebugLoc &DL, bool ZeroCheck,
+ bool IsCntPhiUsedOutsideLoop);
/// @}
};
@@ -310,9 +311,9 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
SmallVector<BasicBlock *, 8> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
- DEBUG(dbgs() << "loop-idiom Scanning: F["
- << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
- << CurLoop->getHeader()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "loop-idiom Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName()
+ << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
bool MadeChange = false;
@@ -756,8 +757,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
MSIs.insert(MSI);
bool NegStride = SizeInBytes == -Stride;
return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
- MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
- BECount, NegStride, /*IsLoopMemset=*/true);
+ MSI->getDestAlignment(), SplatValue, MSI, MSIs,
+ Ev, BECount, NegStride, /*IsLoopMemset=*/true);
}
/// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -936,8 +937,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
}
- DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
- << " from store to: " << *Ev << " at: " << *TheStore << "\n");
+ LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
+ << " from store to: " << *Ev << " at: " << *TheStore
+ << "\n");
NewCall->setDebugLoc(TheStore->getDebugLoc());
// Okay, the memset has been formed. Zap the original store and anything that
@@ -1037,16 +1039,17 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
- unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
CallInst *NewCall = nullptr;
// Check whether to generate an unordered atomic memcpy:
- // If the load or store are atomic, then they must neccessarily be unordered
+ // If the load or store are atomic, then they must necessarily be unordered
// by previous checks.
if (!SI->isAtomic() && !LI->isAtomic())
- NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
+ LoadBasePtr, LI->getAlignment(), NumBytes);
else {
// We cannot allow unaligned ops for unordered load/store, so reject
// anything where the alignment isn't at least the element size.
+ unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
if (Align < StoreSize)
return false;
@@ -1066,9 +1069,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
}
NewCall->setDebugLoc(SI->getDebugLoc());
- DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
- << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
- << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI
+ << "\n");
// Okay, the memcpy has been formed. Zap the original store and anything that
// feeds into it.
@@ -1084,9 +1088,9 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
bool IsLoopMemset) {
if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
- DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
- << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
- << " avoided: multi-block top-level loop\n");
+ LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
+ << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+ << " avoided: multi-block top-level loop\n");
return true;
}
}
@@ -1195,14 +1199,13 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
VarX1 = DefX2->getOperand(0);
SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
}
- if (!SubOneOp)
+ if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
return false;
- Instruction *SubInst = cast<Instruction>(SubOneOp);
- ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+ ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
if (!Dec ||
- !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
- (SubInst->getOpcode() == Instruction::Add &&
+ !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+ (SubOneOp->getOpcode() == Instruction::Add &&
Dec->isMinusOne()))) {
return false;
}
@@ -1314,7 +1317,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
return false;
// step 2: detect instructions corresponding to "x.next = x >> 1"
- if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ if (!DefX || (DefX->getOpcode() != Instruction::AShr &&
+ DefX->getOpcode() != Instruction::LShr))
return false;
ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
if (!Shft || !Shft->isOne())
@@ -1372,13 +1376,13 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
bool IsCntPhiUsedOutsideLoop = false;
for (User *U : CntPhi->users())
- if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ if (!CurLoop->contains(cast<Instruction>(U))) {
IsCntPhiUsedOutsideLoop = true;
break;
}
bool IsCntInstUsedOutsideLoop = false;
for (User *U : CntInst->users())
- if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ if (!CurLoop->contains(cast<Instruction>(U))) {
IsCntInstUsedOutsideLoop = true;
break;
}
@@ -1395,16 +1399,27 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
// parent function RunOnLoop.
BasicBlock *PH = CurLoop->getLoopPreheader();
Value *InitX = PhiX->getIncomingValueForBlock(PH);
- // If we check X != 0 before entering the loop we don't need a zero
- // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
- // loop (if it is used we count CTLZ(X >> 1)).
- if (!IsCntPhiUsedOutsideLoop)
- if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
- if (BranchInst *PreCondBr =
- dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
- if (matchCondition(PreCondBr, PH) == InitX)
- ZeroCheck = true;
- }
+
+ // Make sure the initial value can't be negative otherwise the ashr in the
+ // loop might never reach zero which would make the loop infinite.
+ if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL))
+ return false;
+
+ // If we are using the count instruction outside the loop, make sure we
+ // have a zero check as a precondition. Without the check the loop would run
+ // one iteration for before any check of the input value. This means 0 and 1
+ // would have identical behavior in the original loop and thus
+ if (!IsCntPhiUsedOutsideLoop) {
+ auto *PreCondBB = PH->getSinglePredecessor();
+ if (!PreCondBB)
+ return false;
+ auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ if (!PreCondBI)
+ return false;
+ if (matchCondition(PreCondBI, PH) != InitX)
+ return false;
+ ZeroCheck = true;
+ }
// Check if CTLZ intrinsic is profitable. Assume it is always profitable
// if we delete the loop (the loop has only 6 instructions):
@@ -1415,17 +1430,16 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
// %inc = add nsw %i.0, 1
// br i1 %tobool
- IRBuilder<> Builder(PH->getTerminator());
- SmallVector<const Value *, 2> Ops =
- {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
- ArrayRef<const Value *> Args(Ops);
+ const Value *Args[] =
+ {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
+ : ConstantInt::getFalse(InitX->getContext())};
if (CurLoop->getHeader()->size() != 6 &&
TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
TargetTransformInfo::TCC_Basic)
return false;
- const DebugLoc DL = DefX->getDebugLoc();
- transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX,
+ DefX->getDebugLoc(), ZeroCheck,
IsCntPhiUsedOutsideLoop);
return true;
}
@@ -1461,7 +1475,7 @@ bool LoopIdiomRecognize::recognizePopcount() {
if (!EntryBI || EntryBI->isConditional())
return false;
- // It should have a precondition block where the generated popcount instrinsic
+ // It should have a precondition block where the generated popcount intrinsic
// function can be inserted.
auto *PreCondBB = PH->getSinglePredecessor();
if (!PreCondBB)
@@ -1539,8 +1553,9 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
void LoopIdiomRecognize::transformLoopToCountable(
BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
- const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
- BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+ Instruction *DefX, const DebugLoc &DL, bool ZeroCheck,
+ bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
// Step 1: Insert the CTLZ instruction at the end of the preheader block
// Count = BitWidth - CTLZ(InitX);
@@ -1550,10 +1565,16 @@ void LoopIdiomRecognize::transformLoopToCountable(
Builder.SetCurrentDebugLocation(DL);
Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
- if (IsCntPhiUsedOutsideLoop)
- InitXNext = Builder.CreateAShr(InitX,
- ConstantInt::get(InitX->getType(), 1));
- else
+ if (IsCntPhiUsedOutsideLoop) {
+ if (DefX->getOpcode() == Instruction::AShr)
+ InitXNext =
+ Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
+ else if (DefX->getOpcode() == Instruction::LShr)
+ InitXNext =
+ Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
+ else
+ llvm_unreachable("Unexpected opcode!");
+ } else
InitXNext = InitX;
CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
Count = Builder.CreateSub(
@@ -1588,7 +1609,7 @@ void LoopIdiomRecognize::transformLoopToCountable(
// ...
// Br: loop if (Dec != 0)
BasicBlock *Body = *(CurLoop->block_begin());
- auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = Count->getType();
@@ -1625,8 +1646,8 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
BasicBlock *PreHead = CurLoop->getLoopPreheader();
- auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
- const DebugLoc DL = CntInst->getDebugLoc();
+ auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
+ const DebugLoc &DL = CntInst->getDebugLoc();
// Assuming before transformation, the loop is following:
// if (x) // the precondition
@@ -1675,7 +1696,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
}
// Step 3: Note that the population count is exactly the trip count of the
- // loop in question, which enable us to to convert the loop from noncountable
+ // loop in question, which enable us to convert the loop from noncountable
// loop into a countable one. The benefit is twofold:
//
// - If the loop only counts population, the entire loop becomes dead after
@@ -1696,7 +1717,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
// do { cnt++; x &= x-1; t--) } while (t > 0);
BasicBlock *Body = *(CurLoop->block_begin());
{
- auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = TripCnt->getType();
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 40d468a084d4..71859efbf4bd 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -20,8 +20,10 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
@@ -34,7 +36,6 @@
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <utility>
@@ -45,118 +46,116 @@ using namespace llvm;
STATISTIC(NumSimplified, "Number of redundant instructions simplified");
-static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
- AssumptionCache *AC,
- const TargetLibraryInfo *TLI) {
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
- array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
-
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC,
+ const TargetLibraryInfo &TLI) {
+ const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+ SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+
+ // On the first pass over the loop body we try to simplify every instruction.
+ // On subsequent passes, we can restrict this to only simplifying instructions
+ // where the inputs have been updated. We end up needing two sets: one
+ // containing the instructions we are simplifying in *this* pass, and one for
+ // the instructions we will want to simplify in the *next* pass. We use
+ // pointers so we can swap between two stably allocated sets.
SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
- // The bit we are stealing from the pointer represents whether this basic
- // block is the header of a subloop, in which case we only process its phis.
- using WorklistItem = PointerIntPair<BasicBlock *, 1>;
- SmallVector<WorklistItem, 16> VisitStack;
- SmallPtrSet<BasicBlock *, 32> Visited;
-
- bool Changed = false;
- bool LocalChanged;
- do {
- LocalChanged = false;
-
- VisitStack.clear();
- Visited.clear();
+ // Track the PHI nodes that have already been visited during each iteration so
+ // that we can identify when it is necessary to iterate.
+ SmallPtrSet<PHINode *, 4> VisitedPHIs;
- VisitStack.push_back(WorklistItem(L->getHeader(), false));
+ // While simplifying we may discover dead code or cause code to become dead.
+ // Keep track of all such instructions and we will delete them at the end.
+ SmallVector<Instruction *, 8> DeadInsts;
- while (!VisitStack.empty()) {
- WorklistItem Item = VisitStack.pop_back_val();
- BasicBlock *BB = Item.getPointer();
- bool IsSubloopHeader = Item.getInt();
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ // First we want to create an RPO traversal of the loop body. By processing in
+ // RPO we can ensure that definitions are processed prior to uses (for non PHI
+ // uses) in all cases. This ensures we maximize the simplifications in each
+ // iteration over the loop and minimizes the possible causes for continuing to
+ // iterate.
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(&LI);
- // Simplify instructions in the current basic block.
- for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
- Instruction *I = &*BI++;
-
- // The first time through the loop ToSimplify is empty and we try to
- // simplify all instructions. On later iterations ToSimplify is not
- // empty and we only bother simplifying instructions that are in it.
- if (!ToSimplify->empty() && !ToSimplify->count(I))
+ bool Changed = false;
+ for (;;) {
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I : *BB) {
+ if (auto *PI = dyn_cast<PHINode>(&I))
+ VisitedPHIs.insert(PI);
+
+ if (I.use_empty()) {
+ if (isInstructionTriviallyDead(&I, &TLI))
+ DeadInsts.push_back(&I);
continue;
-
- // Don't bother simplifying unused instructions.
- if (!I->use_empty()) {
- Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
- if (V && LI->replacementPreservesLCSSAForm(I, V)) {
- // Mark all uses for resimplification next time round the loop.
- for (User *U : I->users())
- Next->insert(cast<Instruction>(U));
-
- I->replaceAllUsesWith(V);
- LocalChanged = true;
- ++NumSimplified;
- }
- }
- if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
- // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
- // instruction, so simply incrementing the iterator does not work.
- // When instructions get deleted re-iterate instead.
- BI = BB->begin();
- BE = BB->end();
- LocalChanged = true;
}
- if (IsSubloopHeader && !isa<PHINode>(I))
- break;
- }
+ // We special case the first iteration which we can detect due to the
+ // empty `ToSimplify` set.
+ bool IsFirstIteration = ToSimplify->empty();
- // Add all successors to the worklist, except for loop exit blocks and the
- // bodies of subloops. We visit the headers of loops so that we can
- // process
- // their phis, but we contract the rest of the subloop body and only
- // follow
- // edges leading back to the original loop.
- for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
- ++SI) {
- BasicBlock *SuccBB = *SI;
- if (!Visited.insert(SuccBB).second)
+ if (!IsFirstIteration && !ToSimplify->count(&I))
continue;
- const Loop *SuccLoop = LI->getLoopFor(SuccBB);
- if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
- L->contains(SuccLoop)) {
- VisitStack.push_back(WorklistItem(SuccBB, true));
-
- SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
- SuccLoop->getExitBlocks(SubLoopExitBlocks);
-
- for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
- BasicBlock *ExitBB = SubLoopExitBlocks[i];
- if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
- VisitStack.push_back(WorklistItem(ExitBB, false));
- }
-
+ Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+ if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
continue;
- }
- bool IsExitBlock =
- std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
- if (IsExitBlock)
- continue;
+ for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ auto *UserI = cast<Instruction>(U.getUser());
+ U.set(V);
+
+ // If the instruction is used by a PHI node we have already processed
+ // we'll need to iterate on the loop body to converge, so add it to
+ // the next set.
+ if (auto *UserPI = dyn_cast<PHINode>(UserI))
+ if (VisitedPHIs.count(UserPI)) {
+ Next->insert(UserPI);
+ continue;
+ }
+
+ // If we are only simplifying targeted instructions and the user is an
+ // instruction in the loop body, add it to our set of targeted
+ // instructions. Because we process defs before uses (outside of PHIs)
+ // we won't have visited it yet.
+ //
+ // We also skip any uses outside of the loop being simplified. Those
+ // should always be PHI nodes due to LCSSA form, and we don't want to
+ // try to simplify those away.
+ assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+ "Uses outside the loop should be PHI nodes due to LCSSA!");
+ if (!IsFirstIteration && L.contains(UserI))
+ ToSimplify->insert(UserI);
+ }
- VisitStack.push_back(WorklistItem(SuccBB, false));
+ assert(I.use_empty() && "Should always have replaced all uses!");
+ if (isInstructionTriviallyDead(&I, &TLI))
+ DeadInsts.push_back(&I);
+ ++NumSimplified;
+ Changed = true;
}
}
- // Place the list of instructions to simplify on the next loop iteration
- // into ToSimplify.
- std::swap(ToSimplify, Next);
- Next->clear();
+ // Delete any dead instructions found thus far now that we've finished an
+ // iteration over all instructions in all the loop blocks.
+ if (!DeadInsts.empty()) {
+ Changed = true;
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI);
+ }
+
+ // If we never found a PHI that needs to be simplified in the next
+ // iteration, we're done.
+ if (Next->empty())
+ break;
- Changed |= LocalChanged;
- } while (LocalChanged);
+ // Otherwise, put the next set in place for the next iteration and reset it
+ // and the visited PHIs for that iteration.
+ std::swap(Next, ToSimplify);
+ Next->clear();
+ VisitedPHIs.clear();
+ DeadInsts.clear();
+ }
return Changed;
}
@@ -174,21 +173,20 @@ public:
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
if (skipLoop(L))
return false;
- DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return SimplifyLoopInst(L, DT, LI, AC, TLI);
+ return simplifyLoopInst(*L, DT, LI, AC, TLI);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.setPreservesCFG();
getLoopAnalysisUsage(AU);
@@ -200,7 +198,7 @@ public:
PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
- if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+ if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI))
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 4f8dafef230a..2978165ed8a9 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -15,6 +15,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/DependenceAnalysis.h"
@@ -40,6 +41,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
@@ -50,6 +52,8 @@ using namespace llvm;
#define DEBUG_TYPE "loop-interchange"
+STATISTIC(LoopsInterchanged, "Number of loops interchanged");
+
static cl::opt<int> LoopInterchangeCostThreshold(
"loop-interchange-threshold", cl::init(0), cl::Hidden,
cl::desc("Interchange if you gain more than this number"));
@@ -73,8 +77,8 @@ static const unsigned MaxLoopNestDepth = 10;
static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
for (auto D : Row)
- DEBUG(dbgs() << D << " ");
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << D << " ");
+ LLVM_DEBUG(dbgs() << "\n");
}
}
#endif
@@ -103,8 +107,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
}
}
- DEBUG(dbgs() << "Found " << MemInstr.size()
- << " Loads and Stores to analyze\n");
+ LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
+ << " Loads and Stores to analyze\n");
ValueVector::iterator I, IE, J, JE;
@@ -121,11 +125,11 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
// Track Output, Flow, and Anti dependencies.
if (auto D = DI->depends(Src, Dst, true)) {
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
- DEBUG(StringRef DepType =
- D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
- dbgs() << "Found " << DepType
- << " dependency between Src and Dst\n"
- << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+ LLVM_DEBUG(StringRef DepType =
+ D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+ dbgs() << "Found " << DepType
+ << " dependency between Src and Dst\n"
+ << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
unsigned Levels = D->getLevels();
char Direction;
for (unsigned II = 1; II <= Levels; ++II) {
@@ -165,17 +169,14 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
DepMatrix.push_back(Dep);
if (DepMatrix.size() > MaxMemInstrCount) {
- DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
- << " dependencies inside loop\n");
+ LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
return false;
}
}
}
}
- // We don't have a DepMatrix to check legality return false.
- if (DepMatrix.empty())
- return false;
return true;
}
@@ -271,9 +272,9 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
}
static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
- DEBUG(dbgs() << "Calling populateWorklist on Func: "
- << L.getHeader()->getParent()->getName() << " Loop: %"
- << L.getHeader()->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
+ << L.getHeader()->getParent()->getName() << " Loop: %"
+ << L.getHeader()->getName() << '\n');
LoopVector LoopList;
Loop *CurrentLoop = &L;
const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
@@ -404,7 +405,9 @@ public:
/// Interchange OuterLoop and InnerLoop.
bool transform();
- void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+ void restructureLoops(Loop *NewInner, Loop *NewOuter,
+ BasicBlock *OrigInnerPreHeader,
+ BasicBlock *OrigOuterPreHeader);
void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
private:
@@ -453,6 +456,9 @@ struct LoopInterchange : public FunctionPass {
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
}
bool runOnFunction(Function &F) override {
@@ -462,8 +468,7 @@ struct LoopInterchange : public FunctionPass {
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
@@ -473,7 +478,7 @@ struct LoopInterchange : public FunctionPass {
for (Loop *L : *LI)
populateWorklist(*L, Worklist);
- DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+ LLVM_DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
bool Changed = true;
while (!Worklist.empty()) {
LoopVector LoopList = Worklist.pop_back_val();
@@ -486,15 +491,15 @@ struct LoopInterchange : public FunctionPass {
for (Loop *L : LoopList) {
const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
if (ExitCountOuter == SE->getCouldNotCompute()) {
- DEBUG(dbgs() << "Couldn't compute backedge count\n");
+ LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
return false;
}
if (L->getNumBackEdges() != 1) {
- DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+ LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
return false;
}
if (!L->getExitingBlock()) {
- DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+ LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
return false;
}
}
@@ -511,53 +516,38 @@ struct LoopInterchange : public FunctionPass {
bool Changed = false;
unsigned LoopNestDepth = LoopList.size();
if (LoopNestDepth < 2) {
- DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
return false;
}
if (LoopNestDepth > MaxLoopNestDepth) {
- DEBUG(dbgs() << "Cannot handle loops of depth greater than "
- << MaxLoopNestDepth << "\n");
+ LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
return false;
}
if (!isComputableLoopNest(LoopList)) {
- DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+ LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
return false;
}
- DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
+ LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
+ << "\n");
CharMatrix DependencyMatrix;
Loop *OuterMostLoop = *(LoopList.begin());
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
OuterMostLoop, DI)) {
- DEBUG(dbgs() << "Populating dependency matrix failed\n");
+ LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
return false;
}
#ifdef DUMP_DEP_MATRICIES
- DEBUG(dbgs() << "Dependence before interchange\n");
+ LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
printDepMatrix(DependencyMatrix);
#endif
- BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
- BranchInst *OuterMostLoopLatchBI =
- dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
- if (!OuterMostLoopLatchBI)
- return false;
-
- // Since we currently do not handle LCSSA PHI's any failure in loop
- // condition will now branch to LoopNestExit.
- // TODO: This should be removed once we handle LCSSA PHI nodes.
-
// Get the Outermost loop exit.
- BasicBlock *LoopNestExit;
- if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
- LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
- else
- LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
-
- if (isa<PHINode>(LoopNestExit->begin())) {
- DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
- "since on failure all loops branch to loop nest exit.\n");
+ BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
+ if (!LoopNestExit) {
+ LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
return false;
}
@@ -573,9 +563,8 @@ struct LoopInterchange : public FunctionPass {
// Update the DependencyMatrix
interChangeDependencies(DependencyMatrix, i, i - 1);
- DT->recalculate(F);
#ifdef DUMP_DEP_MATRICIES
- DEBUG(dbgs() << "Dependence after interchange\n");
+ LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
printDepMatrix(DependencyMatrix);
#endif
Changed |= Interchanged;
@@ -586,21 +575,21 @@ struct LoopInterchange : public FunctionPass {
bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
unsigned OuterLoopId, BasicBlock *LoopNestExit,
std::vector<std::vector<char>> &DependencyMatrix) {
- DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
- << " and OuterLoopId = " << OuterLoopId << "\n");
+ LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId << "\n");
Loop *InnerLoop = LoopList[InnerLoopId];
Loop *OuterLoop = LoopList[OuterLoopId];
LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
PreserveLCSSA, ORE);
if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+ LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
return false;
}
- DEBUG(dbgs() << "Loops are legal to interchange\n");
+ LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- DEBUG(dbgs() << "Interchanging loops not profitable\n");
+ LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
@@ -614,7 +603,8 @@ struct LoopInterchange : public FunctionPass {
LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
LoopNestExit, LIL.hasInnerLoopReduction());
LIT.transform();
- DEBUG(dbgs() << "Loops interchanged\n");
+ LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
+ LoopsInterchanged++;
return true;
}
};
@@ -631,13 +621,13 @@ bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
BasicBlock *BB) {
- for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ for (Instruction &I : *BB) {
// Load corresponding to reduction PHI's are safe while concluding if
// tightly nested.
- if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
if (!areAllUsesReductions(L, InnerLoop))
return true;
- } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
return true;
}
return false;
@@ -645,13 +635,13 @@ bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
BasicBlock *BB) {
- for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ for (Instruction &I : *BB) {
// Stores corresponding to reductions are safe while concluding if tightly
// nested.
- if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+ if (StoreInst *L = dyn_cast<StoreInst>(&I)) {
if (!isa<PHINode>(L->getOperand(0)))
return true;
- } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
return true;
}
return false;
@@ -662,7 +652,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
- DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+ LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
// A perfectly nested loop will not have any branch in between the outer and
// inner block i.e. outer header will branch to either inner preheader and
@@ -676,14 +666,14 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
return false;
- DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+ LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
// We do not have any basic block in between now make sure the outer header
// and outer loop latch doesn't contain any unsafe instructions.
if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
containsUnsafeInstructionsInLatch(OuterLoopLatch))
return false;
- DEBUG(dbgs() << "Loops are perfectly nested\n");
+ LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
// We have a perfect loop nest.
return true;
}
@@ -717,16 +707,15 @@ bool LoopInterchangeLegality::findInductionAndReductions(
SmallVector<PHINode *, 8> &Reductions) {
if (!L->getLoopLatch() || !L->getLoopPredecessor())
return false;
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ for (PHINode &PHI : L->getHeader()->phis()) {
RecurrenceDescriptor RD;
InductionDescriptor ID;
- PHINode *PHI = cast<PHINode>(I);
- if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
- Inductions.push_back(PHI);
- else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
- Reductions.push_back(PHI);
+ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+ Inductions.push_back(&PHI);
+ else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD))
+ Reductions.push_back(&PHI);
else {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
return false;
}
@@ -735,12 +724,11 @@ bool LoopInterchangeLegality::findInductionAndReductions(
}
static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
- for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
- PHINode *PHI = cast<PHINode>(I);
+ for (PHINode &PHI : Block->phis()) {
// Reduction lcssa phi will have only 1 incoming block that from loop latch.
- if (PHI->getNumIncomingValues() > 1)
+ if (PHI.getNumIncomingValues() > 1)
return false;
- Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+ Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
if (!Ins)
return false;
// Incoming value for lcssa phi's in outer loop exit can only be inner loop
@@ -751,35 +739,38 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
return true;
}
-static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
- BasicBlock *LoopHeader) {
- if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
- assert(BI->getNumSuccessors() == 2 &&
- "Branch leaving loop latch must have 2 successors");
- for (BasicBlock *Succ : BI->successors()) {
- if (Succ == LoopHeader)
- continue;
- return Succ;
- }
- }
- return nullptr;
-}
-
// This function indicates the current limitations in the transform as a result
// of which we do not proceed.
bool LoopInterchangeLegality::currentLimitations() {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
- BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
- BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+ // transform currently expects the loop latches to also be the exiting
+ // blocks.
+ if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
+ OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
+ !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
+ !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
+ LLVM_DEBUG(
+ dbgs() << "Loops where the latch is not the exiting block are not"
+ << " supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Loops where the latch is not the exiting block cannot be"
+ " interchange currently.";
+ });
+ return true;
+ }
PHINode *InnerInductionVar;
SmallVector<PHINode *, 8> Inductions;
SmallVector<PHINode *, 8> Reductions;
if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
- DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
- << "are supported currently.\n");
+ LLVM_DEBUG(
+ dbgs() << "Only inner loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
InnerLoop->getStartLoc(),
@@ -792,8 +783,9 @@ bool LoopInterchangeLegality::currentLimitations() {
// TODO: Currently we handle only loops with 1 induction variable.
if (Inductions.size() != 1) {
- DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
- << "Failed to interchange due to current limitation\n");
+ LLVM_DEBUG(
+ dbgs() << "We currently only support loops with 1 induction variable."
+ << "Failed to interchange due to current limitation\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
InnerLoop->getStartLoc(),
@@ -809,8 +801,9 @@ bool LoopInterchangeLegality::currentLimitations() {
InnerInductionVar = Inductions.pop_back_val();
Reductions.clear();
if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
- DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
- << "are supported currently.\n");
+ LLVM_DEBUG(
+ dbgs() << "Only outer loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
OuterLoop->getStartLoc(),
@@ -824,8 +817,8 @@ bool LoopInterchangeLegality::currentLimitations() {
// Outer loop cannot have reduction because then loops will not be tightly
// nested.
if (!Reductions.empty()) {
- DEBUG(dbgs() << "Outer loops with reductions are not supported "
- << "currently.\n");
+ LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported "
+ << "currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
OuterLoop->getStartLoc(),
@@ -837,8 +830,8 @@ bool LoopInterchangeLegality::currentLimitations() {
}
// TODO: Currently we handle only loops with 1 induction variable.
if (Inductions.size() != 1) {
- DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
- << "supported currently.\n");
+ LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+ << "supported currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
OuterLoop->getStartLoc(),
@@ -851,7 +844,7 @@ bool LoopInterchangeLegality::currentLimitations() {
// TODO: Triangular loops are not handled for now.
if (!isLoopStructureUnderstood(InnerInductionVar)) {
- DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
InnerLoop->getStartLoc(),
@@ -862,23 +855,10 @@ bool LoopInterchangeLegality::currentLimitations() {
}
// TODO: We only handle LCSSA PHI's corresponding to reduction for now.
- BasicBlock *LoopExitBlock =
- getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
- if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
- DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with LCSSA PHIs can be interchange "
- "currently.";
- });
- return true;
- }
-
- LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
- if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
- DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+ BasicBlock *InnerExit = InnerLoop->getExitBlock();
+ if (!containsSafePHI(InnerExit, false)) {
+ LLVM_DEBUG(
+ dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
InnerLoop->getStartLoc(),
@@ -908,8 +888,9 @@ bool LoopInterchangeLegality::currentLimitations() {
dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
if (!InnerIndexVarInc) {
- DEBUG(dbgs() << "Did not find an instruction to increment the induction "
- << "variable.\n");
+ LLVM_DEBUG(
+ dbgs() << "Did not find an instruction to increment the induction "
+ << "variable.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
InnerLoop->getStartLoc(),
@@ -924,7 +905,8 @@ bool LoopInterchangeLegality::currentLimitations() {
// instruction.
bool FoundInduction = false;
- for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
+ for (const Instruction &I :
+ llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
isa<ZExtInst>(I))
continue;
@@ -932,8 +914,8 @@ bool LoopInterchangeLegality::currentLimitations() {
// We found an instruction. If this is not induction variable then it is not
// safe to split this loop latch.
if (!I.isIdenticalTo(InnerIndexVarInc)) {
- DEBUG(dbgs() << "Found unsupported instructions between induction "
- << "variable increment and branch.\n");
+ LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
+ << "variable increment and branch.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(
DEBUG_TYPE, "UnsupportedInsBetweenInduction",
@@ -950,7 +932,7 @@ bool LoopInterchangeLegality::currentLimitations() {
// The loop latch ended and we didn't find the induction variable return as
// current limitation.
if (!FoundInduction) {
- DEBUG(dbgs() << "Did not find the induction variable.\n");
+ LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
InnerLoop->getStartLoc(),
@@ -962,13 +944,50 @@ bool LoopInterchangeLegality::currentLimitations() {
return false;
}
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+ for (PHINode &PHI : LoopNestExit->phis()) {
+ // FIXME: We currently are not able to detect floating point reductions
+ // and have to use floating point PHIs as a proxy to prevent
+ // interchanging in the presence of floating point reductions.
+ if (PHI.getType()->isFloatingPointTy())
+ return false;
+ for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+ Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+ if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+ continue;
+
+ // The incoming value is defined in the outer loop latch. Currently we
+ // only support that in case the outer loop latch has a single predecessor.
+ // This guarantees that the outer loop latch is executed if and only if
+ // the inner loop is executed (because tightlyNested() guarantees that the
+ // outer loop header only branches to the inner loop or the outer loop
+ // latch).
+ // FIXME: We could weaken this logic and allow multiple predecessors,
+ // if the values are produced outside the loop latch. We would need
+ // additional logic to update the PHI nodes in the exit block as
+ // well.
+ if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+ return false;
+ }
+ }
+ return true;
+}
+
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix) {
if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
- DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
- << " and OuterLoopId = " << OuterLoopId
- << " due to dependence\n");
+ LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId
+ << " due to dependence\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
InnerLoop->getStartLoc(),
@@ -977,16 +996,23 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
});
return false;
}
-
// Check if outer and inner loop contain legal instructions only.
for (auto *BB : OuterLoop->blocks())
- for (Instruction &I : *BB)
+ for (Instruction &I : BB->instructionsWithoutDebug())
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
// readnone functions do not prevent interchanging.
if (CI->doesNotReadMemory())
continue;
- DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
- << "safely.");
+ LLVM_DEBUG(
+ dbgs() << "Loops with call instructions cannot be interchanged "
+ << "safely.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
+ CI->getDebugLoc(),
+ CI->getParent())
+ << "Cannot interchange loops due to call instruction.";
+ });
+
return false;
}
@@ -1015,13 +1041,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
// TODO: The loops could not be interchanged due to current limitations in the
// transform module.
if (currentLimitations()) {
- DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+ LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
return false;
}
// Check if the loops are tightly nested.
if (!tightlyNested(OuterLoop, InnerLoop)) {
- DEBUG(dbgs() << "Loops not tightly nested\n");
+ LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
InnerLoop->getStartLoc(),
@@ -1032,6 +1058,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
return false;
}
+ if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+ LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Found unsupported PHI node in loop exit.";
+ });
+ return false;
+ }
+
return true;
}
@@ -1100,7 +1137,8 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
}
// If outer loop has dependence and inner loop is loop independent then it is
// profitable to interchange to enable parallelism.
- return true;
+ // If there are no dependences, interchanging will not improve anything.
+ return !DepMatrix.empty();
}
bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
@@ -1115,7 +1153,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
// of induction variables in the instruction and allows reordering if number
// of bad orders is more than good.
int Cost = getInstrOrderCost();
- DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
if (Cost < -LoopInterchangeCostThreshold)
return true;
@@ -1138,33 +1176,88 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
Loop *InnerLoop) {
- for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
- ++I) {
- if (*I == InnerLoop) {
- OuterLoop->removeChildLoop(I);
+ for (Loop *L : *OuterLoop)
+ if (L == InnerLoop) {
+ OuterLoop->removeChildLoop(L);
return;
}
- }
llvm_unreachable("Couldn't find loop");
}
-void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
- Loop *OuterLoop) {
+/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
+/// new inner and outer loop after interchanging: NewInner is the original
+/// outer loop and NewOuter is the original inner loop.
+///
+/// Before interchanging, we have the following structure
+/// Outer preheader
+// Outer header
+// Inner preheader
+// Inner header
+// Inner body
+// Inner latch
+// outer bbs
+// Outer latch
+//
+// After interchanging:
+// Inner preheader
+// Inner header
+// Outer preheader
+// Outer header
+// Inner body
+// outer bbs
+// Outer latch
+// Inner latch
+void LoopInterchangeTransform::restructureLoops(
+ Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
+ BasicBlock *OrigOuterPreHeader) {
Loop *OuterLoopParent = OuterLoop->getParentLoop();
+ // The original inner loop preheader moves from the new inner loop to
+ // the parent loop, if there is one.
+ NewInner->removeBlockFromLoop(OrigInnerPreHeader);
+ LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
+
+ // Switch the loop levels.
if (OuterLoopParent) {
// Remove the loop from its parent loop.
- removeChildLoop(OuterLoopParent, OuterLoop);
- removeChildLoop(OuterLoop, InnerLoop);
- OuterLoopParent->addChildLoop(InnerLoop);
+ removeChildLoop(OuterLoopParent, NewInner);
+ removeChildLoop(NewInner, NewOuter);
+ OuterLoopParent->addChildLoop(NewOuter);
} else {
- removeChildLoop(OuterLoop, InnerLoop);
- LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+ removeChildLoop(NewInner, NewOuter);
+ LI->changeTopLevelLoop(NewInner, NewOuter);
+ }
+ while (!NewOuter->empty())
+ NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
+ NewOuter->addChildLoop(NewInner);
+
+ // BBs from the original inner loop.
+ SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
+
+ // Add BBs from the original outer loop to the original inner loop (excluding
+ // BBs already in inner loop)
+ for (BasicBlock *BB : NewInner->blocks())
+ if (LI->getLoopFor(BB) == NewInner)
+ NewOuter->addBlockEntry(BB);
+
+ // Now remove inner loop header and latch from the new inner loop and move
+ // other BBs (the loop body) to the new inner loop.
+ BasicBlock *OuterHeader = NewOuter->getHeader();
+ BasicBlock *OuterLatch = NewOuter->getLoopLatch();
+ for (BasicBlock *BB : OrigInnerBBs) {
+ // Nothing will change for BBs in child loops.
+ if (LI->getLoopFor(BB) != NewOuter)
+ continue;
+ // Remove the new outer loop header and latch from the new inner loop.
+ if (BB == OuterHeader || BB == OuterLatch)
+ NewInner->removeBlockFromLoop(BB);
+ else
+ LI->changeLoopFor(BB, NewInner);
}
- while (!InnerLoop->empty())
- OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
-
- InnerLoop->addChildLoop(OuterLoop);
+ // The preheader of the original outer loop becomes part of the new
+ // outer loop.
+ NewOuter->addBlockEntry(OrigOuterPreHeader);
+ LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
}
bool LoopInterchangeTransform::transform() {
@@ -1173,10 +1266,10 @@ bool LoopInterchangeTransform::transform() {
if (InnerLoop->getSubLoops().empty()) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- DEBUG(dbgs() << "Calling Split Inner Loop\n");
+ LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
if (!InductionPHI) {
- DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+ LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
return false;
}
@@ -1185,8 +1278,7 @@ bool LoopInterchangeTransform::transform() {
else
InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
- // Ensure that InductionPHI is the first Phi node as required by
- // splitInnerLoopHeader
+ // Ensure that InductionPHI is the first Phi node.
if (&InductionPHI->getParent()->front() != InductionPHI)
InductionPHI->moveBefore(&InductionPHI->getParent()->front());
@@ -1194,20 +1286,20 @@ bool LoopInterchangeTransform::transform() {
// incremented/decremented.
// TODO: This splitting logic may not work always. Fix this.
splitInnerLoopLatch(InnerIndexVar);
- DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+ LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
// Splits the inner loops phi nodes out into a separate basic block.
- splitInnerLoopHeader();
- DEBUG(dbgs() << "splitInnerLoopHeader done\n");
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+ LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
}
Transformed |= adjustLoopLinks();
if (!Transformed) {
- DEBUG(dbgs() << "adjustLoopLinks failed\n");
+ LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
return false;
}
- restructureLoops(InnerLoop, OuterLoop);
return true;
}
@@ -1217,38 +1309,6 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
}
-void LoopInterchangeTransform::splitInnerLoopHeader() {
- // Split the inner loop header out. Here make sure that the reduction PHI's
- // stay in the innerloop body.
- BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- if (InnerLoopHasReduction) {
- // Note: The induction PHI must be the first PHI for this to work
- BasicBlock *New = InnerLoopHeader->splitBasicBlock(
- ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
- if (LI)
- if (Loop *L = LI->getLoopFor(InnerLoopHeader))
- L->addBasicBlockToLoop(New, *LI);
-
- // Adjust Reduction PHI's in the block.
- SmallVector<PHINode *, 8> PHIVec;
- for (auto I = New->begin(); isa<PHINode>(I); ++I) {
- PHINode *PHI = dyn_cast<PHINode>(I);
- Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
- PHI->replaceAllUsesWith(V);
- PHIVec.push_back((PHI));
- }
- for (PHINode *P : PHIVec) {
- P->eraseFromParent();
- }
- } else {
- SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
- }
-
- DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
- "InnerLoopHeader\n");
-}
-
/// \brief Move all instructions except the terminator from FromBB right before
/// InsertBefore
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
@@ -1262,18 +1322,40 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
BasicBlock *OldPred,
BasicBlock *NewPred) {
- for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
- PHINode *PHI = cast<PHINode>(I);
- unsigned Num = PHI->getNumIncomingValues();
+ for (PHINode &PHI : CurrBlock->phis()) {
+ unsigned Num = PHI.getNumIncomingValues();
for (unsigned i = 0; i < Num; ++i) {
- if (PHI->getIncomingBlock(i) == OldPred)
- PHI->setIncomingBlock(i, NewPred);
+ if (PHI.getIncomingBlock(i) == OldPred)
+ PHI.setIncomingBlock(i, NewPred);
+ }
+ }
+}
+
+/// Update BI to jump to NewBB instead of OldBB. Records updates to
+/// the dominator tree in DTUpdates, if DT should be preserved.
+static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
+ BasicBlock *NewBB,
+ std::vector<DominatorTree::UpdateType> &DTUpdates) {
+ assert(llvm::count_if(BI->successors(),
+ [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 &&
+ "BI must jump to OldBB at most once.");
+ for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) {
+ if (BI->getSuccessor(i) == OldBB) {
+ BI->setSuccessor(i, NewBB);
+
+ DTUpdates.push_back(
+ {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+ DTUpdates.push_back(
+ {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
+ break;
}
}
}
bool LoopInterchangeTransform::adjustLoopBranches() {
- DEBUG(dbgs() << "adjustLoopBranches called\n");
+ LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
+ std::vector<DominatorTree::UpdateType> DTUpdates;
+
// Adjust the loop preheader
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
@@ -1313,27 +1395,18 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
return false;
// Adjust Loop Preheader and headers
-
- unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
- for (unsigned i = 0; i < NumSucc; ++i) {
- if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
- OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
- }
-
- NumSucc = OuterLoopHeaderBI->getNumSuccessors();
- for (unsigned i = 0; i < NumSucc; ++i) {
- if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
- OuterLoopHeaderBI->setSuccessor(i, LoopExit);
- else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
- OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
- }
+ updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
+ InnerLoopPreHeader, DTUpdates);
+ updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates);
+ updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
+ InnerLoopHeaderSuccessor, DTUpdates);
// Adjust reduction PHI's now that the incoming block has changed.
updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
OuterLoopHeader);
- BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
- InnerLoopHeaderBI->eraseFromParent();
+ updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
+ OuterLoopPreHeader, DTUpdates);
// -------------Adjust loop latches-----------
if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
@@ -1341,19 +1414,15 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
else
InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
- NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
- for (unsigned i = 0; i < NumSucc; ++i) {
- if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
- InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
- }
+ updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
+ InnerLoopLatchSuccessor, DTUpdates);
// Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
// the value and remove this PHI node from inner loop.
SmallVector<PHINode *, 8> LcssaVec;
- for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
- PHINode *LcssaPhi = cast<PHINode>(I);
- LcssaVec.push_back(LcssaPhi);
- }
+ for (PHINode &P : InnerLoopLatchSuccessor->phis())
+ LcssaVec.push_back(&P);
+
for (PHINode *P : LcssaVec) {
Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
P->replaceAllUsesWith(Incoming);
@@ -1365,19 +1434,52 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
else
OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
- if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
- InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
- else
- InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+ updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
+ OuterLoopLatchSuccessor, DTUpdates);
+ updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
+ DTUpdates);
updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
- if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
- OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
- } else {
- OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+ DT->applyUpdates(DTUpdates);
+ restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
+ OuterLoopPreHeader);
+
+ // Now update the reduction PHIs in the inner and outer loop headers.
+ SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
+ for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+ InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+ for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+ OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+ for (PHINode *PHI : OuterLoopPHIs)
+ PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+
+ // Move the PHI nodes from the inner loop header to the outer loop header.
+ // We have to deal with one kind of PHI nodes:
+ // 1) PHI nodes that are part of inner loop-only reductions.
+ // We only have to move the PHI node and update the incoming blocks.
+ for (PHINode *PHI : InnerLoopPHIs) {
+ PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+ for (BasicBlock *InBB : PHI->blocks()) {
+ if (InnerLoop->contains(InBB))
+ continue;
+
+ assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) &&
+ "Unexpected incoming PHI node, reductions in outer loop are not "
+ "supported yet");
+ PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
+ PHI->eraseFromParent();
+ break;
+ }
}
+ // Update the incoming blocks for moved PHI nodes.
+ updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
+ updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch);
+ updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader);
+ updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch);
+
return true;
}
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index dfa5ec1f354d..19bd9ebcc15b 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -25,7 +25,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -52,6 +52,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <algorithm>
#include <cassert>
@@ -79,7 +80,7 @@ STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
namespace {
-/// \brief Represent a store-to-forwarding candidate.
+/// Represent a store-to-forwarding candidate.
struct StoreToLoadForwardingCandidate {
LoadInst *Load;
StoreInst *Store;
@@ -87,7 +88,7 @@ struct StoreToLoadForwardingCandidate {
StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
: Load(Load), Store(Store) {}
- /// \brief Return true if the dependence from the store to the load has a
+ /// Return true if the dependence from the store to the load has a
/// distance of one. E.g. A[i+1] = A[i]
bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
Loop *L) const {
@@ -136,7 +137,7 @@ struct StoreToLoadForwardingCandidate {
} // end anonymous namespace
-/// \brief Check if the store dominates all latches, so as long as there is no
+/// Check if the store dominates all latches, so as long as there is no
/// intervening store this value will be loaded in the next iteration.
static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
DominatorTree *DT) {
@@ -147,21 +148,21 @@ static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
});
}
-/// \brief Return true if the load is not executed on all paths in the loop.
+/// Return true if the load is not executed on all paths in the loop.
static bool isLoadConditional(LoadInst *Load, Loop *L) {
return Load->getParent() != L->getHeader();
}
namespace {
-/// \brief The per-loop class that does most of the work.
+/// The per-loop class that does most of the work.
class LoadEliminationForLoop {
public:
LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
DominatorTree *DT)
: L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
- /// \brief Look through the loop-carried and loop-independent dependences in
+ /// Look through the loop-carried and loop-independent dependences in
/// this loop and find store->load dependences.
///
/// Note that no candidate is returned if LAA has failed to analyze the loop
@@ -178,7 +179,7 @@ public:
// forward and backward dependences qualify. Disqualify loads that have
// other unknown dependences.
- SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+ SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
for (const auto &Dep : *Deps) {
Instruction *Source = Dep.getSource(LAI);
@@ -222,14 +223,14 @@ public:
return Candidates;
}
- /// \brief Return the index of the instruction according to program order.
+ /// Return the index of the instruction according to program order.
unsigned getInstrIndex(Instruction *Inst) {
auto I = InstOrder.find(Inst);
assert(I != InstOrder.end() && "No index for instruction");
return I->second;
}
- /// \brief If a load has multiple candidates associated (i.e. different
+ /// If a load has multiple candidates associated (i.e. different
/// stores), it means that it could be forwarding from multiple stores
/// depending on control flow. Remove these candidates.
///
@@ -284,22 +285,24 @@ public:
Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
if (LoadToSingleCand[Cand.Load] != &Cand) {
- DEBUG(dbgs() << "Removing from candidates: \n" << Cand
- << " The load may have multiple stores forwarding to "
- << "it\n");
+ LLVM_DEBUG(
+ dbgs() << "Removing from candidates: \n"
+ << Cand
+ << " The load may have multiple stores forwarding to "
+ << "it\n");
return true;
}
return false;
});
}
- /// \brief Given two pointers operations by their RuntimePointerChecking
+ /// Given two pointers operations by their RuntimePointerChecking
/// indices, return true if they require an alias check.
///
/// We need a check if one is a pointer for a candidate load and the other is
/// a pointer for a possibly intervening store.
bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
- const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+ const SmallPtrSet<Value *, 4> &PtrsWrittenOnFwdingPath,
const std::set<Value *> &CandLoadPtrs) {
Value *Ptr1 =
LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
@@ -309,11 +312,11 @@ public:
(PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
}
- /// \brief Return pointers that are possibly written to on the path from a
+ /// Return pointers that are possibly written to on the path from a
/// forwarding store to a load.
///
/// These pointers need to be alias-checked against the forwarding candidates.
- SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+ SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
// From FirstStore to LastLoad neither of the elimination candidate loads
// should overlap with any of the stores.
@@ -351,7 +354,7 @@ public:
// We're looking for stores after the first forwarding store until the end
// of the loop, then from the beginning of the loop until the last
// forwarded-to load. Collect the pointer for the stores.
- SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+ SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
auto InsertStorePtr = [&](Instruction *I) {
if (auto *S = dyn_cast<StoreInst>(I))
@@ -366,16 +369,16 @@ public:
return PtrsWrittenOnFwdingPath;
}
- /// \brief Determine the pointer alias checks to prove that there are no
+ /// Determine the pointer alias checks to prove that there are no
/// intervening stores.
SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
- SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+ SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
findPointersWrittenOnForwardingPath(Candidates);
// Collect the pointers of the candidate loads.
- // FIXME: SmallSet does not work with std::inserter.
+ // FIXME: SmallPtrSet does not work with std::inserter.
std::set<Value *> CandLoadPtrs;
transform(Candidates,
std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
@@ -394,13 +397,14 @@ public:
return false;
});
- DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
- DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
+ << "):\n");
+ LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
return Checks;
}
- /// \brief Perform the transformation for a candidate.
+ /// Perform the transformation for a candidate.
void
propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
SCEVExpander &SEE) {
@@ -436,11 +440,11 @@ public:
Cand.Load->replaceAllUsesWith(PHI);
}
- /// \brief Top-level driver for each loop: find store->load forwarding
+ /// Top-level driver for each loop: find store->load forwarding
/// candidates, add run-time checks and perform transformation.
bool processLoop() {
- DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
- << "\" checking " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
// Look for store-to-load forwarding cases across the
// backedge. E.g.:
@@ -479,7 +483,7 @@ public:
SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
unsigned NumForwarding = 0;
for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
- DEBUG(dbgs() << "Candidate " << Cand);
+ LLVM_DEBUG(dbgs() << "Candidate " << Cand);
// Make sure that the stored values is available everywhere in the loop in
// the next iteration.
@@ -498,9 +502,10 @@ public:
continue;
++NumForwarding;
- DEBUG(dbgs()
- << NumForwarding
- << ". Valid store-to-load forwarding across the loop backedge\n");
+ LLVM_DEBUG(
+ dbgs()
+ << NumForwarding
+ << ". Valid store-to-load forwarding across the loop backedge\n");
Candidates.push_back(Cand);
}
if (Candidates.empty())
@@ -513,25 +518,26 @@ public:
// Too many checks are likely to outweigh the benefits of forwarding.
if (Checks.size() > Candidates.size() * CheckPerElim) {
- DEBUG(dbgs() << "Too many run-time checks needed.\n");
+ LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
return false;
}
if (LAI.getPSE().getUnionPredicate().getComplexity() >
LoadElimSCEVCheckThreshold) {
- DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+ LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
return false;
}
if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
if (L->getHeader()->getParent()->optForSize()) {
- DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing "
- "for size.\n");
+ LLVM_DEBUG(
+ dbgs() << "Versioning is needed but not allowed when optimizing "
+ "for size.\n");
return false;
}
if (!L->isLoopSimplifyForm()) {
- DEBUG(dbgs() << "Loop is not is loop-simplify form");
+ LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
return false;
}
@@ -558,7 +564,7 @@ public:
private:
Loop *L;
- /// \brief Maps the load/store instructions to their index according to
+ /// Maps the load/store instructions to their index according to
/// program order.
DenseMap<Instruction *, unsigned> InstOrder;
@@ -599,7 +605,7 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
namespace {
-/// \brief The pass. Most of the work is delegated to the per-loop
+/// The pass. Most of the work is delegated to the per-loop
/// LoadEliminationForLoop class.
class LoopLoadElimination : public FunctionPass {
public:
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 2e4c7b19e476..561ceea1d880 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -155,7 +155,7 @@
// When S = -1 (i.e. reverse iterating loop), the transformation is supported
// when:
// * The loop has a single latch with the condition of the form:
-// B(X) = X <pred> latchLimit, where <pred> is u> or s>.
+// B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=.
// * The guard condition is of the form
// G(X) = X - 1 u< guardLimit
//
@@ -171,9 +171,14 @@
// guardStart u< guardLimit && latchLimit u>= 1.
// Similarly for sgt condition the widened condition is:
// guardStart u< guardLimit && latchLimit s>= 1.
+// For uge condition the widened condition is:
+// guardStart u< guardLimit && latchLimit u> 1.
+// For sge condition the widened condition is:
+// guardStart u< guardLimit && latchLimit s> 1.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -198,6 +203,20 @@ static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+ SkipProfitabilityChecks("loop-predication-skip-profitability-checks",
+ cl::Hidden, cl::init(false));
+
+// This is the scale factor for the latch probability. We use this during
+// profitability analysis to find other exiting blocks that have a much higher
+// probability of exiting the loop instead of loop exiting via latch.
+// This value should be greater than 1 for a sane profitability check.
+static cl::opt<float> LatchExitProbabilityScale(
+ "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0),
+ cl::desc("scale factor for the latch probability. Value should be greater "
+ "than 1. Lower values are ignored"));
+
namespace {
class LoopPredication {
/// Represents an induction variable check:
@@ -217,6 +236,7 @@ class LoopPredication {
};
ScalarEvolution *SE;
+ BranchProbabilityInfo *BPI;
Loop *L;
const DataLayout *DL;
@@ -250,6 +270,12 @@ class LoopPredication {
IRBuilder<> &Builder);
bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+ // If the loop always exits through another block in the loop, we should not
+ // predicate based on the latch check. For example, the latch check can be a
+ // very coarse grained check and there can be more fine grained exit checks
+ // within the loop. We identify such unprofitable loops through BPI.
+ bool isLoopProfitableToPredicate();
+
// When the IV type is wider than the range operand type, we can still do loop
// predication, by generating SCEVs for the range and latch that are of the
// same type. We achieve this by generating a SCEV truncate expression for the
@@ -266,8 +292,10 @@ class LoopPredication {
// Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
// so.
Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
+
public:
- LoopPredication(ScalarEvolution *SE) : SE(SE){};
+ LoopPredication(ScalarEvolution *SE, BranchProbabilityInfo *BPI)
+ : SE(SE), BPI(BPI){};
bool runOnLoop(Loop *L);
};
@@ -279,6 +307,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
@@ -286,7 +315,9 @@ public:
if (skipLoop(L))
return false;
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LoopPredication LP(SE);
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+ LoopPredication LP(SE, &BPI);
return LP.runOnLoop(L);
}
};
@@ -296,6 +327,7 @@ char LoopPredicationLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
"Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
"Loop predication", false, false)
@@ -307,7 +339,11 @@ Pass *llvm::createLoopPredicationPass() {
PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &U) {
- LoopPredication LP(&AR.SE);
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+ auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
+ LoopPredication LP(&AR.SE, BPI);
if (!LP.runOnLoop(&L))
return PreservedAnalyses::all();
@@ -375,11 +411,11 @@ LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
if (!NewLatchCheck.IV)
return None;
NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
- DEBUG(dbgs() << "IV of type: " << *LatchType
- << "can be represented as range check type:" << *RangeCheckType
- << "\n");
- DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
- DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+ LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType
+ << "can be represented as range check type:"
+ << *RangeCheckType << "\n");
+ LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+ LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
return NewLatchCheck;
}
@@ -412,30 +448,15 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
!CanExpand(LatchLimit) || !CanExpand(RHS)) {
- DEBUG(dbgs() << "Can't expand limit check!\n");
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
return None;
}
- ICmpInst::Predicate LimitCheckPred;
- switch (LatchCheck.Pred) {
- case ICmpInst::ICMP_ULT:
- LimitCheckPred = ICmpInst::ICMP_ULE;
- break;
- case ICmpInst::ICMP_ULE:
- LimitCheckPred = ICmpInst::ICMP_ULT;
- break;
- case ICmpInst::ICMP_SLT:
- LimitCheckPred = ICmpInst::ICMP_SLE;
- break;
- case ICmpInst::ICMP_SLE:
- LimitCheckPred = ICmpInst::ICMP_SLT;
- break;
- default:
- llvm_unreachable("Unsupported loop latch!");
- }
+ auto LimitCheckPred =
+ ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
- DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
- DEBUG(dbgs() << "RHS: " << *RHS << "\n");
- DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+ LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+ LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+ LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
Instruction *InsertAt = Preheader->getTerminator();
auto *LimitCheck =
@@ -454,16 +475,16 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
const SCEV *LatchLimit = LatchCheck.Limit;
if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
!CanExpand(LatchLimit)) {
- DEBUG(dbgs() << "Can't expand limit check!\n");
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
return None;
}
// The decrement of the latch check IV should be the same as the
// rangeCheckIV.
auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
if (RangeCheck.IV != PostDecLatchCheckIV) {
- DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
- << *PostDecLatchCheckIV
- << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
+ LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+ << *PostDecLatchCheckIV
+ << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
return None;
}
@@ -472,9 +493,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
// latchLimit <pred> 1.
// See the header comment for reasoning of the checks.
Instruction *InsertAt = Preheader->getTerminator();
- auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
- ? ICmpInst::ICMP_SGE
- : ICmpInst::ICMP_UGE;
+ auto LimitCheckPred =
+ ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT,
GuardStart, GuardLimit, InsertAt);
auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit,
@@ -488,8 +508,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
SCEVExpander &Expander,
IRBuilder<> &Builder) {
- DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
- DEBUG(ICI->dump());
+ LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+ LLVM_DEBUG(ICI->dump());
// parseLoopStructure guarantees that the latch condition is:
// ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
@@ -497,34 +517,34 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
// i u< guardLimit
auto RangeCheck = parseLoopICmp(ICI);
if (!RangeCheck) {
- DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
return None;
}
- DEBUG(dbgs() << "Guard check:\n");
- DEBUG(RangeCheck->dump());
+ LLVM_DEBUG(dbgs() << "Guard check:\n");
+ LLVM_DEBUG(RangeCheck->dump());
if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
- DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
- << ")!\n");
+ LLVM_DEBUG(dbgs() << "Unsupported range check predicate("
+ << RangeCheck->Pred << ")!\n");
return None;
}
auto *RangeCheckIV = RangeCheck->IV;
if (!RangeCheckIV->isAffine()) {
- DEBUG(dbgs() << "Range check IV is not affine!\n");
+ LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n");
return None;
}
auto *Step = RangeCheckIV->getStepRecurrence(*SE);
// We cannot just compare with latch IV step because the latch and range IVs
// may have different types.
if (!isSupportedStep(Step)) {
- DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+ LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
return None;
}
auto *Ty = RangeCheckIV->getType();
auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
if (!CurrLatchCheckOpt) {
- DEBUG(dbgs() << "Failed to generate a loop latch check "
- "corresponding to range type: "
- << *Ty << "\n");
+ LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check "
+ "corresponding to range type: "
+ << *Ty << "\n");
return None;
}
@@ -535,7 +555,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
"Range and latch steps should be of same type!");
if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
- DEBUG(dbgs() << "Range and latch have different step values!\n");
+ LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n");
return None;
}
@@ -551,14 +571,14 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
SCEVExpander &Expander) {
- DEBUG(dbgs() << "Processing guard:\n");
- DEBUG(Guard->dump());
+ LLVM_DEBUG(dbgs() << "Processing guard:\n");
+ LLVM_DEBUG(Guard->dump());
IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
// The guard condition is expected to be in form of:
// cond1 && cond2 && cond3 ...
- // Iterate over subconditions looking for for icmp conditions which can be
+ // Iterate over subconditions looking for icmp conditions which can be
// widened across loop iterations. Widening these conditions remember the
// resulting list of subconditions in Checks vector.
SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
@@ -605,7 +625,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
LastCheck = Builder.CreateAnd(LastCheck, Check);
Guard->setOperand(0, LastCheck);
- DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+ LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
return true;
}
@@ -614,7 +634,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
BasicBlock *LoopLatch = L->getLoopLatch();
if (!LoopLatch) {
- DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+ LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
return None;
}
@@ -625,7 +645,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
if (!match(LoopLatch->getTerminator(),
m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
FalseDest))) {
- DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+ LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
return None;
}
assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
@@ -635,20 +655,20 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
auto Result = parseLoopICmp(Pred, LHS, RHS);
if (!Result) {
- DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
return None;
}
// Check affine first, so if it's not we don't try to compute the step
// recurrence.
if (!Result->IV->isAffine()) {
- DEBUG(dbgs() << "The induction variable is not affine!\n");
+ LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n");
return None;
}
auto *Step = Result->IV->getStepRecurrence(*SE);
if (!isSupportedStep(Step)) {
- DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+ LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
return None;
}
@@ -658,13 +678,14 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
} else {
assert(Step->isAllOnesValue() && "Step should be -1!");
- return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT;
+ return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE;
}
};
if (IsUnsupportedPredicate(Step, Result->Pred)) {
- DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
- << ")!\n");
+ LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+ << ")!\n");
return None;
}
return Result;
@@ -700,11 +721,65 @@ bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
}
+bool LoopPredication::isLoopProfitableToPredicate() {
+ if (SkipProfitabilityChecks || !BPI)
+ return true;
+
+ SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 8> ExitEdges;
+ L->getExitEdges(ExitEdges);
+ // If there is only one exiting edge in the loop, it is always profitable to
+ // predicate the loop.
+ if (ExitEdges.size() == 1)
+ return true;
+
+ // Calculate the exiting probabilities of all exiting edges from the loop,
+ // starting with the LatchExitProbability.
+ // Heuristic for profitability: If any of the exiting blocks' probability of
+ // exiting the loop is larger than exiting through the latch block, it's not
+ // profitable to predicate the loop.
+ auto *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "Should have a single latch at this point!");
+ auto *LatchTerm = LatchBlock->getTerminator();
+ assert(LatchTerm->getNumSuccessors() == 2 &&
+ "expected to be an exiting block with 2 succs!");
+ unsigned LatchBrExitIdx =
+ LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
+ BranchProbability LatchExitProbability =
+ BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
+
+ // Protect against degenerate inputs provided by the user. Providing a value
+ // less than one, can invert the definition of profitable loop predication.
+ float ScaleFactor = LatchExitProbabilityScale;
+ if (ScaleFactor < 1) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Ignored user setting for loop-predication-latch-probability-scale: "
+ << LatchExitProbabilityScale << "\n");
+ LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
+ ScaleFactor = 1.0;
+ }
+ const auto LatchProbabilityThreshold =
+ LatchExitProbability * ScaleFactor;
+
+ for (const auto &ExitEdge : ExitEdges) {
+ BranchProbability ExitingBlockProbability =
+ BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
+ // Some exiting edge has higher probability than the latch exiting edge.
+ // No longer profitable to predicate.
+ if (ExitingBlockProbability > LatchProbabilityThreshold)
+ return false;
+ }
+ // Using BPI, we have concluded that the most probable way to exit from the
+ // loop is through the latch (or there's no profile information and all
+ // exits are equally likely).
+ return true;
+}
+
bool LoopPredication::runOnLoop(Loop *Loop) {
L = Loop;
- DEBUG(dbgs() << "Analyzing ");
- DEBUG(L->dump());
+ LLVM_DEBUG(dbgs() << "Analyzing ");
+ LLVM_DEBUG(L->dump());
Module *M = L->getHeader()->getModule();
@@ -725,9 +800,13 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
return false;
LatchCheck = *LatchCheckOpt;
- DEBUG(dbgs() << "Latch check:\n");
- DEBUG(LatchCheck.dump());
+ LLVM_DEBUG(dbgs() << "Latch check:\n");
+ LLVM_DEBUG(LatchCheck.dump());
+ if (!isLoopProfitableToPredicate()) {
+ LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n");
+ return false;
+ }
// Collect all the guards into a vector and process later, so as not
// to invalidate the instruction iterator.
SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index d1a54b877950..9a99e5925572 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -17,7 +17,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -28,6 +28,7 @@
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -51,8 +52,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
#include <cstddef>
@@ -69,10 +70,6 @@ using namespace llvm;
STATISTIC(NumRerolledLoops, "Number of rerolled loops");
static cl::opt<unsigned>
-MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
- cl::desc("The maximum increment for loop rerolling"));
-
-static cl::opt<unsigned>
NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
cl::Hidden,
cl::desc("The maximum number of failures to tolerate"
@@ -188,7 +185,7 @@ namespace {
bool PreserveLCSSA;
using SmallInstructionVector = SmallVector<Instruction *, 16>;
- using SmallInstructionSet = SmallSet<Instruction *, 16>;
+ using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
// Map between induction variable and its increment
DenseMap<Instruction *, int64_t> IVToIncMap;
@@ -397,8 +394,8 @@ namespace {
/// Stage 3: Assuming validate() returned true, perform the
/// replacement.
- /// @param IterCount The maximum iteration count of L.
- void replace(const SCEV *IterCount);
+ /// @param BackedgeTakenCount The backedge-taken count of L.
+ void replace(const SCEV *BackedgeTakenCount);
protected:
using UsesTy = MapVector<Instruction *, BitVector>;
@@ -428,8 +425,7 @@ namespace {
bool instrDependsOn(Instruction *I,
UsesTy::iterator Start,
UsesTy::iterator End);
- void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount);
- void updateNonLoopCtrlIncr();
+ void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
LoopReroll *Parent;
@@ -482,8 +478,8 @@ namespace {
void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
void collectPossibleReductions(Loop *L,
ReductionTracker &Reductions);
- bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
- ReductionTracker &Reductions);
+ bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
};
} // end anonymous namespace
@@ -510,48 +506,6 @@ static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
return false;
}
-static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE,
- const SCEV *SCEVExpr,
- Instruction &IV) {
- const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr);
-
- // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)),
- // Return c1.
- if (!MulSCEV && IV.getType()->isPointerTy())
- if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) {
- const PointerType *PTy = cast<PointerType>(IV.getType());
- Type *ElTy = PTy->getElementType();
- const SCEV *SizeOfExpr =
- SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy);
- if (IncSCEV->getValue()->getValue().isNegative()) {
- const SCEV *NewSCEV =
- SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr);
- return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV));
- } else {
- return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr));
- }
- }
-
- if (!MulSCEV)
- return nullptr;
-
- // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant,
- // Return c.
- const SCEVConstant *CIncSCEV = nullptr;
- for (const SCEV *Operand : MulSCEV->operands()) {
- if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) {
- CIncSCEV = Constant;
- } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) {
- Type *AllocTy;
- if (!Unknown->isSizeOf(AllocTy))
- break;
- } else {
- return nullptr;
- }
- }
- return CIncSCEV;
-}
-
// Check if an IV is only used to control the loop. There are two cases:
// 1. It only has one use which is loop increment, and the increment is only
// used by comparison and the PHI (could has sext with nsw in between), and the
@@ -632,25 +586,17 @@ void LoopReroll::collectPossibleIVs(Loop *L,
continue;
if (!PHISCEV->isAffine())
continue;
- const SCEVConstant *IncSCEV = nullptr;
- if (I->getType()->isPointerTy())
- IncSCEV =
- getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I);
- else
- IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+ auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
if (IncSCEV) {
- const APInt &AInt = IncSCEV->getValue()->getValue().abs();
- if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
- continue;
IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
- DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
- << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+ << "\n");
if (isLoopControlIV(L, &*I)) {
assert(!LoopControlIV && "Found two loop control only IV");
LoopControlIV = &(*I);
- DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = "
- << *PHISCEV << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
+ << " = " << *PHISCEV << "\n");
} else
PossibleIVs.push_back(&*I);
}
@@ -717,8 +663,8 @@ void LoopReroll::collectPossibleReductions(Loop *L,
if (!SLR.valid())
continue;
- DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
- SLR.size() << " chained instructions)\n");
+ LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
+ << SLR.size() << " chained instructions)\n");
Reductions.addSLR(SLR);
}
}
@@ -856,7 +802,8 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
BaseUsers.push_back(II);
continue;
} else {
- DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
+ << "\n");
return false;
}
}
@@ -878,7 +825,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
// away.
if (BaseUsers.size()) {
if (Roots.find(0) != Roots.end()) {
- DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+ LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
return false;
}
Roots[0] = Base;
@@ -894,9 +841,9 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
if (KV.first == 0)
continue;
if (!KV.second->hasNUses(NumBaseUses)) {
- DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
- << "#Base=" << NumBaseUses << ", #Root=" <<
- KV.second->getNumUses() << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+ << "#Base=" << NumBaseUses
+ << ", #Root=" << KV.second->getNumUses() << "\n");
return false;
}
}
@@ -1024,13 +971,14 @@ bool LoopReroll::DAGRootTracker::findRoots() {
// Ensure all sets have the same size.
if (RootSets.empty()) {
- DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
return false;
}
for (auto &V : RootSets) {
if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
- DEBUG(dbgs()
- << "LRR: Aborting because not all root sets have the same size\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "LRR: Aborting because not all root sets have the same size\n");
return false;
}
}
@@ -1038,13 +986,14 @@ bool LoopReroll::DAGRootTracker::findRoots() {
Scale = RootSets[0].Roots.size() + 1;
if (Scale > IL_MaxRerollIterations) {
- DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
- << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
- << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+ << "#Found=" << Scale
+ << ", #Max=" << IL_MaxRerollIterations << "\n");
return false;
}
- DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
+ << "\n");
return true;
}
@@ -1078,7 +1027,7 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
// While we're here, check the use sets are the same size.
if (V.size() != VBase.size()) {
- DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
return false;
}
@@ -1235,17 +1184,17 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// set.
for (auto &KV : Uses) {
if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
- DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
- << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+ LLVM_DEBUG(
+ dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+ << *KV.first << " (#uses=" << KV.second.count() << ")\n");
return false;
}
}
- DEBUG(
- for (auto &KV : Uses) {
- dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
- }
- );
+ LLVM_DEBUG(for (auto &KV
+ : Uses) {
+ dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+ });
for (unsigned Iter = 1; Iter < Scale; ++Iter) {
// In addition to regular aliasing information, we need to look for
@@ -1304,8 +1253,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
if (TryIt == Uses.end() || TryIt == RootIt ||
instrDependsOn(TryIt->first, RootIt, TryIt)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+ << *BaseInst << " vs. " << *RootInst << "\n");
return false;
}
@@ -1341,8 +1290,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// root instruction, does not also belong to the base set or the set of
// some other root instruction.
if (RootIt->second.count() > 1) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << " (prev. case overlap)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (prev. case overlap)\n");
return false;
}
@@ -1352,8 +1301,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
if (RootInst->mayReadFromMemory())
for (auto &K : AST) {
if (K.aliasesUnknownInst(RootInst, *AA)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << " (depends on future store)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+ << *BaseInst << " vs. " << *RootInst
+ << " (depends on future store)\n");
return false;
}
}
@@ -1366,9 +1316,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
!isSafeToSpeculativelyExecute(BaseInst)) ||
(!isUnorderedLoadStore(RootInst) &&
!isSafeToSpeculativelyExecute(RootInst)))) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst <<
- " (side effects prevent reordering)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst
+ << " (side effects prevent reordering)\n");
return false;
}
@@ -1419,8 +1369,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
BaseInst->getOperand(!j) == Op2) {
Swapped = true;
} else {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
- << " vs. " << *RootInst << " (operand " << j << ")\n");
+ LLVM_DEBUG(dbgs()
+ << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (operand " << j << ")\n");
return false;
}
}
@@ -1433,8 +1384,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
hasUsesOutsideLoop(BaseInst, L)) ||
(!PossibleRedLastSet.count(RootInst) &&
hasUsesOutsideLoop(RootInst, L))) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << " (uses outside loop)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (uses outside loop)\n");
return false;
}
@@ -1451,20 +1402,32 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
"Mismatched set sizes!");
}
- DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
- *IV << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
+ << "\n");
return true;
}
-void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
BasicBlock *Header = L->getHeader();
+
+ // Compute the start and increment for each BaseInst before we start erasing
+ // instructions.
+ SmallVector<const SCEV *, 8> StartExprs;
+ SmallVector<const SCEV *, 8> IncrExprs;
+ for (auto &DRS : RootSets) {
+ const SCEVAddRecExpr *IVSCEV =
+ cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ StartExprs.push_back(IVSCEV->getStart());
+ IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
+ }
+
// Remove instructions associated with non-base iterations.
for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
J != JE;) {
unsigned I = Uses[&*J].find_first();
if (I > 0 && I < IL_All) {
- DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
J++->eraseFromParent();
continue;
}
@@ -1472,74 +1435,47 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
++J;
}
- bool HasTwoIVs = LoopControlIV && LoopControlIV != IV;
+ // Rewrite each BaseInst using SCEV.
+ for (size_t i = 0, e = RootSets.size(); i != e; ++i)
+ // Insert the new induction variable.
+ replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
- if (HasTwoIVs) {
- updateNonLoopCtrlIncr();
- replaceIV(LoopControlIV, LoopControlIV, IterCount);
- } else
- // We need to create a new induction variable for each different BaseInst.
- for (auto &DRS : RootSets)
- // Insert the new induction variable.
- replaceIV(DRS.BaseInst, IV, IterCount);
+ { // Limit the lifetime of SCEVExpander.
+ BranchInst *BI = cast<BranchInst>(Header->getTerminator());
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "reroll");
+ auto Zero = SE->getZero(BackedgeTakenCount->getType());
+ auto One = SE->getOne(BackedgeTakenCount->getType());
+ auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
+ Value *NewIV =
+ Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
+ Header->getFirstNonPHIOrDbg());
+ // FIXME: This arithmetic can overflow.
+ auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
+ auto ScaledTripCount = SE->getMulExpr(
+ TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
+ auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
+ Value *TakenCount =
+ Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
+ Header->getFirstNonPHIOrDbg());
+ Value *Cond =
+ new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
+ BI->setCondition(Cond);
+
+ if (BI->getSuccessor(1) != Header)
+ BI->swapSuccessors();
+ }
SimplifyInstructionsInBlock(Header, TLI);
DeleteDeadPHIs(Header, TLI);
}
-// For non-loop-control IVs, we only need to update the last increment
-// with right amount, then we are done.
-void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() {
- const SCEV *NewInc = nullptr;
- for (auto *LoopInc : LoopIncs) {
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc);
- const SCEVConstant *COp = nullptr;
- if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) {
- COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
- } else {
- COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0)));
- if (!COp)
- COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
- }
-
- assert(COp && "Didn't find constant operand of LoopInc!\n");
-
- const APInt &AInt = COp->getValue()->getValue();
- const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale);
- if (AInt.isNegative()) {
- NewInc = SE->getNegativeSCEV(COp);
- NewInc = SE->getUDivExpr(NewInc, ScaleSCEV);
- NewInc = SE->getNegativeSCEV(NewInc);
- } else
- NewInc = SE->getUDivExpr(COp, ScaleSCEV);
-
- LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue());
- }
-}
-
-void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
- Instruction *InstIV,
- const SCEV *IterCount) {
+void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
+ const SCEV *Start,
+ const SCEV *IncrExpr) {
BasicBlock *Header = L->getHeader();
- int64_t Inc = IVToIncMap[InstIV];
- bool NeedNewIV = InstIV == LoopControlIV;
- bool Negative = !NeedNewIV && Inc < 0;
-
- const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst));
- const SCEV *Start = RealIVSCEV->getStart();
-
- if (NeedNewIV)
- Start = SE->getConstant(Start->getType(), 0);
-
- const SCEV *SizeOfExpr = nullptr;
- const SCEV *IncrExpr =
- SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1);
- if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) {
- Type *ElTy = PTy->getElementType();
- SizeOfExpr =
- SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy);
- IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr);
- }
+ Instruction *Inst = DRS.BaseInst;
+
const SCEV *NewIVSCEV =
SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
@@ -1552,54 +1488,6 @@ void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
for (auto &KV : Uses)
if (KV.second.find_first() == 0)
KV.first->replaceUsesOfWith(Inst, NewIV);
-
- if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
- // FIXME: Why do we need this check?
- if (Uses[BI].find_first() == IL_All) {
- const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
-
- if (NeedNewIV)
- ICSCEV = SE->getMulExpr(IterCount,
- SE->getConstant(IterCount->getType(), Scale));
-
- // Iteration count SCEV minus or plus 1
- const SCEV *MinusPlus1SCEV =
- SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1);
- if (Inst->getType()->isPointerTy()) {
- assert(SizeOfExpr && "SizeOfExpr is not initialized");
- MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr);
- }
-
- const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV);
- // Iteration count minus 1
- Instruction *InsertPtr = nullptr;
- if (isa<SCEVConstant>(ICMinusPlus1SCEV)) {
- InsertPtr = BI;
- } else {
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader)
- Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
- InsertPtr = Preheader->getTerminator();
- }
-
- if (!isa<PointerType>(NewIV->getType()) && NeedNewIV &&
- (SE->getTypeSizeInBits(NewIV->getType()) <
- SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) {
- IRBuilder<> Builder(BI);
- Builder.SetCurrentDebugLocation(BI->getDebugLoc());
- NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType());
- }
- Value *ICMinusPlus1 = Expander.expandCodeFor(
- ICMinusPlus1SCEV, NewIV->getType(), InsertPtr);
-
- Value *Cond =
- new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond");
- BI->setCondition(Cond);
-
- if (BI->getSuccessor(1) != Header)
- BI->swapSuccessors();
- }
- }
}
}
@@ -1617,17 +1505,17 @@ bool LoopReroll::ReductionTracker::validateSelected() {
int Iter = PossibleRedIter[J];
if (Iter != PrevIter && Iter != PrevIter + 1 &&
!PossibleReds[i].getReducedValue()->isAssociative()) {
- DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
- J << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
+ << J << "\n");
return false;
}
if (Iter != PrevIter) {
if (Count != BaseCount) {
- DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
- " reduction use count " << Count <<
- " is not equal to the base use count " <<
- BaseCount << "\n");
+ LLVM_DEBUG(dbgs()
+ << "LRR: Iteration " << PrevIter << " reduction use count "
+ << Count << " is not equal to the base use count "
+ << BaseCount << "\n");
return false;
}
@@ -1716,15 +1604,15 @@ void LoopReroll::ReductionTracker::replaceSelected() {
// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
// have been validated), then we reroll the loop.
bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
- const SCEV *IterCount,
+ const SCEV *BackedgeTakenCount,
ReductionTracker &Reductions) {
DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
IVToIncMap, LoopControlIV);
if (!DAGRoots.findRoots())
return false;
- DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
- *IV << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
+ << "\n");
if (!DAGRoots.validate(Reductions))
return false;
@@ -1734,7 +1622,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
// making changes!
Reductions.replaceSelected();
- DAGRoots.replace(IterCount);
+ DAGRoots.replace(BackedgeTakenCount);
++NumRerolledLoops;
return true;
@@ -1752,9 +1640,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
BasicBlock *Header = L->getHeader();
- DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
- "] Loop %" << Header->getName() << " (" <<
- L->getNumBlocks() << " block(s))\n");
+ LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
+ << Header->getName() << " (" << L->getNumBlocks()
+ << " block(s))\n");
// For now, we'll handle only single BB loops.
if (L->getNumBlocks() > 1)
@@ -1763,10 +1651,10 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
if (!SE->hasLoopInvariantBackedgeTakenCount(L))
return false;
- const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
- const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
- DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
- DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+ LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
+ << "\n");
// First, we need to find the induction variable with respect to which we can
// reroll (there may be several possible options).
@@ -1776,7 +1664,7 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
collectPossibleIVs(L, PossibleIVs);
if (PossibleIVs.empty()) {
- DEBUG(dbgs() << "LRR: No possible IVs found\n");
+ LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
return false;
}
@@ -1787,11 +1675,11 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
// For each possible IV, collect the associated possible set of 'root' nodes
// (i+1, i+2, etc.).
for (Instruction *PossibleIV : PossibleIVs)
- if (reroll(PossibleIV, L, Header, IterCount, Reductions)) {
+ if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
Changed = true;
break;
}
- DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+ LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
// Trip count of L has changed so SE must be re-evaluated.
if (Changed)
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index a91f53ba663f..eeaad39dc1d1 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,33 +13,15 @@
#include "llvm/Transforms/Scalar/LoopRotation.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
using namespace llvm;
#define DEBUG_TYPE "loop-rotate"
@@ -48,595 +30,6 @@ static cl::opt<unsigned> DefaultRotationThreshold(
"rotation-max-header-size", cl::init(16), cl::Hidden,
cl::desc("The default maximum header size for automatic loop rotation"));
-STATISTIC(NumRotated, "Number of loops rotated");
-
-namespace {
-/// A simple loop rotation transformation.
-class LoopRotate {
- const unsigned MaxHeaderSize;
- LoopInfo *LI;
- const TargetTransformInfo *TTI;
- AssumptionCache *AC;
- DominatorTree *DT;
- ScalarEvolution *SE;
- const SimplifyQuery &SQ;
-
-public:
- LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
- : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
- SQ(SQ) {}
- bool processLoop(Loop *L);
-
-private:
- bool rotateLoop(Loop *L, bool SimplifiedLatch);
- bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader. If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values. Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
- BasicBlock *OrigPreheader,
- ValueToValueMapTy &ValueMap,
- SmallVectorImpl<PHINode*> *InsertedPHIs) {
- // Remove PHI node entries that are no longer live.
- BasicBlock::iterator I, E = OrigHeader->end();
- for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
- PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
- // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
- // as necessary.
- SSAUpdater SSA(InsertedPHIs);
- for (I = OrigHeader->begin(); I != E; ++I) {
- Value *OrigHeaderVal = &*I;
-
- // If there are no uses of the value (e.g. because it returns void), there
- // is nothing to rewrite.
- if (OrigHeaderVal->use_empty())
- continue;
-
- Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
- // The value now exits in two versions: the initial value in the preheader
- // and the loop "next" value in the original header.
- SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
- SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
- SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
- // Visit each use of the OrigHeader instruction.
- for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
- UE = OrigHeaderVal->use_end();
- UI != UE;) {
- // Grab the use before incrementing the iterator.
- Use &U = *UI;
-
- // Increment the iterator before removing the use from the list.
- ++UI;
-
- // SSAUpdater can't handle a non-PHI use in the same block as an
- // earlier def. We can easily handle those cases manually.
- Instruction *UserInst = cast<Instruction>(U.getUser());
- if (!isa<PHINode>(UserInst)) {
- BasicBlock *UserBB = UserInst->getParent();
-
- // The original users in the OrigHeader are already using the
- // original definitions.
- if (UserBB == OrigHeader)
- continue;
-
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped.
- if (UserBB == OrigPreheader) {
- U = OrigPreHeaderVal;
- continue;
- }
- }
-
- // Anything else can be handled by SSAUpdater.
- SSA.RewriteUse(U);
- }
-
- // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
- // intrinsics.
- SmallVector<DbgValueInst *, 1> DbgValues;
- llvm::findDbgValues(DbgValues, OrigHeaderVal);
- for (auto &DbgValue : DbgValues) {
- // The original users in the OrigHeader are already using the original
- // definitions.
- BasicBlock *UserBB = DbgValue->getParent();
- if (UserBB == OrigHeader)
- continue;
-
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped and anything else can be handled by
- // the SSAUpdater. To avoid adding PHINodes, check if the value is
- // available in UserBB, if not substitute undef.
- Value *NewVal;
- if (UserBB == OrigPreheader)
- NewVal = OrigPreHeaderVal;
- else if (SSA.HasValueForBlock(UserBB))
- NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
- else
- NewVal = UndefValue::get(OrigHeaderVal->getType());
- DbgValue->setOperand(0,
- MetadataAsValue::get(OrigHeaderVal->getContext(),
- ValueAsMetadata::get(NewVal)));
- }
- }
-}
-
-/// Propagate dbg.value intrinsics through the newly inserted Phis.
-static void insertDebugValues(BasicBlock *OrigHeader,
- SmallVectorImpl<PHINode*> &InsertedPHIs) {
- ValueToValueMapTy DbgValueMap;
-
- // Map existing PHI nodes to their dbg.values.
- for (auto &I : *OrigHeader) {
- if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
- if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
- DbgValueMap.insert({Loc, DbgII});
- }
- }
-
- // Then iterate through the new PHIs and look to see if they use one of the
- // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
- // propagate the info through the new PHI.
- LLVMContext &C = OrigHeader->getContext();
- for (auto PHI : InsertedPHIs) {
- for (auto VI : PHI->operand_values()) {
- auto V = DbgValueMap.find(VI);
- if (V != DbgValueMap.end()) {
- auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
- Instruction *NewDbgII = DbgII->clone();
- auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
- NewDbgII->setOperand(0, PhiMAV);
- BasicBlock *Parent = PHI->getParent();
- NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
- }
- }
- }
-}
-
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
- // If the loop has only one block then there is not much to rotate.
- if (L->getBlocks().size() == 1)
- return false;
-
- BasicBlock *OrigHeader = L->getHeader();
- BasicBlock *OrigLatch = L->getLoopLatch();
-
- BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
- if (!BI || BI->isUnconditional())
- return false;
-
- // If the loop header is not one of the loop exiting blocks then
- // either this loop is already rotated or it is not
- // suitable for loop rotation transformations.
- if (!L->isLoopExiting(OrigHeader))
- return false;
-
- // If the loop latch already contains a branch that leaves the loop then the
- // loop is already rotated.
- if (!OrigLatch)
- return false;
-
- // Rotate if either the loop latch does *not* exit the loop, or if the loop
- // latch was just simplified.
- if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
- return false;
-
- // Check size of original header and reject loop if it is very big or we can't
- // duplicate blocks inside it.
- {
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
- CodeMetrics Metrics;
- Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
- if (Metrics.notDuplicatable) {
- DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
- << " instructions: ";
- L->dump());
- return false;
- }
- if (Metrics.convergent) {
- DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
- "instructions: ";
- L->dump());
- return false;
- }
- if (Metrics.NumInsts > MaxHeaderSize)
- return false;
- }
-
- // Now, this loop is suitable for rotation.
- BasicBlock *OrigPreheader = L->getLoopPreheader();
-
- // If the loop could not be converted to canonical form, it must have an
- // indirectbr in it, just give up.
- if (!OrigPreheader)
- return false;
-
- // Anything ScalarEvolution may know about this loop or the PHI nodes
- // in its header will soon be invalidated.
- if (SE)
- SE->forgetLoop(L);
-
- DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-
- // Find new Loop header. NewHeader is a Header's one and only successor
- // that is inside loop. Header's other successor is outside the
- // loop. Otherwise loop is not suitable for rotation.
- BasicBlock *Exit = BI->getSuccessor(0);
- BasicBlock *NewHeader = BI->getSuccessor(1);
- if (L->contains(Exit))
- std::swap(Exit, NewHeader);
- assert(NewHeader && "Unable to determine new loop header");
- assert(L->contains(NewHeader) && !L->contains(Exit) &&
- "Unable to determine loop header and exit blocks");
-
- // This code assumes that the new header has exactly one predecessor.
- // Remove any single-entry PHI nodes in it.
- assert(NewHeader->getSinglePredecessor() &&
- "New header doesn't have one pred!");
- FoldSingleEntryPHINodes(NewHeader);
-
- // Begin by walking OrigHeader and populating ValueMap with an entry for
- // each Instruction.
- BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
- ValueToValueMapTy ValueMap;
-
- // For PHI nodes, the value available in OldPreHeader is just the
- // incoming value from OldPreHeader.
- for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
- ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
-
- // For the rest of the instructions, either hoist to the OrigPreheader if
- // possible or create a clone in the OldPreHeader if not.
- TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
-
- // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
- using DbgIntrinsicHash =
- std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
- auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
- return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
- };
- SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
- for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
- I != E; ++I) {
- if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
- DbgIntrinsics.insert(makeHash(DII));
- else
- break;
- }
-
- while (I != E) {
- Instruction *Inst = &*I++;
-
- // If the instruction's operands are invariant and it doesn't read or write
- // memory, then it is safe to hoist. Doing this doesn't change the order of
- // execution in the preheader, but does prevent the instruction from
- // executing in each iteration of the loop. This means it is safe to hoist
- // something that might trap, but isn't safe to hoist something that reads
- // memory (without proving that the loop doesn't write).
- if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
- !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
- !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
- Inst->moveBefore(LoopEntryBranch);
- continue;
- }
-
- // Otherwise, create a duplicate of the instruction.
- Instruction *C = Inst->clone();
-
- // Eagerly remap the operands of the instruction.
- RemapInstruction(C, ValueMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
- // Avoid inserting the same intrinsic twice.
- if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
- if (DbgIntrinsics.count(makeHash(DII))) {
- C->deleteValue();
- continue;
- }
-
- // With the operands remapped, see if the instruction constant folds or is
- // otherwise simplifyable. This commonly occurs because the entry from PHI
- // nodes allows icmps and other instructions to fold.
- Value *V = SimplifyInstruction(C, SQ);
- if (V && LI->replacementPreservesLCSSAForm(C, V)) {
- // If so, then delete the temporary instruction and stick the folded value
- // in the map.
- ValueMap[Inst] = V;
- if (!C->mayHaveSideEffects()) {
- C->deleteValue();
- C = nullptr;
- }
- } else {
- ValueMap[Inst] = C;
- }
- if (C) {
- // Otherwise, stick the new instruction into the new block!
- C->setName(Inst->getName());
- C->insertBefore(LoopEntryBranch);
-
- if (auto *II = dyn_cast<IntrinsicInst>(C))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
- }
- }
-
- // Along with all the other instructions, we just cloned OrigHeader's
- // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
- // successors by duplicating their incoming values for OrigHeader.
- TerminatorInst *TI = OrigHeader->getTerminator();
- for (BasicBlock *SuccBB : TI->successors())
- for (BasicBlock::iterator BI = SuccBB->begin();
- PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
- PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
- // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
- // OrigPreHeader's old terminator (the original branch into the loop), and
- // remove the corresponding incoming values from the PHI nodes in OrigHeader.
- LoopEntryBranch->eraseFromParent();
-
-
- SmallVector<PHINode*, 2> InsertedPHIs;
- // If there were any uses of instructions in the duplicated block outside the
- // loop, update them, inserting PHI nodes as required
- RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
- &InsertedPHIs);
-
- // Attach dbg.value intrinsics to the new phis if that phi uses a value that
- // previously had debug metadata attached. This keeps the debug info
- // up-to-date in the loop body.
- if (!InsertedPHIs.empty())
- insertDebugValues(OrigHeader, InsertedPHIs);
-
- // NewHeader is now the header of the loop.
- L->moveToHeader(NewHeader);
- assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
- // Inform DT about changes to the CFG.
- if (DT) {
- // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
- // the DT about the removed edge to the OrigHeader (that got removed).
- SmallVector<DominatorTree::UpdateType, 3> Updates;
- Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
- Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
- Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
- DT->applyUpdates(Updates);
- }
-
- // At this point, we've finished our major CFG changes. As part of cloning
- // the loop into the preheader we've simplified instructions and the
- // duplicated conditional branch may now be branching on a constant. If it is
- // branching on a constant and if that constant means that we enter the loop,
- // then we fold away the cond branch to an uncond branch. This simplifies the
- // loop in cases important for nested loops, and it also means we don't have
- // to split as many edges.
- BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
- assert(PHBI->isConditional() && "Should be clone of BI condbr!");
- if (!isa<ConstantInt>(PHBI->getCondition()) ||
- PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
- NewHeader) {
- // The conditional branch can't be folded, handle the general case.
- // Split edges as necessary to preserve LoopSimplify form.
-
- // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
- // thus is not a preheader anymore.
- // Split the edge to form a real preheader.
- BasicBlock *NewPH = SplitCriticalEdge(
- OrigPreheader, NewHeader,
- CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
- NewPH->setName(NewHeader->getName() + ".lr.ph");
-
- // Preserve canonical loop form, which means that 'Exit' should have only
- // one predecessor. Note that Exit could be an exit block for multiple
- // nested loops, causing both of the edges to now be critical and need to
- // be split.
- SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
- bool SplitLatchEdge = false;
- for (BasicBlock *ExitPred : ExitPreds) {
- // We only need to split loop exit edges.
- Loop *PredLoop = LI->getLoopFor(ExitPred);
- if (!PredLoop || PredLoop->contains(Exit))
- continue;
- if (isa<IndirectBrInst>(ExitPred->getTerminator()))
- continue;
- SplitLatchEdge |= L->getLoopLatch() == ExitPred;
- BasicBlock *ExitSplit = SplitCriticalEdge(
- ExitPred, Exit,
- CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
- ExitSplit->moveBefore(Exit);
- }
- assert(SplitLatchEdge &&
- "Despite splitting all preds, failed to split latch exit?");
- } else {
- // We can fold the conditional branch in the preheader, this makes things
- // simpler. The first step is to remove the extra edge to the Exit block.
- Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
- BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
- NewBI->setDebugLoc(PHBI->getDebugLoc());
- PHBI->eraseFromParent();
-
- // With our CFG finalized, update DomTree if it is available.
- if (DT) DT->deleteEdge(OrigPreheader, Exit);
- }
-
- assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
- assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
-
- // Now that the CFG and DomTree are in a consistent state again, try to merge
- // the OrigHeader block into OrigLatch. This will succeed if they are
- // connected by an unconditional branch. This is just a cleanup so the
- // emitted code isn't too gross in this common case.
- MergeBlockIntoPredecessor(OrigHeader, DT, LI);
-
- DEBUG(dbgs() << "LoopRotation: into "; L->dump());
-
- ++NumRotated;
- return true;
-}
-
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
- BasicBlock::iterator End, Loop *L) {
- bool seenIncrement = false;
- bool MultiExitLoop = false;
-
- if (!L->getExitingBlock())
- MultiExitLoop = true;
-
- for (BasicBlock::iterator I = Begin; I != End; ++I) {
-
- if (!isSafeToSpeculativelyExecute(&*I))
- return false;
-
- if (isa<DbgInfoIntrinsic>(I))
- continue;
-
- switch (I->getOpcode()) {
- default:
- return false;
- case Instruction::GetElementPtr:
- // GEPs are cheap if all indices are constant.
- if (!cast<GEPOperator>(I)->hasAllConstantIndices())
- return false;
- // fall-thru to increment case
- LLVM_FALLTHROUGH;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr: {
- Value *IVOpnd =
- !isa<Constant>(I->getOperand(0))
- ? I->getOperand(0)
- : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
- if (!IVOpnd)
- return false;
-
- // If increment operand is used outside of the loop, this speculation
- // could cause extra live range interference.
- if (MultiExitLoop) {
- for (User *UseI : IVOpnd->users()) {
- auto *UserInst = cast<Instruction>(UseI);
- if (!L->contains(UserInst))
- return false;
- }
- }
-
- if (seenIncrement)
- return false;
- seenIncrement = true;
- break;
- }
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- // ignore type conversions
- break;
- }
- }
- return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
- BasicBlock *Latch = L->getLoopLatch();
- if (!Latch || Latch->hasAddressTaken())
- return false;
-
- BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
- if (!Jmp || !Jmp->isUnconditional())
- return false;
-
- BasicBlock *LastExit = Latch->getSinglePredecessor();
- if (!LastExit || !L->isLoopExiting(LastExit))
- return false;
-
- BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
- if (!BI)
- return false;
-
- if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
- return false;
-
- DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
- << LastExit->getName() << "\n");
-
- // Hoist the instructions from Latch into LastExit.
- LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
- Latch->begin(), Jmp->getIterator());
-
- unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
- BasicBlock *Header = Jmp->getSuccessor(0);
- assert(Header == L->getHeader() && "expected a backward branch");
-
- // Remove Latch from the CFG so that LastExit becomes the new Latch.
- BI->setSuccessor(FallThruPath, Header);
- Latch->replaceSuccessorsPhiUsesWith(LastExit);
- Jmp->eraseFromParent();
-
- // Nuke the Latch block.
- assert(Latch->empty() && "unable to evacuate Latch");
- LI->removeBlock(Latch);
- if (DT)
- DT->eraseNode(Latch);
- Latch->eraseFromParent();
- return true;
-}
-
-/// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
- // Save the loop metadata.
- MDNode *LoopMD = L->getLoopID();
-
- // Simplify the loop latch before attempting to rotate the header
- // upward. Rotation may not be needed if the loop tail can be folded into the
- // loop exit.
- bool SimplifiedLatch = simplifyLoopLatch(L);
-
- bool MadeChange = rotateLoop(L, SimplifiedLatch);
- assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
- "Loop latch should be exiting after loop-rotate.");
-
- // Restore the loop metadata.
- // NB! We presume LoopRotation DOESN'T ADD its own metadata.
- if ((MadeChange || SimplifiedLatch) && LoopMD)
- L->setLoopID(LoopMD);
-
- return MadeChange || SimplifiedLatch;
-}
-
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
: EnableHeaderDuplication(EnableHeaderDuplication) {}
@@ -646,10 +39,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
- LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
- SQ);
- bool Changed = LR.processLoop(&L);
+ bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ,
+ false, Threshold, false);
+
if (!Changed)
return PreservedAnalyses::all();
@@ -691,8 +84,8 @@ public:
auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
auto *SE = SEWP ? &SEWP->getSE() : nullptr;
const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
- LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
- return LR.processLoop(L);
+ return LoopRotation(L, LI, TTI, AC, DT, SE, SQ, false, MaxHeaderSize,
+ false);
}
};
}
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 35c05e84fd68..2b83d3dc5f1b 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -30,13 +30,16 @@
#include "llvm/IR/Dominators.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
#define DEBUG_TYPE "loop-simplifycfg"
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ ScalarEvolution &SE) {
bool Changed = false;
// Copy blocks into a temporary array to avoid iterator invalidation issues
// as we remove them.
@@ -53,11 +56,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
continue;
- // Pred is going to disappear, so we need to update the loop info.
- if (L.getHeader() == Pred)
- L.moveToHeader(Succ);
- LI.removeBlock(Pred);
- MergeBasicBlockIntoOnlyPred(Succ, &DT);
+ // Merge Succ into Pred and delete it.
+ MergeBlockIntoPredecessor(Succ, &DT, &LI);
+
+ SE.forgetLoop(&L);
Changed = true;
}
@@ -67,7 +69,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
- if (!simplifyLoopCFG(L, AR.DT, AR.LI))
+ if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
@@ -87,7 +89,8 @@ public:
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- return simplifyLoopCFG(*L, DT, LI);
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ return simplifyLoopCFG(*L, DT, LI, SE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index 430a7085d93f..760177c9c5e9 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -42,6 +42,7 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
@@ -49,7 +50,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -200,17 +200,19 @@ static bool sinkInstruction(Loop &L, Instruction &I,
SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
BBsToSinkInto.end());
- std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
- [&](BasicBlock *A, BasicBlock *B) {
- return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
- });
+ llvm::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+ [&](BasicBlock *A, BasicBlock *B) {
+ return LoopBlockNumber.find(A)->second <
+ LoopBlockNumber.find(B)->second;
+ });
BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
// FIXME: Optimize the efficiency for cloned value replacement. The current
// implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
- for (BasicBlock *N : SortedBBsToSinkInto) {
- if (N == MoveBB)
- continue;
+ for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
+ assert(LoopBlockNumber.find(N)->second >
+ LoopBlockNumber.find(MoveBB)->second &&
+ "BBs not sorted!");
// Clone I and replace its uses.
Instruction *IC = I.clone();
IC->setName(I.getName());
@@ -224,11 +226,11 @@ static bool sinkInstruction(Loop &L, Instruction &I,
}
// Replaces uses of I with IC in blocks dominated by N
replaceDominatedUsesWith(&I, IC, DT, N);
- DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
- << '\n');
+ LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+ << '\n');
NumLoopSunkCloned++;
}
- DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
NumLoopSunk++;
I.moveBefore(&*MoveBB->getFirstInsertionPt());
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 953854c8b7b7..fa83b48210bc 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -75,6 +75,8 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -105,8 +107,8 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -121,7 +123,7 @@ using namespace llvm;
#define DEBUG_TYPE "loop-reduce"
-/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
/// bail out. This threshold is far beyond the number of users that LSR can
/// conceivably solve, so it should not affect generated code, but catches the
/// worst cases before LSR burns too much compile time and stack space.
@@ -185,6 +187,8 @@ struct MemAccessTy {
unsigned AS = UnknownAddressSpace) {
return MemAccessTy(Type::getVoidTy(Ctx), AS);
}
+
+ Type *getType() { return MemTy; }
};
/// This class holds data which is used to order reuse candidates.
@@ -327,7 +331,7 @@ struct Formula {
/// #2 enforces that 1 * reg is reg.
/// #3 ensures invariant regs with respect to current loop can be combined
/// together in LSR codegen.
- /// This invariant can be temporarly broken while building a formula.
+ /// This invariant can be temporarily broken while building a formula.
/// However, every formula inserted into the LSRInstance must be in canonical
/// form.
SmallVector<const SCEV *, 4> BaseRegs;
@@ -442,7 +446,7 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
canonicalize(*L);
}
-/// \brief Check whether or not this formula statisfies the canonical
+/// Check whether or not this formula satisfies the canonical
/// representation.
/// \see Formula::BaseRegs.
bool Formula::isCanonical(const Loop &L) const {
@@ -470,7 +474,7 @@ bool Formula::isCanonical(const Loop &L) const {
return I == BaseRegs.end();
}
-/// \brief Helper method to morph a formula into its canonical representation.
+/// Helper method to morph a formula into its canonical representation.
/// \see Formula::BaseRegs.
/// Every formula having more than one base register, must use the ScaledReg
/// field. Otherwise, we would have to do special cases everywhere in LSR
@@ -505,7 +509,7 @@ void Formula::canonicalize(const Loop &L) {
}
}
-/// \brief Get rid of the scale in the formula.
+/// Get rid of the scale in the formula.
/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
/// \return true if it was possible to get rid of the scale, false otherwise.
/// \note After this operation the formula may not be in the canonical form.
@@ -818,7 +822,7 @@ static bool isAddressUse(const TargetTransformInfo &TTI,
/// Return the type of the memory being accessed.
static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
- Instruction *Inst) {
+ Instruction *Inst, Value *OperandVal) {
MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
AccessTy.MemTy = SI->getOperand(0)->getType();
@@ -832,7 +836,14 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
switch (II->getIntrinsicID()) {
case Intrinsic::prefetch:
+ case Intrinsic::memset:
AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+ AccessTy.MemTy = OperandVal->getType();
+ break;
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
+ AccessTy.MemTy = OperandVal->getType();
break;
default: {
MemIntrinsicInfo IntrInfo;
@@ -857,12 +868,11 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
/// Return true if this AddRec is already a phi in its loop.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
- for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I) {
- if (SE.isSCEVable(PN->getType()) &&
- (SE.getEffectiveSCEVType(PN->getType()) ==
+ for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
+ if (SE.isSCEVable(PN.getType()) &&
+ (SE.getEffectiveSCEVType(PN.getType()) ==
SE.getEffectiveSCEVType(AR->getType())) &&
- SE.getSCEV(PN) == AR)
+ SE.getSCEV(&PN) == AR)
return true;
}
return false;
@@ -938,7 +948,7 @@ static bool isHighCostExpansion(const SCEV *S,
return true;
}
-/// If any of the instructions is the specified set are trivially dead, delete
+/// If any of the instructions in the specified set are trivially dead, delete
/// them and see if this makes any of their operands subsequently dead.
static bool
DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
@@ -971,7 +981,7 @@ class LSRUse;
} // end anonymous namespace
-/// \brief Check if the addressing mode defined by \p F is completely
+/// Check if the addressing mode defined by \p F is completely
/// folded in \p LU at isel time.
/// This includes address-mode folding and special icmp tricks.
/// This function returns true if \p LU can accommodate what \p F
@@ -1041,12 +1051,14 @@ private:
void RateRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT);
+ ScalarEvolution &SE, DominatorTree &DT,
+ const TargetTransformInfo &TTI);
void RatePrimaryRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSetImpl<const SCEV *> *LoserRegs);
+ SmallPtrSetImpl<const SCEV *> *LoserRegs,
+ const TargetTransformInfo &TTI);
};
/// An operand value in an instruction which is to be replaced with some
@@ -1195,7 +1207,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
void Cost::RateRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT) {
+ ScalarEvolution &SE, DominatorTree &DT,
+ const TargetTransformInfo &TTI) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
// If this is an addrec for another loop, it should be an invariant
// with respect to L since L is the innermost loop (at least
@@ -1216,13 +1229,28 @@ void Cost::RateRegister(const SCEV *Reg,
++C.NumRegs;
return;
}
- C.AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+ unsigned LoopCost = 1;
+ if (TTI.shouldFavorPostInc()) {
+ const SCEV *LoopStep = AR->getStepRecurrence(SE);
+ if (isa<SCEVConstant>(LoopStep)) {
+ // Check if a post-indexed load/store can be used.
+ if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+ TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+ const SCEV *LoopStart = AR->getStart();
+ if (!isa<SCEVConstant>(LoopStart) &&
+ SE.isLoopInvariant(LoopStart, L))
+ LoopCost = 0;
+ }
+ }
+ }
+ C.AddRecCost += LoopCost;
// Add the step value register, if it needs one.
// TODO: The non-affine case isn't precisely modeled here.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
if (!Regs.count(AR->getOperand(1))) {
- RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+ RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
if (isLoser())
return;
}
@@ -1250,13 +1278,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ SmallPtrSetImpl<const SCEV *> *LoserRegs,
+ const TargetTransformInfo &TTI) {
if (LoserRegs && LoserRegs->count(Reg)) {
Lose();
return;
}
if (Regs.insert(Reg).second) {
- RateRegister(Reg, Regs, L, SE, DT);
+ RateRegister(Reg, Regs, L, SE, DT, TTI);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
}
@@ -1280,7 +1309,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
- RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+ RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
if (isLoser())
return;
}
@@ -1289,7 +1318,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
- RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+ RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
if (isLoser())
return;
}
@@ -1344,14 +1373,15 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
// If ICmpZero formula ends with not 0, it could not be replaced by
// just add or sub. We'll need to compare final result of AddRec.
- // That means we'll need an additional instruction.
+ // That means we'll need an additional instruction. But if the target can
+ // macro-fuse a compare with a branch, don't count this extra instruction.
// For -10 + {0, +, 1}:
// i = i + 1;
// cmp i, 10
//
// For {-10, +, 1}:
// i = i + 1;
- if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
C.Insns++;
// Each new AddRec adds 1 instruction to calculation.
C.Insns += (C.AddRecCost - PrevAddRecCost);
@@ -1457,7 +1487,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
SmallVector<const SCEV *, 4> Key = F.BaseRegs;
if (F.ScaledReg) Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for uniquifying.
- std::sort(Key.begin(), Key.end());
+ llvm::sort(Key.begin(), Key.end());
return Uniquifier.count(Key);
}
@@ -1481,7 +1511,7 @@ bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
SmallVector<const SCEV *, 4> Key = F.BaseRegs;
if (F.ScaledReg) Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for uniquifying.
- std::sort(Key.begin(), Key.end());
+ llvm::sort(Key.begin(), Key.end());
if (!Uniquifier.insert(Key).second)
return false;
@@ -2385,24 +2415,27 @@ LSRInstance::OptimizeLoopTermCond() {
C->getValue().isMinSignedValue())
goto decline_post_inc;
// Check for possible scaled-address reuse.
- MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
- int64_t Scale = C->getSExtValue();
- if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
- /*BaseOffset=*/0,
- /*HasBaseReg=*/false, Scale,
- AccessTy.AddrSpace))
- goto decline_post_inc;
- Scale = -Scale;
- if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
- /*BaseOffset=*/0,
- /*HasBaseReg=*/false, Scale,
- AccessTy.AddrSpace))
- goto decline_post_inc;
+ if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
+ MemAccessTy AccessTy = getAccessType(
+ TTI, UI->getUser(), UI->getOperandValToReplace());
+ int64_t Scale = C->getSExtValue();
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ Scale = -Scale;
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ }
}
}
- DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
- << *Cond << '\n');
+ LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
+ << *Cond << '\n');
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
@@ -2643,7 +2676,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
if (Types.size() == 1)
Types.clear();
- DEBUG(print_factors_and_types(dbgs()));
+ LLVM_DEBUG(print_factors_and_types(dbgs()));
}
/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
@@ -2667,7 +2700,7 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE,
return OI;
}
-/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
/// a convenient helper.
static Value *getWideOperand(Value *Oper) {
if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
@@ -2774,10 +2807,9 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
return false;
if (!Users.empty()) {
- DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
- for (Instruction *Inst : Users) {
- dbgs() << " " << *Inst << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+ for (Instruction *Inst
+ : Users) { dbgs() << " " << *Inst << "\n"; });
return false;
}
assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
@@ -2830,8 +2862,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
// the stride.
cost -= NumReusedIncrements;
- DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
- << "\n");
+ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+ << "\n");
return cost < 0;
}
@@ -2884,7 +2916,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
if (isa<PHINode>(UserInst))
return;
if (NChains >= MaxChains && !StressIVChain) {
- DEBUG(dbgs() << "IV Chain Limit\n");
+ LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
return;
}
LastIncExpr = OperExpr;
@@ -2897,11 +2929,11 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
OperExprBase));
ChainUsersVec.resize(NChains);
- DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
- << ") IV=" << *LastIncExpr << "\n");
+ LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+ << ") IV=" << *LastIncExpr << "\n");
} else {
- DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
- << ") IV+" << *LastIncExpr << "\n");
+ LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
+ << ") IV+" << *LastIncExpr << "\n");
// Add this IV user to the end of the chain.
IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
}
@@ -2971,7 +3003,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
/// loop latch. This will discover chains on side paths, but requires
/// maintaining multiple copies of the Chains state.
void LSRInstance::CollectChains() {
- DEBUG(dbgs() << "Collecting IV Chains.\n");
+ LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
SmallVector<ChainUsers, 8> ChainUsersVec;
SmallVector<BasicBlock *,8> LatchPath;
@@ -3013,15 +3045,14 @@ void LSRInstance::CollectChains() {
} // Continue walking down the instructions.
} // Continue walking down the domtree.
// Visit phi backedges to determine if the chain can generate the IV postinc.
- for (BasicBlock::iterator I = L->getHeader()->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I) {
- if (!SE.isSCEVable(PN->getType()))
+ for (PHINode &PN : L->getHeader()->phis()) {
+ if (!SE.isSCEVable(PN.getType()))
continue;
Instruction *IncV =
- dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+ dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
if (IncV)
- ChainInstruction(PN, IncV, ChainUsersVec);
+ ChainInstruction(&PN, IncV, ChainUsersVec);
}
// Remove any unprofitable chains.
unsigned ChainIdx = 0;
@@ -3041,10 +3072,10 @@ void LSRInstance::CollectChains() {
void LSRInstance::FinalizeChain(IVChain &Chain) {
assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
- DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+ LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
for (const IVInc &Inc : Chain) {
- DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
+ LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
IVIncSet.insert(UseI);
@@ -3061,7 +3092,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
if (IncConst->getAPInt().getMinSignedBits() > 64)
return false;
- MemAccessTy AccessTy = getAccessType(TTI, UserInst);
+ MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
int64_t IncOffset = IncConst->getValue()->getSExtValue();
if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
IncOffset, /*HaseBaseReg=*/false))
@@ -3101,11 +3132,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
}
if (IVOpIter == IVOpEnd) {
// Gracefully give up on this chain.
- DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+ LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
return;
}
- DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+ LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
Type *IVTy = IVSrc->getType();
Type *IntTy = SE.getEffectiveSCEVType(IVTy);
const SCEV *LeftOverExpr = nullptr;
@@ -3152,12 +3183,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
// If LSR created a new, wider phi, we may also replace its postinc. We only
// do this if we also found a wide value for the head of the chain.
if (isa<PHINode>(Chain.tailUserInst())) {
- for (BasicBlock::iterator I = L->getHeader()->begin();
- PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
- if (!isCompatibleIVType(Phi, IVSrc))
+ for (PHINode &Phi : L->getHeader()->phis()) {
+ if (!isCompatibleIVType(&Phi, IVSrc))
continue;
Instruction *PostIncV = dyn_cast<Instruction>(
- Phi->getIncomingValueForBlock(L->getLoopLatch()));
+ Phi.getIncomingValueForBlock(L->getLoopLatch()));
if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
continue;
Value *IVOper = IVSrc;
@@ -3168,7 +3198,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
}
- Phi->replaceUsesOfWith(PostIncV, IVOper);
+ Phi.replaceUsesOfWith(PostIncV, IVOper);
DeadInsts.emplace_back(PostIncV);
}
}
@@ -3182,7 +3212,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
find(UserInst->operands(), U.getOperandValToReplace());
assert(UseI != UserInst->op_end() && "cannot find IV operand");
if (IVIncSet.count(UseI)) {
- DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+ LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
continue;
}
@@ -3190,7 +3220,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
MemAccessTy AccessTy;
if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
Kind = LSRUse::Address;
- AccessTy = getAccessType(TTI, UserInst);
+ AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
}
const SCEV *S = IU.getExpr(U);
@@ -3258,7 +3288,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
}
}
- DEBUG(print_fixups(dbgs()));
+ LLVM_DEBUG(print_fixups(dbgs()));
}
/// Insert a formula for the given expression into the given use, separating out
@@ -3467,12 +3497,45 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
return S;
}
-/// \brief Helper function for LSRInstance::GenerateReassociations.
+/// Return true if the SCEV represents a value that may end up as a
+/// post-increment operation.
+static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
+ LSRUse &LU, const SCEV *S, const Loop *L,
+ ScalarEvolution &SE) {
+ if (LU.Kind != LSRUse::Address ||
+ !LU.AccessTy.getType()->isIntOrIntVectorTy())
+ return false;
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+ if (!AR)
+ return false;
+ const SCEV *LoopStep = AR->getStepRecurrence(SE);
+ if (!isa<SCEVConstant>(LoopStep))
+ return false;
+ if (LU.AccessTy.getType()->getScalarSizeInBits() !=
+ LoopStep->getType()->getScalarSizeInBits())
+ return false;
+ // Check if a post-indexed load/store can be used.
+ if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+ TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+ const SCEV *LoopStart = AR->getStart();
+ if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
+ return true;
+ }
+ return false;
+}
+
+/// Helper function for LSRInstance::GenerateReassociations.
void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base,
unsigned Depth, size_t Idx,
bool IsScaledReg) {
const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ // Don't generate reassociations for the base register of a value that
+ // may generate a post-increment operator. The reason is that the
+ // reassociations cause extra base+register formula to be created,
+ // and possibly chosen, but the post-increment is more efficient.
+ if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
+ return;
SmallVector<const SCEV *, 8> AddOps;
const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
if (Remainder)
@@ -3545,7 +3608,12 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
- GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+ // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
+ // Because just Depth is not enough to bound compile time.
+ // This means that every time AddOps.size() is greater 16^x we will add
+ // x to Depth.
+ GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
+ Depth + 1 + (Log2_32(AddOps.size()) >> 2));
}
}
@@ -3599,7 +3667,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
}
}
-/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+/// Helper function for LSRInstance::GenerateSymbolicOffsets.
void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base, size_t Idx,
bool IsScaledReg) {
@@ -3631,7 +3699,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
/* IsScaledReg */ true);
}
-/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+/// Helper function for LSRInstance::GenerateConstantOffsets.
void LSRInstance::GenerateConstantOffsetsImpl(
LSRUse &LU, unsigned LUIdx, const Formula &Base,
const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
@@ -3941,10 +4009,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
if (Imms.size() == 1)
continue;
- DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
- for (const auto &Entry : Imms)
- dbgs() << ' ' << Entry.first;
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+ for (const auto &Entry
+ : Imms) dbgs()
+ << ' ' << Entry.first;
+ dbgs() << '\n');
// Examine each offset.
for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
@@ -3956,7 +4025,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
if (!isa<SCEVConstant>(OrigReg) &&
UsedByIndicesMap[Reg].count() == 1) {
- DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+ LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+ << '\n');
continue;
}
@@ -4041,6 +4111,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, NewF)) {
+ if (TTI.shouldFavorPostInc() &&
+ mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
+ continue;
if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
continue;
NewF = F;
@@ -4102,9 +4175,9 @@ LSRInstance::GenerateAllReuseFormulae() {
GenerateCrossUseConstantOffsets();
- DEBUG(dbgs() << "\n"
- "After generating reuse formulae:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n"
+ "After generating reuse formulae:\n";
+ print_uses(dbgs()));
}
/// If there are multiple formulae with the same set of registers used
@@ -4126,7 +4199,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
- DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+ dbgs() << '\n');
bool Any = false;
for (size_t FIdx = 0, NumForms = LU.Formulae.size();
@@ -4150,8 +4224,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// as the basis of rediscovering the desired formula that uses an AddRec
// corresponding to the existing phi. Once all formulae have been
// generated, these initial losers may be pruned.
- DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
- dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
+ dbgs() << "\n");
}
else {
SmallVector<const SCEV *, 4> Key;
@@ -4164,7 +4238,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for
// uniquifying.
- std::sort(Key.begin(), Key.end());
+ llvm::sort(Key.begin(), Key.end());
std::pair<BestFormulaeTy::const_iterator, bool> P =
BestFormulae.insert(std::make_pair(Key, FIdx));
@@ -4178,10 +4252,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
if (CostF.isLess(CostBest, TTI))
std::swap(F, Best);
- DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n"
- " in favor of formula "; Best.print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
}
#ifndef NDEBUG
ChangedFormulae = true;
@@ -4200,11 +4274,11 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
BestFormulae.clear();
}
- DEBUG(if (ChangedFormulae) {
- dbgs() << "\n"
- "After filtering out undesirable candidates:\n";
- print_uses(dbgs());
- });
+ LLVM_DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
}
// This is a rough guess that seems to work fairly well.
@@ -4233,11 +4307,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
/// register pressure); remove it to simplify the system.
void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
- DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
- "which use a superset of registers used by other "
- "formulae.\n");
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+ "which use a superset of registers used by other "
+ "formulae.\n");
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
@@ -4255,7 +4329,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
LU.DeleteFormula(F);
--i;
--e;
@@ -4270,8 +4345,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
LU.DeleteFormula(F);
--i;
--e;
@@ -4286,8 +4361,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
LU.RecomputeRegs(LUIdx, RegUses);
}
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
}
@@ -4297,9 +4371,10 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
- DEBUG(dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by assuming that uses separated "
- "by a constant offset will use the same registers.\n");
+ LLVM_DEBUG(
+ dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by assuming that uses separated "
+ "by a constant offset will use the same registers.\n");
// This is especially useful for unrolled loops.
@@ -4317,7 +4392,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LU.Kind, LU.AccessTy))
continue;
- DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
@@ -4325,7 +4400,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
for (LSRFixup &Fixup : LU.Fixups) {
Fixup.Offset += F.BaseOffset;
LUThatHas->pushFixup(Fixup);
- DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+ LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
}
// Delete formulae from the new use which are no longer legal.
@@ -4334,8 +4409,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
Formula &F = LUThatHas->Formulae[i];
if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
LUThatHas->Kind, LUThatHas->AccessTy, F)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
LUThatHas->DeleteFormula(F);
--i;
--e;
@@ -4354,7 +4428,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
}
}
- DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
@@ -4362,15 +4436,14 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
/// eliminate.
void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
- DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
- "undesirable dedicated registers.\n");
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+ "undesirable dedicated registers.\n");
FilterOutUndesirableDedicatedRegisters();
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
}
@@ -4381,15 +4454,16 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
/// The benefit is that it is more likely to find out a better solution
/// from a formulae set with more Scale and ScaledReg variations than
/// a formulae set with the same Scale and ScaledReg. The picking winner
-/// reg heurstic will often keep the formulae with the same Scale and
+/// reg heuristic will often keep the formulae with the same Scale and
/// ScaledReg and filter others, and we want to avoid that if possible.
void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
- DEBUG(dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by choosing the best Formula "
- "from the Formulae with the same Scale and ScaledReg.\n");
+ LLVM_DEBUG(
+ dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the best Formula "
+ "from the Formulae with the same Scale and ScaledReg.\n");
// Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
@@ -4403,7 +4477,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
- DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+ dbgs() << '\n');
// Return true if Formula FA is better than Formula FB.
auto IsBetterThan = [&](Formula &FA, Formula &FB) {
@@ -4447,10 +4522,10 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
Formula &Best = LU.Formulae[P.first->second];
if (IsBetterThan(F, Best))
std::swap(F, Best);
- DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n"
- " in favor of formula ";
- Best.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
#ifndef NDEBUG
ChangedFormulae = true;
#endif
@@ -4466,7 +4541,7 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
BestFormulae.clear();
}
- DEBUG(if (ChangedFormulae) {
+ LLVM_DEBUG(if (ChangedFormulae) {
dbgs() << "\n"
"After filtering out undesirable candidates:\n";
print_uses(dbgs());
@@ -4525,7 +4600,7 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
// Used in each formula of a solution (in example above this is reg(c)).
// We can skip them in calculations.
SmallPtrSet<const SCEV *, 4> UniqRegs;
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
// Map each register to probability of not selecting
DenseMap <const SCEV *, float> RegNumMap;
@@ -4545,7 +4620,8 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
RegNumMap.insert(std::make_pair(Reg, PNotSel));
}
- DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+ LLVM_DEBUG(
+ dbgs() << "Narrowing the search space by deleting costly formulas\n");
// Delete formulas where registers number expectation is high.
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
@@ -4587,26 +4663,25 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
MinIdx = i;
}
}
- DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
- dbgs() << " with min reg num " << FMinRegNum << '\n');
+ LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
+ dbgs() << " with min reg num " << FMinRegNum << '\n');
if (MinIdx != 0)
std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
while (LU.Formulae.size() != 1) {
- DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
+ dbgs() << '\n');
LU.Formulae.pop_back();
}
LU.RecomputeRegs(LUIdx, RegUses);
assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
Formula &F = LU.Formulae[0];
- DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
// When we choose the formula, the regs become unique.
UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
if (F.ScaledReg)
UniqRegs.insert(F.ScaledReg);
}
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
/// Pick a register which seems likely to be profitable, and then in any use
@@ -4619,7 +4694,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
// Ok, we have too many of formulae on our hands to conveniently handle.
// Use a rough heuristic to thin out the list.
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
// Pick the register which is used by the most LSRUses, which is likely
// to be a good reuse register candidate.
@@ -4640,8 +4715,8 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
}
}
- DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
- << " will yield profitable reuse.\n");
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+ << " will yield profitable reuse.\n");
Taken.insert(Best);
// In any use with formulae which references this register, delete formulae
@@ -4654,7 +4729,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
Formula &F = LU.Formulae[i];
if (!F.referencesReg(Best)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
LU.DeleteFormula(F);
--e;
--i;
@@ -4668,8 +4743,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
LU.RecomputeRegs(LUIdx, RegUses);
}
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
}
@@ -4751,11 +4825,11 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
if (F.getNumRegs() == 1 && Workspace.size() == 1)
VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
} else {
- DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
- dbgs() << ".\n Regs:";
- for (const SCEV *S : NewRegs)
- dbgs() << ' ' << *S;
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+ dbgs() << ".\n Regs:"; for (const SCEV *S
+ : NewRegs) dbgs()
+ << ' ' << *S;
+ dbgs() << '\n');
SolutionCost = NewCost;
Solution = Workspace;
@@ -4780,22 +4854,22 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
CurRegs, VisitedRegs);
if (Solution.empty()) {
- DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+ LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
return;
}
// Ok, we've now made all our decisions.
- DEBUG(dbgs() << "\n"
- "The chosen solution requires "; SolutionCost.print(dbgs());
- dbgs() << ":\n";
- for (size_t i = 0, e = Uses.size(); i != e; ++i) {
- dbgs() << " ";
- Uses[i].print(dbgs());
- dbgs() << "\n"
- " ";
- Solution[i]->print(dbgs());
- dbgs() << '\n';
- });
+ LLVM_DEBUG(dbgs() << "\n"
+ "The chosen solution requires ";
+ SolutionCost.print(dbgs()); dbgs() << ":\n";
+ for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+ dbgs() << " ";
+ Uses[i].print(dbgs());
+ dbgs() << "\n"
+ " ";
+ Solution[i]->print(dbgs());
+ dbgs() << '\n';
+ });
assert(Solution.size() == Uses.size() && "Malformed solution!");
}
@@ -4996,7 +5070,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
// Unless the addressing mode will not be folded.
if (!Ops.empty() && LU.Kind == LSRUse::Address &&
isAMCompletelyFolded(TTI, LU, F)) {
- Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
Ops.clear();
Ops.push_back(SE.getUnknown(FullV));
}
@@ -5269,7 +5343,8 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
for (const IVStrideUse &U : IU) {
if (++NumUsers > MaxIVUsers) {
(void)U;
- DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+ LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
+ << "\n");
return;
}
// Bail out if we have a PHI on an EHPad that gets a value from a
@@ -5302,9 +5377,9 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
}
#endif // DEBUG
- DEBUG(dbgs() << "\nLSR on loop ";
- L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
- dbgs() << ":\n");
+ LLVM_DEBUG(dbgs() << "\nLSR on loop ";
+ L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+ dbgs() << ":\n");
// First, perform some low-level loop optimizations.
OptimizeShadowIV();
@@ -5315,7 +5390,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
// Skip nested loops until we can model them better with formulae.
if (!L->empty()) {
- DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
return;
}
@@ -5325,9 +5400,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
CollectFixupsAndInitialFormulae();
CollectLoopInvariantFixupsAndFormulae();
- assert(!Uses.empty() && "IVUsers reported at least one use");
- DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
- print_uses(dbgs()));
+ if (Uses.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+ print_uses(dbgs()));
// Now use the reuse data to generate a bunch of interesting ways
// to formulate the values needed for the uses.
diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
new file mode 100644
index 000000000000..86c99aed4417
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -0,0 +1,447 @@
+//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an unroll and jam pass. Most of the work is done by
+// Utils/UnrollLoopAndJam.cpp.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+static cl::opt<bool>
+ AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
+ cl::desc("Allows loops to be unroll-and-jammed."));
+
+static cl::opt<unsigned> UnrollAndJamCount(
+ "unroll-and-jam-count", cl::Hidden,
+ cl::desc("Use this unroll count for all loops including those with "
+ "unroll_and_jam_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollAndJamThreshold(
+ "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
+ cl::desc("Threshold to use for inner loop when doing unroll and jam."));
+
+static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
+ "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
+ cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
+ "unroll_count pragma."));
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
+ return nullptr;
+}
+
+// Returns true if the loop has any metadata starting with Prefix. For example a
+// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
+static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+ if (MDNode *LoopID = L->getLoopID()) {
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (!MD)
+ continue;
+
+ MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ if (!S)
+ continue;
+
+ if (S->getString().startswith(Prefix))
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns true if the loop has an unroll_and_jam(enable) pragma.
+static bool HasUnrollAndJamEnablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+}
+
+// Returns true if the loop has an unroll_and_jam(disable) pragma.
+static bool HasUnrollAndJamDisablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
+}
+
+// If loop has an unroll_and_jam_count pragma return the (necessarily
+// positive) value from the pragma. Otherwise return 0.
+static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
+ MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ assert(Count >= 1 && "Unroll count must be positive.");
+ return Count;
+ }
+ return 0;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t
+getUnrollAndJammedLoopSize(unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+ return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Calculates unroll and jam count and writes it to UP.Count. Returns true if
+// unroll count was set explicitly.
+static bool computeUnrollAndJamCount(
+ Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
+ LoopInfo *LI, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
+ unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
+ unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
+ // Check for explicit Count from the "unroll-and-jam-count" option.
+ bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
+ if (UserUnrollCount) {
+ UP.Count = UnrollAndJamCount;
+ UP.Force = true;
+ if (UP.AllowRemainder &&
+ getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+ UP.UnrollAndJamInnerLoopThreshold)
+ return true;
+ }
+
+ // Check for unroll_and_jam pragmas
+ unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
+ if (PragmaCount > 0) {
+ UP.Count = PragmaCount;
+ UP.Runtime = true;
+ UP.Force = true;
+ if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
+ getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+ UP.UnrollAndJamInnerLoopThreshold)
+ return true;
+ }
+
+ // Use computeUnrollCount from the loop unroller to get a sensible count
+ // for the unrolling the outer loop. This uses UP.Threshold /
+ // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+ // We have already checked that the loop has no unroll.* pragmas.
+ unsigned MaxTripCount = 0;
+ bool UseUpperBound = false;
+ bool ExplicitUnroll = computeUnrollCount(
+ L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+ OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+ if (ExplicitUnroll || UseUpperBound) {
+ // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+ // for the unroller instead.
+ UP.Count = 0;
+ return false;
+ }
+
+ bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
+ ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
+
+ // If the loop has an unrolling pragma, we want to be more aggressive with
+ // unrolling limits.
+ if (ExplicitUnroll && OuterTripCount != 0)
+ UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
+
+ if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+ UP.UnrollAndJamInnerLoopThreshold) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // If the inner loop count is known and small, leave the entire loop nest to
+ // be the unroller
+ if (!ExplicitUnroll && InnerTripCount &&
+ InnerLoopSize * InnerTripCount < UP.Threshold) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // We have a sensible limit for the outer loop, now adjust it for the inner
+ // loop and UP.UnrollAndJamInnerLoopThreshold.
+ while (UP.Count != 0 && UP.AllowRemainder &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+ UP.UnrollAndJamInnerLoopThreshold)
+ UP.Count--;
+
+ if (!ExplicitUnroll) {
+ // Check for situations where UnJ is likely to be unprofitable. Including
+ // subloops with more than 1 block.
+ if (SubLoop->getBlocks().size() != 1) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // Limit to loops where there is something to gain from unrolling and
+ // jamming the loop. In this case, look for loads that are invariant in the
+ // outer loop and can become shared.
+ unsigned NumInvariant = 0;
+ for (BasicBlock *BB : SubLoop->getBlocks()) {
+ for (Instruction &I : *BB) {
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ Value *V = Ld->getPointerOperand();
+ const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+ if (SE.isLoopInvariant(LSCEV, L))
+ NumInvariant++;
+ }
+ }
+ }
+ if (NumInvariant == 0) {
+ UP.Count = 0;
+ return false;
+ }
+ }
+
+ return ExplicitUnroll;
+}
+
+static LoopUnrollResult
+tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ AssumptionCache &AC, DependenceInfo &DI,
+ OptimizationRemarkEmitter &ORE, int OptLevel) {
+ // Quick checks of the correct loop form
+ if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+ return LoopUnrollResult::Unmodified;
+ Loop *SubLoop = L->getSubLoops()[0];
+ if (!SubLoop->isLoopSimplifyForm())
+ return LoopUnrollResult::Unmodified;
+
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *Exit = L->getExitingBlock();
+ BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+ BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+ if (Latch != Exit || SubLoopLatch != SubLoopExit)
+ return LoopUnrollResult::Unmodified;
+
+ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+ L, SE, TTI, OptLevel, None, None, None, None, None, None);
+ if (AllowUnrollAndJam.getNumOccurrences() > 0)
+ UP.UnrollAndJam = AllowUnrollAndJam;
+ if (UnrollAndJamThreshold.getNumOccurrences() > 0)
+ UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
+ // Exit early if unrolling is disabled.
+ if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
+ return LoopUnrollResult::Unmodified;
+
+ LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
+ << L->getHeader()->getParent()->getName() << "] Loop %"
+ << L->getHeader()->getName() << "\n");
+
+ // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
+ // the unroller, so long as it does not explicitly have unroll_and_jam
+ // metadata. This means #pragma nounroll will disable unroll and jam as well
+ // as unrolling
+ if (HasUnrollAndJamDisablePragma(L) ||
+ (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+ !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
+ LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
+ LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Approximate the loop size and collect useful info
+ unsigned NumInlineCandidates;
+ bool NotDuplicatable;
+ bool Convergent;
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ unsigned InnerLoopSize =
+ ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
+ Convergent, TTI, EphValues, UP.BEInsns);
+ unsigned OuterLoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+ TTI, EphValues, UP.BEInsns);
+ LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
+ LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
+ if (NotDuplicatable) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
+ "instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (NumInlineCandidates != 0) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (Convergent) {
+ LLVM_DEBUG(
+ dbgs() << " Not unrolling loop with convergent instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Find trip count and trip multiple
+ unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
+ unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
+ unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
+
+ // Decide if, and by how much, to unroll
+ bool IsCountSetExplicitly = computeUnrollAndJamCount(
+ L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
+ OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
+ if (UP.Count <= 1)
+ return LoopUnrollResult::Unmodified;
+ // Unroll factor (Count) must be less or equal to TripCount.
+ if (OuterTripCount && UP.Count > OuterTripCount)
+ UP.Count = OuterTripCount;
+
+ LoopUnrollResult UnrollResult =
+ UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
+ UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
+
+ // If loop has an unroll count pragma or unrolled by explicitly set count
+ // mark loop as unrolled to prevent unrolling beyond that requested.
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
+ L->setLoopAlreadyUnrolled();
+
+ return UnrollResult;
+}
+
+namespace {
+
+class LoopUnrollAndJam : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ unsigned OptLevel;
+
+ LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+ initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(&F);
+
+ LoopUnrollResult Result =
+ tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+
+ if (Result == LoopUnrollResult::FullyUnrolled)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopUnrollResult::Unmodified;
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<DependenceAnalysisWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopUnrollAndJam::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
+ "Unroll and Jam loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
+ "Unroll and Jam loops", false, false)
+
+Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
+ return new LoopUnrollAndJam(OptLevel);
+}
+
+PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+
+ auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+ // FIXME: This should probably be optional rather than required.
+ if (!ORE)
+ report_fatal_error(
+ "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
+ "a higher level");
+
+ DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+
+ LoopUnrollResult Result = tryToUnrollAndJamLoop(
+ &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
+
+ if (Result == LoopUnrollResult::Unmodified)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 15e7da5e1a7a..634215c9770f 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -53,6 +53,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -164,7 +165,7 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
-static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
@@ -191,6 +192,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Force = false;
UP.UpperBound = false;
UP.AllowPeeling = true;
+ UP.UnrollAndJam = false;
+ UP.UnrollAndJamInnerLoopThreshold = 60;
// Override with any target specific settings
TTI.getUnrollingPreferences(L, SE, UP);
@@ -285,17 +288,17 @@ struct UnrolledInstStateKeyInfo {
};
struct EstimatedUnrollCost {
- /// \brief The estimated cost after unrolling.
+ /// The estimated cost after unrolling.
unsigned UnrolledCost;
- /// \brief The estimated dynamic cost of executing the instructions in the
+ /// The estimated dynamic cost of executing the instructions in the
/// rolled form.
unsigned RolledDynamicCost;
};
} // end anonymous namespace
-/// \brief Figure out if the loop is worth full unrolling.
+/// Figure out if the loop is worth full unrolling.
///
/// Complete loop unrolling can make some loads constant, and we need to know
/// if that would expose any further optimization opportunities. This routine
@@ -308,10 +311,10 @@ struct EstimatedUnrollCost {
/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
/// the analysis failed (no benefits expected from the unrolling, or the loop is
/// too big to analyze), the returned value is None.
-static Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
- ScalarEvolution &SE, const TargetTransformInfo &TTI,
- unsigned MaxUnrolledLoopSize) {
+static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
+ const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
// We want to be able to scale offsets by the trip count and add more offsets
// to them without checking for overflows, and we already don't want to
// analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -405,9 +408,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// First accumulate the cost of this instruction.
if (!Cost.IsFree) {
UnrolledCost += TTI.getUserCost(I);
- DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
- << "): ");
- DEBUG(I->dump());
+ LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
+ << Iteration << "): ");
+ LLVM_DEBUG(I->dump());
}
// We must count the cost of every operand which is not free,
@@ -442,14 +445,14 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
assert(L->isLCSSAForm(DT) &&
"Must have loops in LCSSA form to track live-out values.");
- DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+ LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
// Simulate execution of each iteration of the loop counting instructions,
// which would be simplified.
// Since the same load will take different values on different iterations,
// we literally have to go through all loop's iterations.
for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
- DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+ LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
// Prepare for the iteration by collecting any simplified entry or backedge
// inputs.
@@ -490,7 +493,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// it. We don't change the actual IR, just count optimization
// opportunities.
for (Instruction &I : *BB) {
- if (isa<DbgInfoIntrinsic>(I))
+ // These won't get into the final code - don't even try calculating the
+ // cost for them.
+ if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
continue;
// Track this instruction's expected baseline cost when executing the
@@ -512,8 +517,13 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// Can't properly model a cost of a call.
// FIXME: With a proper cost model we should be able to do it.
- if(isa<CallInst>(&I))
- return None;
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const Function *Callee = CI->getCalledFunction();
+ if (!Callee || TTI.isLoweredToCall(Callee)) {
+ LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
+ return None;
+ }
+ }
// If the instruction might have a side-effect recursively account for
// the cost of it and all the instructions leading up to it.
@@ -522,10 +532,10 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// If unrolled body turns out to be too big, bail out.
if (UnrolledCost > MaxUnrolledLoopSize) {
- DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
- << " UnrolledCost: " << UnrolledCost
- << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
- << "\n");
+ LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost
+ << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+ << "\n");
return None;
}
}
@@ -578,8 +588,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// If we found no optimization opportunities on the first iteration, we
// won't find them on later ones too.
if (UnrolledCost == RolledDynamicCost) {
- DEBUG(dbgs() << " No opportunities found.. exiting.\n"
- << " UnrolledCost: " << UnrolledCost << "\n");
+ LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost << "\n");
return None;
}
}
@@ -600,20 +610,17 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
}
}
- DEBUG(dbgs() << "Analysis finished:\n"
- << "UnrolledCost: " << UnrolledCost << ", "
- << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+ LLVM_DEBUG(dbgs() << "Analysis finished:\n"
+ << "UnrolledCost: " << UnrolledCost << ", "
+ << "RolledDynamicCost: " << RolledDynamicCost << "\n");
return {{UnrolledCost, RolledDynamicCost}};
}
/// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
- bool &NotDuplicatable, bool &Convergent,
- const TargetTransformInfo &TTI,
- AssumptionCache *AC, unsigned BEInsns) {
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
+unsigned llvm::ApproximateLoopSize(
+ const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+ const TargetTransformInfo &TTI,
+ const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
@@ -706,10 +713,11 @@ static uint64_t getUnrolledLoopSize(
// Returns true if unroll count was set explicitly.
// Calculates unroll count and writes it to UP.Count.
-static bool computeUnrollCount(
+bool llvm::computeUnrollCount(
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
- unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+ ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+ unsigned &TripMultiple, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
// Check for explicit Count.
// 1st priority is unroll count set by "unroll-count" option.
@@ -729,7 +737,7 @@ static bool computeUnrollCount(
UP.Runtime = true;
UP.AllowExpensiveTripCount = true;
UP.Force = true;
- if (UP.AllowRemainder &&
+ if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
return true;
}
@@ -746,8 +754,8 @@ static bool computeUnrollCount(
if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
- // unrolling limits. Set thresholds to at least the PragmaThreshold value
- // which is larger than the default limits.
+ // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+ // value which is larger than the default limits.
UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
UP.PartialThreshold =
std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
@@ -763,7 +771,7 @@ static bool computeUnrollCount(
// compute the former when the latter is zero.
unsigned ExactTripCount = TripCount;
assert((ExactTripCount == 0 || MaxTripCount == 0) &&
- "ExtractTripCound and MaxTripCount cannot both be non zero.");
+ "ExtractTripCount and MaxTripCount cannot both be non zero.");
unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
UP.Count = FullUnrollTripCount;
if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
@@ -779,7 +787,7 @@ static bool computeUnrollCount(
// helps to remove a significant number of instructions.
// To check that, run additional analysis on the loop.
if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, FullUnrollTripCount, DT, SE, TTI,
+ L, FullUnrollTripCount, DT, SE, EphValues, TTI,
UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
unsigned Boost =
getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -794,7 +802,7 @@ static bool computeUnrollCount(
}
// 4th priority is loop peeling
- computePeelCount(L, LoopSize, UP, TripCount);
+ computePeelCount(L, LoopSize, UP, TripCount, SE);
if (UP.PeelCount) {
UP.Runtime = false;
UP.Count = 1;
@@ -802,12 +810,12 @@ static bool computeUnrollCount(
}
// 5th priority is partial unrolling.
- // Try partial unroll only when TripCount could be staticaly calculated.
+ // Try partial unroll only when TripCount could be statically calculated.
if (TripCount) {
UP.Partial |= ExplicitUnroll;
if (!UP.Partial) {
- DEBUG(dbgs() << " will not try to unroll partially because "
- << "-unroll-allow-partial not given\n");
+ LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
+ << "-unroll-allow-partial not given\n");
UP.Count = 0;
return false;
}
@@ -894,8 +902,9 @@ static bool computeUnrollCount(
// Reduce count based on the type of unrolling and the threshold values.
UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
if (!UP.Runtime) {
- DEBUG(dbgs() << " will not try to unroll loop with runtime trip count "
- << "-unroll-runtime not given\n");
+ LLVM_DEBUG(
+ dbgs() << " will not try to unroll loop with runtime trip count "
+ << "-unroll-runtime not given\n");
UP.Count = 0;
return false;
}
@@ -915,12 +924,13 @@ static bool computeUnrollCount(
if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
while (UP.Count != 0 && TripMultiple % UP.Count != 0)
UP.Count >>= 1;
- DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
- "specific or because the loop contains a convergent "
- "instruction), so unroll count must divide the trip "
- "multiple, "
- << TripMultiple << ". Reducing unroll count from "
- << OrigCount << " to " << UP.Count << ".\n");
+ LLVM_DEBUG(
+ dbgs() << "Remainder loop is restricted (that could architecture "
+ "specific or because the loop contains a convergent "
+ "instruction), so unroll count must divide the trip "
+ "multiple, "
+ << TripMultiple << ". Reducing unroll count from " << OrigCount
+ << " to " << UP.Count << ".\n");
using namespace ore;
@@ -942,7 +952,8 @@ static bool computeUnrollCount(
if (UP.Count > UP.MaxCount)
UP.Count = UP.MaxCount;
- DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n");
+ LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
+ << "\n");
if (UP.Count < 2)
UP.Count = 0;
return ExplicitUnroll;
@@ -955,12 +966,13 @@ static LoopUnrollResult tryToUnrollLoop(
Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
- DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
- << "] Loop %" << L->getHeader()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Loop Unroll: F["
+ << L->getHeader()->getParent()->getName() << "] Loop %"
+ << L->getHeader()->getName() << "\n");
if (HasUnrollDisablePragma(L))
return LoopUnrollResult::Unmodified;
if (!L->isLoopSimplifyForm()) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
return LoopUnrollResult::Unmodified;
}
@@ -975,16 +987,21 @@ static LoopUnrollResult tryToUnrollLoop(
// Exit early if unrolling is disabled.
if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
return LoopUnrollResult::Unmodified;
- unsigned LoopSize = ApproximateLoopSize(
- L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
- DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+ unsigned LoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+ TTI, EphValues, UP.BEInsns);
+ LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
if (NotDuplicatable) {
- DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
- << " instructions.\n");
+ LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
+ << " instructions.\n");
return LoopUnrollResult::Unmodified;
}
if (NumInlineCandidates != 0) {
- DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return LoopUnrollResult::Unmodified;
}
@@ -1030,7 +1047,7 @@ static LoopUnrollResult tryToUnrollLoop(
// loop tests remains the same compared to the non-unrolled version, whereas
// the generic upper bound unrolling keeps all but the last loop test so the
// number of loop tests goes up which may end up being worse on targets with
- // constriained branch predictor resources so is controlled by an option.)
+ // constrained branch predictor resources so is controlled by an option.)
// In addition we only unroll small upper bounds.
if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
MaxTripCount = 0;
@@ -1040,9 +1057,9 @@ static LoopUnrollResult tryToUnrollLoop(
// computeUnrollCount() decides whether it is beneficial to use upper bound to
// fully unroll the loop.
bool UseUpperBound = false;
- bool IsCountSetExplicitly =
- computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
- TripMultiple, LoopSize, UP, UseUpperBound);
+ bool IsCountSetExplicitly = computeUnrollCount(
+ L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+ TripMultiple, LoopSize, UP, UseUpperBound);
if (!UP.Count)
return LoopUnrollResult::Unmodified;
// Unroll factor (Count) must be less or equal to TripCount.
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index bd468338a1d0..b12586758925 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -28,7 +28,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
@@ -39,6 +39,7 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
@@ -66,7 +67,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
@@ -298,9 +298,9 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
if (Metrics.notDuplicatable) {
- DEBUG(dbgs() << "NOT unswitching loop %"
- << L->getHeader()->getName() << ", contents cannot be "
- << "duplicated!\n");
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
+ << ", contents cannot be "
+ << "duplicated!\n");
return false;
}
}
@@ -635,6 +635,12 @@ bool LoopUnswitch::processCurrentLoop() {
return true;
}
+ // Do not do non-trivial unswitch while optimizing for size.
+ // FIXME: Use Function::optForSize().
+ if (OptimizeForSize ||
+ loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+
// Run through the instructions in the loop, keeping track of three things:
//
// - That we do not unswitch loops containing convergent operations, as we
@@ -666,12 +672,6 @@ bool LoopUnswitch::processCurrentLoop() {
}
}
- // Do not do non-trivial unswitch while optimizing for size.
- // FIXME: Use Function::optForSize().
- if (OptimizeForSize ||
- loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
- return false;
-
for (IntrinsicInst *Guard : Guards) {
Value *LoopCond =
FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
@@ -856,20 +856,20 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
TerminatorInst *TI) {
// Check to see if it would be profitable to unswitch current loop.
if (!BranchesInfo.CostAllowsUnswitching()) {
- DEBUG(dbgs() << "NOT unswitching loop %"
- << currentLoop->getHeader()->getName()
- << " at non-trivial condition '" << *Val
- << "' == " << *LoopCond << "\n"
- << ". Cost too high.\n");
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Cost too high.\n");
return false;
}
if (hasBranchDivergence &&
getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
- DEBUG(dbgs() << "NOT unswitching loop %"
- << currentLoop->getHeader()->getName()
- << " at non-trivial condition '" << *Val
- << "' == " << *LoopCond << "\n"
- << ". Condition is divergent.\n");
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Condition is divergent.\n");
return false;
}
@@ -910,6 +910,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
BranchInst *OldBranch,
TerminatorInst *TI) {
assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
+ assert(TrueDest != FalseDest && "Branch targets should be different");
// Insert a conditional branch on LIC to the two preheaders. The original
// code is the true version and the new code is the false version.
Value *BranchVal = LIC;
@@ -942,9 +943,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
if (DT) {
// First, add both successors.
SmallVector<DominatorTree::UpdateType, 3> Updates;
- if (TrueDest != OldBranchParent)
+ if (TrueDest != OldBranchSucc)
Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
- if (FalseDest != OldBranchParent)
+ if (FalseDest != OldBranchSucc)
Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
// If both of the new successors are different from the old one, inform the
// DT that the edge was deleted.
@@ -970,11 +971,15 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
BasicBlock *ExitBlock,
TerminatorInst *TI) {
- DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
- << loopHeader->getName() << " [" << L->getBlocks().size()
- << " blocks] in Function "
- << L->getHeader()->getParent()->getName() << " on cond: " << *Val
- << " == " << *Cond << "\n");
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function "
+ << L->getHeader()->getParent()->getName()
+ << " on cond: " << *Val << " == " << *Cond << "\n");
+ // We are going to make essential changes to CFG. This may invalidate cached
+ // information for L or one of its parent loops in SCEV.
+ if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+ SEWP->getSE().forgetTopmostLoop(L);
// First step, split the preheader, so that we know that there is a safe place
// to insert the conditional branch. We will change loopPreheader to have a
@@ -1038,7 +1043,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
// until it finds the trivial condition candidate (condition that is not a
// constant). Since unswitching generates branches with constant conditions,
// this scenario could be very common in practice.
- SmallSet<BasicBlock*, 8> Visited;
+ SmallPtrSet<BasicBlock*, 8> Visited;
while (true) {
// If we exit loop or reach a previous visited block, then
@@ -1196,13 +1201,15 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
Loop *L, TerminatorInst *TI) {
Function *F = loopHeader->getParent();
- DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
- << loopHeader->getName() << " [" << L->getBlocks().size()
- << " blocks] in Function " << F->getName()
- << " when '" << *Val << "' == " << *LIC << "\n");
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function " << F->getName() << " when '"
+ << *Val << "' == " << *LIC << "\n");
+ // We are going to make essential changes to CFG. This may invalidate cached
+ // information for L or one of its parent loops in SCEV.
if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
- SEWP->getSE().forgetLoop(L);
+ SEWP->getSE().forgetTopmostLoop(L);
LoopBlocks.clear();
NewBlocks.clear();
@@ -1274,12 +1281,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
// If the successor of the exit block had PHI nodes, add an entry for
// NewExit.
- for (BasicBlock::iterator I = ExitSucc->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I) {
- Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+ for (PHINode &PN : ExitSucc->phis()) {
+ Value *V = PN.getIncomingValueForBlock(ExitBlocks[i]);
ValueToValueMapTy::iterator It = VMap.find(V);
if (It != VMap.end()) V = It->second;
- PN->addIncoming(V, NewExit);
+ PN.addIncoming(V, NewExit);
}
if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
@@ -1356,7 +1362,7 @@ static void RemoveFromWorklist(Instruction *I,
static void ReplaceUsesOfWith(Instruction *I, Value *V,
std::vector<Instruction*> &Worklist,
Loop *L, LPPassManager *LPM) {
- DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
// Add uses to the worklist, which may be dead now.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1496,10 +1502,9 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
BranchInst::Create(Abort, OldSISucc,
ConstantInt::getTrue(Context), NewSISucc);
// Release the PHI operands for this edge.
- for (BasicBlock::iterator II = NewSISucc->begin();
- PHINode *PN = dyn_cast<PHINode>(II); ++II)
- PN->setIncomingValue(PN->getBasicBlockIndex(Switch),
- UndefValue::get(PN->getType()));
+ for (PHINode &PN : NewSISucc->phis())
+ PN.setIncomingValue(PN.getBasicBlockIndex(Switch),
+ UndefValue::get(PN.getType()));
// Tell the domtree about the new block. We don't fully update the
// domtree here -- instead we force it to do a full recomputation
// after the pass is complete -- but we do need to inform it of
@@ -1526,7 +1531,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
// Simple DCE.
if (isInstructionTriviallyDead(I)) {
- DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
// Add uses to the worklist, which may be dead now.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1559,8 +1564,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
if (!SinglePred) continue; // Nothing to do.
assert(SinglePred == Pred && "CFG broken");
- DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
- << Succ->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+ << Succ->getName() << "\n");
// Resolve any single entry PHI nodes in Succ.
while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 53b25e688e82..06e86081e8a0 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -68,6 +68,7 @@
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
@@ -85,6 +86,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <cassert>
@@ -111,7 +113,7 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
"LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
cl::init(2), cl::Hidden);
-/// \brief Create MDNode for input string.
+/// Create MDNode for input string.
static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
LLVMContext &Context = TheLoop->getHeader()->getContext();
Metadata *MDs[] = {
@@ -120,7 +122,7 @@ static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
return MDNode::get(Context, MDs);
}
-/// \brief Set input string into loop metadata by keeping other values intact.
+/// Set input string into loop metadata by keeping other values intact.
void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
unsigned V) {
SmallVector<Metadata *, 4> MDs(1);
@@ -166,6 +168,7 @@ struct LoopVersioningLICM : public LoopPass {
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
StringRef getPassName() const override { return "Loop Versioning for LICM"; }
@@ -178,6 +181,7 @@ struct LoopVersioningLICM : public LoopPass {
LoadAndStoreCounter = 0;
InvariantCounter = 0;
IsReadOnlyLoop = true;
+ ORE = nullptr;
CurAST.reset();
}
@@ -207,7 +211,7 @@ private:
Loop *CurLoop = nullptr;
// AliasSet information for the current loop.
- std::unique_ptr<AliasSetTracker> CurAST;
+ std::unique_ptr<AliasSetTracker> CurAST;
// Maximum loop nest threshold
unsigned LoopDepthThreshold;
@@ -224,6 +228,9 @@ private:
// Read only loop marker.
bool IsReadOnlyLoop = true;
+ // OptimizationRemarkEmitter
+ OptimizationRemarkEmitter *ORE;
+
bool isLegalForVersioning();
bool legalLoopStructure();
bool legalLoopInstructions();
@@ -235,58 +242,57 @@ private:
} // end anonymous namespace
-/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
+/// Check loop structure and confirms it's good for LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopStructure() {
// Loop must be in loop simplify form.
if (!CurLoop->isLoopSimplifyForm()) {
- DEBUG(
- dbgs() << " loop is not in loop-simplify form.\n");
+ LLVM_DEBUG(dbgs() << " loop is not in loop-simplify form.\n");
return false;
}
// Loop should be innermost loop, if not return false.
if (!CurLoop->getSubLoops().empty()) {
- DEBUG(dbgs() << " loop is not innermost\n");
+ LLVM_DEBUG(dbgs() << " loop is not innermost\n");
return false;
}
// Loop should have a single backedge, if not return false.
if (CurLoop->getNumBackEdges() != 1) {
- DEBUG(dbgs() << " loop has multiple backedges\n");
+ LLVM_DEBUG(dbgs() << " loop has multiple backedges\n");
return false;
}
// Loop must have a single exiting block, if not return false.
if (!CurLoop->getExitingBlock()) {
- DEBUG(dbgs() << " loop has multiple exiting block\n");
+ LLVM_DEBUG(dbgs() << " loop has multiple exiting block\n");
return false;
}
// We only handle bottom-tested loop, i.e. loop in which the condition is
// checked at the end of each iteration. With that we can assume that all
// instructions in the loop are executed the same number of times.
if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
- DEBUG(dbgs() << " loop is not bottom tested\n");
+ LLVM_DEBUG(dbgs() << " loop is not bottom tested\n");
return false;
}
// Parallel loops must not have aliasing loop-invariant memory accesses.
// Hence we don't need to version anything in this case.
if (CurLoop->isAnnotatedParallel()) {
- DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
+ LLVM_DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
return false;
}
// Loop depth more then LoopDepthThreshold are not allowed
if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
- DEBUG(dbgs() << " loop depth is more then threshold\n");
+ LLVM_DEBUG(dbgs() << " loop depth is more then threshold\n");
return false;
}
// We need to be able to compute the loop trip count in order
// to generate the bound checks.
const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
if (ExitCount == SE->getCouldNotCompute()) {
- DEBUG(dbgs() << " loop does not has trip count\n");
+ LLVM_DEBUG(dbgs() << " loop does not has trip count\n");
return false;
}
return true;
}
-/// \brief Check memory accesses in loop and confirms it's good for
+/// Check memory accesses in loop and confirms it's good for
/// LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopMemoryAccesses() {
bool HasMayAlias = false;
@@ -328,24 +334,24 @@ bool LoopVersioningLICM::legalLoopMemoryAccesses() {
}
// Ensure types should be of same type.
if (!TypeSafety) {
- DEBUG(dbgs() << " Alias tracker type safety failed!\n");
+ LLVM_DEBUG(dbgs() << " Alias tracker type safety failed!\n");
return false;
}
// Ensure loop body shouldn't be read only.
if (!HasMod) {
- DEBUG(dbgs() << " No memory modified in loop body\n");
+ LLVM_DEBUG(dbgs() << " No memory modified in loop body\n");
return false;
}
// Make sure alias set has may alias case.
// If there no alias memory ambiguity, return false.
if (!HasMayAlias) {
- DEBUG(dbgs() << " No ambiguity in memory access.\n");
+ LLVM_DEBUG(dbgs() << " No ambiguity in memory access.\n");
return false;
}
return true;
}
-/// \brief Check loop instructions safe for Loop versioning.
+/// Check loop instructions safe for Loop versioning.
/// It returns true if it's safe else returns false.
/// Consider following:
/// 1) Check all load store in loop body are non atomic & non volatile.
@@ -355,12 +361,12 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
assert(I != nullptr && "Null instruction found!");
// Check function call safety
if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) {
- DEBUG(dbgs() << " Unsafe call site found.\n");
+ LLVM_DEBUG(dbgs() << " Unsafe call site found.\n");
return false;
}
// Avoid loops with possiblity of throw
if (I->mayThrow()) {
- DEBUG(dbgs() << " May throw instruction found in loop body\n");
+ LLVM_DEBUG(dbgs() << " May throw instruction found in loop body\n");
return false;
}
// If current instruction is load instructions
@@ -368,7 +374,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
if (I->mayReadFromMemory()) {
LoadInst *Ld = dyn_cast<LoadInst>(I);
if (!Ld || !Ld->isSimple()) {
- DEBUG(dbgs() << " Found a non-simple load.\n");
+ LLVM_DEBUG(dbgs() << " Found a non-simple load.\n");
return false;
}
LoadAndStoreCounter++;
@@ -382,7 +388,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
else if (I->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(I);
if (!St || !St->isSimple()) {
- DEBUG(dbgs() << " Found a non-simple store.\n");
+ LLVM_DEBUG(dbgs() << " Found a non-simple store.\n");
return false;
}
LoadAndStoreCounter++;
@@ -396,59 +402,87 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
return true;
}
-/// \brief Check loop instructions and confirms it's good for
+/// Check loop instructions and confirms it's good for
/// LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopInstructions() {
// Resetting counters.
LoadAndStoreCounter = 0;
InvariantCounter = 0;
IsReadOnlyLoop = true;
+ using namespace ore;
// Iterate over loop blocks and instructions of each block and check
// instruction safety.
for (auto *Block : CurLoop->getBlocks())
for (auto &Inst : *Block) {
// If instruction is unsafe just return false.
- if (!instructionSafeForVersioning(&Inst))
+ if (!instructionSafeForVersioning(&Inst)) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst)
+ << " Unsafe Loop Instruction";
+ });
return false;
+ }
}
// Get LoopAccessInfo from current loop.
LAI = &LAA->getInfo(CurLoop);
// Check LoopAccessInfo for need of runtime check.
if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
- DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
+ LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
return false;
}
// Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
if (LAI->getNumRuntimePointerChecks() >
VectorizerParams::RuntimeMemoryCheckThreshold) {
- DEBUG(dbgs() << " LAA: Runtime checks are more than threshold !!\n");
+ LLVM_DEBUG(
+ dbgs() << " LAA: Runtime checks are more than threshold !!\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Number of runtime checks "
+ << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks())
+ << " exceeds threshold "
+ << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold);
+ });
return false;
}
// Loop should have at least one invariant load or store instruction.
if (!InvariantCounter) {
- DEBUG(dbgs() << " Invariant not found !!\n");
+ LLVM_DEBUG(dbgs() << " Invariant not found !!\n");
return false;
}
// Read only loop not allowed.
if (IsReadOnlyLoop) {
- DEBUG(dbgs() << " Found a read-only loop!\n");
+ LLVM_DEBUG(dbgs() << " Found a read-only loop!\n");
return false;
}
// Profitablity check:
// Check invariant threshold, should be in limit.
if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
- DEBUG(dbgs()
- << " Invariant load & store are less then defined threshold\n");
- DEBUG(dbgs() << " Invariant loads & stores: "
- << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n");
- DEBUG(dbgs() << " Invariant loads & store threshold: "
- << InvariantThreshold << "%\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Invariant load & store are less then defined threshold\n");
+ LLVM_DEBUG(dbgs() << " Invariant loads & stores: "
+ << ((InvariantCounter * 100) / LoadAndStoreCounter)
+ << "%\n");
+ LLVM_DEBUG(dbgs() << " Invariant loads & store threshold: "
+ << InvariantThreshold << "%\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Invariant load & store "
+ << NV("LoadAndStoreCounter",
+ ((InvariantCounter * 100) / LoadAndStoreCounter))
+ << " are less then defined threshold "
+ << NV("Threshold", InvariantThreshold);
+ });
return false;
}
return true;
}
-/// \brief It checks loop is already visited or not.
+/// It checks loop is already visited or not.
/// check loop meta data, if loop revisited return true
/// else false.
bool LoopVersioningLICM::isLoopAlreadyVisited() {
@@ -459,42 +493,64 @@ bool LoopVersioningLICM::isLoopAlreadyVisited() {
return false;
}
-/// \brief Checks legality for LoopVersioningLICM by considering following:
+/// Checks legality for LoopVersioningLICM by considering following:
/// a) loop structure legality b) loop instruction legality
/// c) loop memory access legality.
/// Return true if legal else returns false.
bool LoopVersioningLICM::isLegalForVersioning() {
- DEBUG(dbgs() << "Loop: " << *CurLoop);
+ using namespace ore;
+ LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop);
// Make sure not re-visiting same loop again.
if (isLoopAlreadyVisited()) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n");
return false;
}
// Check loop structure leagality.
if (!legalLoopStructure()) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << " Unsafe Loop structure";
+ });
return false;
}
// Check loop instruction leagality.
if (!legalLoopInstructions()) {
- DEBUG(dbgs()
- << " Loop instructions not suitable for LoopVersioningLICM\n\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Loop instructions not suitable for LoopVersioningLICM\n\n");
return false;
}
// Check loop memory access leagality.
if (!legalLoopMemoryAccesses()) {
- DEBUG(dbgs()
- << " Loop memory access not suitable for LoopVersioningLICM\n\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Loop memory access not suitable for LoopVersioningLICM\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << " Unsafe Loop memory access";
+ });
return false;
}
// Loop versioning is feasible, return true.
- DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
+ LLVM_DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning",
+ CurLoop->getStartLoc(), CurLoop->getHeader())
+ << " Versioned loop for LICM."
+ << " Number of runtime checks we had to insert "
+ << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks());
+ });
return true;
}
-/// \brief Update loop with aggressive aliasing assumptions.
+/// Update loop with aggressive aliasing assumptions.
/// It marks no-alias to any pairs of memory operations by assuming
/// loop should not have any must-alias memory accesses pairs.
/// During LoopVersioningLICM legality we ignore loops having must
@@ -542,6 +598,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
LAI = nullptr;
// Set Current Loop
CurLoop = L;
@@ -592,6 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
"Loop Versioning For LICM", false, false)
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 46f8a3564265..68bfa0030395 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -357,7 +357,7 @@ PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
}
namespace {
-/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+/// Legacy pass for lowering expect intrinsics out of the IR.
///
/// When this pass is run over a function it uses expect intrinsics which feed
/// branches and switches to provide branch weight metadata for those
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9c870b42a747..3b74421a47a0 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -25,6 +25,7 @@
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -55,7 +56,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -263,7 +263,7 @@ public:
void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
- addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+ addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
}
void addRange(int64_t Start, int64_t Size, Value *Ptr,
@@ -479,10 +479,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
AMemSet =
Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
- DEBUG(dbgs() << "Replace stores:\n";
- for (Instruction *SI : Range.TheStores)
- dbgs() << *SI << '\n';
- dbgs() << "With: " << *AMemSet << '\n');
+ LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
+ : Range.TheStores) dbgs()
+ << *SI << '\n';
+ dbgs() << "With: " << *AMemSet << '\n');
if (!Range.TheStores.empty())
AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
@@ -498,16 +498,25 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
return AMemSet;
}
-static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
- const LoadInst *LI) {
+static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) {
unsigned StoreAlign = SI->getAlignment();
if (!StoreAlign)
StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+ return StoreAlign;
+}
+
+static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) {
unsigned LoadAlign = LI->getAlignment();
if (!LoadAlign)
LoadAlign = DL.getABITypeAlignment(LI->getType());
+ return LoadAlign;
+}
- return std::min(StoreAlign, LoadAlign);
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+ const LoadInst *LI) {
+ unsigned StoreAlign = findStoreAlignment(DL, SI);
+ unsigned LoadAlign = findLoadAlignment(DL, LI);
+ return MinAlign(StoreAlign, LoadAlign);
}
// This method try to lift a store instruction before position P.
@@ -522,7 +531,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
return false;
// Keep track of the arguments of all instruction we plan to lift
- // so we can make sure to lift them as well if apropriate.
+ // so we can make sure to lift them as well if appropriate.
DenseSet<Instruction*> Args;
if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
if (Ptr->getParent() == SI->getParent())
@@ -594,7 +603,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
// We made it, we need to lift
for (auto *I : llvm::reverse(ToLift)) {
- DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+ LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
I->moveBefore(P);
}
@@ -656,22 +665,23 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
UseMemMove = true;
- unsigned Align = findCommonAlignment(DL, SI, LI);
uint64_t Size = DL.getTypeStoreSize(T);
IRBuilder<> Builder(P);
Instruction *M;
if (UseMemMove)
- M = Builder.CreateMemMove(SI->getPointerOperand(),
- LI->getPointerOperand(), Size,
- Align, SI->isVolatile());
+ M = Builder.CreateMemMove(
+ SI->getPointerOperand(), findStoreAlignment(DL, SI),
+ LI->getPointerOperand(), findLoadAlignment(DL, LI), Size,
+ SI->isVolatile());
else
- M = Builder.CreateMemCpy(SI->getPointerOperand(),
- LI->getPointerOperand(), Size,
- Align, SI->isVolatile());
+ M = Builder.CreateMemCpy(
+ SI->getPointerOperand(), findStoreAlignment(DL, SI),
+ LI->getPointerOperand(), findLoadAlignment(DL, LI), Size,
+ SI->isVolatile());
- DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
- << " => " << *M << "\n");
+ LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
+ << *M << "\n");
MD->removeInstruction(SI);
SI->eraseFromParent();
@@ -760,7 +770,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
Size, Align, SI->isVolatile());
- DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+ LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
MD->removeInstruction(SI);
SI->eraseFromParent();
@@ -1047,20 +1057,17 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
// If all checks passed, then we can transform M.
- // Make sure to use the lesser of the alignment of the source and the dest
- // since we're changing where we're reading from, but don't want to increase
- // the alignment past what can be read from or written to.
// TODO: Is this worth it if we're creating a less aligned memcpy? For
// example we could be moving from movaps -> movq on x86.
- unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
-
IRBuilder<> Builder(M);
if (UseMemMove)
- Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
- Align, M->isVolatile());
+ Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(),
+ MDep->getRawSource(), MDep->getSourceAlignment(),
+ M->getLength(), M->isVolatile());
else
- Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
- Align, M->isVolatile());
+ Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(),
+ MDep->getRawSource(), MDep->getSourceAlignment(),
+ M->getLength(), M->isVolatile());
// Remove the instruction we're replacing.
MD->removeInstruction(M);
@@ -1106,7 +1113,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
// If Dest is aligned, and SrcSize is constant, use the minimum alignment
// of the sum.
const unsigned DestAlign =
- std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+ std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
if (DestAlign > 1)
if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
@@ -1166,7 +1173,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
IRBuilder<> Builder(MemCpy);
Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
- CopySize, MemCpy->getAlignment());
+ CopySize, MemCpy->getDestAlignment());
return true;
}
@@ -1192,7 +1199,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
IRBuilder<> Builder(M);
Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
- M->getAlignment(), false);
+ M->getDestAlignment(), false);
MD->removeInstruction(M);
M->eraseFromParent();
++NumCpyToSet;
@@ -1221,8 +1228,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
// d) memcpy from a just-memset'd source can be turned into memset.
if (DepInfo.isClobber()) {
if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+ // FIXME: Can we pass in either of dest/src alignment here instead
+ // of conservatively taking the minimum?
+ unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment());
if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
- CopySize->getZExtValue(), M->getAlignment(),
+ CopySize->getZExtValue(), Align,
C)) {
MD->removeInstruction(M);
M->eraseFromParent();
@@ -1284,8 +1294,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
MemoryLocation::getForSource(M)))
return false;
- DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
- << "\n");
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+ << "\n");
// If not, then we know we can transform this.
Type *ArgTys[3] = { M->getRawDest()->getType(),
@@ -1337,7 +1347,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
// source of the memcpy to the alignment we need. If we fail, we bail out.
AssumptionCache &AC = LookupAssumptionCache();
DominatorTree &DT = LookupDomTree();
- if (MDep->getAlignment() < ByValAlign &&
+ if (MDep->getSourceAlignment() < ByValAlign &&
getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
CS.getInstruction(), &AC, &DT) < ByValAlign)
return false;
@@ -1367,9 +1377,9 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
"tmpcast", CS.getInstruction());
- DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
- << " " << *MDep << "\n"
- << " " << *CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+ << " " << *MDep << "\n"
+ << " " << *CS.getInstruction() << "\n");
// Otherwise we're good! Update the byval argument.
CS.setArgument(ArgNo, TmpCast);
@@ -1381,10 +1391,19 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
bool MemCpyOptPass::iterateOnFunction(Function &F) {
bool MadeChange = false;
+ DominatorTree &DT = LookupDomTree();
+
// Walk all instruction in the function.
for (BasicBlock &BB : F) {
+ // Skip unreachable blocks. For example processStore assumes that an
+ // instruction in a BB can't be dominated by a later instruction in the
+ // same BB (which is a scenario that can happen for an unreachable BB that
+ // has itself as a predecessor).
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+
for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
- // Avoid invalidating the iterator.
+ // Avoid invalidating the iterator.
Instruction *I = &*BI++;
bool RepeatInstruction = false;
diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
index 9869a3fb96fa..ff0183a8ea2d 100644
--- a/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -71,30 +71,30 @@ struct BCEAtom {
};
// If this value is a load from a constant offset w.r.t. a base address, and
-// there are no othe rusers of the load or address, returns the base address and
+// there are no other users of the load or address, returns the base address and
// the offset.
BCEAtom visitICmpLoadOperand(Value *const Val) {
BCEAtom Result;
if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
- DEBUG(dbgs() << "load\n");
+ LLVM_DEBUG(dbgs() << "load\n");
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
- DEBUG(dbgs() << "used outside of block\n");
+ LLVM_DEBUG(dbgs() << "used outside of block\n");
return {};
}
if (LoadI->isVolatile()) {
- DEBUG(dbgs() << "volatile\n");
+ LLVM_DEBUG(dbgs() << "volatile\n");
return {};
}
Value *const Addr = LoadI->getOperand(0);
if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
- DEBUG(dbgs() << "GEP\n");
+ LLVM_DEBUG(dbgs() << "GEP\n");
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
- DEBUG(dbgs() << "used outside of block\n");
+ LLVM_DEBUG(dbgs() << "used outside of block\n");
return {};
}
const auto &DL = GEP->getModule()->getDataLayout();
if (!isDereferenceablePointer(GEP, DL)) {
- DEBUG(dbgs() << "not dereferenceable\n");
+ LLVM_DEBUG(dbgs() << "not dereferenceable\n");
// We need to make sure that we can do comparison in any order, so we
// require memory to be unconditionnally dereferencable.
return {};
@@ -110,6 +110,10 @@ BCEAtom visitICmpLoadOperand(Value *const Val) {
}
// A basic block with a comparison between two BCE atoms.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
// Note: the terminology is misleading: the comparison is symmetric, so there
// is no real {l/r}hs. What we want though is to have the same base on the
// left (resp. right), so that we can detect consecutive loads. To ensure this
@@ -127,7 +131,7 @@ class BCECmpBlock {
return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
}
- // Assert the the block is consistent: If valid, it should also have
+ // Assert the block is consistent: If valid, it should also have
// non-null members besides Lhs_ and Rhs_.
void AssertConsistent() const {
if (IsValid()) {
@@ -144,37 +148,95 @@ class BCECmpBlock {
// Returns true if the block does other works besides comparison.
bool doesOtherWork() const;
+ // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+ // instructions in the block.
+ bool canSplit() const;
+
+ // Return true if this all the relevant instructions in the BCE-cmp-block can
+ // be sunk below this instruction. By doing this, we know we can separate the
+ // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+ // block.
+ bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const;
+
+ // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+ // instructions. Split the old block and move all non-BCE-cmp-insts into the
+ // new parent block.
+ void split(BasicBlock *NewParent) const;
+
// The basic block where this comparison happens.
BasicBlock *BB = nullptr;
// The ICMP for this comparison.
ICmpInst *CmpI = nullptr;
// The terminating branch.
BranchInst *BranchI = nullptr;
+ // The block requires splitting.
+ bool RequireSplit = false;
- private:
+private:
BCEAtom Lhs_;
BCEAtom Rhs_;
int SizeBits_ = 0;
};
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+ DenseSet<Instruction *> &BlockInsts) const {
+ // If this instruction has side effects and its in middle of the BCE cmp block
+ // instructions, then bail for now.
+ // TODO: use alias analysis to tell whether there is real interference.
+ if (Inst->mayHaveSideEffects())
+ return false;
+ // Make sure this instruction does not use any of the BCE cmp block
+ // instructions as operand.
+ for (auto BI : BlockInsts) {
+ if (is_contained(Inst->operands(), BI))
+ return false;
+ }
+ return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent) const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ llvm::SmallVector<Instruction *, 4> OtherInsts;
+ for (Instruction &Inst : *BB) {
+ if (BlockInsts.count(&Inst))
+ continue;
+ assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block");
+ // This is a non-BCE-cmp-block instruction. And it can be separated
+ // from the BCE-cmp-block instruction.
+ OtherInsts.push_back(&Inst);
+ }
+
+ // Do the actual spliting.
+ for (Instruction *Inst : reverse(OtherInsts)) {
+ Inst->moveBefore(&*NewParent->begin());
+ }
+}
+
+bool BCECmpBlock::canSplit() const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ for (Instruction &Inst : *BB) {
+ if (!BlockInsts.count(&Inst)) {
+ if (!canSinkBCECmpInst(&Inst, BlockInsts))
+ return false;
+ }
+ }
+ return true;
+}
+
bool BCECmpBlock::doesOtherWork() const {
AssertConsistent();
+ // All the instructions we care about in the BCE cmp block.
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
// TODO(courbet): Can we allow some other things ? This is very conservative.
- // We might be able to get away with anything does does not have any side
+ // We might be able to get away with anything does not have any side
// effects outside of the basic block.
// Note: The GEPs and/or loads are not necessarily in the same block.
for (const Instruction &Inst : *BB) {
- if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
- if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true;
- } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
- if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true;
- } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
- if (C != CmpI) return true;
- } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
- if (Br != BranchI) return true;
- } else {
+ if (!BlockInsts.count(&Inst))
return true;
- }
}
return false;
}
@@ -183,10 +245,19 @@ bool BCECmpBlock::doesOtherWork() const {
// BCE atoms, returns the comparison.
BCECmpBlock visitICmp(const ICmpInst *const CmpI,
const ICmpInst::Predicate ExpectedPredicate) {
+ // The comparison can only be used once:
+ // - For intermediate blocks, as a branch condition.
+ // - For the final block, as an incoming value for the Phi.
+ // If there are any other uses of the comparison, we cannot merge it with
+ // other comparisons as we would create an orphan use of the value.
+ if (!CmpI->hasOneUse()) {
+ LLVM_DEBUG(dbgs() << "cmp has several uses\n");
+ return {};
+ }
if (CmpI->getPredicate() == ExpectedPredicate) {
- DEBUG(dbgs() << "cmp "
- << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
- << "\n");
+ LLVM_DEBUG(dbgs() << "cmp "
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+ << "\n");
auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
if (!Lhs.Base()) return {};
auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
@@ -204,7 +275,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
if (Block->empty()) return {};
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
if (!BranchI) return {};
- DEBUG(dbgs() << "branch\n");
+ LLVM_DEBUG(dbgs() << "branch\n");
if (BranchI->isUnconditional()) {
// In this case, we expect an incoming value which is the result of the
// comparison. This is the last link in the chain of comparisons (note
@@ -212,7 +283,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
// can be reordered).
auto *const CmpI = dyn_cast<ICmpInst>(Val);
if (!CmpI) return {};
- DEBUG(dbgs() << "icmp\n");
+ LLVM_DEBUG(dbgs() << "icmp\n");
auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
Result.CmpI = CmpI;
Result.BranchI = BranchI;
@@ -221,12 +292,12 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
// In this case, we expect a constant incoming value (the comparison is
// chained).
const auto *const Const = dyn_cast<ConstantInt>(Val);
- DEBUG(dbgs() << "const\n");
+ LLVM_DEBUG(dbgs() << "const\n");
if (!Const->isZero()) return {};
- DEBUG(dbgs() << "false\n");
+ LLVM_DEBUG(dbgs() << "false\n");
auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
if (!CmpI) return {};
- DEBUG(dbgs() << "icmp\n");
+ LLVM_DEBUG(dbgs() << "icmp\n");
assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
auto Result = visitICmp(
@@ -238,6 +309,18 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
return {};
}
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+ BCECmpBlock &Comparison) {
+ LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
+ << "': Found cmp of " << Comparison.SizeBits()
+ << " bits between " << Comparison.Lhs().Base() << " + "
+ << Comparison.Lhs().Offset << " and "
+ << Comparison.Rhs().Base() << " + "
+ << Comparison.Rhs().Offset << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
+ Comparisons.push_back(Comparison);
+}
+
// A chain of comparisons.
class BCECmpChain {
public:
@@ -263,9 +346,9 @@ class BCECmpChain {
// Merges the given comparison blocks into one memcmp block and update
// branches. Comparisons are assumed to be continguous. If NextBBInChain is
// null, the merged block will link to the phi block.
- static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
- BasicBlock *const NextBBInChain, PHINode &Phi,
- const TargetLibraryInfo *const TLI);
+ void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain, PHINode &Phi,
+ const TargetLibraryInfo *const TLI);
PHINode &Phi_;
std::vector<BCECmpBlock> Comparisons_;
@@ -275,24 +358,47 @@ class BCECmpChain {
BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
: Phi_(Phi) {
+ assert(!Blocks.empty() && "a chain should have at least one block");
// Now look inside blocks to check for BCE comparisons.
std::vector<BCECmpBlock> Comparisons;
- for (BasicBlock *Block : Blocks) {
+ for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
+ BasicBlock *const Block = Blocks[BlockIdx];
+ assert(Block && "invalid block");
BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
Block, Phi.getParent());
Comparison.BB = Block;
if (!Comparison.IsValid()) {
- DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+ LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
return;
}
if (Comparison.doesOtherWork()) {
- DEBUG(dbgs() << "block does extra work besides compare\n");
- if (Comparisons.empty()) { // First block.
- // TODO(courbet): The first block can do other things, and we should
- // split them apart in a separate block before the comparison chain.
- // Right now we just discard it and make the chain shorter.
- DEBUG(dbgs()
- << "ignoring first block that does extra work besides compare\n");
+ LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
+ << "' does extra work besides compare\n");
+ if (Comparisons.empty()) {
+ // This is the initial block in the chain, in case this block does other
+ // work, we can try to split the block and move the irrelevant
+ // instructions to the predecessor.
+ //
+ // If this is not the initial block in the chain, splitting it wont
+ // work.
+ //
+ // As once split, there will still be instructions before the BCE cmp
+ // instructions that do other work in program order, i.e. within the
+ // chain before sorting. Unless we can abort the chain at this point
+ // and start anew.
+ //
+ // NOTE: we only handle block with single predecessor for now.
+ if (Comparison.canSplit()) {
+ LLVM_DEBUG(dbgs()
+ << "Split initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ Comparison.RequireSplit = true;
+ enqueueBlock(Comparisons, Comparison);
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "ignoring initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ }
continue;
}
// TODO(courbet): Right now we abort the whole chain. We could be
@@ -320,13 +426,13 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
// We could still merge bb1 and bb2 though.
return;
}
- DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
- << " bits between " << Comparison.Lhs().Base() << " + "
- << Comparison.Lhs().Offset << " and "
- << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
- << "\n");
- DEBUG(dbgs() << "\n");
- Comparisons.push_back(Comparison);
+ enqueueBlock(Comparisons, Comparison);
+ }
+
+ // It is possible we have no suitable comparison to merge.
+ if (Comparisons.empty()) {
+ LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
+ return;
}
EntryBlock_ = Comparisons[0].BB;
Comparisons_ = std::move(Comparisons);
@@ -336,10 +442,10 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
#endif // MERGEICMPS_DOT_ON
// Reorder blocks by LHS. We can do that without changing the
// semantics because we are only accessing dereferencable memory.
- std::sort(Comparisons_.begin(), Comparisons_.end(),
- [](const BCECmpBlock &a, const BCECmpBlock &b) {
- return a.Lhs() < b.Lhs();
- });
+ llvm::sort(Comparisons_.begin(), Comparisons_.end(),
+ [](const BCECmpBlock &a, const BCECmpBlock &b) {
+ return a.Lhs() < b.Lhs();
+ });
#ifdef MERGEICMPS_DOT_ON
errs() << "AFTER REORDERING:\n\n";
dump();
@@ -389,10 +495,24 @@ bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
Phi_.removeIncomingValue(Comparison.BB, false);
}
+ // If entry block is part of the chain, we need to make the first block
+ // of the chain the new entry block of the function.
+ BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock();
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (Entry == Comparisons_[I].BB) {
+ BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "",
+ Entry->getParent(), Entry);
+ BranchInst::Create(Entry, NEntryBB);
+ break;
+ }
+ }
+
// Point the predecessors of the chain to the first comparison block (which is
- // the new entry point).
- if (EntryBlock_ != Comparisons_[0].BB)
+ // the new entry point) and update the entry block of the chain.
+ if (EntryBlock_ != Comparisons_[0].BB) {
EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+ EntryBlock_ = Comparisons_[0].BB;
+ }
// Effectively merge blocks.
int NumMerged = 1;
@@ -424,7 +544,15 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
LLVMContext &Context = BB->getContext();
if (Comparisons.size() >= 2) {
- DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+ // If there is one block that requires splitting, we do it now, i.e.
+ // just before we know we will collapse the chain. The instructions
+ // can be executed before any of the instructions in the chain.
+ auto C = std::find_if(Comparisons.begin(), Comparisons.end(),
+ [](const BCECmpBlock &B) { return B.RequireSplit; });
+ if (C != Comparisons.end())
+ C->split(EntryBlock_);
+
+ LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
const auto TotalSize =
std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
[](int Size, const BCECmpBlock &C) {
@@ -445,7 +573,8 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
IRBuilder<> Builder(BB);
const auto &DL = Phi.getModule()->getDataLayout();
Value *const MemCmpCall = emitMemCmp(
- FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+ FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
+ ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
Builder, DL, TLI);
Value *const MemCmpIsZero = Builder.CreateICmpEQ(
MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
@@ -468,17 +597,17 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
} else {
assert(Comparisons.size() == 1);
// There are no blocks to merge, but we still need to update the branches.
- DEBUG(dbgs() << "Only one comparison, updating branches\n");
+ LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
if (NextBBInChain) {
if (FirstComparison.BranchI->isConditional()) {
- DEBUG(dbgs() << "conditional -> conditional\n");
+ LLVM_DEBUG(dbgs() << "conditional -> conditional\n");
// Just update the "true" target, the "false" target should already be
// the phi block.
assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
Phi.addIncoming(ConstantInt::getFalse(Context), BB);
} else {
- DEBUG(dbgs() << "unconditional -> conditional\n");
+ LLVM_DEBUG(dbgs() << "unconditional -> conditional\n");
// Replace the unconditional branch by a conditional one.
FirstComparison.BranchI->eraseFromParent();
IRBuilder<> Builder(BB);
@@ -488,14 +617,14 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
}
} else {
if (FirstComparison.BranchI->isConditional()) {
- DEBUG(dbgs() << "conditional -> unconditional\n");
+ LLVM_DEBUG(dbgs() << "conditional -> unconditional\n");
// Replace the conditional branch by an unconditional one.
FirstComparison.BranchI->eraseFromParent();
IRBuilder<> Builder(BB);
Builder.CreateBr(Phi.getParent());
Phi.addIncoming(FirstComparison.CmpI, BB);
} else {
- DEBUG(dbgs() << "unconditional -> unconditional\n");
+ LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n");
Phi.addIncoming(FirstComparison.CmpI, BB);
}
}
@@ -507,27 +636,28 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
int NumBlocks) {
// Walk up from the last block to find other blocks.
std::vector<BasicBlock *> Blocks(NumBlocks);
+ assert(LastBlock && "invalid last block");
BasicBlock *CurBlock = LastBlock;
for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
if (CurBlock->hasAddressTaken()) {
// Somebody is jumping to the block through an address, all bets are
// off.
- DEBUG(dbgs() << "skip: block " << BlockIndex
- << " has its address taken\n");
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has its address taken\n");
return {};
}
Blocks[BlockIndex] = CurBlock;
auto *SinglePredecessor = CurBlock->getSinglePredecessor();
if (!SinglePredecessor) {
// The block has two or more predecessors.
- DEBUG(dbgs() << "skip: block " << BlockIndex
- << " has two or more predecessors\n");
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has two or more predecessors\n");
return {};
}
if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
// The block does not link back to the phi.
- DEBUG(dbgs() << "skip: block " << BlockIndex
- << " does not link back to the phi\n");
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " does not link back to the phi\n");
return {};
}
CurBlock = SinglePredecessor;
@@ -537,9 +667,9 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
}
bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
- DEBUG(dbgs() << "processPhi()\n");
+ LLVM_DEBUG(dbgs() << "processPhi()\n");
if (Phi.getNumIncomingValues() <= 1) {
- DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+ LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
return false;
}
// We are looking for something that has the following structure:
@@ -552,7 +682,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
// - The last basic block (bb4 here) must branch unconditionally to bb_phi.
// It's the only block that contributes a non-constant value to the Phi.
// - All other blocks (b1, b2, b3) must have exactly two successors, one of
- // them being the the phi block.
+ // them being the phi block.
// - All intermediate blocks (bb2, bb3) must have only one predecessor.
// - Blocks cannot do other work besides the comparison, see doesOtherWork()
@@ -563,18 +693,31 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
if (LastBlock) {
// There are several non-constant values.
- DEBUG(dbgs() << "skip: several non-constant values\n");
+ LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
+ return false;
+ }
+ if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
+ cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
+ Phi.getIncomingBlock(I)) {
+ // Non-constant incoming value is not from a cmp instruction or not
+ // produced by the last block. We could end up processing the value
+ // producing block more than once.
+ //
+ // This is an uncommon case, so we bail.
+ LLVM_DEBUG(
+ dbgs()
+ << "skip: non-constant value not from cmp or not from last block.\n");
return false;
}
LastBlock = Phi.getIncomingBlock(I);
}
if (!LastBlock) {
// There is no non-constant block.
- DEBUG(dbgs() << "skip: no non-constant block\n");
+ LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
return false;
}
if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
- DEBUG(dbgs() << "skip: last block non-phi successor\n");
+ LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
return false;
}
@@ -584,7 +727,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
BCECmpChain CmpChain(Blocks, Phi);
if (CmpChain.size() < 2) {
- DEBUG(dbgs() << "skip: only one compare block\n");
+ LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
return false;
}
@@ -619,12 +762,16 @@ class MergeICmps : public FunctionPass {
PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI) {
- DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
// We only try merging comparisons if the target wants to expand memcmp later.
// The rationale is to avoid turning small chains into memcmp calls.
if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
+ // If we don't have memcmp avaiable we can't emit calls to it.
+ if (!TLI->has(LibFunc_memcmp))
+ return PreservedAnalyses::all();
+
bool MadeChange = false;
for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index f2f615cb9b0f..3464b759280f 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
//! \file
-//! \brief This pass performs merges of loads and stores on both sides of a
+//! This pass performs merges of loads and stores on both sides of a
// diamond (hammock). It hoists the loads and sinks the stores.
//
// The algorithm iteratively hoists two loads to the same address out of a
@@ -80,7 +80,6 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Debug.h"
@@ -97,7 +96,6 @@ namespace {
// MergedLoadStoreMotion Pass
//===----------------------------------------------------------------------===//
class MergedLoadStoreMotion {
- MemoryDependenceResults *MD = nullptr;
AliasAnalysis *AA = nullptr;
// The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
@@ -107,14 +105,9 @@ class MergedLoadStoreMotion {
const int MagicCompileTimeControl = 250;
public:
- bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA);
+ bool run(Function &F, AliasAnalysis &AA);
private:
- ///
- /// \brief Remove instruction from parent and update memory dependence
- /// analysis.
- ///
- void removeInstruction(Instruction *Inst);
BasicBlock *getDiamondTail(BasicBlock *BB);
bool isDiamondHead(BasicBlock *BB);
// Routines for sinking stores
@@ -128,23 +121,7 @@ private:
} // end anonymous namespace
///
-/// \brief Remove instruction from parent and update memory dependence analysis.
-///
-void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
- // Notify the memory dependence analysis.
- if (MD) {
- MD->removeInstruction(Inst);
- if (auto *LI = dyn_cast<LoadInst>(Inst))
- MD->invalidateCachedPointerInfo(LI->getPointerOperand());
- if (Inst->getType()->isPtrOrPtrVectorTy()) {
- MD->invalidateCachedPointerInfo(Inst);
- }
- }
- Inst->eraseFromParent();
-}
-
-///
-/// \brief Return tail block of a diamond.
+/// Return tail block of a diamond.
///
BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
@@ -152,7 +129,7 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
}
///
-/// \brief True when BB is the head of a diamond (hammock)
+/// True when BB is the head of a diamond (hammock)
///
bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
if (!BB)
@@ -179,7 +156,7 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
///
-/// \brief True when instruction is a sink barrier for a store
+/// True when instruction is a sink barrier for a store
/// located in Loc
///
/// Whenever an instruction could possibly read or modify the
@@ -197,13 +174,13 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
}
///
-/// \brief Check if \p BB contains a store to the same address as \p SI
+/// Check if \p BB contains a store to the same address as \p SI
///
/// \return The store in \p when it is safe to sink. Otherwise return Null.
///
StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
StoreInst *Store0) {
- DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
BasicBlock *BB0 = Store0->getParent();
for (Instruction &Inst : reverse(*BB1)) {
auto *Store1 = dyn_cast<StoreInst>(&Inst);
@@ -222,7 +199,7 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
}
///
-/// \brief Create a PHI node in BB for the operands of S0 and S1
+/// Create a PHI node in BB for the operands of S0 and S1
///
PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
StoreInst *S1) {
@@ -236,13 +213,11 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
&BB->front());
NewPN->addIncoming(Opd1, S0->getParent());
NewPN->addIncoming(Opd2, S1->getParent());
- if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(NewPN);
return NewPN;
}
///
-/// \brief Merge two stores to same address and sink into \p BB
+/// Merge two stores to same address and sink into \p BB
///
/// Also sinks GEP instruction computing the store address
///
@@ -254,9 +229,9 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
(A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
(A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
- DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
- dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
- dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
// Hoist the instruction.
BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
// Intersect optional metadata.
@@ -275,19 +250,19 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
// New PHI operand? Use it.
if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
SNew->setOperand(0, NewPN);
- removeInstruction(S0);
- removeInstruction(S1);
+ S0->eraseFromParent();
+ S1->eraseFromParent();
A0->replaceAllUsesWith(ANew);
- removeInstruction(A0);
+ A0->eraseFromParent();
A1->replaceAllUsesWith(ANew);
- removeInstruction(A1);
+ A1->eraseFromParent();
return true;
}
return false;
}
///
-/// \brief True when two stores are equivalent and can sink into the footer
+/// True when two stores are equivalent and can sink into the footer
///
/// Starting from a diamond tail block, iterate over the instructions in one
/// predecessor block and try to match a store in the second predecessor.
@@ -310,7 +285,8 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
return false; // No. More than 2 predecessors.
// #Instructions in Succ1 for Compile Time Control
- int Size1 = Pred1->size();
+ auto InstsNoDbg = Pred1->instructionsWithoutDebug();
+ int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
int NStores = 0;
for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
@@ -338,19 +314,17 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
break;
RBI = Pred0->rbegin();
RBE = Pred0->rend();
- DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+ LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
}
}
return MergedStores;
}
-bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
- AliasAnalysis &AA) {
- this->MD = MD;
+bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
this->AA = &AA;
bool Changed = false;
- DEBUG(dbgs() << "Instruction Merger\n");
+ LLVM_DEBUG(dbgs() << "Instruction Merger\n");
// Merge unconditional branches, allowing PRE to catch more
// optimization opportunities.
@@ -376,15 +350,13 @@ public:
}
///
- /// \brief Run the transformation for each function
+ /// Run the transformation for each function
///
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
MergedLoadStoreMotion Impl;
- auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
- return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr,
- getAnalysis<AAResultsWrapperPass>().getAAResults());
+ return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
}
private:
@@ -392,7 +364,6 @@ private:
AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<MemoryDependenceWrapperPass>();
}
};
@@ -400,7 +371,7 @@ char MergedLoadStoreMotionLegacyPass::ID = 0;
} // anonymous namespace
///
-/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+/// createMergedLoadStoreMotionPass - The public interface to this file.
///
FunctionPass *llvm::createMergedLoadStoreMotionPass() {
return new MergedLoadStoreMotionLegacyPass();
@@ -408,7 +379,6 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() {
INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
"MergedLoadStoreMotion", false, false)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
"MergedLoadStoreMotion", false, false)
@@ -416,14 +386,12 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
PreservedAnalyses
MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
MergedLoadStoreMotion Impl;
- auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
- if (!Impl.run(F, MD, AA))
+ if (!Impl.run(F, AA))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
- PA.preserve<MemoryDependenceAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index b026c8d692c3..7106ea216ad6 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -83,6 +83,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -105,7 +106,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
@@ -240,10 +240,17 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
Changed = true;
SE->forgetValue(&*I);
I->replaceAllUsesWith(NewI);
- // If SeenExprs constains I's WeakTrackingVH, that entry will be
- // replaced with
- // nullptr.
+ WeakVH NewIExist = NewI;
+ // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that
+ // entry will be replaced with nullptr if deleted.
RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+ if (!NewIExist) {
+ // Rare occation where the new instruction (NewI) have been removed,
+ // probably due to parts of the input code was dead from the
+ // beginning, reset the iterator and start over from the beginning
+ I = BB->begin();
+ continue;
+ }
I = NewI->getIterator();
}
// Add the rewritten instruction to SeenExprs; the original instruction
@@ -429,6 +436,9 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ // There is no need to reassociate 0.
+ if (SE->getSCEV(I)->isZero())
+ return nullptr;
if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
return NewI;
if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 9ebf2d769356..2eb887c986be 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -77,6 +77,7 @@
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -105,7 +106,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PredicateInfo.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
#include <algorithm>
@@ -221,13 +221,13 @@ private:
Components.resize(Components.size() + 1);
auto &Component = Components.back();
Component.insert(I);
- DEBUG(dbgs() << "Component root is " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n");
InComponent.insert(I);
ValueToComponent[I] = ComponentID;
// Pop a component off the stack and label it.
while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
auto *Member = Stack.back();
- DEBUG(dbgs() << "Component member is " << *Member << "\n");
+ LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n");
Component.insert(Member);
InComponent.insert(Member);
ValueToComponent[Member] = ComponentID;
@@ -366,9 +366,8 @@ public:
// True if this class has no memory members.
bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
- // Return true if two congruence classes are equivalent to each other. This
- // means
- // that every field but the ID number and the dead field are equivalent.
+ // Return true if two congruence classes are equivalent to each other. This
+ // means that every field but the ID number and the dead field are equivalent.
bool isEquivalentTo(const CongruenceClass *Other) const {
if (!Other)
return false;
@@ -383,10 +382,12 @@ public:
if (!DefiningExpr || !Other->DefiningExpr ||
*DefiningExpr != *Other->DefiningExpr)
return false;
- // We need some ordered set
- std::set<Value *> AMembers(Members.begin(), Members.end());
- std::set<Value *> BMembers(Members.begin(), Members.end());
- return AMembers == BMembers;
+
+ if (Members.size() != Other->Members.size())
+ return false;
+
+ return all_of(Members,
+ [&](const Value *V) { return Other->Members.count(V); });
}
private:
@@ -860,7 +861,7 @@ private:
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
- std::pair<int, int> StartingVNCounter;
+ int64_t StartingVNCounter;
};
} // end anonymous namespace
@@ -958,7 +959,8 @@ static bool isCopyOfAPHI(const Value *V) {
// order. The BlockInstRange numbers are generated in an RPO walk of the basic
// blocks.
void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
- std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) {
+ llvm::sort(Ops.begin(), Ops.end(),
+ [&](const ValPair &P1, const ValPair &P2) {
return BlockInstRange.lookup(P1.second).first <
BlockInstRange.lookup(P2.second).first;
});
@@ -1067,8 +1069,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return nullptr;
if (auto *C = dyn_cast<Constant>(V)) {
if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " constant " << *C << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " constant " << *C << "\n");
NumGVNOpsSimplified++;
assert(isa<BasicExpression>(E) &&
"We should always have had a basic expression here");
@@ -1076,8 +1078,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return createConstantExpression(C);
} else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " variable " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " variable " << *V << "\n");
deleteExpression(E);
return createVariableExpression(V);
}
@@ -1100,8 +1102,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
}
if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " expression " << *CC->getDefiningExpr() << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " expression " << *CC->getDefiningExpr() << "\n");
NumGVNOpsSimplified++;
deleteExpression(E);
return CC->getDefiningExpr();
@@ -1257,7 +1259,7 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
// This must be an instruction because we are only called from phi nodes
// in the case that the value it needs to check against is an instruction.
- // The most likely candiates for dominance are the leader and the next leader.
+ // The most likely candidates for dominance are the leader and the next leader.
// The leader or nextleader will dominate in all cases where there is an
// equivalent that is higher up in the dom tree.
// We can't *only* check them, however, because the
@@ -1421,8 +1423,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (Offset >= 0) {
if (auto *C = dyn_cast<Constant>(
lookupOperandLeader(DepSI->getValueOperand()))) {
- DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
- << *C << "\n");
+ LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
+ << " to constant " << *C << "\n");
return createConstantExpression(
getConstantStoreValueForLoad(C, Offset, LoadType, DL));
}
@@ -1437,8 +1439,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
if (auto *PossibleConstant =
getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
- DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
- << *PossibleConstant << "\n");
+ LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
+ << " to constant " << *PossibleConstant << "\n");
return createConstantExpression(PossibleConstant);
}
}
@@ -1447,8 +1449,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (Offset >= 0) {
if (auto *PossibleConstant =
getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
- DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
- << " to constant " << *PossibleConstant << "\n");
+ LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+ << " to constant " << *PossibleConstant << "\n");
return createConstantExpression(PossibleConstant);
}
}
@@ -1529,7 +1531,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
if (!PI)
return nullptr;
- DEBUG(dbgs() << "Found predicate info from instruction !\n");
+ LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
auto *PWC = dyn_cast<PredicateWithCondition>(PI);
if (!PWC)
@@ -1569,7 +1571,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
return nullptr;
if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
- DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+ LLVM_DEBUG(dbgs() << "Copy is not of any condition operands!\n");
return nullptr;
}
Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1584,11 +1586,11 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
if (isa<PredicateAssume>(PI)) {
- // If the comparison is true when the operands are equal, then we know the
- // operands are equal, because assumes must always be true.
- if (CmpInst::isTrueWhenEqual(Predicate)) {
+ // If we assume the operands are equal, then they are equal.
+ if (Predicate == CmpInst::ICMP_EQ) {
addPredicateUsers(PI, I);
- addAdditionalUsers(Cmp->getOperand(0), I);
+ addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+ I);
return createVariableOrConstant(FirstOp);
}
}
@@ -1622,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- // Instrinsics with the returned attribute are copies of arguments.
+ // Intrinsics with the returned attribute are copies of arguments.
if (auto *ReturnedValue = II->getReturnedArgOperand()) {
if (II->getIntrinsicID() == Intrinsic::ssa_copy)
if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
@@ -1652,10 +1654,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
CongruenceClass *NewClass) {
assert(NewClass &&
"Every MemoryAccess should be getting mapped to a non-null class");
- DEBUG(dbgs() << "Setting " << *From);
- DEBUG(dbgs() << " equivalent to congruence class ");
- DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
- DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+ LLVM_DEBUG(dbgs() << "Setting " << *From);
+ LLVM_DEBUG(dbgs() << " equivalent to congruence class ");
+ LLVM_DEBUG(dbgs() << NewClass->getID()
+ << " with current MemoryAccess leader ");
+ LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
auto LookupResult = MemoryAccessToClass.find(From);
bool Changed = false;
@@ -1673,11 +1676,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
OldClass->setMemoryLeader(nullptr);
} else {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
- DEBUG(dbgs() << "Memory class leader change for class "
- << OldClass->getID() << " to "
- << *OldClass->getMemoryLeader()
- << " due to removal of a memory member " << *From
- << "\n");
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of a memory member " << *From
+ << "\n");
markMemoryLeaderChangeTouched(OldClass);
}
}
@@ -1705,7 +1708,7 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
if (ICS == ICS_Unknown) {
SCCFinder.Start(I);
auto &SCC = SCCFinder.getComponentFor(I);
- // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+ // It's cycle free if it's size 1 or the SCC is *only* phi nodes.
if (SCC.size() == 1)
InstCycleState.insert({I, ICS_CycleFree});
else {
@@ -1753,12 +1756,13 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
// If it has undef at this point, it means there are no-non-undef arguments,
// and thus, the value of the phi node must be undef.
if (HasUndef) {
- DEBUG(dbgs() << "PHI Node " << *I
- << " has no non-undef arguments, valuing it as undef\n");
+ LLVM_DEBUG(
+ dbgs() << "PHI Node " << *I
+ << " has no non-undef arguments, valuing it as undef\n");
return createConstantExpression(UndefValue::get(I->getType()));
}
- DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+ LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
deleteExpression(E);
return createDeadExpression();
}
@@ -1797,8 +1801,8 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
return E;
NumGVNPhisAllSame++;
- DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
- << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+ << "\n");
deleteExpression(E);
return createVariableOrConstant(AllSameValue);
}
@@ -2091,7 +2095,7 @@ void NewGVN::markUsersTouched(Value *V) {
}
void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
- DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+ LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
MemoryToUsers[To].insert(U);
}
@@ -2207,13 +2211,13 @@ Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
//
// - I must be moving to NewClass from OldClass
// - The StoreCount of OldClass and NewClass is expected to have been updated
-// for I already if it is is a store.
+// for I already if it is a store.
// - The OldClass memory leader has not been updated yet if I was the leader.
void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
MemoryAccess *InstMA,
CongruenceClass *OldClass,
CongruenceClass *NewClass) {
- // If the leader is I, and we had a represenative MemoryAccess, it should
+ // If the leader is I, and we had a representative MemoryAccess, it should
// be the MemoryAccess of OldClass.
assert((!InstMA || !OldClass->getMemoryLeader() ||
OldClass->getLeader() != I ||
@@ -2227,8 +2231,9 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
(isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
NewClass->setMemoryLeader(InstMA);
// Mark it touched if we didn't just create a singleton
- DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
- << " due to new memory instruction becoming leader\n");
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << NewClass->getID()
+ << " due to new memory instruction becoming leader\n");
markMemoryLeaderChangeTouched(NewClass);
}
setMemoryClass(InstMA, NewClass);
@@ -2236,10 +2241,10 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
if (OldClass->getMemoryLeader() == InstMA) {
if (!OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
- DEBUG(dbgs() << "Memory class leader change for class "
- << OldClass->getID() << " to "
- << *OldClass->getMemoryLeader()
- << " due to removal of old leader " << *InstMA << "\n");
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of old leader " << *InstMA << "\n");
markMemoryLeaderChangeTouched(OldClass);
} else
OldClass->setMemoryLeader(nullptr);
@@ -2276,9 +2281,10 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
NewClass->setStoredValue(SE->getStoredValue());
markValueLeaderChangeTouched(NewClass);
// Shift the new class leader to be the store
- DEBUG(dbgs() << "Changing leader of congruence class "
- << NewClass->getID() << " from " << *NewClass->getLeader()
- << " to " << *SI << " because store joined class\n");
+ LLVM_DEBUG(dbgs() << "Changing leader of congruence class "
+ << NewClass->getID() << " from "
+ << *NewClass->getLeader() << " to " << *SI
+ << " because store joined class\n");
// If we changed the leader, we have to mark it changed because we don't
// know what it will do to symbolic evaluation.
NewClass->setLeader(SI);
@@ -2298,8 +2304,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// See if we destroyed the class or need to swap leaders.
if (OldClass->empty() && OldClass != TOPClass) {
if (OldClass->getDefiningExpr()) {
- DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
- << " from table\n");
+ LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+ << " from table\n");
// We erase it as an exact expression to make sure we don't just erase an
// equivalent one.
auto Iter = ExpressionToClass.find_as(
@@ -2316,8 +2322,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// When the leader changes, the value numbering of
// everything may change due to symbolization changes, so we need to
// reprocess.
- DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
- << "\n");
+ LLVM_DEBUG(dbgs() << "Value class leader change for class "
+ << OldClass->getID() << "\n");
++NumGVNLeaderChanges;
// Destroy the stored value if there are no more stores to represent it.
// Note that this is basically clean up for the expression removal that
@@ -2380,12 +2386,14 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
"VariableExpression should have been handled already");
EClass = NewClass;
- DEBUG(dbgs() << "Created new congruence class for " << *I
- << " using expression " << *E << " at " << NewClass->getID()
- << " and leader " << *(NewClass->getLeader()));
+ LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I
+ << " using expression " << *E << " at "
+ << NewClass->getID() << " and leader "
+ << *(NewClass->getLeader()));
if (NewClass->getStoredValue())
- DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " and stored value "
+ << *(NewClass->getStoredValue()));
+ LLVM_DEBUG(dbgs() << "\n");
} else {
EClass = lookupResult.first->second;
if (isa<ConstantExpression>(E))
@@ -2403,8 +2411,8 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
bool ClassChanged = IClass != EClass;
bool LeaderChanged = LeaderChanges.erase(I);
if (ClassChanged || LeaderChanged) {
- DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
- << "\n");
+ LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression "
+ << *E << "\n");
if (ClassChanged) {
moveValueToNewCongruenceClass(I, E, IClass, EClass);
markPhiOfOpsChanged(E);
@@ -2442,13 +2450,15 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
if (ReachableEdges.insert({From, To}).second) {
// If this block wasn't reachable before, all instructions are touched.
if (ReachableBlocks.insert(To).second) {
- DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n");
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " marked reachable\n");
const auto &InstRange = BlockInstRange.lookup(To);
TouchedInstructions.set(InstRange.first, InstRange.second);
} else {
- DEBUG(dbgs() << "Block " << getBlockName(To)
- << " was reachable, but new edge {" << getBlockName(From)
- << "," << getBlockName(To) << "} to it found\n");
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " was reachable, but new edge {"
+ << getBlockName(From) << "," << getBlockName(To)
+ << "} to it found\n");
// We've made an edge reachable to an existing block, which may
// impact predicates. Otherwise, only mark the phi nodes as touched, as
@@ -2495,12 +2505,12 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
BasicBlock *FalseSucc = BR->getSuccessor(1);
if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
if (CI->isOne()) {
- DEBUG(dbgs() << "Condition for Terminator " << *TI
- << " evaluated to true\n");
+ LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to true\n");
updateReachableEdge(B, TrueSucc);
} else if (CI->isZero()) {
- DEBUG(dbgs() << "Condition for Terminator " << *TI
- << " evaluated to false\n");
+ LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to false\n");
updateReachableEdge(B, FalseSucc);
}
} else {
@@ -2685,8 +2695,8 @@ Value *NewGVN::findLeaderForInst(Instruction *TransInst,
auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
if (!FoundVal) {
ExpressionToPhiOfOps[E].insert(OrigInst);
- DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
- << " in block " << getBlockName(PredBB) << "\n");
+ LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+ << " in block " << getBlockName(PredBB) << "\n");
return nullptr;
}
if (auto *SI = dyn_cast<StoreInst>(FoundVal))
@@ -2723,116 +2733,143 @@ NewGVN::makePossiblePHIOfOps(Instruction *I,
MemAccess->getDefiningAccess()->getBlock() == I->getParent())
return nullptr;
- SmallPtrSet<const Value *, 10> VisitedOps;
// Convert op of phis to phi of ops
- for (auto *Op : I->operand_values()) {
+ SmallPtrSet<const Value *, 10> VisitedOps;
+ SmallVector<Value *, 4> Ops(I->operand_values());
+ BasicBlock *SamePHIBlock = nullptr;
+ PHINode *OpPHI = nullptr;
+ if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+ return nullptr;
+ for (auto *Op : Ops) {
if (!isa<PHINode>(Op)) {
auto *ValuePHI = RealToTemp.lookup(Op);
if (!ValuePHI)
continue;
- DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+ LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n");
Op = ValuePHI;
}
- auto *OpPHI = cast<PHINode>(Op);
+ OpPHI = cast<PHINode>(Op);
+ if (!SamePHIBlock) {
+ SamePHIBlock = getBlockForValue(OpPHI);
+ } else if (SamePHIBlock != getBlockForValue(OpPHI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "PHIs for operands are not all in the same block, aborting\n");
+ return nullptr;
+ }
// No point in doing this for one-operand phis.
- if (OpPHI->getNumOperands() == 1)
+ if (OpPHI->getNumOperands() == 1) {
+ OpPHI = nullptr;
continue;
- if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
- return nullptr;
- SmallVector<ValPair, 4> Ops;
- SmallPtrSet<Value *, 4> Deps;
- auto *PHIBlock = getBlockForValue(OpPHI);
- RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
- for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
- auto *PredBB = OpPHI->getIncomingBlock(PredNum);
- Value *FoundVal = nullptr;
- // We could just skip unreachable edges entirely but it's tricky to do
- // with rewriting existing phi nodes.
- if (ReachableEdges.count({PredBB, PHIBlock})) {
- // Clone the instruction, create an expression from it that is
- // translated back into the predecessor, and see if we have a leader.
- Instruction *ValueOp = I->clone();
- if (MemAccess)
- TempToMemory.insert({ValueOp, MemAccess});
- bool SafeForPHIOfOps = true;
- VisitedOps.clear();
- for (auto &Op : ValueOp->operands()) {
- auto *OrigOp = &*Op;
- // When these operand changes, it could change whether there is a
- // leader for us or not, so we have to add additional users.
- if (isa<PHINode>(Op)) {
- Op = Op->DoPHITranslation(PHIBlock, PredBB);
- if (Op != OrigOp && Op != I)
- Deps.insert(Op);
- } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
- if (getBlockForValue(ValuePHI) == PHIBlock)
- Op = ValuePHI->getIncomingValueForBlock(PredBB);
- }
- // If we phi-translated the op, it must be safe.
- SafeForPHIOfOps =
- SafeForPHIOfOps &&
- (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
+ }
+ }
+
+ if (!OpPHI)
+ return nullptr;
+
+ SmallVector<ValPair, 4> PHIOps;
+ SmallPtrSet<Value *, 4> Deps;
+ auto *PHIBlock = getBlockForValue(OpPHI);
+ RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+ for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+ auto *PredBB = OpPHI->getIncomingBlock(PredNum);
+ Value *FoundVal = nullptr;
+ SmallPtrSet<Value *, 4> CurrentDeps;
+ // We could just skip unreachable edges entirely but it's tricky to do
+ // with rewriting existing phi nodes.
+ if (ReachableEdges.count({PredBB, PHIBlock})) {
+ // Clone the instruction, create an expression from it that is
+ // translated back into the predecessor, and see if we have a leader.
+ Instruction *ValueOp = I->clone();
+ if (MemAccess)
+ TempToMemory.insert({ValueOp, MemAccess});
+ bool SafeForPHIOfOps = true;
+ VisitedOps.clear();
+ for (auto &Op : ValueOp->operands()) {
+ auto *OrigOp = &*Op;
+ // When these operand changes, it could change whether there is a
+ // leader for us or not, so we have to add additional users.
+ if (isa<PHINode>(Op)) {
+ Op = Op->DoPHITranslation(PHIBlock, PredBB);
+ if (Op != OrigOp && Op != I)
+ CurrentDeps.insert(Op);
+ } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+ if (getBlockForValue(ValuePHI) == PHIBlock)
+ Op = ValuePHI->getIncomingValueForBlock(PredBB);
}
- // FIXME: For those things that are not safe we could generate
- // expressions all the way down, and see if this comes out to a
- // constant. For anything where that is true, and unsafe, we should
- // have made a phi-of-ops (or value numbered it equivalent to something)
- // for the pieces already.
- FoundVal = !SafeForPHIOfOps ? nullptr
- : findLeaderForInst(ValueOp, Visited,
- MemAccess, I, PredBB);
- ValueOp->deleteValue();
- if (!FoundVal)
- return nullptr;
- } else {
- DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
- << getBlockName(PredBB)
- << " because the block is unreachable\n");
- FoundVal = UndefValue::get(I->getType());
- RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ // If we phi-translated the op, it must be safe.
+ SafeForPHIOfOps =
+ SafeForPHIOfOps &&
+ (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
}
-
- Ops.push_back({FoundVal, PredBB});
- DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
- << getBlockName(PredBB) << "\n");
- }
- for (auto Dep : Deps)
- addAdditionalUsers(Dep, I);
- sortPHIOps(Ops);
- auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock);
- if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
- DEBUG(dbgs()
- << "Not creating real PHI of ops because it simplified to existing "
- "value or constant\n");
- return E;
- }
- auto *ValuePHI = RealToTemp.lookup(I);
- bool NewPHI = false;
- if (!ValuePHI) {
- ValuePHI =
- PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
- addPhiOfOps(ValuePHI, PHIBlock, I);
- NewPHI = true;
- NumGVNPHIOfOpsCreated++;
- }
- if (NewPHI) {
- for (auto PHIOp : Ops)
- ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
- } else {
- unsigned int i = 0;
- for (auto PHIOp : Ops) {
- ValuePHI->setIncomingValue(i, PHIOp.first);
- ValuePHI->setIncomingBlock(i, PHIOp.second);
- ++i;
+ // FIXME: For those things that are not safe we could generate
+ // expressions all the way down, and see if this comes out to a
+ // constant. For anything where that is true, and unsafe, we should
+ // have made a phi-of-ops (or value numbered it equivalent to something)
+ // for the pieces already.
+ FoundVal = !SafeForPHIOfOps ? nullptr
+ : findLeaderForInst(ValueOp, Visited,
+ MemAccess, I, PredBB);
+ ValueOp->deleteValue();
+ if (!FoundVal) {
+ // We failed to find a leader for the current ValueOp, but this might
+ // change in case of the translated operands change.
+ if (SafeForPHIOfOps)
+ for (auto Dep : CurrentDeps)
+ addAdditionalUsers(Dep, I);
+
+ return nullptr;
}
+ Deps.insert(CurrentDeps.begin(), CurrentDeps.end());
+ } else {
+ LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+ << getBlockName(PredBB)
+ << " because the block is unreachable\n");
+ FoundVal = UndefValue::get(I->getType());
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
}
- RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
- DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
- << "\n");
+ PHIOps.push_back({FoundVal, PredBB});
+ LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+ << getBlockName(PredBB) << "\n");
+ }
+ for (auto Dep : Deps)
+ addAdditionalUsers(Dep, I);
+ sortPHIOps(PHIOps);
+ auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock);
+ if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Not creating real PHI of ops because it simplified to existing "
+ "value or constant\n");
return E;
}
- return nullptr;
+ auto *ValuePHI = RealToTemp.lookup(I);
+ bool NewPHI = false;
+ if (!ValuePHI) {
+ ValuePHI =
+ PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
+ addPhiOfOps(ValuePHI, PHIBlock, I);
+ NewPHI = true;
+ NumGVNPHIOfOpsCreated++;
+ }
+ if (NewPHI) {
+ for (auto PHIOp : PHIOps)
+ ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+ } else {
+ TempToBlock[ValuePHI] = PHIBlock;
+ unsigned int i = 0;
+ for (auto PHIOp : PHIOps) {
+ ValuePHI->setIncomingValue(i, PHIOp.first);
+ ValuePHI->setIncomingBlock(i, PHIOp.second);
+ ++i;
+ }
+ }
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+ << "\n");
+
+ return E;
}
// The algorithm initially places the values of the routine in the TOP
@@ -2902,8 +2939,9 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
void NewGVN::cleanupTables() {
for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
- DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
- << " has " << CongruenceClasses[i]->size() << " members\n");
+ LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+ << " has " << CongruenceClasses[i]->size()
+ << " members\n");
// Make sure we delete the congruence class (probably worth switching to
// a unique_ptr at some point.
delete CongruenceClasses[i];
@@ -2973,7 +3011,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
// we change its DFS number so that it doesn't get value numbered.
if (isInstructionTriviallyDead(&I, TLI)) {
InstrDFS[&I] = 0;
- DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
markInstructionForDeletion(&I);
continue;
}
@@ -3039,9 +3077,10 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
[&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
if (AllEqual)
- DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
+ LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue
+ << "\n");
else
- DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+ LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
// If it's equal to something, it's in that class. Otherwise, it has to be in
// a class where it is the leader (other things may be equivalent to it, but
// it needs to start off in its own class, which means it must have been the
@@ -3060,7 +3099,7 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
// Value number a single instruction, symbolically evaluating, performing
// congruence finding, and updating mappings.
void NewGVN::valueNumberInstruction(Instruction *I) {
- DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n");
if (!I->isTerminator()) {
const Expression *Symbolized = nullptr;
SmallPtrSet<Value *, 2> Visited;
@@ -3246,7 +3285,7 @@ void NewGVN::verifyMemoryCongruency() const {
// and redoing the iteration to see if anything changed.
void NewGVN::verifyIterationSettled(Function &F) {
#ifndef NDEBUG
- DEBUG(dbgs() << "Beginning iteration verification\n");
+ LLVM_DEBUG(dbgs() << "Beginning iteration verification\n");
if (DebugCounter::isCounterSet(VNCounter))
DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
@@ -3364,9 +3403,9 @@ void NewGVN::iterateTouchedInstructions() {
// If it's not reachable, erase any touched instructions and move on.
if (!BlockReachable) {
TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
- DEBUG(dbgs() << "Skipping instructions in block "
- << getBlockName(CurrBlock)
- << " because it is unreachable\n");
+ LLVM_DEBUG(dbgs() << "Skipping instructions in block "
+ << getBlockName(CurrBlock)
+ << " because it is unreachable\n");
continue;
}
updateProcessedCount(CurrBlock);
@@ -3376,7 +3415,7 @@ void NewGVN::iterateTouchedInstructions() {
TouchedInstructions.reset(InstrNum);
if (auto *MP = dyn_cast<MemoryPhi>(V)) {
- DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+ LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
valueNumberMemoryPhi(MP);
} else if (auto *I = dyn_cast<Instruction>(V)) {
valueNumberInstruction(I);
@@ -3422,10 +3461,10 @@ bool NewGVN::runGVN() {
for (auto &B : RPOT) {
auto *Node = DT->getNode(B);
if (Node->getChildren().size() > 1)
- std::sort(Node->begin(), Node->end(),
- [&](const DomTreeNode *A, const DomTreeNode *B) {
- return RPOOrdering[A] < RPOOrdering[B];
- });
+ llvm::sort(Node->begin(), Node->end(),
+ [&](const DomTreeNode *A, const DomTreeNode *B) {
+ return RPOOrdering[A] < RPOOrdering[B];
+ });
}
// Now a standard depth first ordering of the domtree is equivalent to RPO.
@@ -3446,8 +3485,8 @@ bool NewGVN::runGVN() {
// Initialize the touched instructions to include the entry block.
const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
TouchedInstructions.set(InstRange.first, InstRange.second);
- DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
- << " marked reachable\n");
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+ << " marked reachable\n");
ReachableBlocks.insert(&F.getEntryBlock());
iterateTouchedInstructions();
@@ -3472,8 +3511,8 @@ bool NewGVN::runGVN() {
};
for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
- DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
- << " is unreachable\n");
+ LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+ << " is unreachable\n");
deleteInstructionsInBlock(&BB);
Changed = true;
}
@@ -3695,7 +3734,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
}
void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
- DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
++NumGVNBlocksDeleted;
// Delete the instructions backwards, as it has a reduced likelihood of having
@@ -3722,12 +3761,12 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
}
void NewGVN::markInstructionForDeletion(Instruction *I) {
- DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+ LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
InstructionsToErase.insert(I);
}
void NewGVN::replaceInstruction(Instruction *I, Value *V) {
- DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
patchAndReplaceAllUsesWith(I, V);
// We save the actual erasing to avoid invalidating memory
// dependencies until we are done with everything.
@@ -3853,9 +3892,10 @@ bool NewGVN::eliminateInstructions(Function &F) {
auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
for (auto &Operand : PHI->incoming_values())
if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
- DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
- << getBlockName(PHI->getIncomingBlock(Operand))
- << " with undef due to it being unreachable\n");
+ LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
+ << " for block "
+ << getBlockName(PHI->getIncomingBlock(Operand))
+ << " with undef due to it being unreachable\n");
Operand.set(UndefValue::get(PHI->getType()));
}
};
@@ -3887,7 +3927,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
// Map to store the use counts
DenseMap<const Value *, unsigned int> UseCounts;
for (auto *CC : reverse(CongruenceClasses)) {
- DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+ LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+ << "\n");
// Track the equivalent store info so we can decide whether to try
// dead store elimination.
SmallVector<ValueDFS, 8> PossibleDeadStores;
@@ -3925,8 +3966,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
MembersLeft.insert(Member);
continue;
}
- DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for "
+ << *Member << "\n");
auto *I = cast<Instruction>(Member);
assert(Leader != I && "About to accidentally remove our leader");
replaceInstruction(I, Leader);
@@ -3947,7 +3988,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
// Sort the whole thing.
- std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+ llvm::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
for (auto &VD : DFSOrderedSet) {
int MemberDFSIn = VD.DFSIn;
int MemberDFSOut = VD.DFSOut;
@@ -3966,24 +4007,24 @@ bool NewGVN::eliminateInstructions(Function &F) {
// remove from temp instruction list.
AllTempInstructions.erase(PN);
auto *DefBlock = getBlockForValue(Def);
- DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
- << " into block "
- << getBlockName(getBlockForValue(Def)) << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+ << " into block "
+ << getBlockName(getBlockForValue(Def)) << "\n");
PN->insertBefore(&DefBlock->front());
Def = PN;
NumGVNPHIOfOpsEliminations++;
}
if (EliminationStack.empty()) {
- DEBUG(dbgs() << "Elimination Stack is empty\n");
+ LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n");
} else {
- DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
- << EliminationStack.dfs_back().first << ","
- << EliminationStack.dfs_back().second << ")\n");
+ LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+ << EliminationStack.dfs_back().first << ","
+ << EliminationStack.dfs_back().second << ")\n");
}
- DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
- << MemberDFSOut << ")\n");
+ LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+ << MemberDFSOut << ")\n");
// First, we see if we are out of scope or empty. If so,
// and there equivalences, we try to replace the top of
// stack with equivalences (if it's on the stack, it must
@@ -4058,14 +4099,16 @@ bool NewGVN::eliminateInstructions(Function &F) {
Value *DominatingLeader = EliminationStack.back();
auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
- if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+ bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
+ if (isSSACopy)
DominatingLeader = II->getOperand(0);
// Don't replace our existing users with ourselves.
if (U->get() == DominatingLeader)
continue;
- DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
- << *U->get() << " in " << *(U->getUser()) << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Found replacement " << *DominatingLeader << " for "
+ << *U->get() << " in " << *(U->getUser()) << "\n");
// If we replaced something in an instruction, handle the patching of
// metadata. Skip this if we are replacing predicateinfo with its
@@ -4081,7 +4124,9 @@ bool NewGVN::eliminateInstructions(Function &F) {
// It's about to be alive again.
if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
ProbablyDead.erase(cast<Instruction>(DominatingLeader));
- if (LeaderUseCount == 0 && II)
+ // Copy instructions, however, are still dead because we use their
+ // operand as the leader.
+ if (LeaderUseCount == 0 && isSSACopy)
ProbablyDead.insert(II);
++LeaderUseCount;
AnythingReplaced = true;
@@ -4106,7 +4151,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
// If we have possible dead stores to look at, try to eliminate them.
if (CC->getStoreCount() > 0) {
convertClassToLoadsAndStores(*CC, PossibleDeadStores);
- std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+ llvm::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
ValueDFSStack EliminationStack;
for (auto &VD : PossibleDeadStores) {
int MemberDFSIn = VD.DFSIn;
@@ -4129,8 +4174,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
(void)Leader;
assert(DT->dominates(Leader->getParent(), Member->getParent()));
// Member is dominater by Leader, and thus dead
- DEBUG(dbgs() << "Marking dead store " << *Member
- << " that is dominated by " << *Leader << "\n");
+ LLVM_DEBUG(dbgs() << "Marking dead store " << *Member
+ << " that is dominated by " << *Leader << "\n");
markInstructionForDeletion(Member);
CC->erase(Member);
++NumGVNDeadStores;
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 2d0cb6fbf211..8f30bccf48f1 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -55,6 +55,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -65,7 +66,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#define DEBUG_TYPE "safepoint-placement"
@@ -323,7 +323,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
// avoiding the runtime cost of the actual safepoint.
if (!AllBackedges) {
if (mustBeFiniteCountedLoop(L, SE, Pred)) {
- DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+ LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
FiniteExecution++;
continue;
}
@@ -332,7 +332,9 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
// Note: This is only semantically legal since we won't do any further
// IPO or inlining before the actual call insertion.. If we hadn't, we
// might latter loose this call safepoint.
- DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "skipping safepoint placement due to unconditional call\n");
CallInLoop++;
continue;
}
@@ -348,7 +350,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
// variables) and branches to the true header
TerminatorInst *Term = Pred->getTerminator();
- DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+ LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
PollLocations.push_back(Term);
}
@@ -522,7 +524,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
};
// We need the order of list to be stable so that naming ends up stable
// when we split edges. This makes test cases much easier to write.
- std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+ llvm::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
// We can sometimes end up with duplicate poll locations. This happens if
// a single loop is visited more than once. The fact this happens seems
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 88dcaf0f8a36..c81ac70d99e6 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -31,6 +31,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -42,6 +43,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
@@ -55,7 +57,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <utility>
@@ -168,8 +169,8 @@ void ReassociatePass::BuildRankMap(Function &F,
// Assign distinct ranks to function arguments.
for (auto &Arg : F.args()) {
ValueRankMap[&Arg] = ++Rank;
- DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
- << "\n");
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+ << "\n");
}
// Traverse basic blocks in ReversePostOrder
@@ -200,17 +201,17 @@ unsigned ReassociatePass::getRank(Value *V) {
// for PHI nodes, we cannot have infinite recursion here, because there
// cannot be loops in the value graph that do not go through PHI nodes.
unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
- for (unsigned i = 0, e = I->getNumOperands();
- i != e && Rank != MaxRank; ++i)
+ for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
Rank = std::max(Rank, getRank(I->getOperand(i)));
// If this is a not or neg instruction, do not count it for rank. This
// assures us that X and ~X will have the same rank.
- if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
- !BinaryOperator::isFNeg(I))
+ if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+ !BinaryOperator::isFNeg(I))
++Rank;
- DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
+ << "\n");
return ValueRankMap[I] = Rank;
}
@@ -445,7 +446,7 @@ using RepeatedValue = std::pair<Value*, APInt>;
/// type and thus make the expression bigger.
static bool LinearizeExprTree(BinaryOperator *I,
SmallVectorImpl<RepeatedValue> &Ops) {
- DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
unsigned Opcode = I->getOpcode();
assert(I->isAssociative() && I->isCommutative() &&
@@ -494,14 +495,14 @@ static bool LinearizeExprTree(BinaryOperator *I,
for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
Value *Op = I->getOperand(OpIdx);
APInt Weight = P.second; // Number of paths to this operand.
- DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
assert(!Op->use_empty() && "No uses, so how did we get to it?!");
// If this is a binary operation of the right kind with only one use then
// add its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
assert(Visited.insert(Op).second && "Not first visit!");
- DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
Worklist.push_back(std::make_pair(BO, Weight));
continue;
}
@@ -514,7 +515,8 @@ static bool LinearizeExprTree(BinaryOperator *I,
if (!Op->hasOneUse()) {
// This value has uses not accounted for by the expression, so it is
// not safe to modify. Mark it as being a leaf.
- DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs()
+ << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
continue;
@@ -540,7 +542,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// to the expression, then no longer consider it to be a leaf and add
// its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
- DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+ LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
Worklist.push_back(std::make_pair(BO, It->second));
Leaves.erase(It);
continue;
@@ -573,9 +575,10 @@ static bool LinearizeExprTree(BinaryOperator *I,
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
(Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
- DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+ LLVM_DEBUG(dbgs()
+ << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
BO = LowerNegateToMultiply(BO);
- DEBUG(dbgs() << *BO << '\n');
+ LLVM_DEBUG(dbgs() << *BO << '\n');
Worklist.push_back(std::make_pair(BO, Weight));
Changed = true;
continue;
@@ -583,7 +586,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// Failed to morph into an expression of the right type. This really is
// a leaf.
- DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
@@ -675,9 +678,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
if (NewLHS == OldRHS && NewRHS == OldLHS) {
// The order of the operands was reversed. Swap them.
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->swapOperands();
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
break;
@@ -685,7 +688,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
// The new operation differs non-trivially from the original. Overwrite
// the old operands with the new ones.
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewLHS != OldLHS) {
BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
if (BO && !NotRewritable.count(BO))
@@ -698,7 +701,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
NodesToRewrite.push_back(BO);
Op->setOperand(1, NewRHS);
}
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
@@ -711,7 +714,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
// while the right-hand side will be the current element of Ops.
Value *NewRHS = Ops[i].Op;
if (NewRHS != Op->getOperand(1)) {
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewRHS == Op->getOperand(0)) {
// The new right-hand side was already present as the left operand. If
// we are lucky then swapping the operands will sort out both of them.
@@ -724,7 +727,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
Op->setOperand(1, NewRHS);
ExpressionChanged = Op;
}
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
}
@@ -756,9 +759,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
NewOp = NodesToRewrite.pop_back_val();
}
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->setOperand(0, NewOp);
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
++NumChanged;
@@ -781,6 +784,18 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
if (ExpressionChanged == I)
break;
+
+ // Discard any debug info related to the expressions that has changed (we
+ // can leave debug infor related to the root, since the result of the
+ // expression tree should be the same even after reassociation).
+ SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, ExpressionChanged);
+ for (auto *DII : DbgUsers) {
+ Value *Undef = UndefValue::get(ExpressionChanged->getType());
+ DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+ ValueAsMetadata::get(Undef)));
+ }
+
ExpressionChanged->moveBefore(I);
ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
} while (true);
@@ -798,7 +813,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
/// pushing the negates through adds. These will be revisited to see if
/// additional opportunities have been exposed.
static Value *NegateValue(Value *V, Instruction *BI,
- SetVector<AssertingVH<Instruction>> &ToRedo) {
+ ReassociatePass::OrderedSet &ToRedo) {
if (auto *C = dyn_cast<Constant>(V))
return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
ConstantExpr::getNeg(C);
@@ -912,8 +927,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
/// If we have (X-Y), and if either X is an add, or if this is only used by an
/// add, transform this into (X+(0-Y)) to promote better reassociation.
-static BinaryOperator *
-BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+static BinaryOperator *BreakUpSubtract(Instruction *Sub,
+ ReassociatePass::OrderedSet &ToRedo) {
// Convert a subtract into an add and a neg instruction. This allows sub
// instructions to be commuted with other add instructions.
//
@@ -929,7 +944,7 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
Sub->replaceAllUsesWith(New);
New->setDebugLoc(Sub->getDebugLoc());
- DEBUG(dbgs() << "Negated: " << *New << '\n');
+ LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n');
return New;
}
@@ -1415,7 +1430,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
++NumFound;
} while (i != Ops.size() && Ops[i].Op == TheOp);
- DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+ LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp
+ << '\n');
++NumFactor;
// Insert a new multiply.
@@ -1553,7 +1569,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
// If any factor occurred more than one time, we can pull it out.
if (MaxOcc > 1) {
- DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+ LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal
+ << '\n');
++NumFactor;
// Create a new instruction that uses the MaxOccVal twice. If we don't do
@@ -1622,7 +1639,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
return nullptr;
}
-/// \brief Build up a vector of value/power pairs factoring a product.
+/// Build up a vector of value/power pairs factoring a product.
///
/// Given a series of multiplication operands, build a vector of factors and
/// the powers each is raised to when forming the final product. Sort them in
@@ -1687,7 +1704,7 @@ static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
return true;
}
-/// \brief Build a tree of multiplies, computing the product of Ops.
+/// Build a tree of multiplies, computing the product of Ops.
static Value *buildMultiplyTree(IRBuilder<> &Builder,
SmallVectorImpl<Value*> &Ops) {
if (Ops.size() == 1)
@@ -1704,7 +1721,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
return LHS;
}
-/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
///
/// Given a vector of values raised to various powers, where no two values are
/// equal and the powers are sorted in decreasing order, compute the minimal
@@ -1859,8 +1876,8 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
// Remove dead instructions and if any operands are trivially dead add them to
// Insts so they will be removed as well.
-void ReassociatePass::RecursivelyEraseDeadInsts(
- Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
+ OrderedSet &Insts) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
ValueRankMap.erase(I);
@@ -1876,7 +1893,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(
/// Zap the given instruction, adding interesting operands to the work list.
void ReassociatePass::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
- DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+ LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
// Erase the dead instruction.
@@ -1893,7 +1910,14 @@ void ReassociatePass::EraseInst(Instruction *I) {
while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
Visited.insert(Op).second)
Op = Op->user_back();
- RedoInsts.insert(Op);
+
+ // The instruction we're going to push may be coming from a
+ // dead block, and Reassociate skips the processing of unreachable
+ // blocks because it's a waste of time and also because it can
+ // lead to infinite loop due to LLVM's non-standard definition
+ // of dominance.
+ if (ValueRankMap.find(Op) != ValueRankMap.end())
+ RedoInsts.insert(Op);
}
MadeChange = true;
@@ -2120,7 +2144,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
ValueEntry(getRank(E.first), E.first));
}
- DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
// Now that we have linearized the tree to a list and have gathered all of
// the operands and their ranks, sort the operands by their rank. Use a
@@ -2138,7 +2162,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
return;
// This expression tree simplified to something that isn't a tree,
// eliminate it.
- DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
I->replaceAllUsesWith(V);
if (Instruction *VI = dyn_cast<Instruction>(V))
if (I->getDebugLoc())
@@ -2169,7 +2193,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
}
}
- DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
if (Ops.size() == 1) {
if (Ops[0].Op == I)
@@ -2321,7 +2345,7 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Make a copy of all the instructions to be redone so we can remove dead
// instructions.
- SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+ OrderedSet ToRedo(RedoInsts);
// Iterate over all instructions to be reevaluated and remove trivially dead
// instructions. If any operand of the trivially dead instruction becomes
// dead mark it for deletion as well. Continue this process until all
@@ -2337,7 +2361,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Now that we have removed dead instructions, we can reoptimize the
// remaining instructions.
while (!RedoInsts.empty()) {
- Instruction *I = RedoInsts.pop_back_val();
+ Instruction *I = RedoInsts.front();
+ RedoInsts.erase(RedoInsts.begin());
if (isInstructionTriviallyDead(I))
EraseInst(I);
else
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 96295683314c..018feb035a4f 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -17,6 +17,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
@@ -25,7 +26,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
#include <list>
using namespace llvm;
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c44edbed8ed9..391e43f79121 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -64,7 +65,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
#include <cassert>
@@ -476,6 +476,12 @@ findBaseDefiningValueOfVector(Value *I) {
if (auto *BC = dyn_cast<BitCastInst>(I))
return findBaseDefiningValue(BC->getOperand(0));
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ return BaseDefiningValueResult(I, true);
+
// A PHI or Select is a base defining value. The outer findBasePointer
// algorithm is responsible for constructing a base value for this BDV.
assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -610,8 +616,8 @@ static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
Value *&Cached = Cache[I];
if (!Cached) {
Cached = findBaseDefiningValue(I).BDV;
- DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
- << Cached->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+ << Cached->getName() << "\n");
}
assert(Cache[I] != nullptr);
return Cached;
@@ -842,9 +848,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
}
#ifndef NDEBUG
- DEBUG(dbgs() << "States after initialization:\n");
+ LLVM_DEBUG(dbgs() << "States after initialization:\n");
for (auto Pair : States) {
- DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
}
#endif
@@ -917,9 +923,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
}
#ifndef NDEBUG
- DEBUG(dbgs() << "States after meet iteration:\n");
+ LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
for (auto Pair : States) {
- DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
}
#endif
@@ -960,7 +966,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
if (isa<PHINode>(I)) {
BasicBlock *BB = I->getParent();
- int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+ int NumPreds = pred_size(BB);
assert(NumPreds > 0 && "how did we reach here");
std::string Name = suffixed_name_or(I, ".base", "base_phi");
return PHINode::Create(I->getType(), NumPreds, Name, I);
@@ -1118,10 +1124,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
assert(BDV && Base);
assert(!isKnownBaseResult(BDV) && "why did it get added?");
- DEBUG(dbgs() << "Updating base value cache"
- << " for: " << BDV->getName() << " from: "
- << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
- << " to: " << Base->getName() << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Updating base value cache"
+ << " for: " << BDV->getName() << " from: "
+ << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+ << " to: " << Base->getName() << "\n");
if (Cache.count(BDV)) {
assert(isKnownBaseResult(Base) &&
@@ -1369,7 +1376,7 @@ public:
assert(OldI != NewI && "Disallowed at construction?!");
assert((!IsDeoptimize || !New) &&
- "Deoptimize instrinsics are not replaced!");
+ "Deoptimize intrinsics are not replaced!");
Old = nullptr;
New = nullptr;
@@ -1379,7 +1386,7 @@ public:
if (IsDeoptimize) {
// Note: we've inserted instructions, so the call to llvm.deoptimize may
- // not necessarilly be followed by the matching return.
+ // not necessarily be followed by the matching return.
auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
new UnreachableInst(RI->getContext(), RI);
RI->eraseFromParent();
@@ -1805,7 +1812,7 @@ static void relocationViaAlloca(
SmallVector<Instruction *, 20> Uses;
// PERF: trade a linear scan for repeated reallocation
- Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+ Uses.reserve(Def->getNumUses());
for (User *U : Def->users()) {
if (!isa<ConstantExpr>(U)) {
// If the def has a ConstantExpr use, then the def is either a
@@ -1817,7 +1824,7 @@ static void relocationViaAlloca(
}
}
- std::sort(Uses.begin(), Uses.end());
+ llvm::sort(Uses.begin(), Uses.end());
auto Last = std::unique(Uses.begin(), Uses.end());
Uses.erase(Last, Uses.end());
@@ -1977,7 +1984,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
Cost += 2;
} else {
- llvm_unreachable("unsupported instruciton type during rematerialization");
+ llvm_unreachable("unsupported instruction type during rematerialization");
}
}
@@ -2024,7 +2031,7 @@ static void rematerializeLiveValues(CallSite CS,
SmallVector<Value *, 32> LiveValuesToBeDeleted;
for (Value *LiveValue: Info.LiveSet) {
- // For each live pointer find it's defining chain
+ // For each live pointer find its defining chain
SmallVector<Instruction *, 3> ChainToBase;
assert(Info.PointerToBase.count(LiveValue));
Value *RootOfChain =
@@ -2461,22 +2468,8 @@ static void stripNonValidDataFromBody(Function &F) {
continue;
}
- if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
- assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
- bool IsImmutableTBAA =
- MD->getNumOperands() == 4 &&
- mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
-
- if (!IsImmutableTBAA)
- continue; // no work to do, MD_tbaa is already marked mutable
-
- MDNode *Base = cast<MDNode>(MD->getOperand(0));
- MDNode *Access = cast<MDNode>(MD->getOperand(1));
- uint64_t Offset =
- mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
-
- MDNode *MutableTBAA =
- Builder.createTBAAStructTagNode(Base, Access, Offset);
+ if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) {
+ MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag);
I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
}
@@ -2537,30 +2530,31 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
return false;
};
+
+ // Delete any unreachable statepoints so that we don't have unrewritten
+ // statepoints surviving this pass. This makes testing easier and the
+ // resulting IR less confusing to human readers.
+ DeferredDominance DD(DT);
+ bool MadeChange = removeUnreachableBlocks(F, nullptr, &DD);
+ DD.flush();
+
// Gather all the statepoints which need rewritten. Be careful to only
// consider those in reachable code since we need to ask dominance queries
// when rewriting. We'll delete the unreachable ones in a moment.
SmallVector<CallSite, 64> ParsePointNeeded;
- bool HasUnreachableStatepoint = false;
for (Instruction &I : instructions(F)) {
// TODO: only the ones with the flag set!
if (NeedsRewrite(I)) {
- if (DT.isReachableFromEntry(I.getParent()))
- ParsePointNeeded.push_back(CallSite(&I));
- else
- HasUnreachableStatepoint = true;
+ // NOTE removeUnreachableBlocks() is stronger than
+ // DominatorTree::isReachableFromEntry(). In other words
+ // removeUnreachableBlocks can remove some blocks for which
+ // isReachableFromEntry() returns true.
+ assert(DT.isReachableFromEntry(I.getParent()) &&
+ "no unreachable blocks expected");
+ ParsePointNeeded.push_back(CallSite(&I));
}
}
- bool MadeChange = false;
-
- // Delete any unreachable statepoints so that we don't have unrewritten
- // statepoints surviving this pass. This makes testing easier and the
- // resulting IR less confusing to human readers. Rather than be fancy, we
- // just reuse a utility function which removes the unreachable blocks.
- if (HasUnreachableStatepoint)
- MadeChange |= removeUnreachableBlocks(F);
-
// Return early if no work to do.
if (ParsePointNeeded.empty())
return MadeChange;
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 66608ec631f6..5e3ddeda2d49 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -17,7 +17,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/IPO/SCCP.h"
#include "llvm/Transforms/Scalar/SCCP.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -30,6 +29,7 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueLattice.h"
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/IR/BasicBlock.h"
@@ -54,9 +54,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
#include <vector>
@@ -71,8 +69,6 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
-STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by"
- "IPSCCP");
namespace {
@@ -223,6 +219,10 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
/// represented here for efficient lookup.
SmallPtrSet<Function *, 16> MRVFunctionsTracked;
+ /// MustTailFunctions - Each function here is a callee of non-removable
+ /// musttail call site.
+ SmallPtrSet<Function *, 16> MustTailCallees;
+
/// TrackingIncomingArguments - This is the set of functions for whose
/// arguments we make optimistic assumptions about and try to prove as
/// constants.
@@ -257,7 +257,7 @@ public:
bool MarkBlockExecutable(BasicBlock *BB) {
if (!BBExecutable.insert(BB).second)
return false;
- DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
BBWorkList.push_back(BB); // Add the block to the work list!
return true;
}
@@ -289,6 +289,18 @@ public:
TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
}
+ /// AddMustTailCallee - If the SCCP solver finds that this function is called
+ /// from non-removable musttail call site.
+ void AddMustTailCallee(Function *F) {
+ MustTailCallees.insert(F);
+ }
+
+ /// Returns true if the given function is called from non-removable musttail
+ /// call site.
+ bool isMustTailCallee(Function *F) {
+ return MustTailCallees.count(F);
+ }
+
void AddArgumentTrackedFunction(Function *F) {
TrackingIncomingArguments.insert(F);
}
@@ -313,6 +325,10 @@ public:
return BBExecutable.count(BB);
}
+ // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+ // block to the 'To' basic block is currently feasible.
+ bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
std::vector<LatticeVal> StructValues;
auto *STy = dyn_cast<StructType>(V->getType());
@@ -325,20 +341,13 @@ public:
return StructValues;
}
- ValueLatticeElement getLatticeValueFor(Value *V) {
+ const LatticeVal &getLatticeValueFor(Value *V) const {
assert(!V->getType()->isStructTy() &&
"Should use getStructLatticeValueFor");
- std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
- PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
- ValueLatticeElement &LV = PI.first->second;
- if (PI.second) {
- DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
- assert(I != ValueState.end() &&
- "V not found in ValueState nor Paramstate map!");
- LV = I->second.toValueLattice();
- }
-
- return LV;
+ DenseMap<Value *, LatticeVal>::const_iterator I = ValueState.find(V);
+ assert(I != ValueState.end() &&
+ "V not found in ValueState nor Paramstate map!");
+ return I->second;
}
/// getTrackedRetVals - Get the inferred return value map.
@@ -358,6 +367,12 @@ public:
return MRVFunctionsTracked;
}
+ /// getMustTailCallees - Get the set of functions which are called
+ /// from non-removable musttail call sites.
+ const SmallPtrSet<Function *, 16> getMustTailCallees() {
+ return MustTailCallees;
+ }
+
/// markOverdefined - Mark the specified value overdefined. This
/// works with both scalars and structs.
void markOverdefined(Value *V) {
@@ -393,55 +408,57 @@ private:
// markConstant - Make a value be marked as "constant". If the value
// is not already a constant, add it to the instruction work list so that
// the users of the instruction are updated later.
- void markConstant(LatticeVal &IV, Value *V, Constant *C) {
- if (!IV.markConstant(C)) return;
- DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+ bool markConstant(LatticeVal &IV, Value *V, Constant *C) {
+ if (!IV.markConstant(C)) return false;
+ LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
pushToWorkList(IV, V);
+ return true;
}
- void markConstant(Value *V, Constant *C) {
+ bool markConstant(Value *V, Constant *C) {
assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
- markConstant(ValueState[V], V, C);
+ return markConstant(ValueState[V], V, C);
}
void markForcedConstant(Value *V, Constant *C) {
assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
LatticeVal &IV = ValueState[V];
IV.markForcedConstant(C);
- DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
pushToWorkList(IV, V);
}
// markOverdefined - Make a value be marked as "overdefined". If the
// value is not already overdefined, add it to the overdefined instruction
// work list so that the users of the instruction are updated later.
- void markOverdefined(LatticeVal &IV, Value *V) {
- if (!IV.markOverdefined()) return;
-
- DEBUG(dbgs() << "markOverdefined: ";
- if (auto *F = dyn_cast<Function>(V))
- dbgs() << "Function '" << F->getName() << "'\n";
- else
- dbgs() << *V << '\n');
+ bool markOverdefined(LatticeVal &IV, Value *V) {
+ if (!IV.markOverdefined()) return false;
+
+ LLVM_DEBUG(dbgs() << "markOverdefined: ";
+ if (auto *F = dyn_cast<Function>(V)) dbgs()
+ << "Function '" << F->getName() << "'\n";
+ else dbgs() << *V << '\n');
// Only instructions go on the work list
pushToWorkList(IV, V);
+ return true;
}
- void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+ bool mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
if (IV.isOverdefined() || MergeWithV.isUnknown())
- return; // Noop.
+ return false; // Noop.
if (MergeWithV.isOverdefined())
return markOverdefined(IV, V);
if (IV.isUnknown())
return markConstant(IV, V, MergeWithV.getConstant());
if (IV.getConstant() != MergeWithV.getConstant())
return markOverdefined(IV, V);
+ return false;
}
- void mergeInValue(Value *V, LatticeVal MergeWithV) {
+ bool mergeInValue(Value *V, LatticeVal MergeWithV) {
assert(!V->getType()->isStructTy() &&
"non-structs should use markConstant");
- mergeInValue(ValueState[V], V, MergeWithV);
+ return mergeInValue(ValueState[V], V, MergeWithV);
}
/// getValueState - Return the LatticeVal object that corresponds to the
@@ -512,32 +529,27 @@ private:
/// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
/// work list if it is not already executable.
- void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+ bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
- return; // This edge is already known to be executable!
+ return false; // This edge is already known to be executable!
if (!MarkBlockExecutable(Dest)) {
// If the destination is already executable, we just made an *edge*
// feasible that wasn't before. Revisit the PHI nodes in the block
// because they have potentially new operands.
- DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
- << " -> " << Dest->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+ << " -> " << Dest->getName() << '\n');
- PHINode *PN;
- for (BasicBlock::iterator I = Dest->begin();
- (PN = dyn_cast<PHINode>(I)); ++I)
- visitPHINode(*PN);
+ for (PHINode &PN : Dest->phis())
+ visitPHINode(PN);
}
+ return true;
}
// getFeasibleSuccessors - Return a vector of booleans to indicate which
// successors are reachable from a given terminator instruction.
void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
- // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
- // block to the 'To' basic block is currently feasible.
- bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
-
// OperandChangedState - This method is invoked on all of the users of an
// instruction that was just changed state somehow. Based on this
// information, we need to update the specified user of this instruction.
@@ -594,7 +606,7 @@ private:
void visitInstruction(Instruction &I) {
// All the instructions we don't do any special handling for just
// go to overdefined.
- DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
markOverdefined(&I);
}
};
@@ -681,68 +693,17 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
return;
}
- DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+ LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
llvm_unreachable("SCCP: Don't know how to handle this terminator!");
}
// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
// block to the 'To' basic block is currently feasible.
bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
- assert(BBExecutable.count(To) && "Dest should always be alive!");
-
- // Make sure the source basic block is executable!!
- if (!BBExecutable.count(From)) return false;
-
- // Check to make sure this edge itself is actually feasible now.
- TerminatorInst *TI = From->getTerminator();
- if (auto *BI = dyn_cast<BranchInst>(TI)) {
- if (BI->isUnconditional())
- return true;
-
- LatticeVal BCValue = getValueState(BI->getCondition());
-
- // Overdefined condition variables mean the branch could go either way,
- // undef conditions mean that neither edge is feasible yet.
- ConstantInt *CI = BCValue.getConstantInt();
- if (!CI)
- return !BCValue.isUnknown();
-
- // Constant condition variables mean the branch can only go a single way.
- return BI->getSuccessor(CI->isZero()) == To;
- }
-
- // Unwinding instructions successors are always executable.
- if (TI->isExceptional())
- return true;
-
- if (auto *SI = dyn_cast<SwitchInst>(TI)) {
- if (SI->getNumCases() < 1)
- return true;
-
- LatticeVal SCValue = getValueState(SI->getCondition());
- ConstantInt *CI = SCValue.getConstantInt();
-
- if (!CI)
- return !SCValue.isUnknown();
-
- return SI->findCaseValue(CI)->getCaseSuccessor() == To;
- }
-
- // In case of indirect branch and its address is a blockaddress, we mark
- // the target as executable.
- if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
- LatticeVal IBRValue = getValueState(IBR->getAddress());
- BlockAddress *Addr = IBRValue.getBlockAddress();
-
- if (!Addr)
- return !IBRValue.isUnknown();
-
- // At this point, the indirectbr is branching on a blockaddress.
- return Addr->getBasicBlock() == To;
- }
-
- DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
- llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+ // Check if we've called markEdgeExecutable on the edge yet. (We could
+ // be more aggressive and try to consider edges which haven't been marked
+ // yet, but there isn't any need.)
+ return KnownFeasibleEdges.count(Edge(From, To));
}
// visit Implementations - Something changed in this instruction, either an
@@ -766,7 +727,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// If this PN returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
if (PN.getType()->isStructTy())
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
if (getValueState(&PN).isOverdefined())
return; // Quick exit
@@ -774,7 +735,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
@@ -790,7 +751,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
continue;
if (IV.isOverdefined()) // PHI node becomes overdefined!
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
if (!OperandVal) { // Grab the first value.
OperandVal = IV.getConstant();
@@ -804,7 +765,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// Check to see if there are two different constants merging, if so, the PHI
// node is overdefined.
if (IV.getConstant() != OperandVal)
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
}
// If we exited the loop, this means that the PHI node only has constant
@@ -872,11 +833,11 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
// If this returns a struct, mark all elements over defined, we don't track
// structs in structs.
if (EVI.getType()->isStructTy())
- return markOverdefined(&EVI);
+ return (void)markOverdefined(&EVI);
// If this is extracting from more than one level of struct, we don't know.
if (EVI.getNumIndices() != 1)
- return markOverdefined(&EVI);
+ return (void)markOverdefined(&EVI);
Value *AggVal = EVI.getAggregateOperand();
if (AggVal->getType()->isStructTy()) {
@@ -885,19 +846,19 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
mergeInValue(getValueState(&EVI), &EVI, EltVal);
} else {
// Otherwise, must be extracting from an array.
- return markOverdefined(&EVI);
+ return (void)markOverdefined(&EVI);
}
}
void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
auto *STy = dyn_cast<StructType>(IVI.getType());
if (!STy)
- return markOverdefined(&IVI);
+ return (void)markOverdefined(&IVI);
// If this has more than one index, we can't handle it, drive all results to
// undef.
if (IVI.getNumIndices() != 1)
- return markOverdefined(&IVI);
+ return (void)markOverdefined(&IVI);
Value *Aggr = IVI.getAggregateOperand();
unsigned Idx = *IVI.idx_begin();
@@ -926,7 +887,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
// If this select returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
if (I.getType()->isStructTy())
- return markOverdefined(&I);
+ return (void)markOverdefined(&I);
LatticeVal CondValue = getValueState(I.getCondition());
if (CondValue.isUnknown())
@@ -947,12 +908,12 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
// select ?, C, C -> C.
if (TVal.isConstant() && FVal.isConstant() &&
TVal.getConstant() == FVal.getConstant())
- return markConstant(&I, FVal.getConstant());
+ return (void)markConstant(&I, FVal.getConstant());
if (TVal.isUnknown()) // select ?, undef, X -> X.
- return mergeInValue(&I, FVal);
+ return (void)mergeInValue(&I, FVal);
if (FVal.isUnknown()) // select ?, X, undef -> X.
- return mergeInValue(&I, TVal);
+ return (void)mergeInValue(&I, TVal);
markOverdefined(&I);
}
@@ -970,7 +931,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// X op Y -> undef.
if (isa<UndefValue>(C))
return;
- return markConstant(IV, &I, C);
+ return (void)markConstant(IV, &I, C);
}
// If something is undef, wait for it to resolve.
@@ -983,7 +944,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// overdefined, and we can replace it with zero.
if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
if (V1State.isConstant() && V1State.getConstant()->isNullValue())
- return markConstant(IV, &I, V1State.getConstant());
+ return (void)markConstant(IV, &I, V1State.getConstant());
// If this is:
// -> AND/MUL with 0
@@ -1006,12 +967,12 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// X and 0 = 0
// X * 0 = 0
if (NonOverdefVal->getConstant()->isNullValue())
- return markConstant(IV, &I, NonOverdefVal->getConstant());
+ return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
} else {
// X or -1 = -1
if (ConstantInt *CI = NonOverdefVal->getConstantInt())
if (CI->isMinusOne())
- return markConstant(IV, &I, NonOverdefVal->getConstant());
+ return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
}
}
}
@@ -1021,22 +982,36 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// Handle ICmpInst instruction.
void SCCPSolver::visitCmpInst(CmpInst &I) {
- LatticeVal V1State = getValueState(I.getOperand(0));
- LatticeVal V2State = getValueState(I.getOperand(1));
-
LatticeVal &IV = ValueState[&I];
if (IV.isOverdefined()) return;
- if (V1State.isConstant() && V2State.isConstant()) {
- Constant *C = ConstantExpr::getCompare(
- I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+ Value *Op1 = I.getOperand(0);
+ Value *Op2 = I.getOperand(1);
+
+ // For parameters, use ParamState which includes constant range info if
+ // available.
+ auto V1Param = ParamState.find(Op1);
+ ValueLatticeElement V1State = (V1Param != ParamState.end())
+ ? V1Param->second
+ : getValueState(Op1).toValueLattice();
+
+ auto V2Param = ParamState.find(Op2);
+ ValueLatticeElement V2State = V2Param != ParamState.end()
+ ? V2Param->second
+ : getValueState(Op2).toValueLattice();
+
+ Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
+ if (C) {
if (isa<UndefValue>(C))
return;
- return markConstant(IV, &I, C);
+ LatticeVal CV;
+ CV.markConstant(C);
+ mergeInValue(&I, CV);
+ return;
}
// If operands are still unknown, wait for it to resolve.
- if (!V1State.isOverdefined() && !V2State.isOverdefined())
+ if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant())
return;
markOverdefined(&I);
@@ -1056,7 +1031,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
return; // Operands are not resolved yet.
if (State.isOverdefined())
- return markOverdefined(&I);
+ return (void)markOverdefined(&I);
assert(State.isConstant() && "Unknown state!");
Operands.push_back(State.getConstant());
@@ -1094,7 +1069,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
void SCCPSolver::visitLoadInst(LoadInst &I) {
// If this load is of a struct, just mark the result overdefined.
if (I.getType()->isStructTy())
- return markOverdefined(&I);
+ return (void)markOverdefined(&I);
LatticeVal PtrVal = getValueState(I.getOperand(0));
if (PtrVal.isUnknown()) return; // The pointer is not resolved yet!
@@ -1103,13 +1078,17 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
if (IV.isOverdefined()) return;
if (!PtrVal.isConstant() || I.isVolatile())
- return markOverdefined(IV, &I);
+ return (void)markOverdefined(IV, &I);
Constant *Ptr = PtrVal.getConstant();
// load null is undefined.
- if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
- return;
+ if (isa<ConstantPointerNull>(Ptr)) {
+ if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
+ return (void)markOverdefined(IV, &I);
+ else
+ return;
+ }
// Transform load (constant global) into the value loaded.
if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
@@ -1128,7 +1107,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
if (isa<UndefValue>(C))
return;
- return markConstant(IV, &I, C);
+ return (void)markConstant(IV, &I, C);
}
// Otherwise we cannot say for certain what value this load will produce.
@@ -1160,7 +1139,7 @@ CallOverdefined:
if (State.isUnknown())
return; // Operands are not resolved yet.
if (State.isOverdefined())
- return markOverdefined(I);
+ return (void)markOverdefined(I);
assert(State.isConstant() && "Unknown state!");
Operands.push_back(State.getConstant());
}
@@ -1174,12 +1153,12 @@ CallOverdefined:
// call -> undef.
if (isa<UndefValue>(C))
return;
- return markConstant(I, C);
+ return (void)markConstant(I, C);
}
}
// Otherwise, we don't know anything about this call, mark it overdefined.
- return markOverdefined(I);
+ return (void)markOverdefined(I);
}
// If this is a local function that doesn't have its address taken, mark its
@@ -1207,8 +1186,16 @@ CallOverdefined:
} else {
// Most other parts of the Solver still only use the simpler value
// lattice, so we propagate changes for parameters to both lattices.
- getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL);
- mergeInValue(&*AI, getValueState(*CAI));
+ LatticeVal ConcreteArgument = getValueState(*CAI);
+ bool ParamChanged =
+ getParamState(&*AI).mergeIn(ConcreteArgument.toValueLattice(), DL);
+ bool ValueChanged = mergeInValue(&*AI, ConcreteArgument);
+ // Add argument to work list, if the state of a parameter changes but
+ // ValueState does not change (because it is already overdefined there),
+ // We have to take changes in ParamState into account, as it is used
+ // when evaluating Cmp instructions.
+ if (!ValueChanged && ParamChanged)
+ pushToWorkList(ValueState[&*AI], &*AI);
}
}
}
@@ -1242,7 +1229,7 @@ void SCCPSolver::Solve() {
while (!OverdefinedInstWorkList.empty()) {
Value *I = OverdefinedInstWorkList.pop_back_val();
- DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
// "I" got into the work list because it either made the transition from
// bottom to constant, or to overdefined.
@@ -1260,7 +1247,7 @@ void SCCPSolver::Solve() {
while (!InstWorkList.empty()) {
Value *I = InstWorkList.pop_back_val();
- DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
// "I" got into the work list because it made the transition from undef to
// constant.
@@ -1280,7 +1267,7 @@ void SCCPSolver::Solve() {
BasicBlock *BB = BBWorkList.back();
BBWorkList.pop_back();
- DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+ LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
// Notify all instructions in this basic block that they are newly
// executable.
@@ -1501,7 +1488,11 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
break;
case Instruction::ICmp:
// X == undef -> undef. Other comparisons get more complicated.
- if (cast<ICmpInst>(&I)->isEquality())
+ Op0LV = getValueState(I.getOperand(0));
+ Op1LV = getValueState(I.getOperand(1));
+
+ if ((Op0LV.isUnknown() || Op1LV.isUnknown()) &&
+ cast<ICmpInst>(&I)->isEquality())
break;
markOverdefined(&I);
return true;
@@ -1546,11 +1537,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
// Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Handle this by forcing the input value to the
- // branch to false.
- markForcedConstant(BI->getCondition(),
- ConstantInt::getFalse(TI->getContext()));
- return true;
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: Distinguish between dead code and an LLVM "undef" value.
+ BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
+ return true;
+
+ continue;
}
if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
@@ -1571,11 +1565,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
// Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Handle this by forcing the input value to the
- // branch to the first successor.
- markForcedConstant(IBR->getAddress(),
- BlockAddress::get(IBR->getSuccessor(0)));
- return true;
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
+ // we can assume the branch has undefined behavior instead.
+ BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
+ return true;
+
+ continue;
}
if (auto *SI = dyn_cast<SwitchInst>(TI)) {
@@ -1590,56 +1588,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
return true;
}
- markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
- return true;
- }
- }
-
- return false;
-}
-
-static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) {
- bool Changed = false;
-
- // Currently we only use range information for integer values.
- if (!V->getType()->isIntegerTy())
- return false;
-
- const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
- if (!IV.isConstantRange())
- return false;
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: Distinguish between dead code and an LLVM "undef" value.
+ BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
+ return true;
- for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) {
- const Use &U = *UI++;
- auto *Icmp = dyn_cast<ICmpInst>(U.getUser());
- if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent()))
continue;
-
- auto getIcmpLatticeValue = [&](Value *Op) {
- if (auto *C = dyn_cast<Constant>(Op))
- return ValueLatticeElement::get(C);
- return Solver.getLatticeValueFor(Op);
- };
-
- ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0));
- ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1));
-
- Constant *C = nullptr;
- if (A.satisfiesPredicate(Icmp->getPredicate(), B))
- C = ConstantInt::getTrue(Icmp->getType());
- else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B))
- C = ConstantInt::getFalse(Icmp->getType());
-
- if (C) {
- Icmp->replaceAllUsesWith(C);
- DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C
- << ", because of range information " << A << " " << B
- << "\n");
- Icmp->eraseFromParent();
- Changed = true;
}
}
- return Changed;
+
+ return false;
}
static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
@@ -1659,22 +1620,31 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
}
Const = ConstantStruct::get(ST, ConstVals);
} else {
- const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+ const LatticeVal &IV = Solver.getLatticeValueFor(V);
if (IV.isOverdefined())
return false;
- if (IV.isConstantRange()) {
- if (IV.getConstantRange().isSingleElement())
- Const =
- ConstantInt::get(V->getType(), IV.asConstantInteger().getValue());
- else
- return false;
- } else
- Const =
- IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+ Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
}
assert(Const && "Constant is nullptr here!");
- DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
+
+ // Replacing `musttail` instructions with constant breaks `musttail` invariant
+ // unless the call itself can be removed
+ CallInst *CI = dyn_cast<CallInst>(V);
+ if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
+ CallSite CS(CI);
+ Function *F = CS.getCalledFunction();
+
+ // Don't zap returns of the callee
+ if (F)
+ Solver.AddMustTailCallee(F);
+
+ LLVM_DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI
+ << " as a constant\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
// Replaces all of the uses of a variable with uses of the constant.
V->replaceAllUsesWith(Const);
@@ -1685,7 +1655,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
// and return true if the function was modified.
static bool runSCCP(Function &F, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
- DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+ LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
SCCPSolver Solver(DL, TLI);
// Mark the first block of the function as being executable.
@@ -1699,7 +1669,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
bool ResolvedUndefs = true;
while (ResolvedUndefs) {
Solver.Solve();
- DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+ LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n");
ResolvedUndefs = Solver.ResolvedUndefsIn(F);
}
@@ -1711,7 +1681,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
for (BasicBlock &BB : F) {
if (!Solver.isBlockExecutable(&BB)) {
- DEBUG(dbgs() << " BasicBlock Dead:" << BB);
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
++NumDeadBlocks;
NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
@@ -1748,6 +1718,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
auto PA = PreservedAnalyses();
PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
return PA;
}
@@ -1770,6 +1741,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
}
// runOnFunction - Run the Sparse Conditional Constant Propagation
@@ -1804,14 +1776,30 @@ static void findReturnsToZap(Function &F,
if (!Solver.isArgumentTrackedFunction(&F))
return;
- for (BasicBlock &BB : F)
+ // There is a non-removable musttail call site of this function. Zapping
+ // returns is not allowed.
+ if (Solver.isMustTailCallee(&F)) {
+ LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
+ << " due to present musttail call of it\n");
+ return;
+ }
+
+ for (BasicBlock &BB : F) {
+ if (CallInst *CI = BB.getTerminatingMustTailCall()) {
+ LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
+ << "musttail call : " << *CI << "\n");
+ (void)CI;
+ return;
+ }
+
if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
if (!isa<UndefValue>(RI->getOperand(0)))
ReturnsToZap.push_back(RI);
+ }
}
-static bool runIPSCCP(Module &M, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
+bool llvm::runIPSCCP(Module &M, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
SCCPSolver Solver(DL, TLI);
// Loop over all functions, marking arguments to those with their addresses
@@ -1851,13 +1839,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// Solve for constants.
bool ResolvedUndefs = true;
+ Solver.Solve();
while (ResolvedUndefs) {
- Solver.Solve();
-
- DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+ LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
ResolvedUndefs = false;
for (Function &F : M)
- ResolvedUndefs |= Solver.ResolvedUndefsIn(F);
+ if (Solver.ResolvedUndefsIn(F)) {
+ // We run Solve() after we resolved an undef in a function, because
+ // we might deduce a fact that eliminates an undef in another function.
+ Solver.Solve();
+ ResolvedUndefs = true;
+ }
}
bool MadeChanges = false;
@@ -1877,18 +1869,12 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
++IPNumArgsElimed;
continue;
}
-
- if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI))
- ++IPNumRangeInfoUsed;
}
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
if (!Solver.isBlockExecutable(&*BB)) {
- DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
-
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
++NumDeadBlocks;
- NumInstRemoved +=
- changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
MadeChanges = true;
@@ -1902,7 +1888,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
if (Inst->getType()->isVoidTy())
continue;
if (tryToReplaceWithConstant(Solver, Inst)) {
- if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+ if (Inst->isSafeToRemove())
Inst->eraseFromParent();
// Hey, we just changed something!
MadeChanges = true;
@@ -1911,6 +1897,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
}
}
+ // Change dead blocks to unreachable. We do it after replacing constants in
+ // all executable blocks, because changeToUnreachable may remove PHI nodes
+ // in executable blocks we found values for. The function's entry block is
+ // not part of BlocksToErase, so we have to handle it separately.
+ for (BasicBlock *BB : BlocksToErase)
+ NumInstRemoved +=
+ changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+ if (!Solver.isBlockExecutable(&F.front()))
+ NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
+ /*UseLLVMTrap=*/false);
+
// Now that all instructions in the function are constant folded, erase dead
// blocks, because we can now use ConstantFoldTerminator to get rid of
// in-edges.
@@ -1930,31 +1927,33 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
bool Folded = ConstantFoldTerminator(I->getParent());
if (!Folded) {
- // The constant folder may not have been able to fold the terminator
- // if this is a branch or switch on undef. Fold it manually as a
- // branch to the first successor.
-#ifndef NDEBUG
- if (auto *BI = dyn_cast<BranchInst>(I)) {
- assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
- "Branch should be foldable!");
- } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
- assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+ // If the branch can't be folded, we must have forced an edge
+ // for an indeterminate value. Force the terminator to fold
+ // to that edge.
+ Constant *C;
+ BasicBlock *Dest;
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ Dest = SI->case_begin()->getCaseSuccessor();
+ C = SI->case_begin()->getCaseValue();
+ } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ Dest = BI->getSuccessor(1);
+ C = ConstantInt::getFalse(BI->getContext());
+ } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) {
+ Dest = IBR->getSuccessor(0);
+ C = BlockAddress::get(IBR->getSuccessor(0));
} else {
- llvm_unreachable("Didn't fold away reference to block!");
+ llvm_unreachable("Unexpected terminator instruction");
}
-#endif
+ assert(Solver.isEdgeFeasible(I->getParent(), Dest) &&
+ "Didn't find feasible edge?");
+ (void)Dest;
- // Make this an uncond branch to the first successor.
- TerminatorInst *TI = I->getParent()->getTerminator();
- BranchInst::Create(TI->getSuccessor(0), TI);
-
- // Remove entries in successor phi nodes to remove edges.
- for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
- TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
- // Remove the old terminator.
- TI->eraseFromParent();
+ I->setOperand(0, C);
+ Folded = ConstantFoldTerminator(I->getParent());
}
+ assert(Folded &&
+ "Expect TermInst on constantint or blockaddress to be folded");
+ (void) Folded;
}
// Finally, delete the basic block.
@@ -2005,7 +2004,8 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
GlobalVariable *GV = I->first;
assert(!I->second.isOverdefined() &&
"Overdefined values should have been taken out of the map!");
- DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+ LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
+ << "' is constant!\n");
while (!GV->use_empty()) {
StoreInst *SI = cast<StoreInst>(GV->user_back());
SI->eraseFromParent();
@@ -2016,55 +2016,3 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
return MadeChanges;
}
-
-PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
- const DataLayout &DL = M.getDataLayout();
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
- if (!runIPSCCP(M, DL, &TLI))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-namespace {
-
-//===--------------------------------------------------------------------===//
-//
-/// IPSCCP Class - This class implements interprocedural Sparse Conditional
-/// Constant Propagation.
-///
-class IPSCCPLegacyPass : public ModulePass {
-public:
- static char ID;
-
- IPSCCPLegacyPass() : ModulePass(ID) {
- initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- const DataLayout &DL = M.getDataLayout();
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return runIPSCCP(M, DL, TLI);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char IPSCCPLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
- "Interprocedural Sparse Conditional Constant Propagation",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
- "Interprocedural Sparse Conditional Constant Propagation",
- false, false)
-
-// createIPSCCPPass - This is the public interface to this file.
-ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index bfe3754f0769..6c3f012c6280 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -42,6 +42,8 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantFolder.h"
@@ -79,7 +81,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
#include <cassert>
@@ -124,14 +125,9 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
cl::Hidden);
-/// Hidden option to allow more aggressive splitting.
-static cl::opt<bool>
-SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices",
- cl::init(false), cl::Hidden);
-
namespace {
-/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+/// A custom IRBuilder inserter which prefixes all names, but only in
/// Assert builds.
class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
std::string Prefix;
@@ -151,23 +147,23 @@ protected:
}
};
-/// \brief Provide a type for IRBuilder that drops names in release builds.
+/// Provide a type for IRBuilder that drops names in release builds.
using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-/// \brief A used slice of an alloca.
+/// A used slice of an alloca.
///
/// This structure represents a slice of an alloca used by some instruction. It
/// stores both the begin and end offsets of this use, a pointer to the use
/// itself, and a flag indicating whether we can classify the use as splittable
/// or not when forming partitions of the alloca.
class Slice {
- /// \brief The beginning offset of the range.
+ /// The beginning offset of the range.
uint64_t BeginOffset = 0;
- /// \brief The ending offset, not included in the range.
+ /// The ending offset, not included in the range.
uint64_t EndOffset = 0;
- /// \brief Storage for both the use of this slice and whether it can be
+ /// Storage for both the use of this slice and whether it can be
/// split.
PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
@@ -189,7 +185,7 @@ public:
bool isDead() const { return getUse() == nullptr; }
void kill() { UseAndIsSplittable.setPointer(nullptr); }
- /// \brief Support for ordering ranges.
+ /// Support for ordering ranges.
///
/// This provides an ordering over ranges such that start offsets are
/// always increasing, and within equal start offsets, the end offsets are
@@ -207,7 +203,7 @@ public:
return false;
}
- /// \brief Support comparison with a single offset to allow binary searches.
+ /// Support comparison with a single offset to allow binary searches.
friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
uint64_t RHSOffset) {
return LHS.beginOffset() < RHSOffset;
@@ -233,7 +229,7 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
} // end namespace llvm
-/// \brief Representation of the alloca slices.
+/// Representation of the alloca slices.
///
/// This class represents the slices of an alloca which are formed by its
/// various uses. If a pointer escapes, we can't fully build a representation
@@ -242,16 +238,16 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
/// starting at a particular offset before splittable slices.
class llvm::sroa::AllocaSlices {
public:
- /// \brief Construct the slices of a particular alloca.
+ /// Construct the slices of a particular alloca.
AllocaSlices(const DataLayout &DL, AllocaInst &AI);
- /// \brief Test whether a pointer to the allocation escapes our analysis.
+ /// Test whether a pointer to the allocation escapes our analysis.
///
/// If this is true, the slices are never fully built and should be
/// ignored.
bool isEscaped() const { return PointerEscapingInstr; }
- /// \brief Support for iterating over the slices.
+ /// Support for iterating over the slices.
/// @{
using iterator = SmallVectorImpl<Slice>::iterator;
using range = iterator_range<iterator>;
@@ -266,10 +262,10 @@ public:
const_iterator end() const { return Slices.end(); }
/// @}
- /// \brief Erase a range of slices.
+ /// Erase a range of slices.
void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
- /// \brief Insert new slices for this alloca.
+ /// Insert new slices for this alloca.
///
/// This moves the slices into the alloca's slices collection, and re-sorts
/// everything so that the usual ordering properties of the alloca's slices
@@ -278,7 +274,7 @@ public:
int OldSize = Slices.size();
Slices.append(NewSlices.begin(), NewSlices.end());
auto SliceI = Slices.begin() + OldSize;
- std::sort(SliceI, Slices.end());
+ llvm::sort(SliceI, Slices.end());
std::inplace_merge(Slices.begin(), SliceI, Slices.end());
}
@@ -287,10 +283,10 @@ public:
class partition_iterator;
iterator_range<partition_iterator> partitions();
- /// \brief Access the dead users for this alloca.
+ /// Access the dead users for this alloca.
ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
- /// \brief Access the dead operands referring to this alloca.
+ /// Access the dead operands referring to this alloca.
///
/// These are operands which have cannot actually be used to refer to the
/// alloca as they are outside its range and the user doesn't correct for
@@ -316,11 +312,11 @@ private:
friend class AllocaSlices::SliceBuilder;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// \brief Handle to alloca instruction to simplify method interfaces.
+ /// Handle to alloca instruction to simplify method interfaces.
AllocaInst &AI;
#endif
- /// \brief The instruction responsible for this alloca not having a known set
+ /// The instruction responsible for this alloca not having a known set
/// of slices.
///
/// When an instruction (potentially) escapes the pointer to the alloca, we
@@ -328,7 +324,7 @@ private:
/// alloca. This will be null if the alloca slices are analyzed successfully.
Instruction *PointerEscapingInstr;
- /// \brief The slices of the alloca.
+ /// The slices of the alloca.
///
/// We store a vector of the slices formed by uses of the alloca here. This
/// vector is sorted by increasing begin offset, and then the unsplittable
@@ -336,7 +332,7 @@ private:
/// details.
SmallVector<Slice, 8> Slices;
- /// \brief Instructions which will become dead if we rewrite the alloca.
+ /// Instructions which will become dead if we rewrite the alloca.
///
/// Note that these are not separated by slice. This is because we expect an
/// alloca to be completely rewritten or not rewritten at all. If rewritten,
@@ -344,7 +340,7 @@ private:
/// they come from outside of the allocated space.
SmallVector<Instruction *, 8> DeadUsers;
- /// \brief Operands which will become dead if we rewrite the alloca.
+ /// Operands which will become dead if we rewrite the alloca.
///
/// These are operands that in their particular use can be replaced with
/// undef when we rewrite the alloca. These show up in out-of-bounds inputs
@@ -355,7 +351,7 @@ private:
SmallVector<Use *, 8> DeadOperands;
};
-/// \brief A partition of the slices.
+/// A partition of the slices.
///
/// An ephemeral representation for a range of slices which can be viewed as
/// a partition of the alloca. This range represents a span of the alloca's
@@ -371,32 +367,32 @@ private:
using iterator = AllocaSlices::iterator;
- /// \brief The beginning and ending offsets of the alloca for this
+ /// The beginning and ending offsets of the alloca for this
/// partition.
uint64_t BeginOffset, EndOffset;
- /// \brief The start and end iterators of this partition.
+ /// The start and end iterators of this partition.
iterator SI, SJ;
- /// \brief A collection of split slice tails overlapping the partition.
+ /// A collection of split slice tails overlapping the partition.
SmallVector<Slice *, 4> SplitTails;
- /// \brief Raw constructor builds an empty partition starting and ending at
+ /// Raw constructor builds an empty partition starting and ending at
/// the given iterator.
Partition(iterator SI) : SI(SI), SJ(SI) {}
public:
- /// \brief The start offset of this partition.
+ /// The start offset of this partition.
///
/// All of the contained slices start at or after this offset.
uint64_t beginOffset() const { return BeginOffset; }
- /// \brief The end offset of this partition.
+ /// The end offset of this partition.
///
/// All of the contained slices end at or before this offset.
uint64_t endOffset() const { return EndOffset; }
- /// \brief The size of the partition.
+ /// The size of the partition.
///
/// Note that this can never be zero.
uint64_t size() const {
@@ -404,7 +400,7 @@ public:
return EndOffset - BeginOffset;
}
- /// \brief Test whether this partition contains no slices, and merely spans
+ /// Test whether this partition contains no slices, and merely spans
/// a region occupied by split slices.
bool empty() const { return SI == SJ; }
@@ -421,7 +417,7 @@ public:
iterator end() const { return SJ; }
/// @}
- /// \brief Get the sequence of split slice tails.
+ /// Get the sequence of split slice tails.
///
/// These tails are of slices which start before this partition but are
/// split and overlap into the partition. We accumulate these while forming
@@ -429,7 +425,7 @@ public:
ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
};
-/// \brief An iterator over partitions of the alloca's slices.
+/// An iterator over partitions of the alloca's slices.
///
/// This iterator implements the core algorithm for partitioning the alloca's
/// slices. It is a forward iterator as we don't support backtracking for
@@ -443,18 +439,18 @@ class AllocaSlices::partition_iterator
Partition> {
friend class AllocaSlices;
- /// \brief Most of the state for walking the partitions is held in a class
+ /// Most of the state for walking the partitions is held in a class
/// with a nice interface for examining them.
Partition P;
- /// \brief We need to keep the end of the slices to know when to stop.
+ /// We need to keep the end of the slices to know when to stop.
AllocaSlices::iterator SE;
- /// \brief We also need to keep track of the maximum split end offset seen.
+ /// We also need to keep track of the maximum split end offset seen.
/// FIXME: Do we really?
uint64_t MaxSplitSliceEndOffset = 0;
- /// \brief Sets the partition to be empty at given iterator, and sets the
+ /// Sets the partition to be empty at given iterator, and sets the
/// end iterator.
partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
: P(SI), SE(SE) {
@@ -464,7 +460,7 @@ class AllocaSlices::partition_iterator
advance();
}
- /// \brief Advance the iterator to the next partition.
+ /// Advance the iterator to the next partition.
///
/// Requires that the iterator not be at the end of the slices.
void advance() {
@@ -619,7 +615,7 @@ public:
Partition &operator*() { return P; }
};
-/// \brief A forward range over the partitions of the alloca's slices.
+/// A forward range over the partitions of the alloca's slices.
///
/// This accesses an iterator range over the partitions of the alloca's
/// slices. It computes these partitions on the fly based on the overlapping
@@ -643,7 +639,7 @@ static Value *foldSelectInst(SelectInst &SI) {
return nullptr;
}
-/// \brief A helper that folds a PHI node or a select.
+/// A helper that folds a PHI node or a select.
static Value *foldPHINodeOrSelectInst(Instruction &I) {
if (PHINode *PN = dyn_cast<PHINode>(&I)) {
// If PN merges together the same value, return that value.
@@ -652,7 +648,7 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}
-/// \brief Builder for the alloca slices.
+/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
/// of an alloca and making a slice for each load and store at each offset.
@@ -668,7 +664,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
- /// \brief Set to de-duplicate dead instructions found in the use walk.
+ /// Set to de-duplicate dead instructions found in the use walk.
SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
public:
@@ -687,11 +683,12 @@ private:
// Completely skip uses which have a zero size or start either before or
// past the end of the allocation.
if (Size == 0 || Offset.uge(AllocSize)) {
- DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
- << " which has zero size or starts outside of the "
- << AllocSize << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
+ << Offset
+ << " which has zero size or starts outside of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
return markAsDead(I);
}
@@ -706,10 +703,11 @@ private:
// them, and so have to record at least the information here.
assert(AllocSize >= BeginOffset); // Established above.
if (Size > AllocSize - BeginOffset) {
- DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
- << " to remain within the " << AllocSize << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
+ << Offset << " to remain within the " << AllocSize
+ << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
EndOffset = AllocSize;
}
@@ -802,18 +800,18 @@ private:
uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
// If this memory access can be shown to *statically* extend outside the
- // bounds of of the allocation, it's behavior is undefined, so simply
+ // bounds of the allocation, it's behavior is undefined, so simply
// ignore it. Note that this is more strict than the generic clamping
// behavior of insertUse. We also try to handle cases which might run the
// risk of overflow.
// FIXME: We should instead consider the pointer to have escaped if this
// function is being instrumented for addressing bugs or race conditions.
if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
- DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
- << " which extends past the end of the " << AllocSize
- << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
+ << Offset << " which extends past the end of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << SI << "\n");
return markAsDead(SI);
}
@@ -1027,7 +1025,7 @@ private:
void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
- /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+ /// Disable SROA entirely if there are unhandled users of the alloca.
void visitInstruction(Instruction &I) { PI.setAborted(&I); }
};
@@ -1062,7 +1060,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
// Sort the uses. This arranges for the offsets to be in ascending order,
// and the sizes to be in descending order.
- std::sort(Slices.begin(), Slices.end());
+ llvm::sort(Slices.begin(), Slices.end());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1240,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
}
static void speculatePHINodeLoads(PHINode &PN) {
- DEBUG(dbgs() << " original: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
IRBuilderTy PHIBuilder(&PN);
@@ -1263,10 +1261,21 @@ static void speculatePHINodeLoads(PHINode &PN) {
}
// Inject loads into all of the pred blocks.
+ DenseMap<BasicBlock*, Value*> InjectedLoads;
for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
BasicBlock *Pred = PN.getIncomingBlock(Idx);
- TerminatorInst *TI = Pred->getTerminator();
Value *InVal = PN.getIncomingValue(Idx);
+
+ // A PHI node is allowed to have multiple (duplicated) entries for the same
+ // basic block, as long as the value is the same. So if we already injected
+ // a load in the predecessor, then we should reuse the same load for all
+ // duplicated entries.
+ if (Value* V = InjectedLoads.lookup(Pred)) {
+ NewPN->addIncoming(V, Pred);
+ continue;
+ }
+
+ TerminatorInst *TI = Pred->getTerminator();
IRBuilderTy PredBuilder(TI);
LoadInst *Load = PredBuilder.CreateLoad(
@@ -1276,9 +1285,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
if (AATags)
Load->setAAMetadata(AATags);
NewPN->addIncoming(Load, Pred);
+ InjectedLoads[Pred] = Load;
}
- DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
+ LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
PN.eraseFromParent();
}
@@ -1318,7 +1328,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
}
static void speculateSelectInstLoads(SelectInst &SI) {
- DEBUG(dbgs() << " original: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
IRBuilderTy IRB(&SI);
Value *TV = SI.getTrueValue();
@@ -1349,14 +1359,14 @@ static void speculateSelectInstLoads(SelectInst &SI) {
Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
LI->getName() + ".sroa.speculated");
- DEBUG(dbgs() << " speculated to: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
LI->replaceAllUsesWith(V);
LI->eraseFromParent();
}
SI.eraseFromParent();
}
-/// \brief Build a GEP out of a base pointer and indices.
+/// Build a GEP out of a base pointer and indices.
///
/// This will return the BasePtr if that is valid, or build a new GEP
/// instruction using the IRBuilder if GEP-ing is needed.
@@ -1374,7 +1384,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
NamePrefix + "sroa_idx");
}
-/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// Get a natural GEP off of the BasePtr walking through Ty toward
/// TargetTy without changing the offset of the pointer.
///
/// This routine assumes we've already established a properly offset GEP with
@@ -1423,7 +1433,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
return buildGEP(IRB, BasePtr, Indices, NamePrefix);
}
-/// \brief Recursively compute indices for a natural GEP.
+/// Recursively compute indices for a natural GEP.
///
/// This is the recursive step for getNaturalGEPWithOffset that walks down the
/// element types adding appropriate indices for the GEP.
@@ -1491,7 +1501,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
Indices, NamePrefix);
}
-/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// Get a natural GEP from a base pointer to a particular offset and
/// resulting in a particular type.
///
/// The goal is to produce a "natural" looking GEP that works with the existing
@@ -1526,7 +1536,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
Indices, NamePrefix);
}
-/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// Compute an adjusted pointer from Ptr by Offset bytes where the
/// resulting pointer has PointerTy.
///
/// This tries very hard to compute a "natural" GEP which arrives at the offset
@@ -1635,7 +1645,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
return Ptr;
}
-/// \brief Compute the adjusted alignment for a load or store from an offset.
+/// Compute the adjusted alignment for a load or store from an offset.
static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
const DataLayout &DL) {
unsigned Alignment;
@@ -1656,7 +1666,7 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
return MinAlign(Alignment, Offset);
}
-/// \brief Test whether we can convert a value from the old to the new type.
+/// Test whether we can convert a value from the old to the new type.
///
/// This predicate should be used to guard calls to convertValue in order to
/// ensure that we only try to convert viable values. The strategy is that we
@@ -1707,7 +1717,7 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
return true;
}
-/// \brief Generic routine to convert an SSA value to a value of a different
+/// Generic routine to convert an SSA value to a value of a different
/// type.
///
/// This will try various different casting techniques, such as bitcasts,
@@ -1759,7 +1769,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
return IRB.CreateBitCast(V, NewTy);
}
-/// \brief Test whether the given slice use can be promoted to a vector.
+/// Test whether the given slice use can be promoted to a vector.
///
/// This function is called to test each entry in a partition which is slated
/// for a single slice.
@@ -1830,7 +1840,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
return true;
}
-/// \brief Test whether the given alloca partitioning and range of slices can be
+/// Test whether the given alloca partitioning and range of slices can be
/// promoted to a vector.
///
/// This is a quick test to check whether we can rewrite a particular alloca
@@ -1896,7 +1906,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
"All non-integer types eliminated!");
return RHSTy->getNumElements() < LHSTy->getNumElements();
};
- std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+ llvm::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
CandidateTys.erase(
std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
CandidateTys.end());
@@ -1943,7 +1953,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
return nullptr;
}
-/// \brief Test whether a slice of an alloca is valid for integer widening.
+/// Test whether a slice of an alloca is valid for integer widening.
///
/// This implements the necessary checking for the \c isIntegerWideningViable
/// test below on a single slice of the alloca.
@@ -1970,6 +1980,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
// We can't handle loads that extend past the allocated memory.
if (DL.getTypeStoreSize(LI->getType()) > Size)
return false;
+ // So far, AllocaSliceRewriter does not support widening split slice tails
+ // in rewriteIntegerLoad.
+ if (S.beginOffset() < AllocBeginOffset)
+ return false;
// Note that we don't count vector loads or stores as whole-alloca
// operations which enable integer widening because we would prefer to use
// vector widening instead.
@@ -1991,6 +2005,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
// We can't handle stores that extend past the allocated memory.
if (DL.getTypeStoreSize(ValueTy) > Size)
return false;
+ // So far, AllocaSliceRewriter does not support widening split slice tails
+ // in rewriteIntegerStore.
+ if (S.beginOffset() < AllocBeginOffset)
+ return false;
// Note that we don't count vector loads or stores as whole-alloca
// operations which enable integer widening because we would prefer to use
// vector widening instead.
@@ -2021,7 +2039,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
return true;
}
-/// \brief Test whether the given alloca partition's integer operations can be
+/// Test whether the given alloca partition's integer operations can be
/// widened to promotable ones.
///
/// This is a quick test to check whether we can rewrite the integer loads and
@@ -2072,7 +2090,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
IntegerType *Ty, uint64_t Offset,
const Twine &Name) {
- DEBUG(dbgs() << " start: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
IntegerType *IntTy = cast<IntegerType>(V->getType());
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element extends past full value");
@@ -2081,13 +2099,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
- DEBUG(dbgs() << " shifted: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
}
assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
"Cannot extract to a larger integer!");
if (Ty != IntTy) {
V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
- DEBUG(dbgs() << " trunced: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
}
return V;
}
@@ -2098,10 +2116,10 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
IntegerType *Ty = cast<IntegerType>(V->getType());
assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
"Cannot insert a larger integer!");
- DEBUG(dbgs() << " start: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
if (Ty != IntTy) {
V = IRB.CreateZExt(V, IntTy, Name + ".ext");
- DEBUG(dbgs() << " extended: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
}
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element store outside of alloca store");
@@ -2110,15 +2128,15 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateShl(V, ShAmt, Name + ".shift");
- DEBUG(dbgs() << " shifted: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
}
if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
- DEBUG(dbgs() << " masked: " << *Old << "\n");
+ LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
V = IRB.CreateOr(Old, V, Name + ".insert");
- DEBUG(dbgs() << " inserted: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
}
return V;
}
@@ -2135,7 +2153,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
if (NumElements == 1) {
V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
Name + ".extract");
- DEBUG(dbgs() << " extract: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
return V;
}
@@ -2145,7 +2163,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
Mask.push_back(IRB.getInt32(i));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
ConstantVector::get(Mask), Name + ".extract");
- DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
return V;
}
@@ -2159,7 +2177,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// Single element to insert.
V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
Name + ".insert");
- DEBUG(dbgs() << " insert: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
return V;
}
@@ -2184,7 +2202,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
ConstantVector::get(Mask), Name + ".expand");
- DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
Mask.clear();
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
@@ -2192,11 +2210,11 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
- DEBUG(dbgs() << " blend: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
return V;
}
-/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// Visitor to rewrite instructions using p particular slice of an alloca
/// to use a new alloca.
///
/// Also implements the rewriting to vector-based accesses when the partition
@@ -2295,9 +2313,9 @@ public:
IsSplittable = I->isSplittable();
IsSplit =
BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
- DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
- DEBUG(AS.printSlice(dbgs(), I, ""));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
+ LLVM_DEBUG(dbgs() << "\n");
// Compute the intersecting offset range.
assert(BeginOffset < NewAllocaEndOffset);
@@ -2327,7 +2345,7 @@ private:
// Every instruction which can end up as a user must have a rewrite rule.
bool visitInstruction(Instruction &I) {
- DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
+ LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
llvm_unreachable("No rewrite rule for this instruction!");
}
@@ -2369,7 +2387,7 @@ private:
);
}
- /// \brief Compute suitable alignment to access this slice of the *new*
+ /// Compute suitable alignment to access this slice of the *new*
/// alloca.
///
/// You can optionally pass a type to this routine and if that type's ABI
@@ -2431,10 +2449,13 @@ private:
}
bool visitLoadInst(LoadInst &LI) {
- DEBUG(dbgs() << " original: " << LI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
Value *OldOp = LI.getOperand(0);
assert(OldOp == OldPtr);
+ AAMDNodes AATags;
+ LI.getAAMetadata(AATags);
+
unsigned AS = LI.getPointerAddressSpace();
Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
@@ -2453,6 +2474,8 @@ private:
TargetTy->isIntegerTy()))) {
LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
LI.isVolatile(), LI.getName());
+ if (AATags)
+ NewLI->setAAMetadata(AATags);
if (LI.isVolatile())
NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
@@ -2488,6 +2511,8 @@ private:
LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
getSliceAlign(TargetTy),
LI.isVolatile(), LI.getName());
+ if (AATags)
+ NewLI->setAAMetadata(AATags);
if (LI.isVolatile())
NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
@@ -2524,11 +2549,12 @@ private:
Pass.DeadInsts.insert(&LI);
deleteIfTriviallyDead(OldOp);
- DEBUG(dbgs() << " to: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
return !LI.isVolatile() && !IsPtrAdjusted;
}
- bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+ bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+ AAMDNodes AATags) {
if (V->getType() != VecTy) {
unsigned BeginIndex = getIndex(NewBeginOffset);
unsigned EndIndex = getIndex(NewEndOffset);
@@ -2546,14 +2572,15 @@ private:
V = insertVector(IRB, Old, V, BeginIndex, "vec");
}
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ if (AATags)
+ Store->setAAMetadata(AATags);
Pass.DeadInsts.insert(&SI);
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
}
- bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+ bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
assert(IntTy && "We cannot extract an integer from the alloca");
assert(!SI.isVolatile());
if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
@@ -2567,16 +2594,21 @@ private:
V = convertValue(DL, IRB, V, NewAllocaTy);
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+ if (AATags)
+ Store->setAAMetadata(AATags);
Pass.DeadInsts.insert(&SI);
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
}
bool visitStoreInst(StoreInst &SI) {
- DEBUG(dbgs() << " original: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
Value *OldOp = SI.getOperand(1);
assert(OldOp == OldPtr);
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+
Value *V = SI.getValueOperand();
// Strip all inbounds GEPs and pointer casts to try to dig out any root
@@ -2598,9 +2630,9 @@ private:
}
if (VecTy)
- return rewriteVectorizedStoreInst(V, SI, OldOp);
+ return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
if (IntTy && V->getType()->isIntegerTy())
- return rewriteIntegerStore(V, SI);
+ return rewriteIntegerStore(V, SI, AATags);
const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
StoreInst *NewSI;
@@ -2631,16 +2663,18 @@ private:
SI.isVolatile());
}
NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+ if (AATags)
+ NewSI->setAAMetadata(AATags);
if (SI.isVolatile())
NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
Pass.DeadInsts.insert(&SI);
deleteIfTriviallyDead(OldOp);
- DEBUG(dbgs() << " to: " << *NewSI << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
}
- /// \brief Compute an integer value from splatting an i8 across the given
+ /// Compute an integer value from splatting an i8 across the given
/// number of bytes.
///
/// Note that this routine assumes an i8 is a byte. If that isn't true, don't
@@ -2667,25 +2701,27 @@ private:
return V;
}
- /// \brief Compute a vector splat for a given element value.
+ /// Compute a vector splat for a given element value.
Value *getVectorSplat(Value *V, unsigned NumElements) {
V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
- DEBUG(dbgs() << " splat: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
return V;
}
bool visitMemSetInst(MemSetInst &II) {
- DEBUG(dbgs() << " original: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
assert(II.getRawDest() == OldPtr);
+ AAMDNodes AATags;
+ II.getAAMetadata(AATags);
+
// If the memset has a variable size, it cannot be split, just adjust the
// pointer to the new alloca.
if (!isa<Constant>(II.getLength())) {
assert(!IsSplit);
assert(NewBeginOffset == BeginOffset);
II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
- Type *CstTy = II.getAlignmentCst()->getType();
- II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+ II.setDestAlignment(getSliceAlign());
deleteIfTriviallyDead(OldPtr);
return false;
@@ -2710,8 +2746,9 @@ private:
CallInst *New = IRB.CreateMemSet(
getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
getSliceAlign(), II.isVolatile());
- (void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ if (AATags)
+ New->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return false;
}
@@ -2773,10 +2810,11 @@ private:
V = convertValue(DL, IRB, V, AllocaTy);
}
- Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
- II.isVolatile());
- (void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ II.isVolatile());
+ if (AATags)
+ New->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return !II.isVolatile();
}
@@ -2784,7 +2822,10 @@ private:
// Rewriting of memory transfer instructions can be a bit tricky. We break
// them into two categories: split intrinsics and unsplit intrinsics.
- DEBUG(dbgs() << " original: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
+
+ AAMDNodes AATags;
+ II.getAAMetadata(AATags);
bool IsDest = &II.getRawDestUse() == OldUse;
assert((IsDest && II.getRawDest() == OldPtr) ||
@@ -2801,18 +2842,16 @@ private:
// update both source and dest of a single call.
if (!IsSplittable) {
Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
- if (IsDest)
+ if (IsDest) {
II.setDest(AdjustedPtr);
- else
+ II.setDestAlignment(SliceAlign);
+ }
+ else {
II.setSource(AdjustedPtr);
-
- if (II.getAlignment() > SliceAlign) {
- Type *CstTy = II.getAlignmentCst()->getType();
- II.setAlignment(
- ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+ II.setSourceAlignment(SliceAlign);
}
- DEBUG(dbgs() << " to: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << II << "\n");
deleteIfTriviallyDead(OldPtr);
return false;
}
@@ -2862,8 +2901,10 @@ private:
// Compute the relative offset for the other pointer within the transfer.
unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
- unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
- OtherOffset.zextOrTrunc(64).getZExtValue());
+ unsigned OtherAlign =
+ IsDest ? II.getSourceAlignment() : II.getDestAlignment();
+ OtherAlign = MinAlign(OtherAlign ? OtherAlign : 1,
+ OtherOffset.zextOrTrunc(64).getZExtValue());
if (EmitMemCpy) {
// Compute the other pointer, folding as much as possible to produce
@@ -2875,11 +2916,25 @@ private:
Type *SizeTy = II.getLength()->getType();
Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
- CallInst *New = IRB.CreateMemCpy(
- IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
- MinAlign(SliceAlign, OtherAlign), II.isVolatile());
- (void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ Value *DestPtr, *SrcPtr;
+ unsigned DestAlign, SrcAlign;
+ // Note: IsDest is true iff we're copying into the new alloca slice
+ if (IsDest) {
+ DestPtr = OurPtr;
+ DestAlign = SliceAlign;
+ SrcPtr = OtherPtr;
+ SrcAlign = OtherAlign;
+ } else {
+ DestPtr = OtherPtr;
+ DestAlign = OtherAlign;
+ SrcPtr = OurPtr;
+ SrcAlign = SliceAlign;
+ }
+ CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
+ Size, II.isVolatile());
+ if (AATags)
+ New->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return false;
}
@@ -2927,8 +2982,11 @@ private:
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
} else {
- Src =
- IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+ LoadInst *Load = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
+ "copyload");
+ if (AATags)
+ Load->setAAMetadata(AATags);
+ Src = Load;
}
if (VecTy && !IsWholeAlloca && IsDest) {
@@ -2946,15 +3004,16 @@ private:
StoreInst *Store = cast<StoreInst>(
IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ if (AATags)
+ Store->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return !II.isVolatile();
}
bool visitIntrinsicInst(IntrinsicInst &II) {
assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
II.getIntrinsicID() == Intrinsic::lifetime_end);
- DEBUG(dbgs() << " original: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
assert(II.getArgOperand(1) == OldPtr);
// Record this instruction for deletion.
@@ -2982,13 +3041,13 @@ private:
New = IRB.CreateLifetimeEnd(Ptr, Size);
(void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return true;
}
bool visitPHINode(PHINode &PN) {
- DEBUG(dbgs() << " original: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
@@ -3007,7 +3066,7 @@ private:
// Replace the operands which were using the old pointer.
std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
- DEBUG(dbgs() << " to: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
deleteIfTriviallyDead(OldPtr);
// PHIs can't be promoted on their own, but often can be speculated. We
@@ -3018,7 +3077,7 @@ private:
}
bool visitSelectInst(SelectInst &SI) {
- DEBUG(dbgs() << " original: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
"Pointer isn't an operand!");
assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
@@ -3031,7 +3090,7 @@ private:
if (SI.getOperand(2) == OldPtr)
SI.setOperand(2, NewPtr);
- DEBUG(dbgs() << " to: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
deleteIfTriviallyDead(OldPtr);
// Selects can't be promoted on their own, but often can be speculated. We
@@ -3044,7 +3103,7 @@ private:
namespace {
-/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+/// Visitor to rewrite aggregate loads and stores as scalar.
///
/// This pass aggressively rewrites all aggregate loads and stores on
/// a particular pointer (or any pointer derived from it which we can identify)
@@ -3067,7 +3126,7 @@ public:
/// Rewrite loads and stores through a pointer and all pointers derived from
/// it.
bool rewrite(Instruction &I) {
- DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
+ LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
enqueueUsers(I);
bool Changed = false;
while (!Queue.empty()) {
@@ -3089,7 +3148,7 @@ private:
// Conservative default is to not rewrite anything.
bool visitInstruction(Instruction &I) { return false; }
- /// \brief Generic recursive split emission class.
+ /// Generic recursive split emission class.
template <typename Derived> class OpSplitter {
protected:
/// The builder used to form new instructions.
@@ -3113,7 +3172,7 @@ private:
: IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
public:
- /// \brief Generic recursive split emission routine.
+ /// Generic recursive split emission routine.
///
/// This method recursively splits an aggregate op (load or store) into
/// scalar or vector ops. It splits recursively until it hits a single value
@@ -3165,8 +3224,10 @@ private:
};
struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
- LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+ AAMDNodes AATags;
+
+ LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
/// Emit a leaf load of a single value. This is called at the leaves of the
/// recursive emission to actually load values.
@@ -3175,9 +3236,11 @@ private:
// Load the single value and insert it using the indices.
Value *GEP =
IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
- Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+ LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load");
+ if (AATags)
+ Load->setAAMetadata(AATags);
Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
- DEBUG(dbgs() << " to: " << *Load << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
}
};
@@ -3187,8 +3250,10 @@ private:
return false;
// We have an aggregate being loaded, split it apart.
- DEBUG(dbgs() << " original: " << LI << "\n");
- LoadOpSplitter Splitter(&LI, *U);
+ LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
+ AAMDNodes AATags;
+ LI.getAAMetadata(AATags);
+ LoadOpSplitter Splitter(&LI, *U, AATags);
Value *V = UndefValue::get(LI.getType());
Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
LI.replaceAllUsesWith(V);
@@ -3197,8 +3262,9 @@ private:
}
struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
- StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+ StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
+ AAMDNodes AATags;
/// Emit a leaf store of a single value. This is called at the leaves of the
/// recursive emission to actually produce stores.
@@ -3212,9 +3278,10 @@ private:
IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
Value *InBoundsGEP =
IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
- Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+ if (AATags)
+ Store->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
}
};
@@ -3226,8 +3293,10 @@ private:
return false;
// We have an aggregate being stored, split it apart.
- DEBUG(dbgs() << " original: " << SI << "\n");
- StoreOpSplitter Splitter(&SI, *U);
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+ StoreOpSplitter Splitter(&SI, *U, AATags);
Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
SI.eraseFromParent();
return true;
@@ -3256,7 +3325,7 @@ private:
} // end anonymous namespace
-/// \brief Strip aggregate type wrapping.
+/// Strip aggregate type wrapping.
///
/// This removes no-op aggregate types wrapping an underlying type. It will
/// strip as many layers of types as it can without changing either the type
@@ -3286,7 +3355,7 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
return stripAggregateTypeWrapping(DL, InnerTy);
}
-/// \brief Try to find a partition of the aggregate type passed in for a given
+/// Try to find a partition of the aggregate type passed in for a given
/// offset and size.
///
/// This recurses through the aggregate type and tries to compute a subtype
@@ -3392,7 +3461,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
return SubTy;
}
-/// \brief Pre-split loads and stores to simplify rewriting.
+/// Pre-split loads and stores to simplify rewriting.
///
/// We want to break up the splittable load+store pairs as much as
/// possible. This is important to do as a preprocessing step, as once we
@@ -3423,7 +3492,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
///
/// \returns true if any changes are made.
bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
- DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+ LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
// Track the loads and stores which are candidates for pre-splitting here, in
// the order they first appear during the partition scan. These give stable
@@ -3455,7 +3524,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// maybe it would make it more principled?
SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
- DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
for (auto &P : AS.partitions()) {
for (Slice &S : P) {
Instruction *I = cast<Instruction>(S.getUse()->getUser());
@@ -3510,7 +3579,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
}
// Record the initial split.
- DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
auto &Offsets = SplitOffsetsMap[I];
assert(Offsets.Splits.empty() &&
"Should not have splits the first time we see an instruction!");
@@ -3570,10 +3639,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
if (LoadOffsets.Splits == StoreOffsets.Splits)
return false;
- DEBUG(dbgs()
- << " Mismatched splits for load and store:\n"
- << " " << *LI << "\n"
- << " " << *SI << "\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
// We've found a store and load that we need to split
// with mismatched relative splits. Just give up on them
@@ -3646,7 +3716,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
IRB.SetInsertPoint(LI);
- DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
int Idx = 0, Size = Offsets.Splits.size();
@@ -3656,7 +3726,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
auto *PartPtrTy = PartTy->getPointerTo(AS);
LoadInst *PLoad = IRB.CreateAlignedLoad(
getAdjustedPtr(IRB, DL, BasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
PartPtrTy, BasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
@@ -3671,9 +3741,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
&PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
/*IsSplittable*/ false));
- DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
- << ", " << NewSlices.back().endOffset() << "): " << *PLoad
- << "\n");
+ LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset()
+ << "): " << *PLoad << "\n");
// See if we've handled all the splits.
if (Idx >= Size)
@@ -3693,14 +3763,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
StoreInst *SI = cast<StoreInst>(LU);
if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
DeferredStores = true;
- DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
+ << "\n");
continue;
}
Value *StoreBasePtr = SI->getPointerOperand();
IRB.SetInsertPoint(SI);
- DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
LoadInst *PLoad = SplitLoads[Idx];
@@ -3712,11 +3783,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
StoreInst *PStore = IRB.CreateAlignedStore(
PLoad,
getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
PartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
- DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
}
// We want to immediately iterate on any allocas impacted by splitting
@@ -3765,7 +3836,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Value *LoadBasePtr = LI->getPointerOperand();
Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
- DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
// Check whether we have an already split load.
auto SplitLoadsMapI = SplitLoadsMap.find(LI);
@@ -3775,7 +3846,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
"Too few split loads for the number of splits in the store!");
} else {
- DEBUG(dbgs() << " of load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
}
uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
@@ -3794,7 +3865,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
auto AS = LI->getPointerAddressSpace();
PLoad = IRB.CreateAlignedLoad(
getAdjustedPtr(IRB, DL, LoadBasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
LoadPartPtrTy, LoadBasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
@@ -3806,7 +3877,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
StoreInst *PStore = IRB.CreateAlignedStore(
PLoad,
getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
StorePartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
@@ -3815,11 +3886,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
&PStore->getOperandUse(PStore->getPointerOperandIndex()),
/*IsSplittable*/ false));
- DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
- << ", " << NewSlices.back().endOffset() << "): " << *PStore
- << "\n");
+ LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset()
+ << "): " << *PStore << "\n");
if (!SplitLoads) {
- DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
}
// See if we've finished all the splits.
@@ -3874,10 +3945,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// sequence.
AS.insert(NewSlices);
- DEBUG(dbgs() << " Pre-split slices:\n");
+ LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
#ifndef NDEBUG
for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
- DEBUG(AS.print(dbgs(), I, " "));
+ LLVM_DEBUG(AS.print(dbgs(), I, " "));
#endif
// Finally, don't try to promote any allocas that new require re-splitting.
@@ -3891,7 +3962,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
return true;
}
-/// \brief Rewrite an alloca partition's users.
+/// Rewrite an alloca partition's users.
///
/// This routine drives both of the rewriting goals of the SROA pass. It tries
/// to rewrite uses of an alloca partition to be conducive for SSA value
@@ -3934,10 +4005,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// exact same type as the original, and with the same access offsets. In that
// case, re-use the existing alloca, but still run through the rewriter to
// perform phi and select speculation.
+ // P.beginOffset() can be non-zero even with the same type in a case with
+ // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
AllocaInst *NewAI;
- if (SliceTy == AI.getAllocatedType()) {
- assert(P.beginOffset() == 0 &&
- "Non-zero begin offset but same alloca type");
+ if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
NewAI = &AI;
// FIXME: We should be able to bail at this point with "nothing changed".
// FIXME: We might want to defer PHI speculation until after here.
@@ -3958,12 +4029,14 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
NewAI = new AllocaInst(
SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+ // Copy the old AI debug location over to the new one.
+ NewAI->setDebugLoc(AI.getDebugLoc());
++NumNewAllocas;
}
- DEBUG(dbgs() << "Rewriting alloca partition "
- << "[" << P.beginOffset() << "," << P.endOffset()
- << ") to: " << *NewAI << "\n");
+ LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
// Track the high watermark on the worklist as it is only relevant for
// promoted allocas. We will reset it to this point if the alloca is not in
@@ -4040,7 +4113,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return NewAI;
}
-/// \brief Walks the slices of an alloca and form partitions based on them,
+/// Walks the slices of an alloca and form partitions based on them,
/// rewriting each of their uses.
bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
if (AS.begin() == AS.end())
@@ -4063,7 +4136,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
const uint64_t MaxBitVectorSize = 1024;
- if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) {
+ if (AllocaSize <= MaxBitVectorSize) {
// If a byte boundary is included in any load or store, a slice starting or
// ending at the boundary is not splittable.
SmallBitVector SplittableOffset(AllocaSize + 1, true);
@@ -4106,7 +4179,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
}
if (!IsSorted)
- std::sort(AS.begin(), AS.end());
+ llvm::sort(AS.begin(), AS.end());
/// Describes the allocas introduced by rewritePartition in order to migrate
/// the debug info.
@@ -4201,7 +4274,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
return Changed;
}
-/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+/// Clobber a use with undef, deleting the used value if it becomes dead.
void SROA::clobberUse(Use &U) {
Value *OldV = U;
// Replace the use with an undef value.
@@ -4216,13 +4289,13 @@ void SROA::clobberUse(Use &U) {
}
}
-/// \brief Analyze an alloca for SROA.
+/// Analyze an alloca for SROA.
///
/// This analyzes the alloca to ensure we can reason about it, builds
/// the slices of the alloca, and then hands it off to be split and
/// rewritten as needed.
bool SROA::runOnAlloca(AllocaInst &AI) {
- DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+ LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
++NumAllocasAnalyzed;
// Special case dead allocas, as they're trivial.
@@ -4246,7 +4319,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
// Build the slices using a recursive instruction-visiting builder.
AllocaSlices AS(DL, AI);
- DEBUG(AS.print(dbgs()));
+ LLVM_DEBUG(AS.print(dbgs()));
if (AS.isEscaped())
return Changed;
@@ -4274,18 +4347,18 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
Changed |= splitAlloca(AI, AS);
- DEBUG(dbgs() << " Speculating PHIs\n");
+ LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
while (!SpeculatablePHIs.empty())
speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
- DEBUG(dbgs() << " Speculating Selects\n");
+ LLVM_DEBUG(dbgs() << " Speculating Selects\n");
while (!SpeculatableSelects.empty())
speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
return Changed;
}
-/// \brief Delete the dead instructions accumulated in this run.
+/// Delete the dead instructions accumulated in this run.
///
/// Recursively deletes the dead instructions we've accumulated. This is done
/// at the very end to maximize locality of the recursive delete and to
@@ -4299,7 +4372,7 @@ bool SROA::deleteDeadInstructions(
bool Changed = false;
while (!DeadInsts.empty()) {
Instruction *I = DeadInsts.pop_back_val();
- DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
// If the instruction is an alloca, find the possible dbg.declare connected
// to it, and remove it too. We must do this before calling RAUW or we will
@@ -4327,7 +4400,7 @@ bool SROA::deleteDeadInstructions(
return Changed;
}
-/// \brief Promote the allocas, using the best available technique.
+/// Promote the allocas, using the best available technique.
///
/// This attempts to promote whatever allocas have been identified as viable in
/// the PromotableAllocas list. If that list is empty, there is nothing to do.
@@ -4338,7 +4411,7 @@ bool SROA::promoteAllocas(Function &F) {
NumPromoted += PromotableAllocas.size();
- DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+ LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
PromoteMemToReg(PromotableAllocas, *DT, AC);
PromotableAllocas.clear();
return true;
@@ -4346,7 +4419,7 @@ bool SROA::promoteAllocas(Function &F) {
PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
AssumptionCache &RunAC) {
- DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
C = &F.getContext();
DT = &RunDT;
AC = &RunAC;
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 3b99ddff2e06..526487d3477e 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -45,6 +45,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeScalarizerPass(Registry);
initializeDSELegacyPassPass(Registry);
initializeGuardWideningLegacyPassPass(Registry);
+ initializeLoopGuardWideningLegacyPassPass(Registry);
initializeGVNLegacyPassPass(Registry);
initializeNewGVNLegacyPassPass(Registry);
initializeEarlyCSELegacyPassPass(Registry);
@@ -52,9 +53,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeGVNHoistLegacyPassPass(Registry);
initializeGVNSinkLegacyPassPass(Registry);
initializeFlattenCFGPassPass(Registry);
- initializeInductiveRangeCheckEliminationPass(Registry);
+ initializeIRCELegacyPassPass(Registry);
initializeIndVarSimplifyLegacyPassPass(Registry);
initializeInferAddressSpacesPass(Registry);
+ initializeInstSimplifyLegacyPassPass(Registry);
initializeJumpThreadingPass(Registry);
initializeLegacyLICMPassPass(Registry);
initializeLegacyLoopSinkPassPass(Registry);
@@ -68,6 +70,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopStrengthReducePass(Registry);
initializeLoopRerollPass(Registry);
initializeLoopUnrollPass(Registry);
+ initializeLoopUnrollAndJamPass(Registry);
initializeLoopUnswitchPass(Registry);
initializeLoopVersioningLICMPass(Registry);
initializeLoopIdiomRecognizeLegacyPassPass(Registry);
@@ -83,7 +86,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeRegToMemPass(Registry);
initializeRewriteStatepointsForGCLegacyPassPass(Registry);
initializeSCCPLegacyPassPass(Registry);
- initializeIPSCCPLegacyPassPass(Registry);
initializeSROALegacyPassPass(Registry);
initializeCFGSimplifyPassPass(Registry);
initializeStructurizeCFGPass(Registry);
@@ -104,6 +106,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializePostInlineEntryExitInstrumenterPass(Registry);
}
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
initializeScalarOpts(*unwrap(R));
}
@@ -148,10 +154,6 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createIndVarSimplifyPass());
}
-void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createInstructionCombiningPass());
-}
-
void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createJumpThreadingPass());
}
@@ -180,14 +182,14 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopRerollPass());
}
-void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopSimplifyCFGPass());
-}
-
void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnrollPass());
}
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnrollAndJamPass());
+}
+
void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnswitchPass());
}
@@ -200,14 +202,6 @@ void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createPartiallyInlineLibCallsPass());
}
-void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLowerSwitchPass());
-}
-
-void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createPromoteMemoryToRegisterPass());
-}
-
void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createReassociatePass());
}
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 8fa9ffb6d014..967f4a42a8fb 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -165,8 +165,8 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -190,7 +190,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
#include <string>
@@ -213,7 +212,7 @@ static cl::opt<bool>
namespace {
-/// \brief A helper class for separating a constant offset from a GEP index.
+/// A helper class for separating a constant offset from a GEP index.
///
/// In real programs, a GEP index may be more complicated than a simple addition
/// of something and a constant integer which can be trivially splitted. For
@@ -340,16 +339,15 @@ private:
const DominatorTree *DT;
};
-/// \brief A pass that tries to split every GEP in the function into a variadic
+/// A pass that tries to split every GEP in the function into a variadic
/// base and a constant offset. It is a FunctionPass because searching for the
/// constant offset may inspect other basic blocks.
class SeparateConstOffsetFromGEP : public FunctionPass {
public:
static char ID;
- SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
- bool LowerGEP = false)
- : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
+ SeparateConstOffsetFromGEP(bool LowerGEP = false)
+ : FunctionPass(ID), LowerGEP(LowerGEP) {
initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
}
@@ -450,7 +448,6 @@ private:
const DataLayout *DL = nullptr;
DominatorTree *DT = nullptr;
ScalarEvolution *SE;
- const TargetMachine *TM;
LoopInfo *LI;
TargetLibraryInfo *TLI;
@@ -480,10 +477,8 @@ INITIALIZE_PASS_END(
"Split GEPs to a variadic base and a constant offset for better CSE", false,
false)
-FunctionPass *
-llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
- bool LowerGEP) {
- return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
+ return new SeparateConstOffsetFromGEP(LowerGEP);
}
bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -502,6 +497,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
// Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
// don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+ // FIXME: this does not appear to be covered by any tests
+ // (with x86/aarch64 backends at least)
if (BO->getOpcode() == Instruction::Or &&
!haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
return false;
@@ -590,6 +587,10 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
// Trace into subexpressions for more hoisting opportunities.
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ } else if (isa<TruncInst>(V)) {
+ ConstantOffset =
+ find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
+ .trunc(BitWidth);
} else if (isa<SExtInst>(V)) {
ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
ZeroExtended, NonNegative).sext(BitWidth);
@@ -654,8 +655,9 @@ ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
}
if (CastInst *Cast = dyn_cast<CastInst>(U)) {
- assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
- "We only traced into two types of CastInst: sext and zext");
+ assert(
+ (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
+ "Only following instructions can be traced: sext, zext & trunc");
ExtInsts.push_back(Cast);
UserChain[ChainIndex] = nullptr;
return distributeExtsAndCloneChain(ChainIndex - 1);
@@ -706,7 +708,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
if (BO->getOpcode() == Instruction::Or) {
// Rebuild "or" as "add", because "or" may be invalid for the new
- // epxression.
+ // expression.
//
// For instance, given
// a | (b + 5) where a and b + 5 have no common bits,
@@ -943,6 +945,10 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (!NeedsExtraction)
return Changed;
+
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
+
// If LowerGEP is disabled, before really splitting the GEP, check whether the
// backend supports the addressing mode we are about to produce. If no, this
// splitting probably won't be beneficial.
@@ -951,9 +957,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// of variable indices. Therefore, we don't check for addressing modes in that
// case.
if (!LowerGEP) {
- TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *GEP->getParent()->getParent());
unsigned AddrSpace = GEP->getPointerAddressSpace();
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
/*BaseGV=*/nullptr, AccumulativeByteOffset,
@@ -1016,7 +1019,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (LowerGEP) {
// As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
// arithmetic operations if the target uses alias analysis in codegen.
- if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+ if (TTI.useAA())
lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
else
lowerToArithmetics(GEP, AccumulativeByteOffset);
@@ -1065,12 +1068,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
DL->getTypeAllocSize(GEP->getResultElementType()));
Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
- // Very likely. As long as %gep is natually aligned, the byte offset we
+ // Very likely. As long as %gep is naturally aligned, the byte offset we
// extracted should be a multiple of sizeof(*%gep).
int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
ConstantInt::get(IntPtrTy, Index, true),
GEP->getName(), GEP);
+ NewGEP->copyMetadata(*GEP);
// Inherit the inbounds attribute of the original GEP.
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
} else {
@@ -1095,6 +1099,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
Type::getInt8Ty(GEP->getContext()), NewGEP,
ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
GEP);
+ NewGEP->copyMetadata(*GEP);
// Inherit the inbounds attribute of the original GEP.
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
if (GEP->getType() != I8PtrTy)
@@ -1293,7 +1298,7 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
// We changed p+o+c to p+c+o, p+c may not be inbound anymore.
const DataLayout &DAL = First->getModule()->getDataLayout();
- APInt Offset(DAL.getPointerSizeInBits(
+ APInt Offset(DAL.getIndexSizeInBits(
cast<PointerType>(First->getType())->getAddressSpace()),
0);
Value *NewBase =
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 3d0fca0bc3a5..34510cb40732 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1,4 +1,4 @@
-//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,10 +17,14 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -66,180 +70,65 @@ static cl::opt<int>
UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
cl::desc("The cost threshold for unswitching a loop."));
-static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
- Constant &Replacement) {
- assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-
- // Replace uses of LIC in the loop with the given constant.
- for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
- // Grab the use and walk past it so we can clobber it in the use list.
- Use *U = &*UI++;
- Instruction *UserI = dyn_cast<Instruction>(U->getUser());
- if (!UserI || !L.contains(UserI))
- continue;
-
- // Replace this use within the loop body.
- *U = &Replacement;
- }
-}
-
-/// Update the IDom for a basic block whose predecessor set has changed.
-///
-/// This routine is designed to work when the domtree update is relatively
-/// localized by leveraging a known common dominator, often a loop header.
-///
-/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS
-/// approach here as we can do that easily by persisting the candidate IDom's
-/// dominating set between each predecessor.
+/// Collect all of the loop invariant input values transitively used by the
+/// homogeneous instruction graph from a given root.
///
-/// FIXME: Longer term, many uses of this can be replaced by an incremental
-/// domtree update strategy that starts from a known dominating block and
-/// rebuilds that subtree.
-static bool updateIDomWithKnownCommonDominator(BasicBlock *BB,
- BasicBlock *KnownDominatingBB,
- DominatorTree &DT) {
- assert(pred_begin(BB) != pred_end(BB) &&
- "This routine does not handle unreachable blocks!");
-
- BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock();
-
- BasicBlock *IDom = *pred_begin(BB);
- assert(DT.dominates(KnownDominatingBB, IDom) &&
- "Bad known dominating block!");
-
- // Walk all of the other predecessors finding the nearest common dominator
- // until all predecessors are covered or we reach the loop header. The loop
- // header necessarily dominates all loop exit blocks in loop simplified form
- // so we can early-exit the moment we hit that block.
- for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB);
- PI != PE && IDom != KnownDominatingBB; ++PI) {
- assert(DT.dominates(KnownDominatingBB, *PI) &&
- "Bad known dominating block!");
- IDom = DT.findNearestCommonDominator(IDom, *PI);
- }
+/// This essentially walks from a root recursively through loop variant operands
+/// which have the exact same opcode and finds all inputs which are loop
+/// invariant. For some operations these can be re-associated and unswitched out
+/// of the loop entirely.
+static TinyPtrVector<Value *>
+collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
+ LoopInfo &LI) {
+ assert(!L.isLoopInvariant(&Root) &&
+ "Only need to walk the graph if root itself is not invariant.");
+ TinyPtrVector<Value *> Invariants;
+
+ // Build a worklist and recurse through operators collecting invariants.
+ SmallVector<Instruction *, 4> Worklist;
+ SmallPtrSet<Instruction *, 8> Visited;
+ Worklist.push_back(&Root);
+ Visited.insert(&Root);
+ do {
+ Instruction &I = *Worklist.pop_back_val();
+ for (Value *OpV : I.operand_values()) {
+ // Skip constants as unswitching isn't interesting for them.
+ if (isa<Constant>(OpV))
+ continue;
- if (IDom == OrigIDom)
- return false;
+ // Add it to our result if loop invariant.
+ if (L.isLoopInvariant(OpV)) {
+ Invariants.push_back(OpV);
+ continue;
+ }
- DT.changeImmediateDominator(BB, IDom);
- return true;
-}
+ // If not an instruction with the same opcode, nothing we can do.
+ Instruction *OpI = dyn_cast<Instruction>(OpV);
+ if (!OpI || OpI->getOpcode() != Root.getOpcode())
+ continue;
-// Note that we don't currently use the IDFCalculator here for two reasons:
-// 1) It computes dominator tree levels for the entire function on each run
-// of 'compute'. While this isn't terrible, given that we expect to update
-// relatively small subtrees of the domtree, it isn't necessarily the right
-// tradeoff.
-// 2) The interface doesn't fit this usage well. It doesn't operate in
-// append-only, and builds several sets that we don't need.
-//
-// FIXME: Neither of these issues are a big deal and could be addressed with
-// some amount of refactoring of IDFCalculator. That would allow us to share
-// the core logic here (which is solving the same core problem).
-static void appendDomFrontier(DomTreeNode *Node,
- SmallSetVector<BasicBlock *, 4> &Worklist,
- SmallVectorImpl<DomTreeNode *> &DomNodes,
- SmallPtrSetImpl<BasicBlock *> &DomSet) {
- assert(DomNodes.empty() && "Must start with no dominator nodes.");
- assert(DomSet.empty() && "Must start with an empty dominator set.");
-
- // First flatten this subtree into sequence of nodes by doing a pre-order
- // walk.
- DomNodes.push_back(Node);
- // We intentionally re-evaluate the size as each node can add new children.
- // Because this is a tree walk, this cannot add any duplicates.
- for (int i = 0; i < (int)DomNodes.size(); ++i)
- DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
-
- // Now create a set of the basic blocks so we can quickly test for
- // dominated successors. We could in theory use the DFS numbers of the
- // dominator tree for this, but we want this to remain predictably fast
- // even while we mutate the dominator tree in ways that would invalidate
- // the DFS numbering.
- for (DomTreeNode *InnerN : DomNodes)
- DomSet.insert(InnerN->getBlock());
-
- // Now re-walk the nodes, appending every successor of every node that isn't
- // in the set. Note that we don't append the node itself, even though if it
- // is a successor it does not strictly dominate itself and thus it would be
- // part of the dominance frontier. The reason we don't append it is that
- // the node passed in came *from* the worklist and so it has already been
- // processed.
- for (DomTreeNode *InnerN : DomNodes)
- for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
- if (!DomSet.count(SuccBB))
- Worklist.insert(SuccBB);
-
- DomNodes.clear();
- DomSet.clear();
-}
+ // Visit this operand.
+ if (Visited.insert(OpI).second)
+ Worklist.push_back(OpI);
+ }
+ } while (!Worklist.empty());
-/// Update the dominator tree after unswitching a particular former exit block.
-///
-/// This handles the full update of the dominator tree after hoisting a block
-/// that previously was an exit block (or split off of an exit block) up to be
-/// reached from the new immediate dominator of the preheader.
-///
-/// The common case is simple -- we just move the unswitched block to have an
-/// immediate dominator of the old preheader. But in complex cases, there may
-/// be other blocks reachable from the unswitched block that are immediately
-/// dominated by some node between the unswitched one and the old preheader.
-/// All of these also need to be hoisted in the dominator tree. We also want to
-/// minimize queries to the dominator tree because each step of this
-/// invalidates any DFS numbers that would make queries fast.
-static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
- DominatorTree &DT) {
- DomTreeNode *OldPHNode = DT[OldPH];
- DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
- // If the dominator tree has already been updated for this unswitched node,
- // we're done. This makes it easier to use this routine if there are multiple
- // paths to the same unswitched destination.
- if (UnswitchedNode->getIDom() == OldPHNode)
- return;
+ return Invariants;
+}
- // First collect the domtree nodes that we are hoisting over. These are the
- // set of nodes which may have children that need to be hoisted as well.
- SmallPtrSet<DomTreeNode *, 4> DomChain;
- for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
- IDom = IDom->getIDom())
- DomChain.insert(IDom);
-
- // The unswitched block ends up immediately dominated by the old preheader --
- // regardless of whether it is the loop exit block or split off of the loop
- // exit block.
- DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
-
- // For everything that moves up the dominator tree, we need to examine the
- // dominator frontier to see if it additionally should move up the dominator
- // tree. This lambda appends the dominator frontier for a node on the
- // worklist.
- SmallSetVector<BasicBlock *, 4> Worklist;
-
- // Scratch data structures reused by domfrontier finding.
- SmallVector<DomTreeNode *, 4> DomNodes;
- SmallPtrSet<BasicBlock *, 4> DomSet;
-
- // Append the initial dom frontier nodes.
- appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet);
-
- // Walk the worklist. We grow the list in the loop and so must recompute size.
- for (int i = 0; i < (int)Worklist.size(); ++i) {
- auto *BB = Worklist[i];
-
- DomTreeNode *Node = DT[BB];
- assert(!DomChain.count(Node) &&
- "Cannot be dominated by a block you can reach!");
-
- // If this block had an immediate dominator somewhere in the chain
- // we hoisted over, then its position in the domtree needs to move as it is
- // reachable from a node hoisted over this chain.
- if (!DomChain.count(Node->getIDom()))
- continue;
+static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
+ Constant &Replacement) {
+ assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
- DT.changeImmediateDominator(Node, OldPHNode);
+ // Replace uses of LIC in the loop with the given constant.
+ for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
- // Now add this node's dominator frontier to the worklist as well.
- appendDomFrontier(Node, Worklist, DomNodes, DomSet);
+ // Replace this use within the loop body.
+ if (UserI && L.contains(UserI))
+ U->set(&Replacement);
}
}
@@ -261,6 +150,26 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
llvm_unreachable("Basic blocks should never be empty!");
}
+/// Insert code to test a set of loop invariant values, and conditionally branch
+/// on them.
+static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
+ ArrayRef<Value *> Invariants,
+ bool Direction,
+ BasicBlock &UnswitchedSucc,
+ BasicBlock &NormalSucc) {
+ IRBuilder<> IRB(&BB);
+ Value *Cond = Invariants.front();
+ for (Value *Invariant :
+ make_range(std::next(Invariants.begin()), Invariants.end()))
+ if (Direction)
+ Cond = IRB.CreateOr(Cond, Invariant);
+ else
+ Cond = IRB.CreateAnd(Cond, Invariant);
+
+ IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+ Direction ? &NormalSucc : &UnswitchedSucc);
+}
+
/// Rewrite the PHI nodes in an unswitched loop exit basic block.
///
/// Requires that the loop exit and unswitched basic block are the same, and
@@ -271,19 +180,14 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
BasicBlock &OldExitingBB,
BasicBlock &OldPH) {
- for (Instruction &I : UnswitchedBB) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- // No more PHIs to check.
- break;
-
+ for (PHINode &PN : UnswitchedBB.phis()) {
// When the loop exit is directly unswitched we just need to update the
// incoming basic block. We loop to handle weird cases with repeated
// incoming blocks, but expect to typically only have one operand here.
- for (auto i : seq<int>(0, PN->getNumOperands())) {
- assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+ for (auto i : seq<int>(0, PN.getNumOperands())) {
+ assert(PN.getIncomingBlock(i) == &OldExitingBB &&
"Found incoming block different from unique predecessor!");
- PN->setIncomingBlock(i, &OldPH);
+ PN.setIncomingBlock(i, &OldPH);
}
}
}
@@ -298,18 +202,14 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
BasicBlock &UnswitchedBB,
BasicBlock &OldExitingBB,
- BasicBlock &OldPH) {
+ BasicBlock &OldPH,
+ bool FullUnswitch) {
assert(&ExitBB != &UnswitchedBB &&
"Must have different loop exit and unswitched blocks!");
Instruction *InsertPt = &*UnswitchedBB.begin();
- for (Instruction &I : ExitBB) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- // No more PHIs to check.
- break;
-
- auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
- PN->getName() + ".split", InsertPt);
+ for (PHINode &PN : ExitBB.phis()) {
+ auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
+ PN.getName() + ".split", InsertPt);
// Walk backwards over the old PHI node's inputs to minimize the cost of
// removing each one. We have to do this weird loop manually so that we
@@ -320,18 +220,92 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
// allowed us to create a single entry for a predecessor block without
// having separate entries for each "edge" even though these edges are
// required to produce identical results.
- for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
- if (PN->getIncomingBlock(i) != &OldExitingBB)
+ for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN.getIncomingBlock(i) != &OldExitingBB)
continue;
- Value *Incoming = PN->removeIncomingValue(i);
+ Value *Incoming = PN.getIncomingValue(i);
+ if (FullUnswitch)
+ // No more edge from the old exiting block to the exit block.
+ PN.removeIncomingValue(i);
+
NewPN->addIncoming(Incoming, &OldPH);
}
// Now replace the old PHI with the new one and wire the old one in as an
// input to the new one.
- PN->replaceAllUsesWith(NewPN);
- NewPN->addIncoming(PN, &ExitBB);
+ PN.replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(&PN, &ExitBB);
+ }
+}
+
+/// Hoist the current loop up to the innermost loop containing a remaining exit.
+///
+/// Because we've removed an exit from the loop, we may have changed the set of
+/// loops reachable and need to move the current loop up the loop nest or even
+/// to an entirely separate nest.
+static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
+ DominatorTree &DT, LoopInfo &LI) {
+ // If the loop is already at the top level, we can't hoist it anywhere.
+ Loop *OldParentL = L.getParentLoop();
+ if (!OldParentL)
+ return;
+
+ SmallVector<BasicBlock *, 4> Exits;
+ L.getExitBlocks(Exits);
+ Loop *NewParentL = nullptr;
+ for (auto *ExitBB : Exits)
+ if (Loop *ExitL = LI.getLoopFor(ExitBB))
+ if (!NewParentL || NewParentL->contains(ExitL))
+ NewParentL = ExitL;
+
+ if (NewParentL == OldParentL)
+ return;
+
+ // The new parent loop (if different) should always contain the old one.
+ if (NewParentL)
+ assert(NewParentL->contains(OldParentL) &&
+ "Can only hoist this loop up the nest!");
+
+ // The preheader will need to move with the body of this loop. However,
+ // because it isn't in this loop we also need to update the primary loop map.
+ assert(OldParentL == LI.getLoopFor(&Preheader) &&
+ "Parent loop of this loop should contain this loop's preheader!");
+ LI.changeLoopFor(&Preheader, NewParentL);
+
+ // Remove this loop from its old parent.
+ OldParentL->removeChildLoop(&L);
+
+ // Add the loop either to the new parent or as a top-level loop.
+ if (NewParentL)
+ NewParentL->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+
+ // Remove this loops blocks from the old parent and every other loop up the
+ // nest until reaching the new parent. Also update all of these
+ // no-longer-containing loops to reflect the nesting change.
+ for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
+ OldContainingL = OldContainingL->getParentLoop()) {
+ llvm::erase_if(OldContainingL->getBlocksVector(),
+ [&](const BasicBlock *BB) {
+ return BB == &Preheader || L.contains(BB);
+ });
+
+ OldContainingL->getBlocksSet().erase(&Preheader);
+ for (BasicBlock *BB : L.blocks())
+ OldContainingL->getBlocksSet().erase(BB);
+
+ // Because we just hoisted a loop out of this one, we have essentially
+ // created new exit paths from it. That means we need to form LCSSA PHI
+ // nodes for values used in the no-longer-nested loop.
+ formLCSSA(*OldContainingL, DT, &LI, nullptr);
+
+ // We shouldn't need to form dedicated exits because the exit introduced
+ // here is the (just split by unswitching) preheader. As such, it is
+ // necessarily dedicated.
+ assert(OldContainingL->hasDedicatedExits() &&
+ "Unexpected predecessor of hoisted loop preheader!");
}
}
@@ -349,48 +323,83 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
/// (splitting the exit block as necessary). It simplifies the branch within
/// the loop to an unconditional branch but doesn't remove it entirely. Further
/// cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
- LoopInfo &LI) {
+ LoopInfo &LI, ScalarEvolution *SE) {
assert(BI.isConditional() && "Can only unswitch a conditional branch!");
- DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
+ LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
- Value *LoopCond = BI.getCondition();
+ // The loop invariant values that we want to unswitch.
+ TinyPtrVector<Value *> Invariants;
- // Need a trivial loop condition to unswitch.
- if (!L.isLoopInvariant(LoopCond))
- return false;
+ // When true, we're fully unswitching the branch rather than just unswitching
+ // some input conditions to the branch.
+ bool FullUnswitch = false;
- // FIXME: We should compute this once at the start and update it!
- SmallVector<BasicBlock *, 16> ExitBlocks;
- L.getExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-
- // Check to see if a successor of the branch is guaranteed to
- // exit through a unique exit block without having any
- // side-effects. If so, determine the value of Cond that causes
- // it to do this.
- ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
- ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+ if (L.isLoopInvariant(BI.getCondition())) {
+ Invariants.push_back(BI.getCondition());
+ FullUnswitch = true;
+ } else {
+ if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
+ Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
+ if (Invariants.empty())
+ // Couldn't find invariant inputs!
+ return false;
+ }
+
+ // Check that one of the branch's successors exits, and which one.
+ bool ExitDirection = true;
int LoopExitSuccIdx = 0;
auto *LoopExitBB = BI.getSuccessor(0);
- if (!ExitBlockSet.count(LoopExitBB)) {
- std::swap(CondVal, Replacement);
+ if (L.contains(LoopExitBB)) {
+ ExitDirection = false;
LoopExitSuccIdx = 1;
LoopExitBB = BI.getSuccessor(1);
- if (!ExitBlockSet.count(LoopExitBB))
+ if (L.contains(LoopExitBB))
return false;
}
auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
- assert(L.contains(ContinueBB) &&
- "Cannot have both successors exit and still be in the loop!");
-
auto *ParentBB = BI.getParent();
if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
return false;
- DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
- << " == " << LoopCond << "\n");
+ // When unswitching only part of the branch's condition, we need the exit
+ // block to be reached directly from the partially unswitched input. This can
+ // be done when the exit block is along the true edge and the branch condition
+ // is a graph of `or` operations, or the exit block is along the false edge
+ // and the condition is a graph of `and` operations.
+ if (!FullUnswitch) {
+ if (ExitDirection) {
+ if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or)
+ return false;
+ } else {
+ if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And)
+ return false;
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << " unswitching trivial invariant conditions for: " << BI
+ << "\n";
+ for (Value *Invariant : Invariants) {
+ dbgs() << " " << *Invariant << " == true";
+ if (Invariant != Invariants.back())
+ dbgs() << " ||";
+ dbgs() << "\n";
+ }
+ });
+
+ // If we have scalar evolutions, we need to invalidate them including this
+ // loop and the loop containing the exit block.
+ if (SE) {
+ if (Loop *ExitL = LI.getLoopFor(LoopExitBB))
+ SE->forgetLoop(ExitL);
+ else
+ // Forget the entire nest as this exits the entire nest.
+ SE->forgetTopmostLoop(&L);
+ }
// Split the preheader, so that we know that there is a safe place to insert
// the conditional branch. We will change the preheader to have a conditional
@@ -403,45 +412,73 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
// unswitching. We need to split this if there are other loop predecessors.
// Because the loop is in simplified form, *any* other predecessor is enough.
BasicBlock *UnswitchedBB;
- if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
- (void)PredBB;
- assert(PredBB == BI.getParent() &&
+ if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
+ assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
"A branch's parent isn't a predecessor!");
UnswitchedBB = LoopExitBB;
} else {
UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
}
- // Now splice the branch to gate reaching the new preheader and re-point its
- // successors.
- OldPH->getInstList().splice(std::prev(OldPH->end()),
- BI.getParent()->getInstList(), BI);
+ // Actually move the invariant uses into the unswitched position. If possible,
+ // we do this by moving the instructions, but when doing partial unswitching
+ // we do it by building a new merge of the values in the unswitched position.
OldPH->getTerminator()->eraseFromParent();
- BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
- BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
-
- // Create a new unconditional branch that will continue the loop as a new
- // terminator.
- BranchInst::Create(ContinueBB, ParentBB);
+ if (FullUnswitch) {
+ // If fully unswitching, we can use the existing branch instruction.
+ // Splice it into the old PH to gate reaching the new preheader and re-point
+ // its successors.
+ OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
+ BI);
+ BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+ BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+ // Create a new unconditional branch that will continue the loop as a new
+ // terminator.
+ BranchInst::Create(ContinueBB, ParentBB);
+ } else {
+ // Only unswitching a subset of inputs to the condition, so we will need to
+ // build a new branch that merges the invariant inputs.
+ if (ExitDirection)
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::Or &&
+ "Must have an `or` of `i1`s for the condition!");
+ else
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::And &&
+ "Must have an `and` of `i1`s for the condition!");
+ buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
+ *UnswitchedBB, *NewPH);
+ }
// Rewrite the relevant PHI nodes.
if (UnswitchedBB == LoopExitBB)
rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
else
rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
- *ParentBB, *OldPH);
+ *ParentBB, *OldPH, FullUnswitch);
// Now we need to update the dominator tree.
- updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
- // But if we split something off of the loop exit block then we also removed
- // one of the predecessors for the loop exit block and may need to update its
- // idom.
- if (UnswitchedBB != LoopExitBB)
- updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT);
+ DT.insertEdge(OldPH, UnswitchedBB);
+ if (FullUnswitch)
+ DT.deleteEdge(ParentBB, UnswitchedBB);
+
+ // The constant we can replace all of our invariants with inside the loop
+ // body. If any of the invariants have a value other than this the loop won't
+ // be entered.
+ ConstantInt *Replacement = ExitDirection
+ ? ConstantInt::getFalse(BI.getContext())
+ : ConstantInt::getTrue(BI.getContext());
// Since this is an i1 condition we can also trivially replace uses of it
// within the loop with a constant.
- replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+ for (Value *Invariant : Invariants)
+ replaceLoopInvariantUses(L, Invariant, *Replacement);
+
+ // If this was full unswitching, we may have changed the nesting relationship
+ // for this loop so hoist it to its correct parent if needed.
+ if (FullUnswitch)
+ hoistLoopToNewParent(L, *NewPH, DT, LI);
++NumTrivial;
++NumBranches;
@@ -471,9 +508,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
/// switch will not be revisited. If after unswitching there is only a single
/// in-loop successor, the switch is further simplified to an unconditional
/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
- LoopInfo &LI) {
- DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
+ LoopInfo &LI, ScalarEvolution *SE) {
+ LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
Value *LoopCond = SI.getCondition();
// If this isn't switching on an invariant condition, we can't unswitch it.
@@ -482,41 +522,62 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
auto *ParentBB = SI.getParent();
- // FIXME: We should compute this once at the start and update it!
- SmallVector<BasicBlock *, 16> ExitBlocks;
- L.getExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-
SmallVector<int, 4> ExitCaseIndices;
for (auto Case : SI.cases()) {
auto *SuccBB = Case.getCaseSuccessor();
- if (ExitBlockSet.count(SuccBB) &&
+ if (!L.contains(SuccBB) &&
areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
ExitCaseIndices.push_back(Case.getCaseIndex());
}
BasicBlock *DefaultExitBB = nullptr;
- if (ExitBlockSet.count(SI.getDefaultDest()) &&
+ if (!L.contains(SI.getDefaultDest()) &&
areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
!isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
DefaultExitBB = SI.getDefaultDest();
else if (ExitCaseIndices.empty())
return false;
- DEBUG(dbgs() << " unswitching trivial cases...\n");
+ LLVM_DEBUG(dbgs() << " unswitching trivial cases...\n");
+ // We may need to invalidate SCEVs for the outermost loop reached by any of
+ // the exits.
+ Loop *OuterL = &L;
+
+ if (DefaultExitBB) {
+ // Clear out the default destination temporarily to allow accurate
+ // predecessor lists to be examined below.
+ SI.setDefaultDest(nullptr);
+ // Check the loop containing this exit.
+ Loop *ExitL = LI.getLoopFor(DefaultExitBB);
+ if (!ExitL || ExitL->contains(OuterL))
+ OuterL = ExitL;
+ }
+
+ // Store the exit cases into a separate data structure and remove them from
+ // the switch.
SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
ExitCases.reserve(ExitCaseIndices.size());
// We walk the case indices backwards so that we remove the last case first
// and don't disrupt the earlier indices.
for (unsigned Index : reverse(ExitCaseIndices)) {
auto CaseI = SI.case_begin() + Index;
+ // Compute the outer loop from this exit.
+ Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
+ if (!ExitL || ExitL->contains(OuterL))
+ OuterL = ExitL;
// Save the value of this case.
ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
// Delete the unswitched cases.
SI.removeCase(CaseI);
}
+ if (SE) {
+ if (OuterL)
+ SE->forgetLoop(OuterL);
+ else
+ SE->forgetTopmostLoop(&L);
+ }
+
// Check if after this all of the remaining cases point at the same
// successor.
BasicBlock *CommonSuccBB = nullptr;
@@ -527,23 +588,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SI.case_begin()->getCaseSuccessor();
}))
CommonSuccBB = SI.case_begin()->getCaseSuccessor();
-
- if (DefaultExitBB) {
- // We can't remove the default edge so replace it with an edge to either
- // the single common remaining successor (if we have one) or an unreachable
- // block.
- if (CommonSuccBB) {
- SI.setDefaultDest(CommonSuccBB);
- } else {
- BasicBlock *UnreachableBB = BasicBlock::Create(
- ParentBB->getContext(),
- Twine(ParentBB->getName()) + ".unreachable_default",
- ParentBB->getParent());
- new UnreachableInst(ParentBB->getContext(), UnreachableBB);
- SI.setDefaultDest(UnreachableBB);
- DT.addNewBlock(UnreachableBB, ParentBB);
- }
- } else {
+ if (!DefaultExitBB) {
// If we're not unswitching the default, we need it to match any cases to
// have a common successor or if we have no cases it is the common
// successor.
@@ -580,9 +625,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
} else {
auto *SplitBB =
SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
- rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
- *ParentBB, *OldPH);
- updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT);
+ rewritePHINodesForExitAndUnswitchedBlocks(
+ *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
}
}
@@ -607,9 +651,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!SplitExitBB) {
// If this is the first time we see this, do the split and remember it.
SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
- rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
- *ParentBB, *OldPH);
- updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT);
+ rewritePHINodesForExitAndUnswitchedBlocks(
+ *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
}
// Update the case pair to point to the split block.
CasePair.second = SplitExitBB;
@@ -622,14 +665,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
BasicBlock *UnswitchedBB = CasePair.second;
NewSI->addCase(CaseVal, UnswitchedBB);
- updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
}
// If the default was unswitched, re-point it and add explicit cases for
// entering the loop.
if (DefaultExitBB) {
NewSI->setDefaultDest(DefaultExitBB);
- updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
// We removed all the exit cases, so we just copy the cases to the
// unswitched switch.
@@ -643,11 +684,57 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
// pointing at unreachable and other complexity.
if (CommonSuccBB) {
BasicBlock *BB = SI.getParent();
+ // We may have had multiple edges to this common successor block, so remove
+ // them as predecessors. We skip the first one, either the default or the
+ // actual first case.
+ bool SkippedFirst = DefaultExitBB == nullptr;
+ for (auto Case : SI.cases()) {
+ assert(Case.getCaseSuccessor() == CommonSuccBB &&
+ "Non-common successor!");
+ (void)Case;
+ if (!SkippedFirst) {
+ SkippedFirst = true;
+ continue;
+ }
+ CommonSuccBB->removePredecessor(BB,
+ /*DontDeleteUselessPHIs*/ true);
+ }
+ // Now nuke the switch and replace it with a direct branch.
SI.eraseFromParent();
BranchInst::Create(CommonSuccBB, BB);
+ } else if (DefaultExitBB) {
+ assert(SI.getNumCases() > 0 &&
+ "If we had no cases we'd have a common successor!");
+ // Move the last case to the default successor. This is valid as if the
+ // default got unswitched it cannot be reached. This has the advantage of
+ // being simple and keeping the number of edges from this switch to
+ // successors the same, and avoiding any PHI update complexity.
+ auto LastCaseI = std::prev(SI.case_end());
+ SI.setDefaultDest(LastCaseI->getCaseSuccessor());
+ SI.removeCase(LastCaseI);
+ }
+
+ // Walk the unswitched exit blocks and the unswitched split blocks and update
+ // the dominator tree based on the CFG edits. While we are walking unordered
+ // containers here, the API for applyUpdates takes an unordered list of
+ // updates and requires them to not contain duplicates.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
+ DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
+ DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
}
+ for (auto SplitUnswitchedPair : SplitExitBBMap) {
+ auto *UnswitchedBB = SplitUnswitchedPair.second;
+ DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB});
+ DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
+ }
+ DT.applyUpdates(DTUpdates);
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ // We may have changed the nesting relationship for this loop so hoist it to
+ // its correct parent if needed.
+ hoistLoopToNewParent(L, *NewPH, DT, LI);
- DT.verifyDomTree();
++NumTrivial;
++NumSwitches;
return true;
@@ -662,8 +749,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
///
/// The return value indicates whether anything was unswitched (and therefore
/// changed).
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
- LoopInfo &LI) {
+ LoopInfo &LI, ScalarEvolution *SE) {
bool Changed = false;
// If loop header has only one reachable successor we should keep looking for
@@ -697,8 +787,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
if (isa<Constant>(SI->getCondition()))
return Changed;
- if (!unswitchTrivialSwitch(L, *SI, DT, LI))
- // Coludn't unswitch this one so we're done.
+ if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE))
+ // Couldn't unswitch this one so we're done.
return Changed;
// Mark that we managed to unswitch something.
@@ -729,17 +819,19 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
// Found a trivial condition candidate: non-foldable conditional branch. If
// we fail to unswitch this, we can't do anything else that is trivial.
- if (!unswitchTrivialBranch(L, *BI, DT, LI))
+ if (!unswitchTrivialBranch(L, *BI, DT, LI, SE))
return Changed;
// Mark that we managed to unswitch something.
Changed = true;
- // We unswitched the branch. This should always leave us with an
- // unconditional branch that we can follow now.
+ // If we only unswitched some of the conditions feeding the branch, we won't
+ // have collapsed it to a single successor.
BI = cast<BranchInst>(CurrentBB->getTerminator());
- assert(!BI->isConditional() &&
- "Cannot form a conditional branch by unswitching1");
+ if (BI->isConditional())
+ return Changed;
+
+ // Follow the newly unconditional branch into its successor.
CurrentBB = BI->getSuccessor(0);
// When continuing, if we exit the loop or reach a previous visited block,
@@ -758,8 +850,12 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
///
/// This routine handles cloning all of the necessary loop blocks and exit
/// blocks including rewriting their instructions and the relevant PHI nodes.
-/// It skips loop and exit blocks that are not necessary based on the provided
-/// set. It also correctly creates the unconditional branch in the cloned
+/// Any loop blocks or exit blocks which are dominated by a different successor
+/// than the one for this clone of the loop blocks can be trivially skipped. We
+/// use the `DominatingSucc` map to determine whether a block satisfies that
+/// property with a simple map lookup.
+///
+/// It also correctly creates the unconditional branch in the cloned
/// unswitched parent block to only point at the unswitched successor.
///
/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
@@ -773,9 +869,10 @@ static BasicBlock *buildClonedLoopBlocks(
Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
- const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
- ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT,
- LoopInfo &LI) {
+ const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
+ ValueToValueMapTy &VMap,
+ SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
+ DominatorTree &DT, LoopInfo &LI) {
SmallVector<BasicBlock *, 4> NewBlocks;
NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
@@ -790,26 +887,29 @@ static BasicBlock *buildClonedLoopBlocks(
NewBlocks.push_back(NewBB);
VMap[OldBB] = NewBB;
- // Add the block to the domtree. We'll move it to the correct position
- // below.
- DT.addNewBlock(NewBB, SplitBB);
-
return NewBB;
};
+ // We skip cloning blocks when they have a dominating succ that is not the
+ // succ we are cloning for.
+ auto SkipBlock = [&](BasicBlock *BB) {
+ auto It = DominatingSucc.find(BB);
+ return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
+ };
+
// First, clone the preheader.
auto *ClonedPH = CloneBlock(LoopPH);
// Then clone all the loop blocks, skipping the ones that aren't necessary.
for (auto *LoopBB : L.blocks())
- if (!SkippedLoopAndExitBlocks.count(LoopBB))
+ if (!SkipBlock(LoopBB))
CloneBlock(LoopBB);
// Split all the loop exit edges so that when we clone the exit blocks, if
// any of the exit blocks are *also* a preheader for some other loop, we
// don't create multiple predecessors entering the loop header.
for (auto *ExitBB : ExitBlocks) {
- if (SkippedLoopAndExitBlocks.count(ExitBB))
+ if (SkipBlock(ExitBB))
continue;
// When we are going to clone an exit, we don't need to clone all the
@@ -832,17 +932,6 @@ static BasicBlock *buildClonedLoopBlocks(
assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
"Cloned exit block has the wrong successor!");
- // Move the merge block's idom to be the split point as one exit is
- // dominated by one header, and the other by another, so we know the split
- // point dominates both. While the dominator tree isn't fully accurate, we
- // want sub-trees within the original loop to be correctly reflect
- // dominance within that original loop (at least) and that requires moving
- // the merge block out of that subtree.
- // FIXME: This is very brittle as we essentially have a partial contract on
- // the dominator tree. We really need to instead update it and keep it
- // valid or stop relying on it.
- DT.changeImmediateDominator(MergeBB, SplitBB);
-
// Remap any cloned instructions and create a merge phi node for them.
for (auto ZippedInsts : llvm::zip_first(
llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
@@ -882,28 +971,63 @@ static BasicBlock *buildClonedLoopBlocks(
AC.registerAssumption(II);
}
- // Remove the cloned parent as a predecessor of the cloned continue successor
- // if we did in fact clone it.
- auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
- if (auto *ClonedContinueSuccBB =
- cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
- ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
- /*DontDeleteUselessPHIs*/ true);
- // Replace the cloned branch with an unconditional branch to the cloneed
- // unswitched successor.
- auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
- ClonedParentBB->getTerminator()->eraseFromParent();
- BranchInst::Create(ClonedSuccBB, ClonedParentBB);
-
// Update any PHI nodes in the cloned successors of the skipped blocks to not
// have spurious incoming values.
for (auto *LoopBB : L.blocks())
- if (SkippedLoopAndExitBlocks.count(LoopBB))
+ if (SkipBlock(LoopBB))
for (auto *SuccBB : successors(LoopBB))
if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
for (PHINode &PN : ClonedSuccBB->phis())
PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+ // Remove the cloned parent as a predecessor of any successor we ended up
+ // cloning other than the unswitched one.
+ auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+ for (auto *SuccBB : successors(ParentBB)) {
+ if (SuccBB == UnswitchedSuccBB)
+ continue;
+
+ auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
+ if (!ClonedSuccBB)
+ continue;
+
+ ClonedSuccBB->removePredecessor(ClonedParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+ }
+
+ // Replace the cloned branch with an unconditional branch to the cloned
+ // unswitched successor.
+ auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+ ClonedParentBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
+ // If there are duplicate entries in the PHI nodes because of multiple edges
+ // to the unswitched successor, we need to nuke all but one as we replaced it
+ // with a direct branch.
+ for (PHINode &PN : ClonedSuccBB->phis()) {
+ bool Found = false;
+ // Loop over the incoming operands backwards so we can easily delete as we
+ // go without invalidating the index.
+ for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
+ if (PN.getIncomingBlock(i) != ClonedParentBB)
+ continue;
+ if (!Found) {
+ Found = true;
+ continue;
+ }
+ PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
+ }
+ }
+
+ // Record the domtree updates for the new blocks.
+ SmallPtrSet<BasicBlock *, 4> SuccSet;
+ for (auto *ClonedBB : NewBlocks) {
+ for (auto *SuccBB : successors(ClonedBB))
+ if (SuccSet.insert(SuccBB).second)
+ DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
+ SuccSet.clear();
+ }
+
return ClonedPH;
}
@@ -921,11 +1045,8 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
for (auto *BB : OrigL.blocks()) {
auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
ClonedL.addBlockEntry(ClonedBB);
- if (LI.getLoopFor(BB) == &OrigL) {
- assert(!LI.getLoopFor(ClonedBB) &&
- "Should not have an existing loop for this block!");
+ if (LI.getLoopFor(BB) == &OrigL)
LI.changeLoopFor(ClonedBB, &ClonedL);
- }
}
};
@@ -975,9 +1096,9 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
/// original loop, multiple cloned sibling loops may be created. All of them
/// are returned so that the newly introduced loop nest roots can be
/// identified.
-static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
- const ValueToValueMapTy &VMap, LoopInfo &LI,
- SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+ const ValueToValueMapTy &VMap, LoopInfo &LI,
+ SmallVectorImpl<Loop *> &NonChildClonedLoops) {
Loop *ClonedL = nullptr;
auto *OrigPH = OrigL.getLoopPreheader();
@@ -1070,6 +1191,7 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
} else {
LI.addTopLevelLoop(ClonedL);
}
+ NonChildClonedLoops.push_back(ClonedL);
ClonedL->reserveBlocks(BlocksInClonedLoop.size());
// We don't want to just add the cloned loop blocks based on how we
@@ -1138,11 +1260,11 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
// matter as we're just trying to build up the map from inside-out; we use
// the map in a more stably ordered way below.
auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
- std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
- [&](BasicBlock *LHS, BasicBlock *RHS) {
- return ExitLoopMap.lookup(LHS)->getLoopDepth() <
- ExitLoopMap.lookup(RHS)->getLoopDepth();
- });
+ llvm::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
+ [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+ ExitLoopMap.lookup(RHS)->getLoopDepth();
+ });
// Populate the existing ExitLoopMap with everything reachable from each
// exit, starting from the inner most exit.
@@ -1222,60 +1344,69 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
NonChildClonedLoops.push_back(cloneLoopNest(
*ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
}
+}
+
+static void
+deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+ ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
+ DominatorTree &DT) {
+ // Find all the dead clones, and remove them from their successors.
+ SmallVector<BasicBlock *, 16> DeadBlocks;
+ for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+ for (auto &VMap : VMaps)
+ if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
+ if (!DT.isReachableFromEntry(ClonedBB)) {
+ for (BasicBlock *SuccBB : successors(ClonedBB))
+ SuccBB->removePredecessor(ClonedBB);
+ DeadBlocks.push_back(ClonedBB);
+ }
- // Return the main cloned loop if any.
- return ClonedL;
+ // Drop any remaining references to break cycles.
+ for (BasicBlock *BB : DeadBlocks)
+ BB->dropAllReferences();
+ // Erase them from the IR.
+ for (BasicBlock *BB : DeadBlocks)
+ BB->eraseFromParent();
}
-static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
- SmallVectorImpl<BasicBlock *> &ExitBlocks,
- DominatorTree &DT, LoopInfo &LI) {
- // Walk the dominator tree to build up the set of blocks we will delete here.
- // The order is designed to allow us to always delete bottom-up and avoid any
- // dangling uses.
- SmallSetVector<BasicBlock *, 16> DeadBlocks;
- DeadBlocks.insert(DeadSubtreeRoot);
- for (int i = 0; i < (int)DeadBlocks.size(); ++i)
- for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) {
- // FIXME: This assert should pass and that means we don't change nearly
- // as much below! Consider rewriting all of this to avoid deleting
- // blocks. They are always cloned before being deleted, and so instead
- // could just be moved.
- // FIXME: This in turn means that we might actually be more able to
- // update the domtree.
- assert((L.contains(ChildN->getBlock()) ||
- llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) &&
- "Should never reach beyond the loop and exits when deleting!");
- DeadBlocks.insert(ChildN->getBlock());
+static void
+deleteDeadBlocksFromLoop(Loop &L,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ DominatorTree &DT, LoopInfo &LI) {
+ // Find all the dead blocks, and remove them from their successors.
+ SmallVector<BasicBlock *, 16> DeadBlocks;
+ for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+ if (!DT.isReachableFromEntry(BB)) {
+ for (BasicBlock *SuccBB : successors(BB))
+ SuccBB->removePredecessor(BB);
+ DeadBlocks.push_back(BB);
}
+ SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(),
+ DeadBlocks.end());
+
// Filter out the dead blocks from the exit blocks list so that it can be
// used in the caller.
llvm::erase_if(ExitBlocks,
- [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
-
- // Remove these blocks from their successors.
- for (auto *BB : DeadBlocks)
- for (BasicBlock *SuccBB : successors(BB))
- SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
+ [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
// Walk from this loop up through its parents removing all of the dead blocks.
for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
for (auto *BB : DeadBlocks)
ParentL->getBlocksSet().erase(BB);
llvm::erase_if(ParentL->getBlocksVector(),
- [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+ [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
}
// Now delete the dead child loops. This raw delete will clear them
// recursively.
llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
- if (!DeadBlocks.count(ChildL->getHeader()))
+ if (!DeadBlockSet.count(ChildL->getHeader()))
return false;
assert(llvm::all_of(ChildL->blocks(),
[&](BasicBlock *ChildBB) {
- return DeadBlocks.count(ChildBB);
+ return DeadBlockSet.count(ChildBB);
}) &&
"If the child loop header is dead all blocks in the child loop must "
"be dead as well!");
@@ -1283,19 +1414,20 @@ static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
return true;
});
- // Remove the mappings for the dead blocks.
- for (auto *BB : DeadBlocks)
+ // Remove the loop mappings for the dead blocks and drop all the references
+ // from these blocks to others to handle cyclic references as we start
+ // deleting the blocks themselves.
+ for (auto *BB : DeadBlocks) {
+ // Check that the dominator tree has already been updated.
+ assert(!DT.getNode(BB) && "Should already have cleared domtree!");
LI.changeLoopFor(BB, nullptr);
-
- // Drop all the references from these blocks to others to handle cyclic
- // references as we start deleting the blocks themselves.
- for (auto *BB : DeadBlocks)
BB->dropAllReferences();
+ }
- for (auto *BB : llvm::reverse(DeadBlocks)) {
- DT.eraseNode(BB);
+ // Actually delete the blocks now that they've been fully unhooked from the
+ // IR.
+ for (auto *BB : DeadBlocks)
BB->eraseFromParent();
- }
}
/// Recompute the set of blocks in a loop after unswitching.
@@ -1343,14 +1475,15 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
if (LoopBlockSet.empty())
return LoopBlockSet;
- // Add the loop header to the set.
- LoopBlockSet.insert(Header);
-
// We found backedges, recurse through them to identify the loop blocks.
while (!Worklist.empty()) {
BasicBlock *BB = Worklist.pop_back_val();
assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+ // No need to walk past the header.
+ if (BB == Header)
+ continue;
+
// Because we know the inner loop structure remains valid we can use the
// loop structure to jump immediately across the entire nested loop.
// Further, because it is in loop simplified form, we can directly jump
@@ -1371,9 +1504,10 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
continue;
// Insert all of the blocks (other than those already present) into
- // the loop set. The only block we expect to already be in the set is
- // the one we used to find this loop as we immediately handle the
- // others the first time we encounter the loop.
+ // the loop set. We expect at least the block that led us to find the
+ // inner loop to be in the block set, but we may also have other loop
+ // blocks if they were already enqueued as predecessors of some other
+ // outer loop block.
for (auto *InnerBB : InnerL->blocks()) {
if (InnerBB == BB) {
assert(LoopBlockSet.count(InnerBB) &&
@@ -1381,9 +1515,7 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
continue;
}
- bool Inserted = LoopBlockSet.insert(InnerBB).second;
- (void)Inserted;
- assert(Inserted && "Should only insert an inner loop once!");
+ LoopBlockSet.insert(InnerBB);
}
// Add the preheader to the worklist so we will continue past the
@@ -1399,6 +1531,8 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
Worklist.push_back(Pred);
}
+ assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
+
// We've found all the blocks participating in the loop, return our completed
// set.
return LoopBlockSet;
@@ -1646,32 +1780,58 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
} while (!DomWorklist.empty());
}
-/// Take an invariant branch that has been determined to be safe and worthwhile
-/// to unswitch despite being non-trivial to do so and perform the unswitch.
-///
-/// This directly updates the CFG to hoist the predicate out of the loop, and
-/// clone the necessary parts of the loop to maintain behavior.
-///
-/// It also updates both dominator tree and loopinfo based on the unswitching.
-///
-/// Once unswitching has been performed it runs the provided callback to report
-/// the new loops and no-longer valid loops to the caller.
-static bool unswitchInvariantBranch(
- Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC,
- function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
- assert(BI.isConditional() && "Can only unswitch a conditional branch!");
- assert(L.isLoopInvariant(BI.getCondition()) &&
- "Can only unswitch an invariant branch condition!");
+static bool unswitchNontrivialInvariants(
+ Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants,
+ DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE) {
+ auto *ParentBB = TI.getParent();
+ BranchInst *BI = dyn_cast<BranchInst>(&TI);
+ SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
+
+ // We can only unswitch switches, conditional branches with an invariant
+ // condition, or combining invariant conditions with an instruction.
+ assert((SI || BI->isConditional()) &&
+ "Can only unswitch switches and conditional branch!");
+ bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
+ if (FullUnswitch)
+ assert(Invariants.size() == 1 &&
+ "Cannot have other invariants with full unswitching!");
+ else
+ assert(isa<Instruction>(BI->getCondition()) &&
+ "Partial unswitching requires an instruction as the condition!");
+
+ // Constant and BBs tracking the cloned and continuing successor. When we are
+ // unswitching the entire condition, this can just be trivially chosen to
+ // unswitch towards `true`. However, when we are unswitching a set of
+ // invariants combined with `and` or `or`, the combining operation determines
+ // the best direction to unswitch: we want to unswitch the direction that will
+ // collapse the branch.
+ bool Direction = true;
+ int ClonedSucc = 0;
+ if (!FullUnswitch) {
+ if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) {
+ assert(cast<Instruction>(BI->getCondition())->getOpcode() ==
+ Instruction::And &&
+ "Only `or` and `and` instructions can combine invariants being "
+ "unswitched.");
+ Direction = false;
+ ClonedSucc = 1;
+ }
+ }
- // Constant and BBs tracking the cloned and continuing successor.
- const int ClonedSucc = 0;
- auto *ParentBB = BI.getParent();
- auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
- auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
+ BasicBlock *RetainedSuccBB =
+ BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
+ SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
+ if (BI)
+ UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
+ else
+ for (auto Case : SI->cases())
+ if (Case.getCaseSuccessor() != RetainedSuccBB)
+ UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
- assert(UnswitchedSuccBB != ContinueSuccBB &&
- "Should not unswitch a branch that always goes to the same place!");
+ assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
+ "Should not unswitch the same successor we are retaining!");
// The branch should be in this exact loop. Any inner loop's invariant branch
// should be handled by unswitching that inner loop. The caller of this
@@ -1690,9 +1850,6 @@ static bool unswitchInvariantBranch(
if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
return false;
- SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-
// Compute the parent loop now before we start hacking on things.
Loop *ParentL = L.getParentLoop();
@@ -1711,27 +1868,31 @@ static bool unswitchInvariantBranch(
OuterExitL = NewOuterExitL;
}
- // If the edge we *aren't* cloning in the unswitch (the continuing edge)
- // dominates its target, we can skip cloning the dominated region of the loop
- // and its exits. We compute this as a set of nodes to be skipped.
- SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
- if (ContinueSuccBB->getUniquePredecessor() ||
- llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
- return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
- })) {
- visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
- SkippedLoopAndExitBlocks.insert(BB);
- return true;
- });
+ // At this point, we're definitely going to unswitch something so invalidate
+ // any cached information in ScalarEvolution for the outer most loop
+ // containing an exit block and all nested loops.
+ if (SE) {
+ if (OuterExitL)
+ SE->forgetLoop(OuterExitL);
+ else
+ SE->forgetTopmostLoop(&L);
}
- // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
- // edge) dominates its target, we will end up with dead nodes in the original
- // loop and its exits that will need to be deleted. Here, we just retain that
- // the property holds and will compute the deleted set later.
- bool DeleteUnswitchedSucc =
- UnswitchedSuccBB->getUniquePredecessor() ||
- llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
- return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
+
+ // If the edge from this terminator to a successor dominates that successor,
+ // store a map from each block in its dominator subtree to it. This lets us
+ // tell when cloning for a particular successor if a block is dominated by
+ // some *other* successor with a single data structure. We use this to
+ // significantly reduce cloning.
+ SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
+ for (auto *SuccBB : llvm::concat<BasicBlock *const>(
+ makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
+ if (SuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
+ }))
+ visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
+ DominatingSucc[BB] = SuccBB;
+ return true;
});
// Split the preheader, so that we know that there is a safe place to insert
@@ -1742,52 +1903,162 @@ static bool unswitchInvariantBranch(
BasicBlock *SplitBB = L.getLoopPreheader();
BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
- // Keep a mapping for the cloned values.
- ValueToValueMapTy VMap;
+ // Keep track of the dominator tree updates needed.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+
+ // Clone the loop for each unswitched successor.
+ SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+ VMaps.reserve(UnswitchedSuccBBs.size());
+ SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
+ for (auto *SuccBB : UnswitchedSuccBBs) {
+ VMaps.emplace_back(new ValueToValueMapTy());
+ ClonedPHs[SuccBB] = buildClonedLoopBlocks(
+ L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
+ DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI);
+ }
+
+ // The stitching of the branched code back together depends on whether we're
+ // doing full unswitching or not with the exception that we always want to
+ // nuke the initial terminator placed in the split block.
+ SplitBB->getTerminator()->eraseFromParent();
+ if (FullUnswitch) {
+ // First we need to unhook the successor relationship as we'll be replacing
+ // the terminator with a direct branch. This is much simpler for branches
+ // than switches so we handle those first.
+ if (BI) {
+ // Remove the parent as a predecessor of the unswitched successor.
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
+ UnswitchedSuccBB->removePredecessor(ParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+ DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
+ } else {
+ // Note that we actually want to remove the parent block as a predecessor
+ // of *every* case successor. The case successor is either unswitched,
+ // completely eliminating an edge from the parent to that successor, or it
+ // is a duplicate edge to the retained successor as the retained successor
+ // is always the default successor and as we'll replace this with a direct
+ // branch we no longer need the duplicate entries in the PHI nodes.
+ assert(SI->getDefaultDest() == RetainedSuccBB &&
+ "Not retaining default successor!");
+ for (auto &Case : SI->cases())
+ Case.getCaseSuccessor()->removePredecessor(
+ ParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+
+ // We need to use the set to populate domtree updates as even when there
+ // are multiple cases pointing at the same successor we only want to
+ // remove and insert one edge in the domtree.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
+ }
+
+ // Now that we've unhooked the successor relationship, splice the terminator
+ // from the original loop to the split.
+ SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
+
+ // Now wire up the terminator to the preheaders.
+ if (BI) {
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ BI->setSuccessor(ClonedSucc, ClonedPH);
+ BI->setSuccessor(1 - ClonedSucc, LoopPH);
+ DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+ } else {
+ assert(SI && "Must either be a branch or switch!");
+
+ // Walk the cases and directly update their successors.
+ SI->setDefaultDest(LoopPH);
+ for (auto &Case : SI->cases())
+ if (Case.getCaseSuccessor() == RetainedSuccBB)
+ Case.setSuccessor(LoopPH);
+ else
+ Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
+
+ // We need to use the set to populate domtree updates as even when there
+ // are multiple cases pointing at the same successor we only want to
+ // remove and insert one edge in the domtree.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ DTUpdates.push_back(
+ {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
+ }
+
+ // Create a new unconditional branch to the continuing block (as opposed to
+ // the one cloned).
+ BranchInst::Create(RetainedSuccBB, ParentBB);
+ } else {
+ assert(BI && "Only branches have partial unswitching.");
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ // When doing a partial unswitch, we have to do a bit more work to build up
+ // the branch in the split block.
+ buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+ *ClonedPH, *LoopPH);
+ DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+ }
- // Build the cloned blocks from the loop.
- auto *ClonedPH = buildClonedLoopBlocks(
- L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
- ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI);
+ // Apply the updates accumulated above to get an up-to-date dominator tree.
+ DT.applyUpdates(DTUpdates);
+
+ // Now that we have an accurate dominator tree, first delete the dead cloned
+ // blocks so that we can accurately build any cloned loops. It is important to
+ // not delete the blocks from the original loop yet because we still want to
+ // reference the original loop to understand the cloned loop's structure.
+ deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT);
// Build the cloned loop structure itself. This may be substantially
// different from the original structure due to the simplified CFG. This also
// handles inserting all the cloned blocks into the correct loops.
SmallVector<Loop *, 4> NonChildClonedLoops;
- Loop *ClonedL =
- buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
-
- // Remove the parent as a predecessor of the unswitched successor.
- UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
-
- // Now splice the branch from the original loop and use it to select between
- // the two loops.
- SplitBB->getTerminator()->eraseFromParent();
- SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
- BI.setSuccessor(ClonedSucc, ClonedPH);
- BI.setSuccessor(1 - ClonedSucc, LoopPH);
-
- // Create a new unconditional branch to the continuing block (as opposed to
- // the one cloned).
- BranchInst::Create(ContinueSuccBB, ParentBB);
-
- // Delete anything that was made dead in the original loop due to
- // unswitching.
- if (DeleteUnswitchedSucc)
- deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI);
+ for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
+ buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
+ // Now that our cloned loops have been built, we can update the original loop.
+ // First we delete the dead blocks from it and then we rebuild the loop
+ // structure taking these deletions into account.
+ deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI);
SmallVector<Loop *, 4> HoistedLoops;
bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
- // This will have completely invalidated the dominator tree. We can't easily
- // bound how much is invalid because in some cases we will refine the
- // predecessor set of exit blocks of the loop which can move large unrelated
- // regions of code into a new subtree.
- //
- // FIXME: Eventually, we should use an incremental update utility that
- // leverages the existing information in the dominator tree (and potentially
- // the nature of the change) to more efficiently update things.
- DT.recalculate(*SplitBB->getParent());
+ // This transformation has a high risk of corrupting the dominator tree, and
+ // the below steps to rebuild loop structures will result in hard to debug
+ // errors in that case so verify that the dominator tree is sane first.
+ // FIXME: Remove this when the bugs stop showing up and rely on existing
+ // verification steps.
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ if (BI) {
+ // If we unswitched a branch which collapses the condition to a known
+ // constant we want to replace all the uses of the invariants within both
+ // the original and cloned blocks. We do this here so that we can use the
+ // now updated dominator tree to identify which side the users are on.
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ ConstantInt *UnswitchedReplacement =
+ Direction ? ConstantInt::getTrue(BI->getContext())
+ : ConstantInt::getFalse(BI->getContext());
+ ConstantInt *ContinueReplacement =
+ Direction ? ConstantInt::getFalse(BI->getContext())
+ : ConstantInt::getTrue(BI->getContext());
+ for (Value *Invariant : Invariants)
+ for (auto UI = Invariant->use_begin(), UE = Invariant->use_end();
+ UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+ if (!UserI)
+ continue;
+
+ // Replace it with the 'continue' side if in the main loop body, and the
+ // unswitched if in the cloned blocks.
+ if (DT.dominates(LoopPH, UserI->getParent()))
+ U->set(ContinueReplacement);
+ else if (DT.dominates(ClonedPH, UserI->getParent()))
+ U->set(UnswitchedReplacement);
+ }
+ }
// We can change which blocks are exit blocks of all the cloned sibling
// loops, the current loop, and any parent loops which shared exit blocks
@@ -1801,57 +2072,50 @@ static bool unswitchInvariantBranch(
// also need to cover any intervening loops. We add all of these loops to
// a list and sort them by loop depth to achieve this without updating
// unnecessary loops.
- auto UpdateLCSSA = [&](Loop &UpdateL) {
+ auto UpdateLoop = [&](Loop &UpdateL) {
#ifndef NDEBUG
- for (Loop *ChildL : UpdateL)
+ UpdateL.verifyLoop();
+ for (Loop *ChildL : UpdateL) {
+ ChildL->verifyLoop();
assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
"Perturbed a child loop's LCSSA form!");
+ }
#endif
+ // First build LCSSA for this loop so that we can preserve it when
+ // forming dedicated exits. We don't want to perturb some other loop's
+ // LCSSA while doing that CFG edit.
formLCSSA(UpdateL, DT, &LI, nullptr);
+
+ // For loops reached by this loop's original exit blocks we may
+ // introduced new, non-dedicated exits. At least try to re-form dedicated
+ // exits for these loops. This may fail if they couldn't have dedicated
+ // exits to start with.
+ formDedicatedExitBlocks(&UpdateL, &DT, &LI, /*PreserveLCSSA*/ true);
};
// For non-child cloned loops and hoisted loops, we just need to update LCSSA
// and we can do it in any order as they don't nest relative to each other.
- for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
- UpdateLCSSA(*UpdatedL);
+ //
+ // Also check if any of the loops we have updated have become top-level loops
+ // as that will necessitate widening the outer loop scope.
+ for (Loop *UpdatedL :
+ llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
+ UpdateLoop(*UpdatedL);
+ if (!UpdatedL->getParentLoop())
+ OuterExitL = nullptr;
+ }
+ if (IsStillLoop) {
+ UpdateLoop(L);
+ if (!L.getParentLoop())
+ OuterExitL = nullptr;
+ }
// If the original loop had exit blocks, walk up through the outer most loop
// of those exit blocks to update LCSSA and form updated dedicated exits.
- if (OuterExitL != &L) {
- SmallVector<Loop *, 4> OuterLoops;
- // We start with the cloned loop and the current loop if they are loops and
- // move toward OuterExitL. Also, if either the cloned loop or the current
- // loop have become top level loops we need to walk all the way out.
- if (ClonedL) {
- OuterLoops.push_back(ClonedL);
- if (!ClonedL->getParentLoop())
- OuterExitL = nullptr;
- }
- if (IsStillLoop) {
- OuterLoops.push_back(&L);
- if (!L.getParentLoop())
- OuterExitL = nullptr;
- }
- // Grab all of the enclosing loops now.
+ if (OuterExitL != &L)
for (Loop *OuterL = ParentL; OuterL != OuterExitL;
OuterL = OuterL->getParentLoop())
- OuterLoops.push_back(OuterL);
-
- // Finally, update our list of outer loops. This is nicely ordered to work
- // inside-out.
- for (Loop *OuterL : OuterLoops) {
- // First build LCSSA for this loop so that we can preserve it when
- // forming dedicated exits. We don't want to perturb some other loop's
- // LCSSA while doing that CFG edit.
- UpdateLCSSA(*OuterL);
-
- // For loops reached by this loop's original exit blocks we may
- // introduced new, non-dedicated exits. At least try to re-form dedicated
- // exits for these loops. This may fail if they couldn't have dedicated
- // exits to start with.
- formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
- }
- }
+ UpdateLoop(*OuterL);
#ifndef NDEBUG
// Verify the entire loop structure to catch any incorrect updates before we
@@ -1866,7 +2130,7 @@ static bool unswitchInvariantBranch(
for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
if (UpdatedL->getParentLoop() == ParentL)
SibLoops.push_back(UpdatedL);
- NonTrivialUnswitchCB(IsStillLoop, SibLoops);
+ UnswitchCB(IsStillLoop, SibLoops);
++NumBranches;
return true;
@@ -1905,50 +2169,69 @@ computeDomSubtreeCost(DomTreeNode &N,
return Cost;
}
-/// Unswitch control flow predicated on loop invariant conditions.
-///
-/// This first hoists all branches or switches which are trivial (IE, do not
-/// require duplicating any part of the loop) out of the loop body. It then
-/// looks at other loop invariant control flows and tries to unswitch those as
-/// well by cloning the loop if the result is small enough.
static bool
-unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
- TargetTransformInfo &TTI, bool NonTrivial,
- function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
- assert(L.isRecursivelyLCSSAForm(DT, LI) &&
- "Loops must be in LCSSA form before unswitching.");
- bool Changed = false;
+unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, TargetTransformInfo &TTI,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE) {
+ // Collect all invariant conditions within this loop (as opposed to an inner
+ // loop which would be handled when visiting that inner loop).
+ SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4>
+ UnswitchCandidates;
+ for (auto *BB : L.blocks()) {
+ if (LI.getLoopFor(BB) != &L)
+ continue;
- // Must be in loop simplified form: we need a preheader and dedicated exits.
- if (!L.isLoopSimplifyForm())
- return false;
+ if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ // We can only consider fully loop-invariant switch conditions as we need
+ // to completely eliminate the switch after unswitching.
+ if (!isa<Constant>(SI->getCondition()) &&
+ L.isLoopInvariant(SI->getCondition()))
+ UnswitchCandidates.push_back({SI, {SI->getCondition()}});
+ continue;
+ }
- // Try trivial unswitch first before loop over other basic blocks in the loop.
- Changed |= unswitchAllTrivialConditions(L, DT, LI);
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
+ BI->getSuccessor(0) == BI->getSuccessor(1))
+ continue;
- // If we're not doing non-trivial unswitching, we're done. We both accept
- // a parameter but also check a local flag that can be used for testing
- // a debugging.
- if (!NonTrivial && !EnableNonTrivialUnswitch)
- return Changed;
-
- // Collect all remaining invariant branch conditions within this loop (as
- // opposed to an inner loop which would be handled when visiting that inner
- // loop).
- SmallVector<TerminatorInst *, 4> UnswitchCandidates;
- for (auto *BB : L.blocks())
- if (LI.getLoopFor(BB) == &L)
- if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
- if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
- BI->getSuccessor(0) != BI->getSuccessor(1))
- UnswitchCandidates.push_back(BI);
+ if (L.isLoopInvariant(BI->getCondition())) {
+ UnswitchCandidates.push_back({BI, {BI->getCondition()}});
+ continue;
+ }
+
+ Instruction &CondI = *cast<Instruction>(BI->getCondition());
+ if (CondI.getOpcode() != Instruction::And &&
+ CondI.getOpcode() != Instruction::Or)
+ continue;
+
+ TinyPtrVector<Value *> Invariants =
+ collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
+ if (Invariants.empty())
+ continue;
+
+ UnswitchCandidates.push_back({BI, std::move(Invariants)});
+ }
// If we didn't find any candidates, we're done.
if (UnswitchCandidates.empty())
- return Changed;
+ return false;
+
+ // Check if there are irreducible CFG cycles in this loop. If so, we cannot
+ // easily unswitch non-trivial edges out of the loop. Doing so might turn the
+ // irreducible control flow into reducible control flow and introduce new
+ // loops "out of thin air". If we ever discover important use cases for doing
+ // this, we can add support to loop unswitch, but it is a lot of complexity
+ // for what seems little or no real world benefit.
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(&LI);
+ if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+ return false;
- DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
- << " non-trivial loop invariant conditions for unswitching.\n");
+ LLVM_DEBUG(
+ dbgs() << "Considering " << UnswitchCandidates.size()
+ << " non-trivial loop invariant conditions for unswitching.\n");
// Given that unswitching these terminators will require duplicating parts of
// the loop, so we need to be able to model that cost. Compute the ephemeral
@@ -1972,10 +2255,10 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
continue;
if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
- return Changed;
+ return false;
if (auto CS = CallSite(&I))
if (CS.isConvergent() || CS.cannotDuplicate())
- return Changed;
+ return false;
Cost += TTI.getUserCost(&I);
}
@@ -1984,7 +2267,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
assert(LoopCost >= 0 && "Must not have negative loop costs!");
BBCostMap[BB] = Cost;
}
- DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
+ LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
// Now we find the best candidate by searching for the one with the following
// properties in order:
@@ -2003,8 +2286,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
// Given a terminator which might be unswitched, computes the non-duplicated
// cost for that terminator.
- auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
- BasicBlock &BB = *TI->getParent();
+ auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) {
+ BasicBlock &BB = *TI.getParent();
SmallPtrSet<BasicBlock *, 4> Visited;
int Cost = LoopCost;
@@ -2013,6 +2296,26 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
if (!Visited.insert(SuccBB).second)
continue;
+ // If this is a partial unswitch candidate, then it must be a conditional
+ // branch with a condition of either `or` or `and`. In that case, one of
+ // the successors is necessarily duplicated, so don't even try to remove
+ // its cost.
+ if (!FullUnswitch) {
+ auto &BI = cast<BranchInst>(TI);
+ if (cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::And) {
+ if (SuccBB == BI.getSuccessor(1))
+ continue;
+ } else {
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::Or &&
+ "Only `and` and `or` conditions can result in a partial "
+ "unswitch!");
+ if (SuccBB == BI.getSuccessor(0))
+ continue;
+ }
+ }
+
// This successor's domtree will not need to be duplicated after
// unswitching if the edge to the successor dominates it (and thus the
// entire tree). This essentially means there is no other path into this
@@ -2036,27 +2339,95 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
};
TerminatorInst *BestUnswitchTI = nullptr;
int BestUnswitchCost;
- for (TerminatorInst *CandidateTI : UnswitchCandidates) {
- int CandidateCost = ComputeUnswitchedCost(CandidateTI);
- DEBUG(dbgs() << " Computed cost of " << CandidateCost
- << " for unswitch candidate: " << *CandidateTI << "\n");
+ ArrayRef<Value *> BestUnswitchInvariants;
+ for (auto &TerminatorAndInvariants : UnswitchCandidates) {
+ TerminatorInst &TI = *TerminatorAndInvariants.first;
+ ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
+ BranchInst *BI = dyn_cast<BranchInst>(&TI);
+ int CandidateCost = ComputeUnswitchedCost(
+ TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
+ Invariants[0] == BI->getCondition()));
+ LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
+ << " for unswitch candidate: " << TI << "\n");
if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
- BestUnswitchTI = CandidateTI;
+ BestUnswitchTI = &TI;
BestUnswitchCost = CandidateCost;
+ BestUnswitchInvariants = Invariants;
}
}
- if (BestUnswitchCost < UnswitchThreshold) {
- DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = "
- << BestUnswitchCost << ") branch: " << *BestUnswitchTI
- << "\n");
- Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
- LI, AC, NonTrivialUnswitchCB);
- } else {
- DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
- << "\n");
+ if (BestUnswitchCost >= UnswitchThreshold) {
+ LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
+ << BestUnswitchCost << "\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = "
+ << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
+ << "\n");
+ return unswitchNontrivialInvariants(
+ L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE);
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+///
+/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
+/// updated based on the unswitch.
+///
+/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
+/// true, we will attempt to do non-trivial unswitching as well as trivial
+/// unswitching.
+///
+/// The `UnswitchCB` callback provided will be run after unswitching is
+/// complete, with the first parameter set to `true` if the provided loop
+/// remains a loop, and a list of new sibling loops created.
+///
+/// If `SE` is non-null, we will update that analysis based on the unswitching
+/// done.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, TargetTransformInfo &TTI,
+ bool NonTrivial,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE) {
+ assert(L.isRecursivelyLCSSAForm(DT, LI) &&
+ "Loops must be in LCSSA form before unswitching.");
+ bool Changed = false;
+
+ // Must be in loop simplified form: we need a preheader and dedicated exits.
+ if (!L.isLoopSimplifyForm())
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ if (unswitchAllTrivialConditions(L, DT, LI, SE)) {
+ // If we unswitched successfully we will want to clean up the loop before
+ // processing it further so just mark it as unswitched and return.
+ UnswitchCB(/*CurrentLoopValid*/ true, {});
+ return true;
}
+ // If we're not doing non-trivial unswitching, we're done. We both accept
+ // a parameter but also check a local flag that can be used for testing
+ // a debugging.
+ if (!NonTrivial && !EnableNonTrivialUnswitch)
+ return false;
+
+ // For non-trivial unswitching, because it often creates new loops, we rely on
+ // the pass manager to iterate on the loops rather than trying to immediately
+ // reach a fixed point. There is no substantial advantage to iterating
+ // internally, and if any of the new loops are simplified enough to contain
+ // trivial unswitching we want to prefer those.
+
+ // Try to unswitch the best invariant condition. We prefer this full unswitch to
+ // a partial unswitch when possible below the threshold.
+ if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE))
+ return true;
+
+ // No other opportunities to unswitch.
return Changed;
}
@@ -2066,16 +2437,18 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
Function &F = *L.getHeader()->getParent();
(void)F;
- DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
+ << "\n");
// Save the current loop name in a variable so that we can report it even
// after it has been deleted.
std::string LoopName = L.getName();
- auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
- ArrayRef<Loop *> NewLoops) {
+ auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
// If we did a non-trivial unswitch, we have added new (cloned) loops.
- U.addSiblingLoops(NewLoops);
+ if (!NewLoops.empty())
+ U.addSiblingLoops(NewLoops);
// If the current loop remains valid, we should revisit it to catch any
// other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2085,15 +2458,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
U.markLoopAsDeleted(L, LoopName);
};
- if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
- NonTrivialUnswitchCB))
+ if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
+ &AR.SE))
return PreservedAnalyses::all();
-#ifndef NDEBUG
// Historically this pass has had issues with the dominator tree so verify it
// in asserts builds.
- AR.DT.verifyDomTree();
-#endif
+ assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
return getLoopPassPreservedAnalyses();
}
@@ -2128,15 +2499,19 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
Function &F = *L->getHeader()->getParent();
- DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
+ << "\n");
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
- ArrayRef<Loop *> NewLoops) {
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+ auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
// If we did a non-trivial unswitch, we have added new (cloned) loops.
for (auto *NewL : NewLoops)
LPM.addLoop(*NewL);
@@ -2150,18 +2525,16 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
LPM.markLoopAsDeleted(*L);
};
- bool Changed =
- unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
+ bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE);
// If anything was unswitched, also clear any cached information about this
// loop.
LPM.deleteSimpleAnalysisLoop(L);
-#ifndef NDEBUG
// Historically this pass has had issues with the dominator tree so verify it
// in asserts builds.
- DT.verifyDomTree();
-#endif
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
return Changed;
}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 1522170dc3b9..b7b1db76b492 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -39,7 +40,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/SimplifyCFG.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <utility>
using namespace llvm;
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index cfb8a062299f..ca6b93e0b4a9 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
- if (isa<LoadInst>(Inst))
+ if (Inst->mayReadFromMemory())
return false;
// We don't want to sink across a critical edge if we don't dominate the
@@ -187,11 +187,9 @@ static bool SinkInstruction(Instruction *Inst,
if (!SuccToSinkTo)
return false;
- DEBUG(dbgs() << "Sink" << *Inst << " (";
- Inst->getParent()->printAsOperand(dbgs(), false);
- dbgs() << " -> ";
- SuccToSinkTo->printAsOperand(dbgs(), false);
- dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << "Sink" << *Inst << " (";
+ Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> ";
+ SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n");
// Move the instruction.
Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
@@ -244,7 +242,7 @@ static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
do {
MadeChange = false;
- DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+ LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
// Process all basic blocks.
for (BasicBlock &I : F)
MadeChange |= ProcessBlock(I, DT, LI, AA);
diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index 23156d5a4d83..6743e19a7c92 100644
--- a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -64,7 +64,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
// block. We should consider using actual post-dominance here in the
// future.
if (UI->getParent() != PhiBB) {
- DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
return false;
}
@@ -75,7 +75,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
// probably change this to do at least a limited scan of the intervening
// instructions and allow handling stores in easily proven safe cases.
if (mayBeMemoryDependent(*UI)) {
- DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
return false;
}
@@ -126,8 +126,8 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
// If when we directly test whether this is safe it fails, bail.
if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
mayBeMemoryDependent(*OpI)) {
- DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI
- << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
+ << *OpI << "\n");
// Record the stack of instructions which reach this node as unsafe
// so we prune subsequent searches.
UnsafeSet.insert(OpI);
@@ -229,7 +229,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
NonFreeMat |= MatCost != TTI.TCC_Free;
}
if (!NonFreeMat) {
- DEBUG(dbgs() << " Free: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
// No profit in free materialization.
return false;
}
@@ -237,7 +237,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
// Now check that the uses of this PHI can actually be speculated,
// otherwise we'll still have to materialize the PHI value.
if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
- DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
return false;
}
@@ -266,7 +266,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
// Assume we will commute the constant to the RHS to be canonical.
Idx = 1;
- // Get the intrinsic ID if this user is an instrinsic.
+ // Get the intrinsic ID if this user is an intrinsic.
Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
IID = UserII->getIntrinsicID();
@@ -288,9 +288,13 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
// just bail. We're only interested in cases where folding the incoming
// constants is at least break-even on all paths.
if (FoldedCost > MatCost) {
- DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n"
- " Materializing cost: " << MatCost << "\n"
- " Accumulated folded cost: " << FoldedCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
+ << "\n"
+ " Materializing cost: "
+ << MatCost
+ << "\n"
+ " Accumulated folded cost: "
+ << FoldedCost << "\n");
return false;
}
}
@@ -310,8 +314,8 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
"less that its materialized cost, "
"the sum must be as well.");
- DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
- << ": " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
+ << ": " << PN << "\n");
CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
return true;
}
@@ -489,9 +493,13 @@ findProfitablePHIs(ArrayRef<PHINode *> PNs,
// and zero out the cost of everything it depends on.
int CostSavings = CostSavingsMap.find(PN)->second;
if (SpecCost > CostSavings) {
- DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n"
- " Cost savings: " << CostSavings << "\n"
- " Speculation cost: " << SpecCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
+ << "\n"
+ " Cost savings: "
+ << CostSavings
+ << "\n"
+ " Speculation cost: "
+ << SpecCost << "\n");
continue;
}
@@ -545,7 +553,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
SmallSetVector<BasicBlock *, 16> &PredSet,
DominatorTree &DT) {
- DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
+ LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
NumPHIsSpeculated += SpecPNs.size();
// Split any critical edges so that we have a block to hoist into.
@@ -558,8 +566,8 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
if (NewPredBB) {
++NumEdgesSplit;
- DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
+ << "\n");
SpecPreds.push_back(NewPredBB);
} else {
assert(PredBB->getSingleSuccessor() == ParentBB &&
@@ -593,14 +601,15 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
int NumSpecInsts = SpecList.size() * SpecPreds.size();
int NumRedundantInsts = NumSpecInsts - SpecList.size();
- DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, "
- << NumRedundantInsts << " redundancies\n");
+ LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
+ << " speculated instructions, " << NumRedundantInsts
+ << " redundancies\n");
NumSpeculatedInstructions += NumSpecInsts;
NumNewRedundantInstructions += NumRedundantInsts;
// Each predecessor is numbered by its index in `SpecPreds`, so for each
// instruction we speculate, the speculated instruction is stored in that
- // index of the vector asosciated with the original instruction. We also
+ // index of the vector associated with the original instruction. We also
// store the incoming values for each predecessor from any PHIs used.
SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
@@ -716,7 +725,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
/// true when at least some speculation occurs.
static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
DominatorTree &DT, TargetTransformInfo &TTI) {
- DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+ LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
// Savings in cost from speculating around a PHI node.
SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
@@ -745,7 +754,7 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
PNs.end());
// If no PHIs were profitable, skip.
if (PNs.empty()) {
- DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
+ LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
return false;
}
@@ -763,13 +772,13 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
// differently.
if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
isa<InvokeInst>(PredBB->getTerminator())) {
- DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
+ << PredBB->getName() << "\n");
return false;
}
}
if (PredSet.size() < 2) {
- DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
+ LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
return false;
}
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a7c308b59877..f5e1dd6ed850 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -62,7 +62,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Instructions.h"
@@ -137,6 +137,7 @@ INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
}
bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
@@ -151,8 +152,8 @@ namespace llvm {
bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
- DEBUG(dbgs() << "Not running SpeculativeExecution because "
- "TTI->hasBranchDivergence() is false.\n");
+ LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
+ "TTI->hasBranchDivergence() is false.\n");
return false;
}
@@ -251,7 +252,7 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
bool SpeculativeExecutionPass::considerHoistingFromTo(
BasicBlock &FromBlock, BasicBlock &ToBlock) {
- SmallSet<const Instruction *, 8> NotHoisted;
+ SmallPtrSet<const Instruction *, 8> NotHoisted;
const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
for (Value* V : U->operand_values()) {
if (Instruction *I = dyn_cast<Instruction>(V)) {
@@ -314,6 +315,7 @@ PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
return PA;
}
} // namespace llvm
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index ce40af1223f6..2061db13639a 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -61,6 +61,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -80,7 +81,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
#include <limits>
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 2972e1cff9a4..d650264176aa 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -40,6 +40,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
#include <cassert>
@@ -55,6 +56,12 @@ static const char *const FlowBlockName = "Flow";
namespace {
+static cl::opt<bool> ForceSkipUniformRegions(
+ "structurizecfg-skip-uniform-regions",
+ cl::Hidden,
+ cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
+ cl::init(false));
+
// Definition of the complex types used in this pass.
using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -120,7 +127,7 @@ public:
bool resultIsRememberedBlock() { return ResultIsRemembered; }
};
-/// @brief Transforms the control flow graph on one single entry/exit region
+/// Transforms the control flow graph on one single entry/exit region
/// at a time.
///
/// After the transform all "If"/"Then"/"Else" style control flow looks like
@@ -176,6 +183,7 @@ class StructurizeCFG : public RegionPass {
Function *Func;
Region *ParentRegion;
+ DivergenceAnalysis *DA;
DominatorTree *DT;
LoopInfo *LI;
@@ -196,6 +204,9 @@ class StructurizeCFG : public RegionPass {
void orderNodes();
+ Loop *getAdjustedLoop(RegionNode *RN);
+ unsigned getAdjustedLoopDepth(RegionNode *RN);
+
void analyzeLoops(RegionNode *N);
Value *invert(Value *Condition);
@@ -242,8 +253,11 @@ class StructurizeCFG : public RegionPass {
public:
static char ID;
- explicit StructurizeCFG(bool SkipUniformRegions = false)
- : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+ explicit StructurizeCFG(bool SkipUniformRegions_ = false)
+ : RegionPass(ID),
+ SkipUniformRegions(SkipUniformRegions_) {
+ if (ForceSkipUniformRegions.getNumOccurrences())
+ SkipUniformRegions = ForceSkipUniformRegions.getValue();
initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
}
@@ -278,7 +292,7 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
false, false)
-/// \brief Initialize the types and constants used in the pass
+/// Initialize the types and constants used in the pass
bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
LLVMContext &Context = R->getEntry()->getContext();
@@ -290,7 +304,27 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
return false;
}
-/// \brief Build up the general order of nodes
+/// Use the exit block to determine the loop if RN is a SubRegion.
+Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) {
+ if (RN->isSubRegion()) {
+ Region *SubRegion = RN->getNodeAs<Region>();
+ return LI->getLoopFor(SubRegion->getExit());
+ }
+
+ return LI->getLoopFor(RN->getEntry());
+}
+
+/// Use the exit block to determine the loop depth if RN is a SubRegion.
+unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) {
+ if (RN->isSubRegion()) {
+ Region *SubR = RN->getNodeAs<Region>();
+ return LI->getLoopDepth(SubR->getExit());
+ }
+
+ return LI->getLoopDepth(RN->getEntry());
+}
+
+/// Build up the general order of nodes
void StructurizeCFG::orderNodes() {
ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
@@ -299,16 +333,15 @@ void StructurizeCFG::orderNodes() {
// to what we want. The only problem with it is that sometimes backedges
// for outer loops will be visited before backedges for inner loops.
for (RegionNode *RN : RPOT) {
- BasicBlock *BB = RN->getEntry();
- Loop *Loop = LI->getLoopFor(BB);
+ Loop *Loop = getAdjustedLoop(RN);
++LoopBlocks[Loop];
}
unsigned CurrentLoopDepth = 0;
Loop *CurrentLoop = nullptr;
for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
- BasicBlock *BB = (*I)->getEntry();
- unsigned LoopDepth = LI->getLoopDepth(BB);
+ RegionNode *RN = cast<RegionNode>(*I);
+ unsigned LoopDepth = getAdjustedLoopDepth(RN);
if (is_contained(Order, *I))
continue;
@@ -320,15 +353,14 @@ void StructurizeCFG::orderNodes() {
auto LoopI = I;
while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
LoopI++;
- BasicBlock *LoopBB = (*LoopI)->getEntry();
- if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+ if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) {
--BlockCount;
Order.push_back(*LoopI);
}
}
}
- CurrentLoop = LI->getLoopFor(BB);
+ CurrentLoop = getAdjustedLoop(RN);
if (CurrentLoop)
LoopBlocks[CurrentLoop]--;
@@ -343,7 +375,7 @@ void StructurizeCFG::orderNodes() {
std::reverse(Order.begin(), Order.end());
}
-/// \brief Determine the end of the loops
+/// Determine the end of the loops
void StructurizeCFG::analyzeLoops(RegionNode *N) {
if (N->isSubRegion()) {
// Test for exit as back edge
@@ -362,15 +394,16 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
}
}
-/// \brief Invert the given condition
+/// Invert the given condition
Value *StructurizeCFG::invert(Value *Condition) {
// First: Check if it's a constant
if (Constant *C = dyn_cast<Constant>(Condition))
return ConstantExpr::getNot(C);
// Second: If the condition is already inverted, return the original value
- if (match(Condition, m_Not(m_Value(Condition))))
- return Condition;
+ Value *NotCondition;
+ if (match(Condition, m_Not(m_Value(NotCondition))))
+ return NotCondition;
if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
// Third: Check all the users for an invert
@@ -394,7 +427,7 @@ Value *StructurizeCFG::invert(Value *Condition) {
llvm_unreachable("Unhandled condition to invert");
}
-/// \brief Build the condition for one edge
+/// Build the condition for one edge
Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
bool Invert) {
Value *Cond = Invert ? BoolFalse : BoolTrue;
@@ -407,7 +440,7 @@ Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
return Cond;
}
-/// \brief Analyze the predecessors of each block and build up predicates
+/// Analyze the predecessors of each block and build up predicates
void StructurizeCFG::gatherPredicates(RegionNode *N) {
RegionInfo *RI = ParentRegion->getRegionInfo();
BasicBlock *BB = N->getEntry();
@@ -465,7 +498,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
}
}
-/// \brief Collect various loop and predicate infos
+/// Collect various loop and predicate infos
void StructurizeCFG::collectInfos() {
// Reset predicate
Predicates.clear();
@@ -478,10 +511,10 @@ void StructurizeCFG::collectInfos() {
Visited.clear();
for (RegionNode *RN : reverse(Order)) {
- DEBUG(dbgs() << "Visiting: "
- << (RN->isSubRegion() ? "SubRegion with entry: " : "")
- << RN->getEntry()->getName() << " Loop Depth: "
- << LI->getLoopDepth(RN->getEntry()) << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: "
+ << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+ << RN->getEntry()->getName() << " Loop Depth: "
+ << LI->getLoopDepth(RN->getEntry()) << "\n");
// Analyze all the conditions leading to a node
gatherPredicates(RN);
@@ -494,7 +527,7 @@ void StructurizeCFG::collectInfos() {
}
}
-/// \brief Insert the missing branch conditions
+/// Insert the missing branch conditions
void StructurizeCFG::insertConditions(bool Loops) {
BranchVector &Conds = Loops ? LoopConds : Conditions;
Value *Default = Loops ? BoolTrue : BoolFalse;
@@ -540,14 +573,11 @@ void StructurizeCFG::insertConditions(bool Loops) {
}
}
-/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// Remove all PHI values coming from "From" into "To" and remember
/// them in DeletedPhis
void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
PhiMap &Map = DeletedPhis[To];
- for (Instruction &I : *To) {
- if (!isa<PHINode>(I))
- break;
- PHINode &Phi = cast<PHINode>(I);
+ for (PHINode &Phi : To->phis()) {
while (Phi.getBasicBlockIndex(From) != -1) {
Value *Deleted = Phi.removeIncomingValue(From, false);
Map[&Phi].push_back(std::make_pair(From, Deleted));
@@ -555,19 +585,16 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
}
}
-/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+/// Add a dummy PHI value as soon as we knew the new predecessor
void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
- for (Instruction &I : *To) {
- if (!isa<PHINode>(I))
- break;
- PHINode &Phi = cast<PHINode>(I);
+ for (PHINode &Phi : To->phis()) {
Value *Undef = UndefValue::get(Phi.getType());
Phi.addIncoming(Undef, From);
}
AddedPhis[To].push_back(From);
}
-/// \brief Add the real PHI value as soon as everything is set up
+/// Add the real PHI value as soon as everything is set up
void StructurizeCFG::setPhiValues() {
SSAUpdater Updater;
for (const auto &AddedPhi : AddedPhis) {
@@ -607,7 +634,7 @@ void StructurizeCFG::setPhiValues() {
assert(DeletedPhis.empty());
}
-/// \brief Remove phi values from all successors and then remove the terminator.
+/// Remove phi values from all successors and then remove the terminator.
void StructurizeCFG::killTerminator(BasicBlock *BB) {
TerminatorInst *Term = BB->getTerminator();
if (!Term)
@@ -617,10 +644,12 @@ void StructurizeCFG::killTerminator(BasicBlock *BB) {
SI != SE; ++SI)
delPhiValues(BB, *SI);
+ if (DA)
+ DA->removeValue(Term);
Term->eraseFromParent();
}
-/// \brief Let node exit(s) point to NewExit
+/// Let node exit(s) point to NewExit
void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
bool IncludeDominator) {
if (Node->isSubRegion()) {
@@ -666,7 +695,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
}
}
-/// \brief Create a new flow node and update dominator tree and region info
+/// Create a new flow node and update dominator tree and region info
BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
LLVMContext &Context = Func->getContext();
BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
@@ -678,7 +707,7 @@ BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
return Flow;
}
-/// \brief Create a new or reuse the previous node as flow node
+/// Create a new or reuse the previous node as flow node
BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
BasicBlock *Entry = PrevNode->getEntry();
@@ -697,7 +726,7 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
return Flow;
}
-/// \brief Returns the region exit if possible, otherwise just a new flow node
+/// Returns the region exit if possible, otherwise just a new flow node
BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
bool ExitUseAllowed) {
if (!Order.empty() || !ExitUseAllowed)
@@ -709,13 +738,13 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
return Exit;
}
-/// \brief Set the previous node
+/// Set the previous node
void StructurizeCFG::setPrevNode(BasicBlock *BB) {
PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
: nullptr;
}
-/// \brief Does BB dominate all the predicates of Node?
+/// Does BB dominate all the predicates of Node?
bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
@@ -723,7 +752,7 @@ bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
});
}
-/// \brief Can we predict that this node will always be called?
+/// Can we predict that this node will always be called?
bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
bool Dominated = false;
@@ -851,7 +880,7 @@ void StructurizeCFG::createFlow() {
}
/// Handle a rare case where the disintegrated nodes instructions
-/// no longer dominate all their uses. Not sure if this is really nessasary
+/// no longer dominate all their uses. Not sure if this is really necessary
void StructurizeCFG::rebuildSSA() {
SSAUpdater Updater;
for (BasicBlock *BB : ParentRegion->blocks())
@@ -884,30 +913,60 @@ void StructurizeCFG::rebuildSSA() {
}
}
-static bool hasOnlyUniformBranches(const Region *R,
+static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
const DivergenceAnalysis &DA) {
- for (const BasicBlock *BB : R->blocks()) {
- const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
- if (!Br || !Br->isConditional())
- continue;
+ for (auto E : R->elements()) {
+ if (!E->isSubRegion()) {
+ auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
- if (!DA.isUniform(Br->getCondition()))
- return false;
- DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+ if (!DA.isUniform(Br))
+ return false;
+ LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
+ << " has uniform terminator\n");
+ } else {
+ // Explicitly refuse to treat regions as uniform if they have non-uniform
+ // subregions. We cannot rely on DivergenceAnalysis for branches in
+ // subregions because those branches may have been removed and re-created,
+ // so we look for our metadata instead.
+ //
+ // Warning: It would be nice to treat regions as uniform based only on
+ // their direct child basic blocks' terminators, regardless of whether
+ // subregions are uniform or not. However, this requires a very careful
+ // look at SIAnnotateControlFlow to make sure nothing breaks there.
+ for (auto BB : E->getNodeAs<Region>()->blocks()) {
+ auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
+
+ if (!Br->getMetadata(UniformMDKindID))
+ return false;
+ }
+ }
}
return true;
}
-/// \brief Run the transformation for each region found
+/// Run the transformation for each region found
bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
if (R->isTopLevelRegion())
return false;
+ DA = nullptr;
+
if (SkipUniformRegions) {
// TODO: We could probably be smarter here with how we handle sub-regions.
- auto &DA = getAnalysis<DivergenceAnalysis>();
- if (hasOnlyUniformBranches(R, DA)) {
- DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
+ // We currently rely on the fact that metadata is set by earlier invocations
+ // of the pass on sub-regions, and that this metadata doesn't get lost --
+ // but we shouldn't rely on metadata for correctness!
+ unsigned UniformMDKindID =
+ R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
+ DA = &getAnalysis<DivergenceAnalysis>();
+
+ if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
+ LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
+ << '\n');
// Mark all direct child block terminators as having been treated as
// uniform. To account for a possible future in which non-uniform
@@ -919,7 +978,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
continue;
if (Instruction *Term = E->getEntry()->getTerminator())
- Term->setMetadata("structurizecfg.uniform", MD);
+ Term->setMetadata(UniformMDKindID, MD);
}
return false;
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 2a1106b41de2..f8cd6c17a5a6 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -87,7 +87,7 @@ STATISTIC(NumEliminated, "Number of tail calls removed");
STATISTIC(NumRetDuped, "Number of return duplicated");
STATISTIC(NumAccumAdded, "Number of accumulators introduced");
-/// \brief Scan the specified function for alloca instructions.
+/// Scan the specified function for alloca instructions.
/// If it contains any dynamic allocas, returns false.
static bool canTRE(Function &F) {
// Because of PR962, we don't TRE dynamic allocas.
@@ -302,7 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
if (Visited[CI->getParent()] != ESCAPED) {
// If the escape point was part way through the block, calls after the
// escape point wouldn't have been put into DeferredTails.
- DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
+ LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
CI->setTailCall();
Modified = true;
} else {
@@ -699,8 +699,8 @@ static bool foldReturnAndProcessPred(
BranchInst *BI = UncondBranchPreds.pop_back_val();
BasicBlock *Pred = BI->getParent();
if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
- DEBUG(dbgs() << "FOLDING: " << *BB
- << "INTO UNCOND BRANCH PRED: " << *Pred);
+ LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+ << "INTO UNCOND BRANCH PRED: " << *Pred);
ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
// Cleanup: if all predecessors of BB have been eliminated by
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 0f0668f24db5..e3ef42362223 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -69,7 +69,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include <utility>
using namespace llvm;
@@ -114,7 +114,7 @@ static bool shouldHaveDiscriminator(const Instruction *I) {
return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
}
-/// \brief Assign DWARF discriminators.
+/// Assign DWARF discriminators.
///
/// To assign discriminators, we examine the boundaries of every
/// basic block and its successors. Suppose there is a basic block B1
@@ -210,9 +210,9 @@ static bool addDiscriminators(Function &F) {
// it in 1 byte ULEB128 representation.
unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
- DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
- << DIL->getColumn() << ":" << Discriminator << " " << I
- << "\n");
+ LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+ << DIL->getColumn() << ":" << Discriminator << " " << I
+ << "\n");
Changed = true;
}
}
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 606bd8baccaa..516a785dce1e 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -36,7 +37,6 @@
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
#include <string>
@@ -45,16 +45,22 @@
using namespace llvm;
-void llvm::DeleteDeadBlock(BasicBlock *BB) {
+void llvm::DeleteDeadBlock(BasicBlock *BB, DeferredDominance *DDT) {
assert((pred_begin(BB) == pred_end(BB) ||
// Can delete self loop.
BB->getSinglePredecessor() == BB) && "Block is not dead!");
TerminatorInst *BBTerm = BB->getTerminator();
+ std::vector<DominatorTree::UpdateType> Updates;
// Loop through all of our successors and make sure they know that one
// of their predecessors is going away.
- for (BasicBlock *Succ : BBTerm->successors())
+ if (DDT)
+ Updates.reserve(BBTerm->getNumSuccessors());
+ for (BasicBlock *Succ : BBTerm->successors()) {
Succ->removePredecessor(BB);
+ if (DDT)
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ }
// Zap all the instructions in the block.
while (!BB->empty()) {
@@ -69,8 +75,12 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
BB->getInstList().pop_back();
}
- // Zap the block!
- BB->eraseFromParent();
+ if (DDT) {
+ DDT->applyUpdates(Updates);
+ DDT->deleteBB(BB); // Deferred deletion of BB.
+ } else {
+ BB->eraseFromParent(); // Zap the block!
+ }
}
void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
@@ -94,9 +104,8 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
// Recursively deleting a PHI may cause multiple PHIs to be deleted
// or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
SmallVector<WeakTrackingVH, 8> PHIs;
- for (BasicBlock::iterator I = BB->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I)
- PHIs.push_back(PN);
+ for (PHINode &PN : BB->phis())
+ PHIs.push_back(&PN);
bool Changed = false;
for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
@@ -108,9 +117,12 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
LoopInfo *LI,
- MemoryDependenceResults *MemDep) {
- // Don't merge away blocks who have their address taken.
- if (BB->hasAddressTaken()) return false;
+ MemoryDependenceResults *MemDep,
+ DeferredDominance *DDT) {
+ assert(!(DT && DDT) && "Cannot call with both DT and DDT.");
+
+ if (BB->hasAddressTaken())
+ return false;
// Can't merge if there are multiple predecessors, or no predecessors.
BasicBlock *PredBB = BB->getUniquePredecessor();
@@ -122,39 +134,38 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
if (PredBB->getTerminator()->isExceptional())
return false;
- succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
- BasicBlock *OnlySucc = BB;
- for (; SI != SE; ++SI)
- if (*SI != OnlySucc) {
- OnlySucc = nullptr; // There are multiple distinct successors!
- break;
- }
-
- // Can't merge if there are multiple successors.
- if (!OnlySucc) return false;
+ // Can't merge if there are multiple distinct successors.
+ if (PredBB->getUniqueSuccessor() != BB)
+ return false;
// Can't merge if there is PHI loop.
- for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
- if (PHINode *PN = dyn_cast<PHINode>(BI)) {
- for (Value *IncValue : PN->incoming_values())
- if (IncValue == PN)
- return false;
- } else
- break;
- }
+ for (PHINode &PN : BB->phis())
+ for (Value *IncValue : PN.incoming_values())
+ if (IncValue == &PN)
+ return false;
// Begin by getting rid of unneeded PHIs.
- SmallVector<Value *, 4> IncomingValues;
+ SmallVector<AssertingVH<Value>, 4> IncomingValues;
if (isa<PHINode>(BB->front())) {
- for (auto &I : *BB)
- if (PHINode *PN = dyn_cast<PHINode>(&I)) {
- if (PN->getIncomingValue(0) != PN)
- IncomingValues.push_back(PN->getIncomingValue(0));
- } else
- break;
+ for (PHINode &PN : BB->phis())
+ if (!isa<PHINode>(PN.getIncomingValue(0)) ||
+ cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB)
+ IncomingValues.push_back(PN.getIncomingValue(0));
FoldSingleEntryPHINodes(BB, MemDep);
}
+ // Deferred DT update: Collect all the edges that exit BB. These
+ // dominator edges will be redirected from Pred.
+ std::vector<DominatorTree::UpdateType> Updates;
+ if (DDT) {
+ Updates.reserve(1 + (2 * succ_size(BB)));
+ Updates.push_back({DominatorTree::Delete, PredBB, BB});
+ for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+ Updates.push_back({DominatorTree::Delete, BB, *I});
+ Updates.push_back({DominatorTree::Insert, PredBB, *I});
+ }
+ }
+
// Delete the unconditional branch from the predecessor...
PredBB->getInstList().pop_back();
@@ -166,8 +177,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
// Eliminate duplicate dbg.values describing the entry PHI node post-splice.
- for (auto *Incoming : IncomingValues) {
- if (isa<Instruction>(Incoming)) {
+ for (auto Incoming : IncomingValues) {
+ if (isa<Instruction>(*Incoming)) {
SmallVector<DbgValueInst *, 2> DbgValues;
SmallDenseSet<std::pair<DILocalVariable *, DIExpression *>, 2>
DbgValueSet;
@@ -201,7 +212,12 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
if (MemDep)
MemDep->invalidateCachedPredecessors();
- BB->eraseFromParent();
+ if (DDT) {
+ DDT->deleteBB(BB); // Deferred deletion of BB.
+ DDT->applyUpdates(Updates);
+ } else {
+ BB->eraseFromParent(); // Nuke BB.
+ }
return true;
}
@@ -317,13 +333,21 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
DominatorTree *DT, LoopInfo *LI,
bool PreserveLCSSA, bool &HasLoopExit) {
// Update dominator tree if available.
- if (DT)
- DT->splitBlock(NewBB);
+ if (DT) {
+ if (OldBB == DT->getRootNode()->getBlock()) {
+ assert(NewBB == &NewBB->getParent()->getEntryBlock());
+ DT->setNewRoot(NewBB);
+ } else {
+ // Split block expects NewBB to have a non-empty set of predecessors.
+ DT->splitBlock(NewBB);
+ }
+ }
// The rest of the logic is only relevant for updating the loop structures.
if (!LI)
return;
+ assert(DT && "DT should be available to update LoopInfo!");
Loop *L = LI->getLoopFor(OldBB);
// If we need to preserve loop analyses, collect some information about how
@@ -331,6 +355,12 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
bool IsLoopEntry = !!L;
bool SplitMakesNewLoopHeader = false;
for (BasicBlock *Pred : Preds) {
+ // Preds that are not reachable from entry should not be used to identify if
+ // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks
+ // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader
+ // as true and make the NewBB the header of some loop. This breaks LI.
+ if (!DT->isReachableFromEntry(Pred))
+ continue;
// If we need to preserve LCSSA, determine if any of the preds is a loop
// exit.
if (PreserveLCSSA)
@@ -495,7 +525,6 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
// Insert dummy values as the incoming value.
for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
- return NewBB;
}
// Update DominatorTree, LoopInfo, and LCCSA analysis information.
@@ -503,8 +532,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA,
HasLoopExit);
- // Update the PHI nodes in BB with the values coming from NewBB.
- UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+ if (!Preds.empty()) {
+ // Update the PHI nodes in BB with the values coming from NewBB.
+ UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+ }
+
return NewBB;
}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 3653c307619b..3e30c27a9f33 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -28,7 +28,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
@@ -106,10 +106,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
// For each PHI in the destination block.
- for (BasicBlock::iterator I = DestBB->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I) {
- unsigned Idx = PN->getBasicBlockIndex(SplitBB);
- Value *V = PN->getIncomingValue(Idx);
+ for (PHINode &PN : DestBB->phis()) {
+ unsigned Idx = PN.getBasicBlockIndex(SplitBB);
+ Value *V = PN.getIncomingValue(Idx);
// If the input is a PHI which already satisfies LCSSA, don't create
// a new one.
@@ -119,13 +118,13 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
// Otherwise a new PHI is needed. Create one and populate it.
PHINode *NewPN = PHINode::Create(
- PN->getType(), Preds.size(), "split",
+ PN.getType(), Preds.size(), "split",
SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
for (unsigned i = 0, e = Preds.size(); i != e; ++i)
NewPN->addIncoming(V, Preds[i]);
// Update the original PHI.
- PN->setIncomingValue(Idx, NewPN);
+ PN.setIncomingValue(Idx, NewPN);
}
}
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index b60dfb4f3541..5f5c4150d3bb 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -105,12 +105,23 @@ static bool setRetNonNull(Function &F) {
return true;
}
+static bool setNonLazyBind(Function &F) {
+ if (F.hasFnAttribute(Attribute::NonLazyBind))
+ return false;
+ F.addFnAttr(Attribute::NonLazyBind);
+ return true;
+}
+
bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
LibFunc TheLibFunc;
if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
return false;
bool Changed = false;
+
+ if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
+ Changed |= setNonLazyBind(F);
+
switch (TheLibFunc) {
case LibFunc_strlen:
case LibFunc_wcslen:
@@ -375,6 +386,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
case LibFunc_fseek:
case LibFunc_ftell:
case LibFunc_fgetc:
+ case LibFunc_fgetc_unlocked:
case LibFunc_fseeko:
case LibFunc_ftello:
case LibFunc_fileno:
@@ -393,6 +405,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setOnlyReadsMemory(F);
return Changed;
case LibFunc_fputc:
+ case LibFunc_fputc_unlocked:
case LibFunc_fstat:
case LibFunc_frexp:
case LibFunc_frexpf:
@@ -402,21 +415,25 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 1);
return Changed;
case LibFunc_fgets:
+ case LibFunc_fgets_unlocked:
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 2);
return Changed;
case LibFunc_fread:
+ case LibFunc_fread_unlocked:
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 0);
Changed |= setDoesNotCapture(F, 3);
return Changed;
case LibFunc_fwrite:
+ case LibFunc_fwrite_unlocked:
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 0);
Changed |= setDoesNotCapture(F, 3);
// FIXME: readonly #1?
return Changed;
case LibFunc_fputs:
+ case LibFunc_fputs_unlocked:
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 0);
Changed |= setDoesNotCapture(F, 1);
@@ -447,6 +464,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
return Changed;
case LibFunc_gets:
case LibFunc_getchar:
+ case LibFunc_getchar_unlocked:
Changed |= setDoesNotThrow(F);
return Changed;
case LibFunc_getitimer:
@@ -485,6 +503,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
case LibFunc_putc:
+ case LibFunc_putc_unlocked:
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 1);
return Changed;
@@ -505,6 +524,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
case LibFunc_putchar:
+ case LibFunc_putchar_unlocked:
Changed |= setDoesNotThrow(F);
return Changed;
case LibFunc_popen:
@@ -687,9 +707,9 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setRetNonNull(F);
Changed |= setRetDoesNotAlias(F);
return Changed;
- //TODO: add LibFunc entries for:
- //case LibFunc_memset_pattern4:
- //case LibFunc_memset_pattern8:
+ // TODO: add LibFunc entries for:
+ // case LibFunc_memset_pattern4:
+ // case LibFunc_memset_pattern8:
case LibFunc_memset_pattern16:
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setDoesNotCapture(F, 0);
@@ -709,6 +729,19 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
}
}
+bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+ LibFunc DoubleFn, LibFunc FloatFn,
+ LibFunc LongDoubleFn) {
+ switch (Ty->getTypeID()) {
+ case Type::FloatTyID:
+ return TLI->has(FloatFn);
+ case Type::DoubleTyID:
+ return TLI->has(DoubleFn);
+ default:
+ return TLI->has(LongDoubleFn);
+ }
+}
+
//- Emit LibCalls ------------------------------------------------------------//
Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
@@ -973,6 +1006,24 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
return CI;
}
+Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fputc_unlocked))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(),
+ B.getInt32Ty(), File->getType());
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(*M->getFunction("fputc_unlocked"), *TLI);
+ Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
+ CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked");
+
+ if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
const TargetLibraryInfo *TLI) {
if (!TLI->has(LibFunc_fputs))
@@ -991,6 +1042,24 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
return CI;
}
+Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fputs_unlocked))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef FPutsUnlockedName = TLI->getName(LibFunc_fputs_unlocked);
+ Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
+ B.getInt8PtrTy(), File->getType());
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(*M->getFunction(FPutsUnlockedName), *TLI);
+ CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked");
+
+ if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
const DataLayout &DL, const TargetLibraryInfo *TLI) {
if (!TLI->has(LibFunc_fwrite))
@@ -1013,3 +1082,119 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
CI->setCallingConv(Fn->getCallingConv());
return CI;
}
+
+Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_malloc))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(),
+ DL.getIntPtrType(Context));
+ inferLibFuncAttributes(*M->getFunction("malloc"), *TLI);
+ CallInst *CI = B.CreateCall(Malloc, Num, "malloc");
+
+ if (const Function *F = dyn_cast<Function>(Malloc->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+
+ return CI;
+}
+
+Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+ IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+ if (!TLI.has(LibFunc_calloc))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ const DataLayout &DL = M->getDataLayout();
+ IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+ Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+ PtrType, PtrType);
+ inferLibFuncAttributes(*M->getFunction("calloc"), TLI);
+ CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc");
+
+ if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+
+ return CI;
+}
+
+Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+ IRBuilder<> &B, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fwrite_unlocked))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ StringRef FWriteUnlockedName = TLI->getName(LibFunc_fwrite_unlocked);
+ Constant *F = M->getOrInsertFunction(
+ FWriteUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+ DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(*M->getFunction(FWriteUnlockedName), *TLI);
+ CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
+
+ if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fgetc_unlocked))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ Constant *F =
+ M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType());
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(*M->getFunction("fgetc_unlocked"), *TLI);
+ CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked");
+
+ if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
+ IRBuilder<> &B, const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fgets_unlocked))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ Constant *F =
+ M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(),
+ B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
+ inferLibFuncAttributes(*M->getFunction("fgets_unlocked"), *TLI);
+ CallInst *CI =
+ B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked");
+
+ if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+ IRBuilder<> &B, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fread_unlocked))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ StringRef FReadUnlockedName = TLI->getName(LibFunc_fread_unlocked);
+ Constant *F = M->getOrInsertFunction(
+ FReadUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+ DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(*M->getFunction(FReadUnlockedName), *TLI);
+ CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
+
+ if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index f711b192f604..05512a6dff3e 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -21,6 +21,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -34,7 +35,6 @@
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
@@ -173,7 +173,7 @@ Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
return isDivisionOp() ? Value.Quotient : Value.Remainder;
}
-/// \brief Check if a value looks like a hash.
+/// Check if a value looks like a hash.
///
/// The routine is expected to detect values computed using the most common hash
/// algorithms. Typically, hash computations end with one of the following
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 972e47f9270a..c87b74f739f4 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -25,8 +25,10 @@ add_llvm_library(LLVMTransformUtils
LCSSA.cpp
LibCallsShrinkWrap.cpp
Local.cpp
+ LoopRotationUtils.cpp
LoopSimplify.cpp
LoopUnroll.cpp
+ LoopUnrollAndJam.cpp
LoopUnrollPeel.cpp
LoopUnrollRuntime.cpp
LoopUtils.cpp
@@ -43,10 +45,10 @@ add_llvm_library(LLVMTransformUtils
PromoteMemoryToRegister.cpp
StripGCRelocates.cpp
SSAUpdater.cpp
+ SSAUpdaterBulk.cpp
SanitizerStats.cpp
SimplifyCFG.cpp
SimplifyIndVar.cpp
- SimplifyInstructions.cpp
SimplifyLibCalls.cpp
SplitModule.cpp
StripNonLineTableDebugInfo.cpp
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index 8825f77555e7..4d9c22e57a68 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -47,14 +47,11 @@ using namespace llvm;
///
static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
BasicBlock *MergeBlock) {
- for (auto &I : *Invoke->getNormalDest()) {
- auto *Phi = dyn_cast<PHINode>(&I);
- if (!Phi)
- break;
- int Idx = Phi->getBasicBlockIndex(OrigBlock);
+ for (PHINode &Phi : Invoke->getNormalDest()->phis()) {
+ int Idx = Phi.getBasicBlockIndex(OrigBlock);
if (Idx == -1)
continue;
- Phi->setIncomingBlock(Idx, MergeBlock);
+ Phi.setIncomingBlock(Idx, MergeBlock);
}
}
@@ -82,16 +79,13 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
BasicBlock *ThenBlock,
BasicBlock *ElseBlock) {
- for (auto &I : *Invoke->getUnwindDest()) {
- auto *Phi = dyn_cast<PHINode>(&I);
- if (!Phi)
- break;
- int Idx = Phi->getBasicBlockIndex(OrigBlock);
+ for (PHINode &Phi : Invoke->getUnwindDest()->phis()) {
+ int Idx = Phi.getBasicBlockIndex(OrigBlock);
if (Idx == -1)
continue;
- auto *V = Phi->getIncomingValue(Idx);
- Phi->setIncomingBlock(Idx, ThenBlock);
- Phi->addIncoming(V, ElseBlock);
+ auto *V = Phi.getIncomingValue(Idx);
+ Phi.setIncomingBlock(Idx, ThenBlock);
+ Phi.addIncoming(V, ElseBlock);
}
}
@@ -395,12 +389,14 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
// Inspect the arguments of the call site. If an argument's type doesn't
// match the corresponding formal argument's type in the callee, bitcast it
// to the correct type.
- for (Use &U : CS.args()) {
- unsigned ArgNo = CS.getArgumentNo(&U);
- Type *FormalTy = Callee->getFunctionType()->getParamType(ArgNo);
- Type *ActualTy = U.get()->getType();
+ auto CalleeType = Callee->getFunctionType();
+ auto CalleeParamNum = CalleeType->getNumParams();
+ for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
+ auto *Arg = CS.getArgument(ArgNo);
+ Type *FormalTy = CalleeType->getParamType(ArgNo);
+ Type *ActualTy = Arg->getType();
if (FormalTy != ActualTy) {
- auto *Cast = CastInst::Create(Instruction::BitCast, U.get(), FormalTy, "",
+ auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "",
CS.getInstruction());
CS.setArgument(ArgNo, Cast);
}
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 3b19ba1b50f2..61448e9acb57 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
@@ -31,7 +32,6 @@
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <map>
using namespace llvm;
@@ -43,44 +43,36 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
DebugInfoFinder *DIFinder) {
DenseMap<const MDNode *, MDNode *> Cache;
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
- if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+ if (BB->hasName())
+ NewBB->setName(BB->getName() + NameSuffix);
bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
Module *TheModule = F ? F->getParent() : nullptr;
// Loop over all instructions, and copy them over.
- for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
- II != IE; ++II) {
-
- if (DIFinder && TheModule) {
- if (auto *DDI = dyn_cast<DbgDeclareInst>(II))
- DIFinder->processDeclare(*TheModule, DDI);
- else if (auto *DVI = dyn_cast<DbgValueInst>(II))
- DIFinder->processValue(*TheModule, DVI);
+ for (const Instruction &I : *BB) {
+ if (DIFinder && TheModule)
+ DIFinder->processInstruction(*TheModule, I);
- if (auto DbgLoc = II->getDebugLoc())
- DIFinder->processLocation(*TheModule, DbgLoc.get());
- }
-
- Instruction *NewInst = II->clone();
- if (II->hasName())
- NewInst->setName(II->getName()+NameSuffix);
+ Instruction *NewInst = I.clone();
+ if (I.hasName())
+ NewInst->setName(I.getName() + NameSuffix);
NewBB->getInstList().push_back(NewInst);
- VMap[&*II] = NewInst; // Add instruction map to value.
+ VMap[&I] = NewInst; // Add instruction map to value.
- hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+ hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
if (isa<ConstantInt>(AI->getArraySize()))
hasStaticAllocas = true;
else
hasDynamicAllocas = true;
}
}
-
+
if (CodeInfo) {
CodeInfo->ContainsCalls |= hasCalls;
CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
- CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
+ CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
BB != &BB->getParent()->getEntryBlock();
}
return NewBB;
@@ -175,7 +167,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// Create a new basic block and copy instructions into it!
BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
- SP ? &DIFinder : nullptr);
+ ModuleLevelChanges ? &DIFinder : nullptr);
// Add basic block mapping.
VMap[&BB] = CBB;
@@ -197,15 +189,15 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
Returns.push_back(RI);
}
- for (DISubprogram *ISP : DIFinder.subprograms()) {
- if (ISP != SP) {
+ for (DISubprogram *ISP : DIFinder.subprograms())
+ if (ISP != SP)
VMap.MD()[ISP].reset(ISP);
- }
- }
- for (auto *Type : DIFinder.types()) {
+ for (DICompileUnit *CU : DIFinder.compile_units())
+ VMap.MD()[CU].reset(CU);
+
+ for (DIType *Type : DIFinder.types())
VMap.MD()[Type].reset(Type);
- }
// Loop over all of the instructions in the function, fixing up operand
// references as we go. This uses VMap to do all the hard work.
@@ -283,7 +275,7 @@ namespace {
/// The specified block is found to be reachable, clone it and
/// anything that it can reach.
- void CloneBlock(const BasicBlock *BB,
+ void CloneBlock(const BasicBlock *BB,
BasicBlock::const_iterator StartingInst,
std::vector<const BasicBlock*> &ToClone);
};
@@ -493,17 +485,13 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
// Handle PHI nodes specially, as we have to remove references to dead
// blocks.
- for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) {
+ for (const PHINode &PN : BI.phis()) {
// PHI nodes may have been remapped to non-PHI nodes by the caller or
// during the cloning process.
- if (const PHINode *PN = dyn_cast<PHINode>(I)) {
- if (isa<PHINode>(VMap[PN]))
- PHIToResolve.push_back(PN);
- else
- break;
- } else {
+ if (isa<PHINode>(VMap[&PN]))
+ PHIToResolve.push_back(&PN);
+ else
break;
- }
}
// Finally, remap the terminator instructions, as those can't be remapped
@@ -550,7 +538,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
// phi nodes will have invalid entries. Update the PHI nodes in this
// case.
PHINode *PN = cast<PHINode>(NewBB->begin());
- NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+ NumPreds = pred_size(NewBB);
if (NumPreds != PN->getNumIncomingValues()) {
assert(NumPreds < PN->getNumIncomingValues());
// Count how many times each predecessor comes to this block.
@@ -722,7 +710,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
}
-/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+/// Remaps instructions in \p Blocks using the mapping in \p VMap.
void llvm::remapInstructionsInBlocks(
const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
// Rewrite the code to refer to itself.
@@ -732,7 +720,7 @@ void llvm::remapInstructionsInBlocks(
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
}
-/// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p
+/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p
/// Blocks.
///
/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
@@ -796,12 +784,13 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
return NewLoop;
}
-/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// Duplicate non-Phi instructions from the beginning of block up to
/// StopAt instruction into a split block between BB and its predecessor.
BasicBlock *
llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
Instruction *StopAt,
- ValueToValueMapTy &ValueMapping) {
+ ValueToValueMapTy &ValueMapping,
+ DominatorTree *DT) {
// We are going to have to map operands from the original BB block to the new
// copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to
// account for entry from PredBB.
@@ -809,13 +798,15 @@ llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
- BasicBlock *NewBB = SplitEdge(PredBB, BB);
+ BasicBlock *NewBB = SplitEdge(PredBB, BB, DT);
NewBB->setName(PredBB->getName() + ".split");
Instruction *NewTerm = NewBB->getTerminator();
// Clone the non-phi instructions of BB into NewBB, keeping track of the
// mapping and using it to remap operands in the cloned instructions.
- for (; StopAt != &*BI; ++BI) {
+ // Stop once we see the terminator too. This covers the case where BB's
+ // terminator gets replaced and StopAt == BB's terminator.
+ for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) {
Instruction *New = BI->clone();
New->setName(BI->getName());
New->insertBefore(NewTerm);
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 8fee10854229..35c7511a24b9 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -32,33 +32,34 @@ static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
/// copies of global variables and functions, and making their (initializers and
/// references, respectively) refer to the right globals.
///
-std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
+std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
// Create the value map that maps things from the old module over to the new
// module.
ValueToValueMapTy VMap;
return CloneModule(M, VMap);
}
-std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+std::unique_ptr<Module> llvm::CloneModule(const Module &M,
ValueToValueMapTy &VMap) {
return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
}
std::unique_ptr<Module> llvm::CloneModule(
- const Module *M, ValueToValueMapTy &VMap,
+ const Module &M, ValueToValueMapTy &VMap,
function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
// First off, we need to create the new module.
std::unique_ptr<Module> New =
- llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
- New->setDataLayout(M->getDataLayout());
- New->setTargetTriple(M->getTargetTriple());
- New->setModuleInlineAsm(M->getModuleInlineAsm());
-
+ llvm::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+ New->setSourceFileName(M.getSourceFileName());
+ New->setDataLayout(M.getDataLayout());
+ New->setTargetTriple(M.getTargetTriple());
+ New->setModuleInlineAsm(M.getModuleInlineAsm());
+
// Loop over all of the global variables, making corresponding globals in the
// new module. Here we add them to the VMap and to the new Module. We
// don't worry about attributes or initializers, they will come later.
//
- for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
I != E; ++I) {
GlobalVariable *GV = new GlobalVariable(*New,
I->getValueType(),
@@ -72,7 +73,7 @@ std::unique_ptr<Module> llvm::CloneModule(
}
// Loop over the functions in the module, making external functions as before
- for (const Function &I : *M) {
+ for (const Function &I : M) {
Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
I.getLinkage(), I.getName(), New.get());
NF->copyAttributesFrom(&I);
@@ -80,7 +81,7 @@ std::unique_ptr<Module> llvm::CloneModule(
}
// Loop over the aliases in the module
- for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+ for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
I != E; ++I) {
if (!ShouldCloneDefinition(&*I)) {
// An alias cannot act as an external reference, so we need to create
@@ -114,7 +115,7 @@ std::unique_ptr<Module> llvm::CloneModule(
// have been created, loop through and copy the global variable referrers
// over... We also set the attributes on the global now.
//
- for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
I != E; ++I) {
if (I->isDeclaration())
continue;
@@ -139,7 +140,7 @@ std::unique_ptr<Module> llvm::CloneModule(
// Similarly, copy over function bodies now...
//
- for (const Function &I : *M) {
+ for (const Function &I : M) {
if (I.isDeclaration())
continue;
@@ -169,7 +170,7 @@ std::unique_ptr<Module> llvm::CloneModule(
}
// And aliases
- for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+ for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
I != E; ++I) {
// We already dealt with undefined aliases above.
if (!ShouldCloneDefinition(&*I))
@@ -180,8 +181,9 @@ std::unique_ptr<Module> llvm::CloneModule(
}
// And named metadata....
- for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
- E = M->named_metadata_end(); I != E; ++I) {
+ for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+ E = M.named_metadata_end();
+ I != E; ++I) {
const NamedMDNode &NMD = *I;
NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
@@ -194,7 +196,7 @@ std::unique_ptr<Module> llvm::CloneModule(
extern "C" {
LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
- return wrap(CloneModule(unwrap(M)).release());
+ return wrap(CloneModule(*unwrap(M)).release());
}
}
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 7a404241cb14..f31dab9f96af 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -66,6 +66,7 @@
#include <vector>
using namespace llvm;
+using ProfileCount = Function::ProfileCount;
#define DEBUG_TYPE "code-extractor"
@@ -77,12 +78,10 @@ static cl::opt<bool>
AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
cl::desc("Aggregate arguments to code-extracted functions"));
-/// \brief Test whether a block is valid for extraction.
-bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
- bool AllowVarArgs) {
- // Landing pads must be in the function where they were inserted for cleanup.
- if (BB.isEHPad())
- return false;
+/// Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB,
+ const SetVector<BasicBlock *> &Result,
+ bool AllowVarArgs, bool AllowAlloca) {
// taking the address of a basic block moved to another function is illegal
if (BB.hasAddressTaken())
return false;
@@ -111,11 +110,63 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
}
}
- // Don't hoist code containing allocas or invokes. If explicitly requested,
- // allow vastart.
+ // If explicitly requested, allow vastart and alloca. For invoke instructions
+ // verify that extraction is valid.
for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
- if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
- return false;
+ if (isa<AllocaInst>(I)) {
+ if (!AllowAlloca)
+ return false;
+ continue;
+ }
+
+ if (const auto *II = dyn_cast<InvokeInst>(I)) {
+ // Unwind destination (either a landingpad, catchswitch, or cleanuppad)
+ // must be a part of the subgraph which is being extracted.
+ if (auto *UBB = II->getUnwindDest())
+ if (!Result.count(UBB))
+ return false;
+ continue;
+ }
+
+ // All catch handlers of a catchswitch instruction as well as the unwind
+ // destination must be in the subgraph.
+ if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) {
+ if (auto *UBB = CSI->getUnwindDest())
+ if (!Result.count(UBB))
+ return false;
+ for (auto *HBB : CSI->handlers())
+ if (!Result.count(const_cast<BasicBlock*>(HBB)))
+ return false;
+ continue;
+ }
+
+ // Make sure that entire catch handler is within subgraph. It is sufficient
+ // to check that catch return's block is in the list.
+ if (const auto *CPI = dyn_cast<CatchPadInst>(I)) {
+ for (const auto *U : CPI->users())
+ if (const auto *CRI = dyn_cast<CatchReturnInst>(U))
+ if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+ return false;
+ continue;
+ }
+
+ // And do similar checks for cleanup handler - the entire handler must be
+ // in subgraph which is going to be extracted. For cleanup return should
+ // additionally check that the unwind destination is also in the subgraph.
+ if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) {
+ for (const auto *U : CPI->users())
+ if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+ if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+ return false;
+ continue;
+ }
+ if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) {
+ if (auto *UBB = CRI->getUnwindDest())
+ if (!Result.count(UBB))
+ return false;
+ continue;
+ }
+
if (const CallInst *CI = dyn_cast<CallInst>(I))
if (const Function *F = CI->getCalledFunction())
if (F->getIntrinsicID() == Intrinsic::vastart) {
@@ -129,10 +180,10 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
return true;
}
-/// \brief Build a set of blocks to extract if the input blocks are viable.
+/// Build a set of blocks to extract if the input blocks are viable.
static SetVector<BasicBlock *>
buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
- bool AllowVarArgs) {
+ bool AllowVarArgs, bool AllowAlloca) {
assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
SetVector<BasicBlock *> Result;
@@ -145,32 +196,42 @@ buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
if (!Result.insert(BB))
llvm_unreachable("Repeated basic blocks in extraction input");
- if (!CodeExtractor::isBlockValidForExtraction(*BB, AllowVarArgs)) {
- Result.clear();
- return Result;
- }
}
-#ifndef NDEBUG
- for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
- E = Result.end();
- I != E; ++I)
- for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
- PI != PE; ++PI)
- assert(Result.count(*PI) &&
- "No blocks in this region may have entries from outside the region"
- " except for the first block!");
-#endif
+ for (auto *BB : Result) {
+ if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca))
+ return {};
+
+ // Make sure that the first block is not a landing pad.
+ if (BB == Result.front()) {
+ if (BB->isEHPad()) {
+ LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n");
+ return {};
+ }
+ continue;
+ }
+
+ // All blocks other than the first must not have predecessors outside of
+ // the subgraph which is being extracted.
+ for (auto *PBB : predecessors(BB))
+ if (!Result.count(PBB)) {
+ LLVM_DEBUG(
+ dbgs() << "No blocks in this region may have entries from "
+ "outside the region except for the first block!\n");
+ return {};
+ }
+ }
return Result;
}
CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
bool AggregateArgs, BlockFrequencyInfo *BFI,
- BranchProbabilityInfo *BPI, bool AllowVarArgs)
+ BranchProbabilityInfo *BPI, bool AllowVarArgs,
+ bool AllowAlloca)
: DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
BPI(BPI), AllowVarArgs(AllowVarArgs),
- Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs)) {}
+ Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)) {}
CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
BlockFrequencyInfo *BFI,
@@ -178,7 +239,8 @@ CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
: DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
BPI(BPI), AllowVarArgs(false),
Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
- /* AllowVarArgs */ false)) {}
+ /* AllowVarArgs */ false,
+ /* AllowAlloca */ false)) {}
/// definedInRegion - Return true if the specified value is defined in the
/// extracted region.
@@ -562,8 +624,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
BasicBlock *newHeader,
Function *oldFunction,
Module *M) {
- DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
- DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+ LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+ LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
// This function returns unsigned, outputs will go back by reference.
switch (NumExitBlocks) {
@@ -577,20 +639,20 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
// Add the types of the input values to the function's argument list
for (Value *value : inputs) {
- DEBUG(dbgs() << "value used in func: " << *value << "\n");
+ LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
paramTy.push_back(value->getType());
}
// Add the types of the output values to the function's argument list.
for (Value *output : outputs) {
- DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+ LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
if (AggregateArgs)
paramTy.push_back(output->getType());
else
paramTy.push_back(PointerType::getUnqual(output->getType()));
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "Function type: " << *RetTy << " f(";
for (Type *i : paramTy)
dbgs() << *i << ", ";
@@ -620,16 +682,89 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
if (oldFunction->hasUWTable())
newFunction->setHasUWTable();
- // Inherit all of the target dependent attributes.
+ // Inherit all of the target dependent attributes and white-listed
+ // target independent attributes.
// (e.g. If the extracted region contains a call to an x86.sse
// instruction we need to make sure that the extracted region has the
// "target-features" attribute allowing it to be lowered.
// FIXME: This should be changed to check to see if a specific
// attribute can not be inherited.
- AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
- for (const auto &Attr : AB.td_attrs())
- newFunction->addFnAttr(Attr.first, Attr.second);
+ for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+ if (Attr.isStringAttribute()) {
+ if (Attr.getKindAsString() == "thunk")
+ continue;
+ } else
+ switch (Attr.getKindAsEnum()) {
+ // Those attributes cannot be propagated safely. Explicitly list them
+ // here so we get a warning if new attributes are added. This list also
+ // includes non-function attributes.
+ case Attribute::Alignment:
+ case Attribute::AllocSize:
+ case Attribute::ArgMemOnly:
+ case Attribute::Builtin:
+ case Attribute::ByVal:
+ case Attribute::Convergent:
+ case Attribute::Dereferenceable:
+ case Attribute::DereferenceableOrNull:
+ case Attribute::InAlloca:
+ case Attribute::InReg:
+ case Attribute::InaccessibleMemOnly:
+ case Attribute::InaccessibleMemOrArgMemOnly:
+ case Attribute::JumpTable:
+ case Attribute::Naked:
+ case Attribute::Nest:
+ case Attribute::NoAlias:
+ case Attribute::NoBuiltin:
+ case Attribute::NoCapture:
+ case Attribute::NoReturn:
+ case Attribute::None:
+ case Attribute::NonNull:
+ case Attribute::ReadNone:
+ case Attribute::ReadOnly:
+ case Attribute::Returned:
+ case Attribute::ReturnsTwice:
+ case Attribute::SExt:
+ case Attribute::Speculatable:
+ case Attribute::StackAlignment:
+ case Attribute::StructRet:
+ case Attribute::SwiftError:
+ case Attribute::SwiftSelf:
+ case Attribute::WriteOnly:
+ case Attribute::ZExt:
+ case Attribute::EndAttrKinds:
+ continue;
+ // Those attributes should be safe to propagate to the extracted function.
+ case Attribute::AlwaysInline:
+ case Attribute::Cold:
+ case Attribute::NoRecurse:
+ case Attribute::InlineHint:
+ case Attribute::MinSize:
+ case Attribute::NoDuplicate:
+ case Attribute::NoImplicitFloat:
+ case Attribute::NoInline:
+ case Attribute::NonLazyBind:
+ case Attribute::NoRedZone:
+ case Attribute::NoUnwind:
+ case Attribute::OptForFuzzing:
+ case Attribute::OptimizeNone:
+ case Attribute::OptimizeForSize:
+ case Attribute::SafeStack:
+ case Attribute::ShadowCallStack:
+ case Attribute::SanitizeAddress:
+ case Attribute::SanitizeMemory:
+ case Attribute::SanitizeThread:
+ case Attribute::SanitizeHWAddress:
+ case Attribute::StackProtect:
+ case Attribute::StackProtectReq:
+ case Attribute::StackProtectStrong:
+ case Attribute::StrictFP:
+ case Attribute::UWTable:
+ case Attribute::NoCfCheck:
+ break;
+ }
+ newFunction->addFnAttr(Attr);
+ }
newFunction->getBasicBlockList().push_back(newRootNode);
// Create an iterator to name all of the arguments we inserted.
@@ -1093,10 +1228,10 @@ Function *CodeExtractor::extractCodeRegion() {
// Update the entry count of the function.
if (BFI) {
- Optional<uint64_t> EntryCount =
- BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
- if (EntryCount.hasValue())
- newFunction->setEntryCount(EntryCount.getValue());
+ auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+ if (Count.hasValue())
+ newFunction->setEntryCount(
+ ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
}
@@ -1104,6 +1239,10 @@ Function *CodeExtractor::extractCodeRegion() {
moveCodeToFunction(newFunction);
+ // Propagate personality info to the new function if there is one.
+ if (oldFunction->hasPersonalityFn())
+ newFunction->setPersonalityFn(oldFunction->getPersonalityFn());
+
// Update the branch weights for the exit block.
if (BFI && NumExitBlocks > 1)
calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
@@ -1139,7 +1278,7 @@ Function *CodeExtractor::extractCodeRegion() {
}
}
- DEBUG(if (verifyFunction(*newFunction))
- report_fatal_error("verifyFunction failed!"));
+ LLVM_DEBUG(if (verifyFunction(*newFunction))
+ report_fatal_error("verifyFunction failed!"));
return newFunction;
}
diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp
index 82b67c293102..9a0240144d08 100644
--- a/lib/Transforms/Utils/CtorUtils.cpp
+++ b/lib/Transforms/Utils/CtorUtils.cpp
@@ -138,7 +138,7 @@ bool optimizeGlobalCtorsList(Module &M,
if (!F)
continue;
- DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+ LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
// We cannot simplify external ctor functions.
if (F->empty())
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 6d3d287defdb..56ff03c7f5e1 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -9,11 +9,11 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
/// DemoteRegToStack - This function takes a virtual register computed by an
diff --git a/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 421663f82565..569ea58a3047 100644
--- a/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -9,14 +9,13 @@
#include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
static void insertCall(Function &CurFn, StringRef Func,
@@ -92,17 +91,27 @@ static bool runOnFunction(Function &F, bool PostInlining) {
if (!ExitFunc.empty()) {
for (BasicBlock &BB : F) {
- TerminatorInst *T = BB.getTerminator();
+ Instruction *T = BB.getTerminator();
+ if (!isa<ReturnInst>(T))
+ continue;
+
+ // If T is preceded by a musttail call, that's the real terminator.
+ Instruction *Prev = T->getPrevNode();
+ if (BitCastInst *BCI = dyn_cast_or_null<BitCastInst>(Prev))
+ Prev = BCI->getPrevNode();
+ if (CallInst *CI = dyn_cast_or_null<CallInst>(Prev)) {
+ if (CI->isMustTailCall())
+ T = CI;
+ }
+
DebugLoc DL;
if (DebugLoc TerminatorDL = T->getDebugLoc())
DL = TerminatorDL;
else if (auto SP = F.getSubprogram())
DL = DebugLoc::get(0, 0, SP);
- if (isa<ReturnInst>(T)) {
- insertCall(F, ExitFunc, T, DL);
- Changed = true;
- }
+ insertCall(F, ExitFunc, T, DL);
+ Changed = true;
}
F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
}
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 78d7474e5b95..c9c96fbe5da0 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -14,9 +14,9 @@
#include "llvm/Transforms/Utils/EscapeEnumerator.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
static Constant *getDefaultPersonalityFn(Module *M) {
@@ -73,8 +73,8 @@ IRBuilder<> *EscapeEnumerator::Next() {
F.setPersonalityFn(PersFn);
}
- if (isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
- report_fatal_error("Funclet EH not supported");
+ if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+ report_fatal_error("Scoped EH not supported");
}
LandingPadInst *LPad =
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
index 3c5e299fae98..7fd9425efed3 100644
--- a/lib/Transforms/Utils/Evaluator.cpp
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -24,6 +24,7 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InstrTypes.h"
@@ -174,6 +175,11 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
return false;
}
+static Constant *getInitializer(Constant *C) {
+ auto *GV = dyn_cast<GlobalVariable>(C);
+ return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+}
+
/// Return the value that would be computed by a load from P after the stores
/// reflected by 'memory' have been performed. If we can't decide, return null.
Constant *Evaluator::ComputeLoadResult(Constant *P) {
@@ -189,18 +195,96 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {
return nullptr;
}
- // Handle a constantexpr getelementptr.
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
- if (CE->getOpcode() == Instruction::GetElementPtr &&
- isa<GlobalVariable>(CE->getOperand(0))) {
- GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
- if (GV->hasDefinitiveInitializer())
- return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
+ switch (CE->getOpcode()) {
+ // Handle a constantexpr getelementptr.
+ case Instruction::GetElementPtr:
+ if (auto *I = getInitializer(CE->getOperand(0)))
+ return ConstantFoldLoadThroughGEPConstantExpr(I, CE);
+ break;
+ // Handle a constantexpr bitcast.
+ case Instruction::BitCast:
+ Constant *Val = getVal(CE->getOperand(0));
+ auto MM = MutatedMemory.find(Val);
+ auto *I = (MM != MutatedMemory.end()) ? MM->second
+ : getInitializer(CE->getOperand(0));
+ if (I)
+ return ConstantFoldLoadThroughBitcast(
+ I, P->getType()->getPointerElementType(), DL);
+ break;
}
+ }
return nullptr; // don't know how to evaluate.
}
+static Function *getFunction(Constant *C) {
+ if (auto *Fn = dyn_cast<Function>(C))
+ return Fn;
+
+ if (auto *Alias = dyn_cast<GlobalAlias>(C))
+ if (auto *Fn = dyn_cast<Function>(Alias->getAliasee()))
+ return Fn;
+ return nullptr;
+}
+
+Function *
+Evaluator::getCalleeWithFormalArgs(CallSite &CS,
+ SmallVector<Constant *, 8> &Formals) {
+ auto *V = CS.getCalledValue();
+ if (auto *Fn = getFunction(getVal(V)))
+ return getFormalParams(CS, Fn, Formals) ? Fn : nullptr;
+
+ auto *CE = dyn_cast<ConstantExpr>(V);
+ if (!CE || CE->getOpcode() != Instruction::BitCast ||
+ !getFormalParams(CS, getFunction(CE->getOperand(0)), Formals))
+ return nullptr;
+
+ return dyn_cast<Function>(
+ ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+}
+
+bool Evaluator::getFormalParams(CallSite &CS, Function *F,
+ SmallVector<Constant *, 8> &Formals) {
+ if (!F)
+ return false;
+
+ auto *FTy = F->getFunctionType();
+ if (FTy->getNumParams() > CS.getNumArgOperands()) {
+ LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+ return false;
+ }
+
+ auto ArgI = CS.arg_begin();
+ for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
+ ++ParI) {
+ auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
+ if (!ArgC) {
+ LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
+ return false;
+ }
+ Formals.push_back(ArgC);
+ ++ArgI;
+ }
+ return true;
+}
+
+/// If call expression contains bitcast then we may need to cast
+/// evaluated return value to a type of the call expression.
+Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
+ ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
+ if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+ return RV;
+
+ if (auto *FT =
+ dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
+ RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
+ if (!RV)
+ LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
+ }
+ return RV;
+}
+
/// Evaluate all instructions in block BB, returning true if successful, false
/// if we can't evaluate it. NewBB returns the next BB that control flows into,
/// or null upon return.
@@ -210,22 +294,23 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
while (true) {
Constant *InstResult = nullptr;
- DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+ LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
if (!SI->isSimple()) {
- DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+ LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
return false; // no volatile/atomic accesses.
}
Constant *Ptr = getVal(SI->getOperand(1));
if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
- DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+ LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
Ptr = FoldedPtr;
- DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+ LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
}
if (!isSimpleEnoughPointerToCommit(Ptr)) {
// If this is too complex for us to commit, reject it.
- DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+ LLVM_DEBUG(
+ dbgs() << "Pointer is too complex for us to evaluate store.");
return false;
}
@@ -234,14 +319,15 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
// If this might be too difficult for the backend to handle (e.g. the addr
// of one global variable divided by another) then we can't commit it.
if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
- DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
- << "\n");
+ LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
+ << *Val << "\n");
return false;
}
if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
if (CE->getOpcode() == Instruction::BitCast) {
- DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+ LLVM_DEBUG(dbgs()
+ << "Attempting to resolve bitcast on constant ptr.\n");
// If we're evaluating a store through a bitcast, then we need
// to pull the bitcast off the pointer type and push it onto the
// stored value.
@@ -252,7 +338,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
// In order to push the bitcast onto the stored value, a bitcast
// from NewTy to Val's type must be legal. If it's not, we can try
// introspecting NewTy to find a legal conversion.
- while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+ Constant *NewVal;
+ while (!(NewVal = ConstantFoldLoadThroughBitcast(Val, NewTy, DL))) {
// If NewTy is a struct, we can convert the pointer to the struct
// into a pointer to its first member.
// FIXME: This could be extended to support arrays as well.
@@ -270,17 +357,14 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
// If we can't improve the situation by introspecting NewTy,
// we have to give up.
} else {
- DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
- "evaluate.\n");
+ LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+ "evaluate.\n");
return false;
}
}
- // If we found compatible types, go ahead and push the bitcast
- // onto the stored value.
- Val = ConstantExpr::getBitCast(Val, NewTy);
-
- DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+ Val = NewVal;
+ LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
}
}
@@ -289,37 +373,37 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
InstResult = ConstantExpr::get(BO->getOpcode(),
getVal(BO->getOperand(0)),
getVal(BO->getOperand(1)));
- DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
+ << *InstResult << "\n");
} else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
InstResult = ConstantExpr::getCompare(CI->getPredicate(),
getVal(CI->getOperand(0)),
getVal(CI->getOperand(1)));
- DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+ << "\n");
} else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
InstResult = ConstantExpr::getCast(CI->getOpcode(),
getVal(CI->getOperand(0)),
CI->getType());
- DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+ << "\n");
} else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
getVal(SI->getOperand(1)),
getVal(SI->getOperand(2)));
- DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+ << "\n");
} else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
InstResult = ConstantExpr::getExtractValue(
getVal(EVI->getAggregateOperand()), EVI->getIndices());
- DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
+ << *InstResult << "\n");
} else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
InstResult = ConstantExpr::getInsertValue(
getVal(IVI->getAggregateOperand()),
getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
- DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
+ << *InstResult << "\n");
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
Constant *P = getVal(GEP->getOperand(0));
SmallVector<Constant*, 8> GEPOps;
@@ -329,60 +413,63 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
InstResult =
ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
cast<GEPOperator>(GEP)->isInBounds());
- DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
} else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
if (!LI->isSimple()) {
- DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+ LLVM_DEBUG(
+ dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
return false; // no volatile/atomic accesses.
}
Constant *Ptr = getVal(LI->getOperand(0));
if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
Ptr = FoldedPtr;
- DEBUG(dbgs() << "Found a constant pointer expression, constant "
- "folding: " << *Ptr << "\n");
+ LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
+ "folding: "
+ << *Ptr << "\n");
}
InstResult = ComputeLoadResult(Ptr);
if (!InstResult) {
- DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
- "\n");
+ LLVM_DEBUG(
+ dbgs() << "Failed to compute load result. Can not evaluate load."
+ "\n");
return false; // Could not evaluate load.
}
- DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+ LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
} else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
if (AI->isArrayAllocation()) {
- DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+ LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
return false; // Cannot handle array allocs.
}
Type *Ty = AI->getAllocatedType();
AllocaTmps.push_back(llvm::make_unique<GlobalVariable>(
Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
- AI->getName()));
+ AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
+ AI->getType()->getPointerAddressSpace()));
InstResult = AllocaTmps.back().get();
- DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+ LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
} else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
CallSite CS(&*CurInst);
// Debug info can safely be ignored here.
if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
- DEBUG(dbgs() << "Ignoring debug info.\n");
+ LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
++CurInst;
continue;
}
// Cannot handle inline asm.
if (isa<InlineAsm>(CS.getCalledValue())) {
- DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+ LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
return false;
}
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
if (MSI->isVolatile()) {
- DEBUG(dbgs() << "Can not optimize a volatile memset " <<
- "intrinsic.\n");
+ LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
+ << "intrinsic.\n");
return false;
}
Constant *Ptr = getVal(MSI->getDest());
@@ -390,7 +477,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
Constant *DestVal = ComputeLoadResult(getVal(Ptr));
if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
// This memset is a no-op.
- DEBUG(dbgs() << "Ignoring no-op memset.\n");
+ LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
++CurInst;
continue;
}
@@ -398,7 +485,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
II->getIntrinsicID() == Intrinsic::lifetime_end) {
- DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+ LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
++CurInst;
continue;
}
@@ -407,7 +494,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
// We don't insert an entry into Values, as it doesn't have a
// meaningful return value.
if (!II->use_empty()) {
- DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
+ LLVM_DEBUG(dbgs()
+ << "Found unused invariant_start. Can't evaluate.\n");
return false;
}
ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
@@ -419,54 +507,54 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
Size->getValue().getLimitedValue() >=
DL.getTypeStoreSize(ElemTy)) {
Invariants.insert(GV);
- DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: "
+ << *GV << "\n");
} else {
- DEBUG(dbgs() << "Found a global var, but can not treat it as an "
- "invariant.\n");
+ LLVM_DEBUG(dbgs()
+ << "Found a global var, but can not treat it as an "
+ "invariant.\n");
}
}
// Continue even if we do nothing.
++CurInst;
continue;
} else if (II->getIntrinsicID() == Intrinsic::assume) {
- DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+ LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n");
++CurInst;
continue;
} else if (II->getIntrinsicID() == Intrinsic::sideeffect) {
- DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
+ LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
++CurInst;
continue;
}
- DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+ LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
return false;
}
// Resolve function pointers.
- Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+ SmallVector<Constant *, 8> Formals;
+ Function *Callee = getCalleeWithFormalArgs(CS, Formals);
if (!Callee || Callee->isInterposable()) {
- DEBUG(dbgs() << "Can not resolve function pointer.\n");
+ LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
return false; // Cannot resolve.
}
- SmallVector<Constant*, 8> Formals;
- for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
- Formals.push_back(getVal(*i));
-
if (Callee->isDeclaration()) {
// If this is a function we can constant fold, do it.
if (Constant *C = ConstantFoldCall(CS, Callee, Formals, TLI)) {
- InstResult = C;
- DEBUG(dbgs() << "Constant folded function call. Result: " <<
- *InstResult << "\n");
+ InstResult = castCallResultIfNeeded(CS.getCalledValue(), C);
+ if (!InstResult)
+ return false;
+ LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
+ << *InstResult << "\n");
} else {
- DEBUG(dbgs() << "Can not constant fold function call.\n");
+ LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n");
return false;
}
} else {
if (Callee->getFunctionType()->isVarArg()) {
- DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+ LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
return false;
}
@@ -474,21 +562,24 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
// Execute the call, if successful, use the return value.
ValueStack.emplace_back();
if (!EvaluateFunction(Callee, RetVal, Formals)) {
- DEBUG(dbgs() << "Failed to evaluate function.\n");
+ LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n");
return false;
}
ValueStack.pop_back();
- InstResult = RetVal;
+ InstResult = castCallResultIfNeeded(CS.getCalledValue(), RetVal);
+ if (RetVal && !InstResult)
+ return false;
if (InstResult) {
- DEBUG(dbgs() << "Successfully evaluated function. Result: "
- << *InstResult << "\n\n");
+ LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
+ << *InstResult << "\n\n");
} else {
- DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+ LLVM_DEBUG(dbgs()
+ << "Successfully evaluated function. Result: 0\n\n");
}
}
} else if (isa<TerminatorInst>(CurInst)) {
- DEBUG(dbgs() << "Found a terminator instruction.\n");
+ LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
if (BI->isUnconditional()) {
@@ -515,17 +606,18 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
NextBB = nullptr;
} else {
// invoke, unwind, resume, unreachable.
- DEBUG(dbgs() << "Can not handle terminator.");
+ LLVM_DEBUG(dbgs() << "Can not handle terminator.");
return false; // Cannot handle this terminator.
}
// We succeeded at evaluating this block!
- DEBUG(dbgs() << "Successfully evaluated block.\n");
+ LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
return true;
} else {
// Did not know how to evaluate this!
- DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
- "\n");
+ LLVM_DEBUG(
+ dbgs() << "Failed to evaluate block due to unhandled instruction."
+ "\n");
return false;
}
@@ -539,7 +631,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
// If we just processed an invoke, we finished evaluating the block.
if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
NextBB = II->getNormalDest();
- DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+ LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
return true;
}
@@ -578,7 +670,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
while (true) {
BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
- DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+ LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
if (!EvaluateBlock(CurInst, NextBB))
return false;
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 5fdcc6d1d727..3c6c9c9a5df4 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/IRBuilder.h"
@@ -24,7 +25,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
using namespace llvm;
@@ -36,16 +36,16 @@ namespace {
class FlattenCFGOpt {
AliasAnalysis *AA;
- /// \brief Use parallel-and or parallel-or to generate conditions for
+ /// Use parallel-and or parallel-or to generate conditions for
/// conditional branches.
bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
- /// \brief If \param BB is the merge block of an if-region, attempt to merge
+ /// If \param BB is the merge block of an if-region, attempt to merge
/// the if-region with an adjacent if-region upstream if two if-regions
/// contain identical instructions.
bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
- /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
+ /// Compare a pair of blocks: \p Block1 and \p Block2, which
/// are from two if-regions whose entry blocks are \p Head1 and \p
/// Head2. \returns true if \p Block1 and \p Block2 contain identical
/// instructions, and have no memory reference alias with \p Head2.
@@ -312,7 +312,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
new UnreachableInst(CB->getContext(), CB);
} while (Iteration);
- DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+ LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
return true;
}
@@ -469,7 +469,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
// Remove \param SecondEntryBlock
SecondEntryBlock->dropAllReferences();
SecondEntryBlock->eraseFromParent();
- DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+ LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
return true;
}
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index bddcbd86e914..69203f9f2485 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -18,7 +18,6 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -377,7 +376,7 @@ int FunctionComparator::cmpConstants(const Constant *L,
}
}
default: // Unknown constant, abort.
- DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+ LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
llvm_unreachable("Constant ValueID not recognized.");
return -1;
}
@@ -710,7 +709,7 @@ int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
return Res;
if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
return Res;
- llvm_unreachable("InlineAsm blocks were not uniqued.");
+ assert(L->getFunctionType() != R->getFunctionType());
return 0;
}
@@ -925,7 +924,7 @@ FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
H.add(F.arg_size());
SmallVector<const BasicBlock *, 8> BBs;
- SmallSet<const BasicBlock *, 16> VisitedBBs;
+ SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
// Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
// accumulating the hash of the function "structure." (BB and opcode sequence)
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 6b5f593073b4..479816a339d0 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -206,15 +206,10 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
// definition.
if (GV.hasName()) {
ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID());
- if (VI) {
- // Need to check all summaries are local in case of hash collisions.
- bool IsLocal = VI.getSummaryList().size() &&
- llvm::all_of(VI.getSummaryList(),
- [](const std::unique_ptr<GlobalValueSummary> &Summary) {
- return Summary->isDSOLocal();
- });
- if (IsLocal)
- GV.setDSOLocal(true);
+ if (VI && VI.isDSOLocal()) {
+ GV.setDSOLocal(true);
+ if (GV.hasDLLImportStorageClass())
+ GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
}
}
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index 245fefb38ee8..ff6970db47da 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -60,7 +60,7 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) {
}
static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
- SmallPtrSetImpl<const PHINode *> &PhiUsers) {
+ SmallPtrSetImpl<const Value *> &VisitedUsers) {
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
if (GV->isExternallyInitialized())
GS.StoredType = GlobalStatus::StoredOnce;
@@ -75,7 +75,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
if (!isa<PointerType>(CE->getType()))
return true;
- if (analyzeGlobalAux(CE, GS, PhiUsers))
+ // FIXME: Do we need to add constexpr selects to VisitedUsers?
+ if (analyzeGlobalAux(CE, GS, VisitedUsers))
return true;
} else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
if (!GS.HasMultipleAccessingFunctions) {
@@ -137,20 +138,18 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
GS.StoredType = GlobalStatus::Stored;
}
}
- } else if (isa<BitCastInst>(I)) {
- if (analyzeGlobalAux(I, GS, PhiUsers))
+ } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+ // Skip over bitcasts and GEPs; we don't care about the type or offset
+ // of the pointer.
+ if (analyzeGlobalAux(I, GS, VisitedUsers))
return true;
- } else if (isa<GetElementPtrInst>(I)) {
- if (analyzeGlobalAux(I, GS, PhiUsers))
- return true;
- } else if (isa<SelectInst>(I)) {
- if (analyzeGlobalAux(I, GS, PhiUsers))
- return true;
- } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
- // PHI nodes we can check just like select or GEP instructions, but we
- // have to be careful about infinite recursion.
- if (PhiUsers.insert(PN).second) // Not already visited.
- if (analyzeGlobalAux(I, GS, PhiUsers))
+ } else if (isa<SelectInst>(I) || isa<PHINode>(I)) {
+ // Look through selects and PHIs to find if the pointer is
+ // conditionally accessed. Make sure we only visit an instruction
+ // once; otherwise, we can get infinite recursion or exponential
+ // compile time.
+ if (VisitedUsers.insert(I).second)
+ if (analyzeGlobalAux(I, GS, VisitedUsers))
return true;
} else if (isa<CmpInst>(I)) {
GS.IsCompared = true;
@@ -191,6 +190,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
GlobalStatus::GlobalStatus() = default;
bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
- SmallPtrSet<const PHINode *, 16> PhiUsers;
- return analyzeGlobalAux(V, GS, PhiUsers);
+ SmallPtrSet<const Value *, 16> VisitedUsers;
+ return analyzeGlobalAux(V, GS, VisitedUsers);
}
diff --git a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index b8c12ad5ea84..8382220fc9e1 100644
--- a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -161,7 +161,7 @@ void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
void ImportedFunctionsInliningStatistics::calculateRealInlines() {
// Removing duplicated Callers.
- std::sort(NonImportedCallers.begin(), NonImportedCallers.end());
+ llvm::sort(NonImportedCallers.begin(), NonImportedCallers.end());
NonImportedCallers.erase(
std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
NonImportedCallers.end());
@@ -190,13 +190,14 @@ ImportedFunctionsInliningStatistics::getSortedNodes() {
for (const NodesMapTy::value_type& Node : NodesMap)
SortedNodes.push_back(&Node);
- std::sort(
+ llvm::sort(
SortedNodes.begin(), SortedNodes.end(),
[&](const SortedNodesTy::value_type &Lhs,
const SortedNodesTy::value_type &Rhs) {
if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
- if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
+ if (Lhs->second->NumberOfRealInlines !=
+ Rhs->second->NumberOfRealInlines)
return Lhs->second->NumberOfRealInlines >
Rhs->second->NumberOfRealInlines;
return Lhs->first() < Rhs->first();
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index fedf6e100d6c..0315aac1cf84 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -29,6 +29,7 @@
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -60,7 +61,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
#include <cassert>
@@ -72,6 +72,7 @@
#include <vector>
using namespace llvm;
+using ProfileCount = Function::ProfileCount;
static cl::opt<bool>
EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
@@ -1247,7 +1248,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
// Always generate a memcpy of alignment 1 here because we don't know
// the alignment of the src pointer. Other optimizations can infer
// better alignment.
- Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1);
+ Builder.CreateMemCpy(Dst, /*DstAlign*/1, Src, /*SrcAlign*/1, Size);
}
/// When inlining a call site that has a byval argument,
@@ -1431,29 +1432,29 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
/// Update the branch metadata for cloned call instructions.
static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
- const Optional<uint64_t> &CalleeEntryCount,
+ const ProfileCount &CalleeEntryCount,
const Instruction *TheCall,
ProfileSummaryInfo *PSI,
BlockFrequencyInfo *CallerBFI) {
- if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+ if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
+ CalleeEntryCount.getCount() < 1)
return;
- Optional<uint64_t> CallSiteCount =
- PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+ auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
uint64_t CallCount =
std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
- CalleeEntryCount.getValue());
+ CalleeEntryCount.getCount());
for (auto const &Entry : VMap)
if (isa<CallInst>(Entry.first))
if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
- CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+ CI->updateProfWeight(CallCount, CalleeEntryCount.getCount());
for (BasicBlock &BB : *Callee)
// No need to update the callsite if it is pruned during inlining.
if (VMap.count(&BB))
for (Instruction &I : BB)
if (CallInst *CI = dyn_cast<CallInst>(&I))
- CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
- CalleeEntryCount.getValue());
+ CI->updateProfWeight(CalleeEntryCount.getCount() - CallCount,
+ CalleeEntryCount.getCount());
}
/// Update the entry count of callee after inlining.
@@ -1467,18 +1468,19 @@ static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
// callsite is M, the new callee count is set to N - M. M is estimated from
// the caller's entry count, its entry block frequency and the block frequency
// of the callsite.
- Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+ auto CalleeCount = Callee->getEntryCount();
if (!CalleeCount.hasValue() || !PSI)
return;
- Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
+ auto CallCount = PSI->getProfileCount(CallInst, CallerBFI);
if (!CallCount.hasValue())
return;
// Since CallSiteCount is an estimate, it could exceed the original callee
// count and has to be set to 0.
- if (CallCount.getValue() > CalleeCount.getValue())
- Callee->setEntryCount(0);
+ if (CallCount.getValue() > CalleeCount.getCount())
+ CalleeCount.setCount(0);
else
- Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
+ CalleeCount.setCount(CalleeCount.getCount() - CallCount.getValue());
+ Callee->setEntryCount(CalleeCount);
}
/// This function inlines the called function into the basic block of the
@@ -1500,10 +1502,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
IFI.reset();
Function *CalledFunc = CS.getCalledFunction();
- if (!CalledFunc || // Can't inline external function or indirect
- CalledFunc->isDeclaration() ||
- (!ForwardVarArgsTo && CalledFunc->isVarArg())) // call, or call to a vararg function!
- return false;
+ if (!CalledFunc || // Can't inline external function or indirect
+ CalledFunc->isDeclaration()) // call!
+ return false;
// The inliner does not know how to inline through calls with operand bundles
// in general ...
@@ -1568,7 +1569,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
Instruction *CallSiteEHPad = nullptr;
if (CallerPersonality) {
EHPersonality Personality = classifyEHPersonality(CallerPersonality);
- if (isFuncletEHPersonality(Personality)) {
+ if (isScopedEHPersonality(Personality)) {
Optional<OperandBundleUse> ParentFunclet =
CS.getOperandBundle(LLVMContext::OB_funclet);
if (ParentFunclet)
@@ -1630,9 +1631,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
auto &DL = Caller->getParent()->getDataLayout();
- assert((CalledFunc->arg_size() == CS.arg_size() || ForwardVarArgsTo) &&
- "Varargs calls can only be inlined if the Varargs are forwarded!");
-
// Calculate the vector of arguments to pass into the function cloner, which
// matches up the formal to the actual argument values.
CallSite::arg_iterator AI = CS.arg_begin();
@@ -1815,9 +1813,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
}
SmallVector<Value*,4> VarArgsToForward;
+ SmallVector<AttributeSet, 4> VarArgsAttrs;
for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
- i < CS.getNumArgOperands(); i++)
+ i < CS.getNumArgOperands(); i++) {
VarArgsToForward.push_back(CS.getArgOperand(i));
+ VarArgsAttrs.push_back(CS.getAttributes().getParamAttributes(i));
+ }
bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
if (InlinedFunctionInfo.ContainsCalls) {
@@ -1825,6 +1826,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
if (CallInst *CI = dyn_cast<CallInst>(TheCall))
CallSiteTailKind = CI->getTailCallKind();
+ // For inlining purposes, the "notail" marker is the same as no marker.
+ if (CallSiteTailKind == CallInst::TCK_NoTail)
+ CallSiteTailKind = CallInst::TCK_None;
+
for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
++BB) {
for (auto II = BB->begin(); II != BB->end();) {
@@ -1833,6 +1838,40 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
if (!CI)
continue;
+ // Forward varargs from inlined call site to calls to the
+ // ForwardVarArgsTo function, if requested, and to musttail calls.
+ if (!VarArgsToForward.empty() &&
+ ((ForwardVarArgsTo &&
+ CI->getCalledFunction() == ForwardVarArgsTo) ||
+ CI->isMustTailCall())) {
+ // Collect attributes for non-vararg parameters.
+ AttributeList Attrs = CI->getAttributes();
+ SmallVector<AttributeSet, 8> ArgAttrs;
+ if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
+ for (unsigned ArgNo = 0;
+ ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
+ ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+ }
+
+ // Add VarArg attributes.
+ ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
+ Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
+ Attrs.getRetAttributes(), ArgAttrs);
+ // Add VarArgs to existing parameters.
+ SmallVector<Value *, 6> Params(CI->arg_operands());
+ Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
+ CallInst *NewCI =
+ CallInst::Create(CI->getCalledFunction() ? CI->getCalledFunction()
+ : CI->getCalledValue(),
+ Params, "", CI);
+ NewCI->setDebugLoc(CI->getDebugLoc());
+ NewCI->setAttributes(Attrs);
+ NewCI->setCallingConv(CI->getCallingConv());
+ CI->replaceAllUsesWith(NewCI);
+ CI->eraseFromParent();
+ CI = NewCI;
+ }
+
if (Function *F = CI->getCalledFunction())
InlinedDeoptimizeCalls |=
F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
@@ -1850,6 +1889,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
// f -> musttail g -> tail f ==> f -> tail f
// f -> g -> musttail f ==> f -> f
// f -> g -> tail f ==> f -> f
+ //
+ // Inlined notail calls should remain notail calls.
CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
if (ChildTCK != CallInst::TCK_NoTail)
ChildTCK = std::min(CallSiteTailKind, ChildTCK);
@@ -1860,16 +1901,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
// 'nounwind'.
if (MarkNoUnwind)
CI->setDoesNotThrow();
-
- if (ForwardVarArgsTo && !VarArgsToForward.empty() &&
- CI->getCalledFunction() == ForwardVarArgsTo) {
- SmallVector<Value*, 6> Params(CI->arg_operands());
- Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
- CallInst *Call = CallInst::Create(CI->getCalledFunction(), Params, "", CI);
- Call->setDebugLoc(CI->getDebugLoc());
- CI->replaceAllUsesWith(Call);
- CI->eraseFromParent();
- }
}
}
}
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 23ec45edb3ef..003721f2b939 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -17,7 +17,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
namespace {
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 5a90dcb033b2..3fbb3487884b 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -372,7 +372,7 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
/// information about the operands are known. Implements both 32bit and 64bit
/// scalar division.
///
-/// @brief Replace Rem with generated code.
+/// Replace Rem with generated code.
bool llvm::expandRemainder(BinaryOperator *Rem) {
assert((Rem->getOpcode() == Instruction::SRem ||
Rem->getOpcode() == Instruction::URem) &&
@@ -430,7 +430,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
/// when more information about the operands are known. Implements both
/// 32bit and 64bit scalar division.
///
-/// @brief Replace Div with generated code.
+/// Replace Div with generated code.
bool llvm::expandDivision(BinaryOperator *Div) {
assert((Div->getOpcode() == Instruction::SDiv ||
Div->getOpcode() == Instruction::UDiv) &&
@@ -482,7 +482,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {
/// that have no or very little suppport for smaller than 32 bit integer
/// arithmetic.
///
-/// @brief Replace Rem with emulation code.
+/// Replace Rem with emulation code.
bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
assert((Rem->getOpcode() == Instruction::SRem ||
Rem->getOpcode() == Instruction::URem) &&
@@ -531,7 +531,7 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
/// 64 bits. Uses the above routines and extends the inputs/truncates the
/// outputs to operate in 64 bits.
///
-/// @brief Replace Rem with emulation code.
+/// Replace Rem with emulation code.
bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
assert((Rem->getOpcode() == Instruction::SRem ||
Rem->getOpcode() == Instruction::URem) &&
@@ -580,7 +580,7 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
/// in 32 bits; that is, these routines are good for targets that have no
/// or very little support for smaller than 32 bit integer arithmetic.
///
-/// @brief Replace Div with emulation code.
+/// Replace Div with emulation code.
bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
assert((Div->getOpcode() == Instruction::SDiv ||
Div->getOpcode() == Instruction::UDiv) &&
@@ -628,7 +628,7 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
/// above routines and extends the inputs/truncates the outputs to operate
/// in 64 bits.
///
-/// @brief Replace Div with emulation code.
+/// Replace Div with emulation code.
bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
assert((Div->getOpcode() == Instruction::SDiv ||
Div->getOpcode() == Instruction::UDiv) &&
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index ae0e2bb6c280..956d0387c7a8 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -36,13 +36,14 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PredIteratorCache.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
using namespace llvm;
@@ -214,18 +215,27 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
Worklist.push_back(PostProcessPN);
// Keep track of PHI nodes that we want to remove because they did not have
- // any uses rewritten.
+ // any uses rewritten. If the new PHI is used, store it so that we can
+ // try to propagate dbg.value intrinsics to it.
+ SmallVector<PHINode *, 2> NeedDbgValues;
for (PHINode *PN : AddedPHIs)
if (PN->use_empty())
PHIsToRemove.insert(PN);
-
+ else
+ NeedDbgValues.push_back(PN);
+ insertDebugValuesForPHIs(InstBB, NeedDbgValues);
Changed = true;
}
- // Remove PHI nodes that did not have any uses rewritten.
- for (PHINode *PN : PHIsToRemove) {
- assert (PN->use_empty() && "Trying to remove a phi with uses.");
- PN->eraseFromParent();
- }
+ // Remove PHI nodes that did not have any uses rewritten. We need to redo the
+ // use_empty() check here, because even if the PHI node wasn't used when added
+ // to PHIsToRemove, later added PHI nodes can be using it. This cleanup is
+ // not guaranteed to handle trees/cycles of PHI nodes that only are used by
+ // each other. Such situations has only been noticed when the input IR
+ // contains unreachable code, and leaving some extra redundant PHI nodes in
+ // such situations is considered a minor problem.
+ for (PHINode *PN : PHIsToRemove)
+ if (PN->use_empty())
+ PN->eraseFromParent();
return Changed;
}
diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 42aca757c2af..9832a6f24e1f 100644
--- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -79,11 +79,11 @@ public:
bool perform() {
bool Changed = false;
for (auto &CI : WorkList) {
- DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+ << "\n");
if (perform(CI)) {
Changed = true;
- DEBUG(dbgs() << "Transformed\n");
+ LLVM_DEBUG(dbgs() << "Transformed\n");
}
}
return Changed;
@@ -421,7 +421,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
const LibFunc &Func) {
// FIXME: LibFunc_powf and powl TBD.
if (Func != LibFunc_pow) {
- DEBUG(dbgs() << "Not handled powf() and powl()\n");
+ LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n");
return nullptr;
}
@@ -433,7 +433,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
double D = CF->getValueAPF().convertToDouble();
if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
- DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+ LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
return nullptr;
}
@@ -447,7 +447,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
// If the Base value coming from an integer type.
Instruction *I = dyn_cast<Instruction>(Base);
if (!I) {
- DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+ LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n");
return nullptr;
}
unsigned Opcode = I->getOpcode();
@@ -461,7 +461,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
else if (BW == 32)
UpperV = 32.0f;
else {
- DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+ LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n");
return nullptr;
}
@@ -477,7 +477,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
return BBBuilder.CreateOr(Cond0, Cond);
}
- DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+ LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
return nullptr;
}
@@ -496,9 +496,9 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
SuccBB->setName("cdce.end");
CI->removeFromParent();
CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
- DEBUG(dbgs() << "== Basic Block After ==");
- DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
- << *CallBB->getSingleSuccessor() << "\n");
+ LLVM_DEBUG(dbgs() << "== Basic Block After ==");
+ LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+ << *CallBB->getSingleSuccessor() << "\n");
}
// Perform the transformation to a single candidate.
@@ -529,10 +529,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
bool Changed = CCDCE.perform();
// Verify the dominator after we've updated it locally.
-#ifndef NDEBUG
- if (DT)
- DT->verifyDomTree();
-#endif
+ assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast));
return Changed;
}
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index a1961eecb391..ae3cb077a3af 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -73,6 +73,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
#include <cassert>
#include <climits>
@@ -100,26 +101,23 @@ STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
/// conditions and indirectbr addresses this might make dead if
/// DeleteDeadConditions is true.
bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo *TLI,
+ DeferredDominance *DDT) {
TerminatorInst *T = BB->getTerminator();
IRBuilder<> Builder(T);
// Branch - See if we are conditional jumping on constant
- if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+ if (auto *BI = dyn_cast<BranchInst>(T)) {
if (BI->isUnconditional()) return false; // Can't optimize uncond branch
BasicBlock *Dest1 = BI->getSuccessor(0);
BasicBlock *Dest2 = BI->getSuccessor(1);
- if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+ if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
// Are we branching on constant?
// YES. Change to unconditional branch...
BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
BasicBlock *OldDest = Cond->getZExtValue() ? Dest2 : Dest1;
- //cerr << "Function: " << T->getParent()->getParent()
- // << "\nRemoving branch from " << T->getParent()
- // << "\n\nTo: " << OldDest << endl;
-
// Let the basic block know that we are letting go of it. Based on this,
// it will adjust it's PHI nodes.
OldDest->removePredecessor(BB);
@@ -127,6 +125,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
// Replace the conditional branch with an unconditional one.
Builder.CreateBr(Destination);
BI->eraseFromParent();
+ if (DDT)
+ DDT->deleteEdge(BB, OldDest);
return true;
}
@@ -150,10 +150,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
return false;
}
- if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+ if (auto *SI = dyn_cast<SwitchInst>(T)) {
// If we are switching on a constant, we can convert the switch to an
// unconditional branch.
- ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
+ auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
BasicBlock *DefaultDest = SI->getDefaultDest();
BasicBlock *TheOnlyDest = DefaultDest;
@@ -197,9 +197,12 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
createBranchWeights(Weights));
}
// Remove this entry.
- DefaultDest->removePredecessor(SI->getParent());
+ BasicBlock *ParentBB = SI->getParent();
+ DefaultDest->removePredecessor(ParentBB);
i = SI->removeCase(i);
e = SI->case_end();
+ if (DDT)
+ DDT->deleteEdge(ParentBB, DefaultDest);
continue;
}
@@ -225,14 +228,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
// Insert the new branch.
Builder.CreateBr(TheOnlyDest);
BasicBlock *BB = SI->getParent();
+ std::vector <DominatorTree::UpdateType> Updates;
+ if (DDT)
+ Updates.reserve(SI->getNumSuccessors() - 1);
// Remove entries from PHI nodes which we no longer branch to...
for (BasicBlock *Succ : SI->successors()) {
// Found case matching a constant operand?
- if (Succ == TheOnlyDest)
+ if (Succ == TheOnlyDest) {
TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
- else
+ } else {
Succ->removePredecessor(BB);
+ if (DDT)
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ }
}
// Delete the old switch.
@@ -240,6 +249,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
SI->eraseFromParent();
if (DeleteDeadConditions)
RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+ if (DDT)
+ DDT->applyUpdates(Updates);
return true;
}
@@ -280,19 +291,28 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
return false;
}
- if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(T)) {
+ if (auto *IBI = dyn_cast<IndirectBrInst>(T)) {
// indirectbr blockaddress(@F, @BB) -> br label @BB
- if (BlockAddress *BA =
+ if (auto *BA =
dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
BasicBlock *TheOnlyDest = BA->getBasicBlock();
+ std::vector <DominatorTree::UpdateType> Updates;
+ if (DDT)
+ Updates.reserve(IBI->getNumDestinations() - 1);
+
// Insert the new branch.
Builder.CreateBr(TheOnlyDest);
for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
- if (IBI->getDestination(i) == TheOnlyDest)
+ if (IBI->getDestination(i) == TheOnlyDest) {
TheOnlyDest = nullptr;
- else
- IBI->getDestination(i)->removePredecessor(IBI->getParent());
+ } else {
+ BasicBlock *ParentBB = IBI->getParent();
+ BasicBlock *DestBB = IBI->getDestination(i);
+ DestBB->removePredecessor(ParentBB);
+ if (DDT)
+ Updates.push_back({DominatorTree::Delete, ParentBB, DestBB});
+ }
}
Value *Address = IBI->getAddress();
IBI->eraseFromParent();
@@ -307,6 +327,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
new UnreachableInst(BB->getContext(), BB);
}
+ if (DDT)
+ DDT->applyUpdates(Updates);
return true;
}
}
@@ -350,6 +372,11 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
return false;
return true;
}
+ if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
+ if (DLI->getLabel())
+ return false;
+ return true;
+ }
if (!I->mayHaveSideEffects())
return true;
@@ -357,8 +384,9 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
// Special case intrinsics that "may have side effects" but can be deleted
// when dead.
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- // Safe to delete llvm.stacksave if dead.
- if (II->getIntrinsicID() == Intrinsic::stacksave)
+ // Safe to delete llvm.stacksave and launder.invariant.group if dead.
+ if (II->getIntrinsicID() == Intrinsic::stacksave ||
+ II->getIntrinsicID() == Intrinsic::launder_invariant_group)
return true;
// Lifetime intrinsics are dead when their right-hand is undef.
@@ -406,17 +434,31 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
SmallVector<Instruction*, 16> DeadInsts;
DeadInsts.push_back(I);
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
- do {
- I = DeadInsts.pop_back_val();
+ return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+ SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI) {
+ // Process the dead instruction list until empty.
+ while (!DeadInsts.empty()) {
+ Instruction &I = *DeadInsts.pop_back_val();
+ assert(I.use_empty() && "Instructions with uses are not dead.");
+ assert(isInstructionTriviallyDead(&I, TLI) &&
+ "Live instruction found in dead worklist!");
+
+ // Don't lose the debug info while deleting the instructions.
+ salvageDebugInfo(I);
// Null out all of the instruction's operands to see if any operand becomes
// dead as we go.
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- Value *OpV = I->getOperand(i);
- I->setOperand(i, nullptr);
+ for (Use &OpU : I.operands()) {
+ Value *OpV = OpU.get();
+ OpU.set(nullptr);
- if (!OpV->use_empty()) continue;
+ if (!OpV->use_empty())
+ continue;
// If the operand is an instruction that became dead as we nulled out the
// operand, and if it is 'trivially' dead, delete it in a future loop
@@ -426,10 +468,8 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
DeadInsts.push_back(OpI);
}
- I->eraseFromParent();
- } while (!DeadInsts.empty());
-
- return true;
+ I.eraseFromParent();
+ }
}
/// areAllUsesEqual - Check whether the uses of a value are all the same.
@@ -481,6 +521,8 @@ simplifyAndDCEInstruction(Instruction *I,
const DataLayout &DL,
const TargetLibraryInfo *TLI) {
if (isInstructionTriviallyDead(I, TLI)) {
+ salvageDebugInfo(*I);
+
// Null out all of the instruction's operands to see if any operand becomes
// dead as we go.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
@@ -583,7 +625,8 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
///
/// .. and delete the predecessor corresponding to the '1', this will attempt to
/// recursively fold the and to 0.
-void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
+ DeferredDominance *DDT) {
// This only adjusts blocks with PHI nodes.
if (!isa<PHINode>(BB->begin()))
return;
@@ -606,13 +649,18 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
// of the block.
if (PhiIt != OldPhiIt) PhiIt = &BB->front();
}
+ if (DDT)
+ DDT->deleteEdge(Pred, BB);
}
/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
/// predecessor is known to have one successor (DestBB!). Eliminate the edge
/// between them, moving the instructions in the predecessor into DestBB and
/// deleting the predecessor block.
-void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT,
+ DeferredDominance *DDT) {
+ assert(!(DT && DDT) && "Cannot call with both DT and DDT.");
+
// If BB has single-entry PHI nodes, fold them.
while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
Value *NewVal = PN->getIncomingValue(0);
@@ -625,6 +673,24 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
BasicBlock *PredBB = DestBB->getSinglePredecessor();
assert(PredBB && "Block doesn't have a single predecessor!");
+ bool ReplaceEntryBB = false;
+ if (PredBB == &DestBB->getParent()->getEntryBlock())
+ ReplaceEntryBB = true;
+
+ // Deferred DT update: Collect all the edges that enter PredBB. These
+ // dominator edges will be redirected to DestBB.
+ std::vector <DominatorTree::UpdateType> Updates;
+ if (DDT && !ReplaceEntryBB) {
+ Updates.reserve(1 + (2 * pred_size(PredBB)));
+ Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
+ for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
+ Updates.push_back({DominatorTree::Delete, *I, PredBB});
+ // This predecessor of PredBB may already have DestBB as a successor.
+ if (llvm::find(successors(*I), DestBB) == succ_end(*I))
+ Updates.push_back({DominatorTree::Insert, *I, DestBB});
+ }
+ }
+
// Zap anything that took the address of DestBB. Not doing this will give the
// address an invalid value.
if (DestBB->hasAddressTaken()) {
@@ -645,7 +711,7 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
// If the PredBB is the entry block of the function, move DestBB up to
// become the entry block after we erase PredBB.
- if (PredBB == &DestBB->getParent()->getEntryBlock())
+ if (ReplaceEntryBB)
DestBB->moveAfter(PredBB);
if (DT) {
@@ -657,8 +723,19 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
DT->eraseNode(PredBB);
}
}
- // Nuke BB.
- PredBB->eraseFromParent();
+
+ if (DDT) {
+ DDT->deleteBB(PredBB); // Deferred deletion of BB.
+ if (ReplaceEntryBB)
+ // The entry block was removed and there is no external interface for the
+ // dominator tree to be notified of this change. In this corner-case we
+ // recalculate the entire tree.
+ DDT->recalculate(*(DestBB->getParent()));
+ else
+ DDT->applyUpdates(Updates);
+ } else {
+ PredBB->eraseFromParent(); // Nuke BB.
+ }
}
/// CanMergeValues - Return true if we can choose one of these values to use
@@ -675,8 +752,8 @@ static bool CanMergeValues(Value *First, Value *Second) {
static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
- DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
- << Succ->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
+ << Succ->getName() << "\n");
// Shortcut, if there is only a single predecessor it must be BB and merging
// is always safe
if (Succ->getSinglePredecessor()) return true;
@@ -699,10 +776,11 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
if (BBPreds.count(IBB) &&
!CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
PN->getIncomingValue(PI))) {
- DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
- << Succ->getName() << " is conflicting with "
- << BBPN->getName() << " with regard to common predecessor "
- << IBB->getName() << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Can't fold, phi node " << PN->getName() << " in "
+ << Succ->getName() << " is conflicting with "
+ << BBPN->getName() << " with regard to common predecessor "
+ << IBB->getName() << "\n");
return false;
}
}
@@ -715,9 +793,10 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
BasicBlock *IBB = PN->getIncomingBlock(PI);
if (BBPreds.count(IBB) &&
!CanMergeValues(Val, PN->getIncomingValue(PI))) {
- DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
- << Succ->getName() << " is conflicting with regard to common "
- << "predecessor " << IBB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName()
+ << " in " << Succ->getName()
+ << " is conflicting with regard to common "
+ << "predecessor " << IBB->getName() << "\n");
return false;
}
}
@@ -730,7 +809,7 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
using PredBlockVector = SmallVector<BasicBlock *, 16>;
using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
-/// \brief Determines the value to use as the phi node input for a block.
+/// Determines the value to use as the phi node input for a block.
///
/// Select between \p OldVal any value that we know flows from \p BB
/// to a particular phi on the basis of which one (if either) is not
@@ -759,7 +838,7 @@ static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
return OldVal;
}
-/// \brief Create a map from block to value for the operands of a
+/// Create a map from block to value for the operands of a
/// given phi.
///
/// Create a map from block to value for each non-undef value flowing
@@ -778,7 +857,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN,
}
}
-/// \brief Replace the incoming undef values to a phi with the values
+/// Replace the incoming undef values to a phi with the values
/// from a block-to-value map.
///
/// \param PN The phi we are replacing the undefs in.
@@ -798,7 +877,7 @@ static void replaceUndefValuesInPhi(PHINode *PN,
}
}
-/// \brief Replace a value flowing from a block to a phi with
+/// Replace a value flowing from a block to a phi with
/// potentially multiple instances of that value flowing from the
/// block's predecessors to the phi.
///
@@ -865,7 +944,8 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
/// potential side-effect free intrinsics and the branch. If possible,
/// eliminate BB by rewriting all the predecessors to branch to the successor
/// block and return true. If we can't transform, return false.
-bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+ DeferredDominance *DDT) {
assert(BB != &BB->getParent()->getEntryBlock() &&
"TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
@@ -904,7 +984,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
}
}
- DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+ LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+
+ std::vector<DominatorTree::UpdateType> Updates;
+ if (DDT) {
+ Updates.reserve(1 + (2 * pred_size(BB)));
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ // All predecessors of BB will be moved to Succ.
+ for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+ Updates.push_back({DominatorTree::Delete, *I, BB});
+ // This predecessor of BB may already have Succ as a successor.
+ if (llvm::find(successors(*I), Succ) == succ_end(*I))
+ Updates.push_back({DominatorTree::Insert, *I, Succ});
+ }
+ }
if (isa<PHINode>(Succ->begin())) {
// If there is more than one pred of succ, and there are PHI nodes in
@@ -950,7 +1043,13 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
// Everything that jumped to BB now goes to Succ.
BB->replaceAllUsesWith(Succ);
if (!Succ->hasName()) Succ->takeName(BB);
- BB->eraseFromParent(); // Delete the old basic block.
+
+ if (DDT) {
+ DDT->deleteBB(BB); // Deferred deletion of the old basic block.
+ DDT->applyUpdates(Updates);
+ } else {
+ BB->eraseFromParent(); // Delete the old basic block.
+ }
return true;
}
@@ -1129,6 +1228,31 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
return false;
}
+/// Check if the alloc size of \p ValTy is large enough to cover the variable
+/// (or fragment of the variable) described by \p DII.
+///
+/// This is primarily intended as a helper for the different
+/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is
+/// converted describes an alloca'd variable, so we need to use the
+/// alloc size of the value when doing the comparison. E.g. an i1 value will be
+/// identified as covering an n-bit fragment, if the store size of i1 is at
+/// least n bits.
+static bool valueCoversEntireFragment(Type *ValTy, DbgInfoIntrinsic *DII) {
+ const DataLayout &DL = DII->getModule()->getDataLayout();
+ uint64_t ValueSize = DL.getTypeAllocSizeInBits(ValTy);
+ if (auto FragmentSize = DII->getFragmentSizeInBits())
+ return ValueSize >= *FragmentSize;
+ // We can't always calculate the size of the DI variable (e.g. if it is a
+ // VLA). Try to use the size of the alloca that the dbg intrinsic describes
+ // intead.
+ if (DII->isAddressOfVariable())
+ if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
+ if (auto FragmentSize = AI->getAllocationSizeInBits(DL))
+ return ValueSize >= *FragmentSize;
+ // Could not determine size of variable. Conservatively return false.
+ return false;
+}
+
/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
@@ -1139,6 +1263,21 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
auto *DIExpr = DII->getExpression();
Value *DV = SI->getOperand(0);
+ if (!valueCoversEntireFragment(SI->getValueOperand()->getType(), DII)) {
+ // FIXME: If storing to a part of the variable described by the dbg.declare,
+ // then we want to insert a dbg.value for the corresponding fragment.
+ LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+ << *DII << '\n');
+ // For now, when there is a store to parts of the variable (but we do not
+ // know which part) we insert an dbg.value instrinsic to indicate that we
+ // know nothing about the variable's content.
+ DV = UndefValue::get(DV->getType());
+ if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+ Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, DII->getDebugLoc(),
+ SI);
+ return;
+ }
+
// If an argument is zero extended then use argument directly. The ZExt
// may be zapped by an optimization pass in future.
Argument *ExtendedArg = nullptr;
@@ -1182,6 +1321,15 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
if (LdStHasDebugValue(DIVar, DIExpr, LI))
return;
+ if (!valueCoversEntireFragment(LI->getType(), DII)) {
+ // FIXME: If only referring to a part of the variable described by the
+ // dbg.declare, then we want to insert a dbg.value for the corresponding
+ // fragment.
+ LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+ << *DII << '\n');
+ return;
+ }
+
// We are now tracking the loaded value instead of the address. In the
// future if multi-location support is added to the IR, it might be
// preferable to keep tracking both the loaded value and the original
@@ -1202,6 +1350,15 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
if (PhiHasDebugValue(DIVar, DIExpr, APN))
return;
+ if (!valueCoversEntireFragment(APN->getType(), DII)) {
+ // FIXME: If only referring to a part of the variable described by the
+ // dbg.declare, then we want to insert a dbg.value for the corresponding
+ // fragment.
+ LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+ << *DII << '\n');
+ return;
+ }
+
BasicBlock *BB = APN->getParent();
auto InsertionPt = BB->getFirstInsertionPt();
@@ -1241,33 +1398,91 @@ bool llvm::LowerDbgDeclare(Function &F) {
// stored on the stack, while the dbg.declare can only describe
// the stack slot (and at a lexical-scope granularity). Later
// passes will attempt to elide the stack slot.
- if (AI && !isArray(AI)) {
- for (auto &AIUse : AI->uses()) {
- User *U = AIUse.getUser();
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- if (AIUse.getOperandNo() == 1)
- ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
- } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
- ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
- } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
- // This is a call by-value or some other instruction that
- // takes a pointer to the variable. Insert a *value*
- // intrinsic that describes the alloca.
- DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(),
- DDI->getExpression(), DDI->getDebugLoc(),
- CI);
- }
+ if (!AI || isArray(AI))
+ continue;
+
+ // A volatile load/store means that the alloca can't be elided anyway.
+ if (llvm::any_of(AI->users(), [](User *U) -> bool {
+ if (LoadInst *LI = dyn_cast<LoadInst>(U))
+ return LI->isVolatile();
+ if (StoreInst *SI = dyn_cast<StoreInst>(U))
+ return SI->isVolatile();
+ return false;
+ }))
+ continue;
+
+ for (auto &AIUse : AI->uses()) {
+ User *U = AIUse.getUser();
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (AIUse.getOperandNo() == 1)
+ ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+ } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+ // This is a call by-value or some other instruction that takes a
+ // pointer to the variable. Insert a *value* intrinsic that describes
+ // the variable by dereferencing the alloca.
+ auto *DerefExpr =
+ DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
+ DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
+ DDI->getDebugLoc(), CI);
}
- DDI->eraseFromParent();
}
+ DDI->eraseFromParent();
}
return true;
}
+/// Propagate dbg.value intrinsics through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+ SmallVectorImpl<PHINode *> &InsertedPHIs) {
+ assert(BB && "No BasicBlock to clone dbg.value(s) from.");
+ if (InsertedPHIs.size() == 0)
+ return;
+
+ // Map existing PHI nodes to their dbg.values.
+ ValueToValueMapTy DbgValueMap;
+ for (auto &I : *BB) {
+ if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+ DbgValueMap.insert({Loc, DbgII});
+ }
+ }
+ if (DbgValueMap.size() == 0)
+ return;
+
+ // Then iterate through the new PHIs and look to see if they use one of the
+ // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+ // propagate the info through the new PHI.
+ LLVMContext &C = BB->getContext();
+ for (auto PHI : InsertedPHIs) {
+ BasicBlock *Parent = PHI->getParent();
+ // Avoid inserting an intrinsic into an EH block.
+ if (Parent->getFirstNonPHI()->isEHPad())
+ continue;
+ auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+ for (auto VI : PHI->operand_values()) {
+ auto V = DbgValueMap.find(VI);
+ if (V != DbgValueMap.end()) {
+ auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+ Instruction *NewDbgII = DbgII->clone();
+ NewDbgII->setOperand(0, PhiMAV);
+ auto InsertionPt = Parent->getFirstInsertionPt();
+ assert(InsertionPt != Parent->end() && "Ill-formed basic block");
+ NewDbgII->insertBefore(&*InsertionPt);
+ }
+ }
+ }
+}
+
/// Finds all intrinsics declaring local variables as living in the memory that
/// 'V' points to. This may include a mix of dbg.declare and
/// dbg.addr intrinsics.
TinyPtrVector<DbgInfoIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
+ // This function is hot. Check whether the value has any metadata to avoid a
+ // DenseMap lookup.
+ if (!V->isUsedByMetadata())
+ return {};
auto *L = LocalAsMetadata::getIfExists(V);
if (!L)
return {};
@@ -1286,6 +1501,10 @@ TinyPtrVector<DbgInfoIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
}
void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
+ // This function is hot. Check whether the value has any metadata to avoid a
+ // DenseMap lookup.
+ if (!V->isUsedByMetadata())
+ return;
if (auto *L = LocalAsMetadata::getIfExists(V))
if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
for (User *U : MDV->users())
@@ -1293,8 +1512,12 @@ void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
DbgValues.push_back(DVI);
}
-static void findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgUsers,
- Value *V) {
+void llvm::findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgUsers,
+ Value *V) {
+ // This function is hot. Check whether the value has any metadata to avoid a
+ // DenseMap lookup.
+ if (!V->isUsedByMetadata())
+ return;
if (auto *L = LocalAsMetadata::getIfExists(V))
if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
for (User *U : MDV->users())
@@ -1312,11 +1535,11 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
auto *DIExpr = DII->getExpression();
assert(DIVar && "Missing variable");
DIExpr = DIExpression::prepend(DIExpr, DerefBefore, Offset, DerefAfter);
- // Insert llvm.dbg.declare immediately after InsertBefore, and remove old
+ // Insert llvm.dbg.declare immediately before InsertBefore, and remove old
// llvm.dbg.declare.
Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
if (DII == InsertBefore)
- InsertBefore = &*std::next(InsertBefore->getIterator());
+ InsertBefore = InsertBefore->getNextNode();
DII->eraseFromParent();
}
return !DbgAddrs.empty();
@@ -1368,66 +1591,293 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
}
}
-void llvm::salvageDebugInfo(Instruction &I) {
- SmallVector<DbgValueInst *, 1> DbgValues;
+/// Wrap \p V in a ValueAsMetadata instance.
+static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
+ return MetadataAsValue::get(C, ValueAsMetadata::get(V));
+}
+
+bool llvm::salvageDebugInfo(Instruction &I) {
+ SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, &I);
+ if (DbgUsers.empty())
+ return false;
+
auto &M = *I.getModule();
+ auto &DL = M.getDataLayout();
+ auto &Ctx = I.getContext();
+ auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
- auto wrapMD = [&](Value *V) {
- return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+ auto doSalvage = [&](DbgInfoIntrinsic *DII, SmallVectorImpl<uint64_t> &Ops) {
+ auto *DIExpr = DII->getExpression();
+ if (!Ops.empty()) {
+ // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
+ // are implicitly pointing out the value as a DWARF memory location
+ // description.
+ bool WithStackValue = isa<DbgValueInst>(DII);
+ DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
+ }
+ DII->setOperand(0, wrapMD(I.getOperand(0)));
+ DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+ LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
};
- auto applyOffset = [&](DbgValueInst *DVI, uint64_t Offset) {
- auto *DIExpr = DVI->getExpression();
- DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
- DIExpression::NoDeref,
- DIExpression::WithStackValue);
- DVI->setOperand(0, wrapMD(I.getOperand(0)));
- DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
- DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+ auto applyOffset = [&](DbgInfoIntrinsic *DII, uint64_t Offset) {
+ SmallVector<uint64_t, 8> Ops;
+ DIExpression::appendOffset(Ops, Offset);
+ doSalvage(DII, Ops);
};
- if (isa<BitCastInst>(&I) || isa<IntToPtrInst>(&I)) {
- // Bitcasts are entirely irrelevant for debug info. Rewrite dbg.value,
- // dbg.addr, and dbg.declare to use the cast's source.
- SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
- findDbgUsers(DbgUsers, &I);
+ auto applyOps = [&](DbgInfoIntrinsic *DII,
+ std::initializer_list<uint64_t> Opcodes) {
+ SmallVector<uint64_t, 8> Ops(Opcodes);
+ doSalvage(DII, Ops);
+ };
+
+ if (auto *CI = dyn_cast<CastInst>(&I)) {
+ if (!CI->isNoopCast(DL))
+ return false;
+
+ // No-op casts are irrelevant for debug info.
+ MetadataAsValue *CastSrc = wrapMD(I.getOperand(0));
for (auto *DII : DbgUsers) {
- DII->setOperand(0, wrapMD(I.getOperand(0)));
- DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+ DII->setOperand(0, CastSrc);
+ LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
}
+ return true;
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- findDbgValues(DbgValues, &I);
- for (auto *DVI : DbgValues) {
- unsigned BitWidth =
- M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
- APInt Offset(BitWidth, 0);
- // Rewrite a constant GEP into a DIExpression. Since we are performing
- // arithmetic to compute the variable's *value* in the DIExpression, we
- // need to mark the expression with a DW_OP_stack_value.
- if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
- // GEP offsets are i32 and thus always fit into an int64_t.
- applyOffset(DVI, Offset.getSExtValue());
- }
+ unsigned BitWidth =
+ M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
+ // Rewrite a constant GEP into a DIExpression. Since we are performing
+ // arithmetic to compute the variable's *value* in the DIExpression, we
+ // need to mark the expression with a DW_OP_stack_value.
+ APInt Offset(BitWidth, 0);
+ if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
+ for (auto *DII : DbgUsers)
+ applyOffset(DII, Offset.getSExtValue());
+ return true;
} else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
- if (BI->getOpcode() == Instruction::Add)
- if (auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1)))
- if (ConstInt->getBitWidth() <= 64) {
- APInt Offset = ConstInt->getValue();
- findDbgValues(DbgValues, &I);
- for (auto *DVI : DbgValues)
- applyOffset(DVI, Offset.getSExtValue());
- }
+ // Rewrite binary operations with constant integer operands.
+ auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1));
+ if (!ConstInt || ConstInt->getBitWidth() > 64)
+ return false;
+
+ uint64_t Val = ConstInt->getSExtValue();
+ for (auto *DII : DbgUsers) {
+ switch (BI->getOpcode()) {
+ case Instruction::Add:
+ applyOffset(DII, Val);
+ break;
+ case Instruction::Sub:
+ applyOffset(DII, -int64_t(Val));
+ break;
+ case Instruction::Mul:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul});
+ break;
+ case Instruction::SDiv:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_div});
+ break;
+ case Instruction::SRem:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod});
+ break;
+ case Instruction::Or:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_or});
+ break;
+ case Instruction::And:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_and});
+ break;
+ case Instruction::Xor:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor});
+ break;
+ case Instruction::Shl:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl});
+ break;
+ case Instruction::LShr:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr});
+ break;
+ case Instruction::AShr:
+ applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra});
+ break;
+ default:
+ // TODO: Salvage constants from each kind of binop we know about.
+ return false;
+ }
+ }
+ return true;
} else if (isa<LoadInst>(&I)) {
- findDbgValues(DbgValues, &I);
- for (auto *DVI : DbgValues) {
+ MetadataAsValue *AddrMD = wrapMD(I.getOperand(0));
+ for (auto *DII : DbgUsers) {
// Rewrite the load into DW_OP_deref.
- auto *DIExpr = DVI->getExpression();
+ auto *DIExpr = DII->getExpression();
DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
- DVI->setOperand(0, wrapMD(I.getOperand(0)));
- DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
- DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+ DII->setOperand(0, AddrMD);
+ DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+ LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+ }
+ return true;
+ }
+ return false;
+}
+
+/// A replacement for a dbg.value expression.
+using DbgValReplacement = Optional<DIExpression *>;
+
+/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
+/// possibly moving/deleting users to prevent use-before-def. Returns true if
+/// changes are made.
+static bool rewriteDebugUsers(
+ Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
+ function_ref<DbgValReplacement(DbgInfoIntrinsic &DII)> RewriteExpr) {
+ // Find debug users of From.
+ SmallVector<DbgInfoIntrinsic *, 1> Users;
+ findDbgUsers(Users, &From);
+ if (Users.empty())
+ return false;
+
+ // Prevent use-before-def of To.
+ bool Changed = false;
+ SmallPtrSet<DbgInfoIntrinsic *, 1> DeleteOrSalvage;
+ if (isa<Instruction>(&To)) {
+ bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
+
+ for (auto *DII : Users) {
+ // It's common to see a debug user between From and DomPoint. Move it
+ // after DomPoint to preserve the variable update without any reordering.
+ if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
+ LLVM_DEBUG(dbgs() << "MOVE: " << *DII << '\n');
+ DII->moveAfter(&DomPoint);
+ Changed = true;
+
+ // Users which otherwise aren't dominated by the replacement value must
+ // be salvaged or deleted.
+ } else if (!DT.dominates(&DomPoint, DII)) {
+ DeleteOrSalvage.insert(DII);
+ }
}
}
+
+ // Update debug users without use-before-def risk.
+ for (auto *DII : Users) {
+ if (DeleteOrSalvage.count(DII))
+ continue;
+
+ LLVMContext &Ctx = DII->getContext();
+ DbgValReplacement DVR = RewriteExpr(*DII);
+ if (!DVR)
+ continue;
+
+ DII->setOperand(0, wrapValueInMetadata(Ctx, &To));
+ DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR));
+ LLVM_DEBUG(dbgs() << "REWRITE: " << *DII << '\n');
+ Changed = true;
+ }
+
+ if (!DeleteOrSalvage.empty()) {
+ // Try to salvage the remaining debug users.
+ Changed |= salvageDebugInfo(From);
+
+ // Delete the debug users which weren't salvaged.
+ for (auto *DII : DeleteOrSalvage) {
+ if (DII->getVariableLocation() == &From) {
+ LLVM_DEBUG(dbgs() << "Erased UseBeforeDef: " << *DII << '\n');
+ DII->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would
+/// losslessly preserve the bits and semantics of the value. This predicate is
+/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result.
+///
+/// Note that Type::canLosslesslyBitCastTo is not suitable here because it
+/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>,
+/// and also does not allow lossless pointer <-> integer conversions.
+static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy,
+ Type *ToTy) {
+ // Trivially compatible types.
+ if (FromTy == ToTy)
+ return true;
+
+ // Handle compatible pointer <-> integer conversions.
+ if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) {
+ bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy);
+ bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) &&
+ !DL.isNonIntegralPointerType(ToTy);
+ return SameSize && LosslessConversion;
+ }
+
+ // TODO: This is not exhaustive.
+ return false;
+}
+
+bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
+ Instruction &DomPoint, DominatorTree &DT) {
+ // Exit early if From has no debug users.
+ if (!From.isUsedByMetadata())
+ return false;
+
+ assert(&From != &To && "Can't replace something with itself");
+
+ Type *FromTy = From.getType();
+ Type *ToTy = To.getType();
+
+ auto Identity = [&](DbgInfoIntrinsic &DII) -> DbgValReplacement {
+ return DII.getExpression();
+ };
+
+ // Handle no-op conversions.
+ Module &M = *From.getModule();
+ const DataLayout &DL = M.getDataLayout();
+ if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
+ return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+ // Handle integer-to-integer widening and narrowing.
+ // FIXME: Use DW_OP_convert when it's available everywhere.
+ if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) {
+ uint64_t FromBits = FromTy->getPrimitiveSizeInBits();
+ uint64_t ToBits = ToTy->getPrimitiveSizeInBits();
+ assert(FromBits != ToBits && "Unexpected no-op conversion");
+
+ // When the width of the result grows, assume that a debugger will only
+ // access the low `FromBits` bits when inspecting the source variable.
+ if (FromBits < ToBits)
+ return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+ // The width of the result has shrunk. Use sign/zero extension to describe
+ // the source variable's high bits.
+ auto SignOrZeroExt = [&](DbgInfoIntrinsic &DII) -> DbgValReplacement {
+ DILocalVariable *Var = DII.getVariable();
+
+ // Without knowing signedness, sign/zero extension isn't possible.
+ auto Signedness = Var->getSignedness();
+ if (!Signedness)
+ return None;
+
+ bool Signed = *Signedness == DIBasicType::Signedness::Signed;
+
+ if (!Signed) {
+ // In the unsigned case, assume that a debugger will initialize the
+ // high bits to 0 and do a no-op conversion.
+ return Identity(DII);
+ } else {
+ // In the signed case, the high bits are given by sign extension, i.e:
+ // (To >> (ToBits - 1)) * ((2 ^ FromBits) - 1)
+ // Calculate the high bits and OR them together with the low bits.
+ SmallVector<uint64_t, 8> Ops({dwarf::DW_OP_dup, dwarf::DW_OP_constu,
+ (ToBits - 1), dwarf::DW_OP_shr,
+ dwarf::DW_OP_lit0, dwarf::DW_OP_not,
+ dwarf::DW_OP_mul, dwarf::DW_OP_or});
+ return DIExpression::appendToStack(DII.getExpression(), Ops);
+ }
+ };
+ return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
+ }
+
+ // TODO: Floating-point conversions, vectors.
+ return false;
}
unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
@@ -1452,13 +1902,19 @@ unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
}
unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
- bool PreserveLCSSA) {
+ bool PreserveLCSSA, DeferredDominance *DDT) {
BasicBlock *BB = I->getParent();
+ std::vector <DominatorTree::UpdateType> Updates;
+
// Loop over all of the successors, removing BB's entry from any PHI
// nodes.
- for (BasicBlock *Successor : successors(BB))
+ if (DDT)
+ Updates.reserve(BB->getTerminator()->getNumSuccessors());
+ for (BasicBlock *Successor : successors(BB)) {
Successor->removePredecessor(BB, PreserveLCSSA);
-
+ if (DDT)
+ Updates.push_back({DominatorTree::Delete, BB, Successor});
+ }
// Insert a call to llvm.trap right before this. This turns the undefined
// behavior into a hard fail instead of falling through into random code.
if (UseLLVMTrap) {
@@ -1478,11 +1934,13 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
BB->getInstList().erase(BBI++);
++NumInstrsRemoved;
}
+ if (DDT)
+ DDT->applyUpdates(Updates);
return NumInstrsRemoved;
}
/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
+static void changeToCall(InvokeInst *II, DeferredDominance *DDT = nullptr) {
SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
SmallVector<OperandBundleDef, 1> OpBundles;
II->getOperandBundlesAsDefs(OpBundles);
@@ -1495,11 +1953,16 @@ static void changeToCall(InvokeInst *II) {
II->replaceAllUsesWith(NewCall);
// Follow the call by a branch to the normal destination.
- BranchInst::Create(II->getNormalDest(), II);
+ BasicBlock *NormalDestBB = II->getNormalDest();
+ BranchInst::Create(NormalDestBB, II);
// Update PHI nodes in the unwind destination
- II->getUnwindDest()->removePredecessor(II->getParent());
+ BasicBlock *BB = II->getParent();
+ BasicBlock *UnwindDestBB = II->getUnwindDest();
+ UnwindDestBB->removePredecessor(BB);
II->eraseFromParent();
+ if (DDT)
+ DDT->deleteEdge(BB, UnwindDestBB);
}
BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
@@ -1540,7 +2003,8 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
}
static bool markAliveBlocks(Function &F,
- SmallPtrSetImpl<BasicBlock*> &Reachable) {
+ SmallPtrSetImpl<BasicBlock*> &Reachable,
+ DeferredDominance *DDT = nullptr) {
SmallVector<BasicBlock*, 128> Worklist;
BasicBlock *BB = &F.front();
Worklist.push_back(BB);
@@ -1553,41 +2017,44 @@ static bool markAliveBlocks(Function &F,
// instructions into LLVM unreachable insts. The instruction combining pass
// canonicalizes unreachable insts into stores to null or undef.
for (Instruction &I : *BB) {
- // Assumptions that are known to be false are equivalent to unreachable.
- // Also, if the condition is undefined, then we make the choice most
- // beneficial to the optimizer, and choose that to also be unreachable.
- if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
- if (II->getIntrinsicID() == Intrinsic::assume) {
- if (match(II->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
- // Don't insert a call to llvm.trap right before the unreachable.
- changeToUnreachable(II, false);
- Changed = true;
- break;
- }
- }
-
- if (II->getIntrinsicID() == Intrinsic::experimental_guard) {
- // A call to the guard intrinsic bails out of the current compilation
- // unit if the predicate passed to it is false. If the predicate is a
- // constant false, then we know the guard will bail out of the current
- // compile unconditionally, so all code following it is dead.
- //
- // Note: unlike in llvm.assume, it is not "obviously profitable" for
- // guards to treat `undef` as `false` since a guard on `undef` can
- // still be useful for widening.
- if (match(II->getArgOperand(0), m_Zero()))
- if (!isa<UnreachableInst>(II->getNextNode())) {
- changeToUnreachable(II->getNextNode(), /*UseLLVMTrap=*/ false);
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ Value *Callee = CI->getCalledValue();
+ // Handle intrinsic calls.
+ if (Function *F = dyn_cast<Function>(Callee)) {
+ auto IntrinsicID = F->getIntrinsicID();
+ // Assumptions that are known to be false are equivalent to
+ // unreachable. Also, if the condition is undefined, then we make the
+ // choice most beneficial to the optimizer, and choose that to also be
+ // unreachable.
+ if (IntrinsicID == Intrinsic::assume) {
+ if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
+ // Don't insert a call to llvm.trap right before the unreachable.
+ changeToUnreachable(CI, false, false, DDT);
Changed = true;
break;
}
- }
- }
-
- if (auto *CI = dyn_cast<CallInst>(&I)) {
- Value *Callee = CI->getCalledValue();
- if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
- changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+ } else if (IntrinsicID == Intrinsic::experimental_guard) {
+ // A call to the guard intrinsic bails out of the current
+ // compilation unit if the predicate passed to it is false. If the
+ // predicate is a constant false, then we know the guard will bail
+ // out of the current compile unconditionally, so all code following
+ // it is dead.
+ //
+ // Note: unlike in llvm.assume, it is not "obviously profitable" for
+ // guards to treat `undef` as `false` since a guard on `undef` can
+ // still be useful for widening.
+ if (match(CI->getArgOperand(0), m_Zero()))
+ if (!isa<UnreachableInst>(CI->getNextNode())) {
+ changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false,
+ false, DDT);
+ Changed = true;
+ break;
+ }
+ }
+ } else if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(CI->getFunction())) ||
+ isa<UndefValue>(Callee)) {
+ changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DDT);
Changed = true;
break;
}
@@ -1597,17 +2064,16 @@ static bool markAliveBlocks(Function &F,
// though.
if (!isa<UnreachableInst>(CI->getNextNode())) {
// Don't insert a call to llvm.trap right before the unreachable.
- changeToUnreachable(CI->getNextNode(), false);
+ changeToUnreachable(CI->getNextNode(), false, false, DDT);
Changed = true;
}
break;
}
- }
+ } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ // Store to undef and store to null are undefined and used to signal
+ // that they should be changed to unreachable by passes that can't
+ // modify the CFG.
- // Store to undef and store to null are undefined and used to signal that
- // they should be changed to unreachable by passes that can't modify the
- // CFG.
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
// Don't touch volatile stores.
if (SI->isVolatile()) continue;
@@ -1615,8 +2081,9 @@ static bool markAliveBlocks(Function &F,
if (isa<UndefValue>(Ptr) ||
(isa<ConstantPointerNull>(Ptr) &&
- SI->getPointerAddressSpace() == 0)) {
- changeToUnreachable(SI, true);
+ !NullPointerIsDefined(SI->getFunction(),
+ SI->getPointerAddressSpace()))) {
+ changeToUnreachable(SI, true, false, DDT);
Changed = true;
break;
}
@@ -1627,17 +2094,23 @@ static bool markAliveBlocks(Function &F,
if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
// Turn invokes that call 'nounwind' functions into ordinary calls.
Value *Callee = II->getCalledValue();
- if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
- changeToUnreachable(II, true);
+ if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(BB->getParent())) ||
+ isa<UndefValue>(Callee)) {
+ changeToUnreachable(II, true, false, DDT);
Changed = true;
} else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
if (II->use_empty() && II->onlyReadsMemory()) {
// jump to the normal destination branch.
- BranchInst::Create(II->getNormalDest(), II);
- II->getUnwindDest()->removePredecessor(II->getParent());
+ BasicBlock *NormalDestBB = II->getNormalDest();
+ BasicBlock *UnwindDestBB = II->getUnwindDest();
+ BranchInst::Create(NormalDestBB, II);
+ UnwindDestBB->removePredecessor(II->getParent());
II->eraseFromParent();
+ if (DDT)
+ DDT->deleteEdge(BB, UnwindDestBB);
} else
- changeToCall(II);
+ changeToCall(II, DDT);
Changed = true;
}
} else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
@@ -1683,7 +2156,7 @@ static bool markAliveBlocks(Function &F,
}
}
- Changed |= ConstantFoldTerminator(BB, true);
+ Changed |= ConstantFoldTerminator(BB, true, nullptr, DDT);
for (BasicBlock *Successor : successors(BB))
if (Reachable.insert(Successor).second)
Worklist.push_back(Successor);
@@ -1691,11 +2164,11 @@ static bool markAliveBlocks(Function &F,
return Changed;
}
-void llvm::removeUnwindEdge(BasicBlock *BB) {
+void llvm::removeUnwindEdge(BasicBlock *BB, DeferredDominance *DDT) {
TerminatorInst *TI = BB->getTerminator();
if (auto *II = dyn_cast<InvokeInst>(TI)) {
- changeToCall(II);
+ changeToCall(II, DDT);
return;
}
@@ -1723,15 +2196,18 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
UnwindDest->removePredecessor(BB);
TI->replaceAllUsesWith(NewTI);
TI->eraseFromParent();
+ if (DDT)
+ DDT->deleteEdge(BB, UnwindDest);
}
/// removeUnreachableBlocks - Remove blocks that are not reachable, even
/// if they are in a dead cycle. Return true if a change was made, false
/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
/// after modifying the CFG.
-bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
+bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
+ DeferredDominance *DDT) {
SmallPtrSet<BasicBlock*, 16> Reachable;
- bool Changed = markAliveBlocks(F, Reachable);
+ bool Changed = markAliveBlocks(F, Reachable, DDT);
// If there are unreachable blocks in the CFG...
if (Reachable.size() == F.size())
@@ -1741,25 +2217,39 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
NumRemoved += F.size()-Reachable.size();
// Loop over all of the basic blocks that are not reachable, dropping all of
- // their internal references...
- for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
- if (Reachable.count(&*BB))
+ // their internal references. Update DDT and LVI if available.
+ std::vector <DominatorTree::UpdateType> Updates;
+ for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) {
+ auto *BB = &*I;
+ if (Reachable.count(BB))
continue;
-
- for (BasicBlock *Successor : successors(&*BB))
+ for (BasicBlock *Successor : successors(BB)) {
if (Reachable.count(Successor))
- Successor->removePredecessor(&*BB);
+ Successor->removePredecessor(BB);
+ if (DDT)
+ Updates.push_back({DominatorTree::Delete, BB, Successor});
+ }
if (LVI)
- LVI->eraseBlock(&*BB);
+ LVI->eraseBlock(BB);
BB->dropAllReferences();
}
- for (Function::iterator I = ++F.begin(); I != F.end();)
- if (!Reachable.count(&*I))
- I = F.getBasicBlockList().erase(I);
- else
+ for (Function::iterator I = ++F.begin(); I != F.end();) {
+ auto *BB = &*I;
+ if (Reachable.count(BB)) {
++I;
+ continue;
+ }
+ if (DDT) {
+ DDT->deleteBB(BB); // deferred deletion of BB.
+ ++I;
+ } else {
+ I = F.getBasicBlockList().erase(I);
+ }
+ }
+ if (DDT)
+ DDT->applyUpdates(Updates);
return true;
}
@@ -1852,8 +2342,8 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To,
if (!Dominates(Root, U))
continue;
U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
- << *To << " in " << *U << "\n");
+ LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName()
+ << "' as " << *To << " in " << *U << "\n");
++Count;
}
return Count;
@@ -1957,7 +2447,7 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
if (!NewTy->isPointerTy())
return;
- unsigned BitWidth = DL.getTypeSizeInBits(NewTy);
+ unsigned BitWidth = DL.getIndexTypeSizeInBits(NewTy);
if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
MDNode *NN = MDNode::get(OldLI.getContext(), None);
NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
@@ -2269,7 +2759,7 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
// Static allocas (constant size in the entry block) are handled by
// prologue/epilogue insertion so they're free anyway. We definitely don't
// want to make them non-constant.
- return !dyn_cast<AllocaInst>(I)->isStaticAlloca();
+ return !cast<AllocaInst>(I)->isStaticAlloca();
case Instruction::GetElementPtr:
if (OpIdx == 0)
return true;
diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp
new file mode 100644
index 000000000000..6e92e679f999
--- /dev/null
+++ b/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -0,0 +1,645 @@
+//===----------------- LoopRotationUtils.cpp -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utilities to convert a loop into a loop with bottom test.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+ const unsigned MaxHeaderSize;
+ LoopInfo *LI;
+ const TargetTransformInfo *TTI;
+ AssumptionCache *AC;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ const SimplifyQuery &SQ;
+ bool RotationOnly;
+ bool IsUtilMode;
+
+public:
+ LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ,
+ bool RotationOnly, bool IsUtilMode)
+ : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+ SQ(SQ), RotationOnly(RotationOnly), IsUtilMode(IsUtilMode) {}
+ bool processLoop(Loop *L);
+
+private:
+ bool rotateLoop(Loop *L, bool SimplifiedLatch);
+ bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader. If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values. Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+ BasicBlock *OrigPreheader,
+ ValueToValueMapTy &ValueMap,
+ SmallVectorImpl<PHINode*> *InsertedPHIs) {
+ // Remove PHI node entries that are no longer live.
+ BasicBlock::iterator I, E = OrigHeader->end();
+ for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+ // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+ // as necessary.
+ SSAUpdater SSA(InsertedPHIs);
+ for (I = OrigHeader->begin(); I != E; ++I) {
+ Value *OrigHeaderVal = &*I;
+
+ // If there are no uses of the value (e.g. because it returns void), there
+ // is nothing to rewrite.
+ if (OrigHeaderVal->use_empty())
+ continue;
+
+ Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+ // The value now exits in two versions: the initial value in the preheader
+ // and the loop "next" value in the original header.
+ SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+ SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+ SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+ // Visit each use of the OrigHeader instruction.
+ for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+ UE = OrigHeaderVal->use_end();
+ UI != UE;) {
+ // Grab the use before incrementing the iterator.
+ Use &U = *UI;
+
+ // Increment the iterator before removing the use from the list.
+ ++UI;
+
+ // SSAUpdater can't handle a non-PHI use in the same block as an
+ // earlier def. We can easily handle those cases manually.
+ Instruction *UserInst = cast<Instruction>(U.getUser());
+ if (!isa<PHINode>(UserInst)) {
+ BasicBlock *UserBB = UserInst->getParent();
+
+ // The original users in the OrigHeader are already using the
+ // original definitions.
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped.
+ if (UserBB == OrigPreheader) {
+ U = OrigPreHeaderVal;
+ continue;
+ }
+ }
+
+ // Anything else can be handled by SSAUpdater.
+ SSA.RewriteUse(U);
+ }
+
+ // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+ // intrinsics.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigHeaderVal);
+ for (auto &DbgValue : DbgValues) {
+ // The original users in the OrigHeader are already using the original
+ // definitions.
+ BasicBlock *UserBB = DbgValue->getParent();
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped and anything else can be handled by
+ // the SSAUpdater. To avoid adding PHINodes, check if the value is
+ // available in UserBB, if not substitute undef.
+ Value *NewVal;
+ if (UserBB == OrigPreheader)
+ NewVal = OrigPreHeaderVal;
+ else if (SSA.HasValueForBlock(UserBB))
+ NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+ else
+ NewVal = UndefValue::get(OrigHeaderVal->getType());
+ DbgValue->setOperand(0,
+ MetadataAsValue::get(OrigHeaderVal->getContext(),
+ ValueAsMetadata::get(NewVal)));
+ }
+ }
+}
+
+// Look for a phi which is only used outside the loop (via a LCSSA phi)
+// in the exit from the header. This means that rotating the loop can
+// remove the phi.
+static bool shouldRotateLoopExitingLatch(Loop *L) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *HeaderExit = Header->getTerminator()->getSuccessor(0);
+ if (L->contains(HeaderExit))
+ HeaderExit = Header->getTerminator()->getSuccessor(1);
+
+ for (auto &Phi : Header->phis()) {
+ // Look for uses of this phi in the loop/via exits other than the header.
+ if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {
+ return cast<Instruction>(U)->getParent() != HeaderExit;
+ }))
+ continue;
+ return true;
+ }
+
+ return false;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+ // If the loop has only one block then there is not much to rotate.
+ if (L->getBlocks().size() == 1)
+ return false;
+
+ BasicBlock *OrigHeader = L->getHeader();
+ BasicBlock *OrigLatch = L->getLoopLatch();
+
+ BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return false;
+
+ // If the loop header is not one of the loop exiting blocks then
+ // either this loop is already rotated or it is not
+ // suitable for loop rotation transformations.
+ if (!L->isLoopExiting(OrigHeader))
+ return false;
+
+ // If the loop latch already contains a branch that leaves the loop then the
+ // loop is already rotated.
+ if (!OrigLatch)
+ return false;
+
+ // Rotate if either the loop latch does *not* exit the loop, or if the loop
+ // latch was just simplified. Or if we think it will be profitable.
+ if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+ !shouldRotateLoopExitingLatch(L))
+ return false;
+
+ // Check size of original header and reject loop if it is very big or we can't
+ // duplicate blocks inside it.
+ {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
+ Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+ if (Metrics.notDuplicatable) {
+ LLVM_DEBUG(
+ dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+ << " instructions: ";
+ L->dump());
+ return false;
+ }
+ if (Metrics.convergent) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+ "instructions: ";
+ L->dump());
+ return false;
+ }
+ if (Metrics.NumInsts > MaxHeaderSize)
+ return false;
+ }
+
+ // Now, this loop is suitable for rotation.
+ BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!OrigPreheader || !L->hasDedicatedExits())
+ return false;
+
+ // Anything ScalarEvolution may know about this loop or the PHI nodes
+ // in its header will soon be invalidated. We should also invalidate
+ // all outer loops because insertion and deletion of blocks that happens
+ // during the rotation may violate invariants related to backedge taken
+ // infos in them.
+ if (SE)
+ SE->forgetTopmostLoop(L);
+
+ LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+ // Find new Loop header. NewHeader is a Header's one and only successor
+ // that is inside loop. Header's other successor is outside the
+ // loop. Otherwise loop is not suitable for rotation.
+ BasicBlock *Exit = BI->getSuccessor(0);
+ BasicBlock *NewHeader = BI->getSuccessor(1);
+ if (L->contains(Exit))
+ std::swap(Exit, NewHeader);
+ assert(NewHeader && "Unable to determine new loop header");
+ assert(L->contains(NewHeader) && !L->contains(Exit) &&
+ "Unable to determine loop header and exit blocks");
+
+ // This code assumes that the new header has exactly one predecessor.
+ // Remove any single-entry PHI nodes in it.
+ assert(NewHeader->getSinglePredecessor() &&
+ "New header doesn't have one pred!");
+ FoldSingleEntryPHINodes(NewHeader);
+
+ // Begin by walking OrigHeader and populating ValueMap with an entry for
+ // each Instruction.
+ BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+ ValueToValueMapTy ValueMap;
+
+ // For PHI nodes, the value available in OldPreHeader is just the
+ // incoming value from OldPreHeader.
+ for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+ // For the rest of the instructions, either hoist to the OrigPreheader if
+ // possible or create a clone in the OldPreHeader if not.
+ TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+ // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+ using DbgIntrinsicHash =
+ std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+ auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+ return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+ };
+ SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+ for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+ I != E; ++I) {
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+ DbgIntrinsics.insert(makeHash(DII));
+ else
+ break;
+ }
+
+ while (I != E) {
+ Instruction *Inst = &*I++;
+
+ // If the instruction's operands are invariant and it doesn't read or write
+ // memory, then it is safe to hoist. Doing this doesn't change the order of
+ // execution in the preheader, but does prevent the instruction from
+ // executing in each iteration of the loop. This means it is safe to hoist
+ // something that might trap, but isn't safe to hoist something that reads
+ // memory (without proving that the loop doesn't write).
+ if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+ !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+ !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+ Inst->moveBefore(LoopEntryBranch);
+ continue;
+ }
+
+ // Otherwise, create a duplicate of the instruction.
+ Instruction *C = Inst->clone();
+
+ // Eagerly remap the operands of the instruction.
+ RemapInstruction(C, ValueMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // Avoid inserting the same intrinsic twice.
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+ if (DbgIntrinsics.count(makeHash(DII))) {
+ C->deleteValue();
+ continue;
+ }
+
+ // With the operands remapped, see if the instruction constant folds or is
+ // otherwise simplifyable. This commonly occurs because the entry from PHI
+ // nodes allows icmps and other instructions to fold.
+ Value *V = SimplifyInstruction(C, SQ);
+ if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+ // If so, then delete the temporary instruction and stick the folded value
+ // in the map.
+ ValueMap[Inst] = V;
+ if (!C->mayHaveSideEffects()) {
+ C->deleteValue();
+ C = nullptr;
+ }
+ } else {
+ ValueMap[Inst] = C;
+ }
+ if (C) {
+ // Otherwise, stick the new instruction into the new block!
+ C->setName(Inst->getName());
+ C->insertBefore(LoopEntryBranch);
+
+ if (auto *II = dyn_cast<IntrinsicInst>(C))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
+
+ // Along with all the other instructions, we just cloned OrigHeader's
+ // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+ // successors by duplicating their incoming values for OrigHeader.
+ TerminatorInst *TI = OrigHeader->getTerminator();
+ for (BasicBlock *SuccBB : TI->successors())
+ for (BasicBlock::iterator BI = SuccBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+ // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+ // OrigPreHeader's old terminator (the original branch into the loop), and
+ // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+ LoopEntryBranch->eraseFromParent();
+
+
+ SmallVector<PHINode*, 2> InsertedPHIs;
+ // If there were any uses of instructions in the duplicated block outside the
+ // loop, update them, inserting PHI nodes as required
+ RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+ &InsertedPHIs);
+
+ // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+ // previously had debug metadata attached. This keeps the debug info
+ // up-to-date in the loop body.
+ if (!InsertedPHIs.empty())
+ insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+ // NewHeader is now the header of the loop.
+ L->moveToHeader(NewHeader);
+ assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+ // Inform DT about changes to the CFG.
+ if (DT) {
+ // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+ // the DT about the removed edge to the OrigHeader (that got removed).
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+ Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+ DT->applyUpdates(Updates);
+ }
+
+ // At this point, we've finished our major CFG changes. As part of cloning
+ // the loop into the preheader we've simplified instructions and the
+ // duplicated conditional branch may now be branching on a constant. If it is
+ // branching on a constant and if that constant means that we enter the loop,
+ // then we fold away the cond branch to an uncond branch. This simplifies the
+ // loop in cases important for nested loops, and it also means we don't have
+ // to split as many edges.
+ BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+ assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+ if (!isa<ConstantInt>(PHBI->getCondition()) ||
+ PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+ NewHeader) {
+ // The conditional branch can't be folded, handle the general case.
+ // Split edges as necessary to preserve LoopSimplify form.
+
+ // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+ // thus is not a preheader anymore.
+ // Split the edge to form a real preheader.
+ BasicBlock *NewPH = SplitCriticalEdge(
+ OrigPreheader, NewHeader,
+ CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+ NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+ // Preserve canonical loop form, which means that 'Exit' should have only
+ // one predecessor. Note that Exit could be an exit block for multiple
+ // nested loops, causing both of the edges to now be critical and need to
+ // be split.
+ SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+ bool SplitLatchEdge = false;
+ for (BasicBlock *ExitPred : ExitPreds) {
+ // We only need to split loop exit edges.
+ Loop *PredLoop = LI->getLoopFor(ExitPred);
+ if (!PredLoop || PredLoop->contains(Exit))
+ continue;
+ if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+ continue;
+ SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+ BasicBlock *ExitSplit = SplitCriticalEdge(
+ ExitPred, Exit,
+ CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+ ExitSplit->moveBefore(Exit);
+ }
+ assert(SplitLatchEdge &&
+ "Despite splitting all preds, failed to split latch exit?");
+ } else {
+ // We can fold the conditional branch in the preheader, this makes things
+ // simpler. The first step is to remove the extra edge to the Exit block.
+ Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+ NewBI->setDebugLoc(PHBI->getDebugLoc());
+ PHBI->eraseFromParent();
+
+ // With our CFG finalized, update DomTree if it is available.
+ if (DT) DT->deleteEdge(OrigPreheader, Exit);
+ }
+
+ assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+ assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+ // Now that the CFG and DomTree are in a consistent state again, try to merge
+ // the OrigHeader block into OrigLatch. This will succeed if they are
+ // connected by an unconditional branch. This is just a cleanup so the
+ // emitted code isn't too gross in this common case.
+ MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+ LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+ ++NumRotated;
+ return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+ BasicBlock::iterator End, Loop *L) {
+ bool seenIncrement = false;
+ bool MultiExitLoop = false;
+
+ if (!L->getExitingBlock())
+ MultiExitLoop = true;
+
+ for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+ if (!isSafeToSpeculativelyExecute(&*I))
+ return false;
+
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case Instruction::GetElementPtr:
+ // GEPs are cheap if all indices are constant.
+ if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+ return false;
+ // fall-thru to increment case
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr: {
+ Value *IVOpnd =
+ !isa<Constant>(I->getOperand(0))
+ ? I->getOperand(0)
+ : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+ if (!IVOpnd)
+ return false;
+
+ // If increment operand is used outside of the loop, this speculation
+ // could cause extra live range interference.
+ if (MultiExitLoop) {
+ for (User *UseI : IVOpnd->users()) {
+ auto *UserInst = cast<Instruction>(UseI);
+ if (!L->contains(UserInst))
+ return false;
+ }
+ }
+
+ if (seenIncrement)
+ return false;
+ seenIncrement = true;
+ break;
+ }
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // ignore type conversions
+ break;
+ }
+ }
+ return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch || Latch->hasAddressTaken())
+ return false;
+
+ BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!Jmp || !Jmp->isUnconditional())
+ return false;
+
+ BasicBlock *LastExit = Latch->getSinglePredecessor();
+ if (!LastExit || !L->isLoopExiting(LastExit))
+ return false;
+
+ BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+ if (!BI)
+ return false;
+
+ if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+ << LastExit->getName() << "\n");
+
+ // Hoist the instructions from Latch into LastExit.
+ LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+ Latch->begin(), Jmp->getIterator());
+
+ unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+ BasicBlock *Header = Jmp->getSuccessor(0);
+ assert(Header == L->getHeader() && "expected a backward branch");
+
+ // Remove Latch from the CFG so that LastExit becomes the new Latch.
+ BI->setSuccessor(FallThruPath, Header);
+ Latch->replaceSuccessorsPhiUsesWith(LastExit);
+ Jmp->eraseFromParent();
+
+ // Nuke the Latch block.
+ assert(Latch->empty() && "unable to evacuate Latch");
+ LI->removeBlock(Latch);
+ if (DT)
+ DT->eraseNode(Latch);
+ Latch->eraseFromParent();
+ return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+ // Save the loop metadata.
+ MDNode *LoopMD = L->getLoopID();
+
+ bool SimplifiedLatch = false;
+
+ // Simplify the loop latch before attempting to rotate the header
+ // upward. Rotation may not be needed if the loop tail can be folded into the
+ // loop exit.
+ if (!RotationOnly)
+ SimplifiedLatch = simplifyLoopLatch(L);
+
+ bool MadeChange = rotateLoop(L, SimplifiedLatch);
+ assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+ "Loop latch should be exiting after loop-rotate.");
+
+ // Restore the loop metadata.
+ // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+ if ((MadeChange || SimplifiedLatch) && LoopMD)
+ L->setLoopID(LoopMD);
+
+ return MadeChange || SimplifiedLatch;
+}
+
+
+/// The utility to convert a loop into a loop with bottom test.
+bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
+ AssumptionCache *AC, DominatorTree *DT,
+ ScalarEvolution *SE, const SimplifyQuery &SQ,
+ bool RotationOnly = true,
+ unsigned Threshold = unsigned(-1),
+ bool IsUtilMode = true) {
+ LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, SQ, RotationOnly, IsUtilMode);
+
+ return LR.processLoop(L);
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index f43af9772771..970494eb4704 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -52,6 +52,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -64,9 +65,8 @@
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -141,8 +141,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
if (!PreheaderBB)
return nullptr;
- DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
- << PreheaderBB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+ << PreheaderBB->getName() << "\n");
// Make sure that NewBB is put someplace intelligent, which doesn't mess up
// code layout too horribly.
@@ -170,7 +170,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
} while (!Worklist.empty());
}
-/// \brief The first part of loop-nestification is to find a PHI node that tells
+/// The first part of loop-nestification is to find a PHI node that tells
/// us how to partition the loops.
static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
AssumptionCache *AC) {
@@ -195,7 +195,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
return nullptr;
}
-/// \brief If this loop has multiple backedges, try to pull one of them out into
+/// If this loop has multiple backedges, try to pull one of them out into
/// a nested loop.
///
/// This is important for code that looks like
@@ -242,7 +242,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
OuterLoopPreds.push_back(PN->getIncomingBlock(i));
}
}
- DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
// If ScalarEvolution is around and knows anything about values in
// this loop, tell it to forget them, because we're about to
@@ -332,7 +332,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
return NewOuter;
}
-/// \brief This method is called when the specified loop has more than one
+/// This method is called when the specified loop has more than one
/// backedge in it.
///
/// If this occurs, revector all of these backedges to target a new basic block
@@ -371,8 +371,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
- DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
- << BEBlock->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+ << BEBlock->getName() << "\n");
// Move the new backedge block to right after the last backedge block.
Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
@@ -457,7 +457,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
return BEBlock;
}
-/// \brief Simplify one loop and queue further loops for simplification.
+/// Simplify one loop and queue further loops for simplification.
static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
DominatorTree *DT, LoopInfo *LI,
ScalarEvolution *SE, AssumptionCache *AC,
@@ -484,8 +484,8 @@ ReprocessLoop:
// Delete each unique out-of-loop (and thus dead) predecessor.
for (BasicBlock *P : BadPreds) {
- DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
- << P->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+ << P->getName() << "\n");
// Zap the dead pred's terminator and replace it with unreachable.
TerminatorInst *TI = P->getTerminator();
@@ -504,16 +504,13 @@ ReprocessLoop:
if (BI->isConditional()) {
if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
- DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
- << ExitingBlock->getName() << "\n");
+ LLVM_DEBUG(dbgs()
+ << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+ << ExitingBlock->getName() << "\n");
BI->setCondition(ConstantInt::get(Cond->getType(),
!L->contains(BI->getSuccessor(0))));
- // This may make the loop analyzable, force SCEV recomputation.
- if (SE)
- SE->forgetLoop(L);
-
Changed = true;
}
}
@@ -617,11 +614,8 @@ ReprocessLoop:
// comparison and the branch.
bool AllInvariant = true;
bool AnyInvariant = false;
- for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) {
+ for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) {
Instruction *Inst = &*I++;
- // Skip debug info intrinsics.
- if (isa<DbgInfoIntrinsic>(Inst))
- continue;
if (Inst == CI)
continue;
if (!L->makeLoopInvariant(Inst, AnyInvariant,
@@ -648,15 +642,8 @@ ReprocessLoop:
// Success. The block is now dead, so remove it from the loop,
// update the dominator tree and delete it.
- DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
- << ExitingBlock->getName() << "\n");
-
- // Notify ScalarEvolution before deleting this block. Currently assume the
- // parent loop doesn't change (spliting edges doesn't count). If blocks,
- // CFG edges, or other values in the parent loop change, then we need call
- // to forgetLoop() for the parent instead.
- if (SE)
- SE->forgetLoop(L);
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+ << ExitingBlock->getName() << "\n");
assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
Changed = true;
@@ -679,6 +666,12 @@ ReprocessLoop:
}
}
+ // Changing exit conditions for blocks may affect exit counts of this loop and
+ // any of its paretns, so we must invalidate the entire subtree if we've made
+ // any changes.
+ if (Changed && SE)
+ SE->forgetTopmostLoop(L);
+
return Changed;
}
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index dc98a39adcc5..04b8c1417e0a 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -23,6 +23,7 @@
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -33,7 +34,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -63,8 +63,7 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
/// Convert the instruction operands from referencing the current values into
/// those specified by VMap.
-static inline void remapInstruction(Instruction *I,
- ValueToValueMapTy &VMap) {
+void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
Value *Op = I->getOperand(op);
@@ -97,16 +96,10 @@ static inline void remapInstruction(Instruction *I,
/// Folds a basic block into its predecessor if it only has one predecessor, and
/// that predecessor only has one successor.
-/// The LoopInfo Analysis that is passed will be kept consistent. If folding is
-/// successful references to the containing loop must be removed from
-/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
-/// references to the eliminated BB. The argument ForgottenLoops contains a set
-/// of loops that have already been forgotten to prevent redundant, expensive
-/// calls to ScalarEvolution::forgetLoop. Returns the new combined block.
-static BasicBlock *
-foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
- SmallPtrSetImpl<Loop *> &ForgottenLoops,
- DominatorTree *DT) {
+/// The LoopInfo Analysis that is passed will be kept consistent.
+BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
+ ScalarEvolution *SE,
+ DominatorTree *DT) {
// Merge basic blocks into their predecessor if there is only one distinct
// pred, and if there is only one distinct successor of the predecessor, and
// if there are no PHI nodes.
@@ -116,7 +109,8 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
return nullptr;
- DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
+ LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
+ << OnlyPred->getName() << "\n");
// Resolve any PHI nodes at the start of the block. They are all
// guaranteed to have exactly one entry if they exist, unless there are
@@ -149,13 +143,6 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
DT->eraseNode(BB);
}
- // ScalarEvolution holds references to loop exit blocks.
- if (SE) {
- if (Loop *L = LI->getLoopFor(BB)) {
- if (ForgottenLoops.insert(L).second)
- SE->forgetLoop(L);
- }
- }
LI->removeBlock(BB);
// Inherit predecessor's name if it exists...
@@ -258,16 +245,55 @@ static bool isEpilogProfitable(Loop *L) {
BasicBlock *PreHeader = L->getLoopPreheader();
BasicBlock *Header = L->getHeader();
assert(PreHeader && Header);
- for (Instruction &BBI : *Header) {
- PHINode *PN = dyn_cast<PHINode>(&BBI);
- if (!PN)
- break;
- if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
+ for (const PHINode &PN : Header->phis()) {
+ if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
return true;
}
return false;
}
+/// Perform some cleanup and simplifications on loops after unrolling. It is
+/// useful to simplify the IV's in the new loop, as well as do a quick
+/// simplify/dce pass of the instructions.
+void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC) {
+ // Simplify any new induction variables in the partially unrolled loop.
+ if (SE && SimplifyIVs) {
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+
+ // Aggressively clean up dead instructions that simplifyLoopIVs already
+ // identified. Any remaining should be cleaned up below.
+ while (!DeadInsts.empty())
+ if (Instruction *Inst =
+ dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+ RecursivelyDeleteTriviallyDeadInstructions(Inst);
+ }
+
+ // At this point, the code is well formed. We now do a quick sweep over the
+ // inserted code, doing constant propagation and dead code elimination as we
+ // go.
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ const std::vector<BasicBlock *> &NewLoopBlocks = L->getBlocks();
+ for (BasicBlock *BB : NewLoopBlocks) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+
+ if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
+ if (LI->replacementPreservesLCSSAForm(Inst, V))
+ Inst->replaceAllUsesWith(V);
+ if (isInstructionTriviallyDead(Inst))
+ BB->getInstList().erase(Inst);
+ }
+ }
+
+ // TODO: after peeling or unrolling, previously loop variant conditions are
+ // likely to fold to constants, eagerly propagating those here will require
+ // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
+ // appropriate.
+}
+
/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
@@ -313,19 +339,19 @@ LoopUnrollResult llvm::UnrollLoop(
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader) {
- DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
+ LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
return LoopUnrollResult::Unmodified;
}
BasicBlock *LatchBlock = L->getLoopLatch();
if (!LatchBlock) {
- DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
+ LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
return LoopUnrollResult::Unmodified;
}
// Loops with indirectbr cannot be cloned.
if (!L->isSafeToClone()) {
- DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
+ LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
return LoopUnrollResult::Unmodified;
}
@@ -338,8 +364,9 @@ LoopUnrollResult llvm::UnrollLoop(
if (!BI || BI->isUnconditional()) {
// The loop-rotate pass can be helpful to avoid this in many cases.
- DEBUG(dbgs() <<
- " Can't unroll; loop not terminated by a conditional branch.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Can't unroll; loop not terminated by a conditional branch.\n");
return LoopUnrollResult::Unmodified;
}
@@ -348,22 +375,22 @@ LoopUnrollResult llvm::UnrollLoop(
};
if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
- DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
- " exiting the loop can be unrolled\n");
+ LLVM_DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
+ " exiting the loop can be unrolled\n");
return LoopUnrollResult::Unmodified;
}
if (Header->hasAddressTaken()) {
// The loop-rotate pass can be helpful to avoid this in many cases.
- DEBUG(dbgs() <<
- " Won't unroll loop: address of header block is taken.\n");
+ LLVM_DEBUG(
+ dbgs() << " Won't unroll loop: address of header block is taken.\n");
return LoopUnrollResult::Unmodified;
}
if (TripCount != 0)
- DEBUG(dbgs() << " Trip Count = " << TripCount << "\n");
+ LLVM_DEBUG(dbgs() << " Trip Count = " << TripCount << "\n");
if (TripMultiple != 1)
- DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n");
+ LLVM_DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n");
// Effectively "DCE" unrolled iterations that are beyond the tripcount
// and will never be executed.
@@ -372,7 +399,7 @@ LoopUnrollResult llvm::UnrollLoop(
// Don't enter the unroll code if there is nothing to do.
if (TripCount == 0 && Count < 2 && PeelCount == 0) {
- DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+ LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
return LoopUnrollResult::Unmodified;
}
@@ -406,8 +433,9 @@ LoopUnrollResult llvm::UnrollLoop(
"Did not expect runtime trip-count unrolling "
"and peeling for the same loop");
+ bool Peeled = false;
if (PeelCount) {
- bool Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
+ Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
// Successful peeling may result in a change in the loop preheader/trip
// counts. If we later unroll the loop, we want these to be updated.
@@ -422,7 +450,7 @@ LoopUnrollResult llvm::UnrollLoop(
// Loops containing convergent instructions must have a count that divides
// their TripMultiple.
- DEBUG(
+ LLVM_DEBUG(
{
bool HasConvergent = false;
for (auto &BB : L->blocks())
@@ -445,18 +473,12 @@ LoopUnrollResult llvm::UnrollLoop(
if (Force)
RuntimeTripCount = false;
else {
- DEBUG(
- dbgs() << "Wont unroll; remainder loop could not be generated"
- "when assuming runtime trip count\n");
+ LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
+ "generated when assuming runtime trip count\n");
return LoopUnrollResult::Unmodified;
}
}
- // Notify ScalarEvolution that the loop will be substantially changed,
- // if not outright eliminated.
- if (SE)
- SE->forgetLoop(L);
-
// If we know the trip count, we know the multiple...
unsigned BreakoutTrip = 0;
if (TripCount != 0) {
@@ -471,8 +493,8 @@ LoopUnrollResult llvm::UnrollLoop(
using namespace ore;
// Report the unrolling decision.
if (CompletelyUnroll) {
- DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
- << " with trip count " << TripCount << "!\n");
+ LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+ << " with trip count " << TripCount << "!\n");
if (ORE)
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
@@ -481,8 +503,8 @@ LoopUnrollResult llvm::UnrollLoop(
<< NV("UnrollCount", TripCount) << " iterations";
});
} else if (PeelCount) {
- DEBUG(dbgs() << "PEELING loop %" << Header->getName()
- << " with iteration count " << PeelCount << "!\n");
+ LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+ << " with iteration count " << PeelCount << "!\n");
if (ORE)
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
@@ -498,31 +520,42 @@ LoopUnrollResult llvm::UnrollLoop(
<< NV("UnrollCount", Count);
};
- DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
- << " by " << Count);
+ LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
+ << Count);
if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
- DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+ LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
if (ORE)
ORE->emit([&]() {
return DiagBuilder() << " with a breakout at trip "
<< NV("BreakoutTrip", BreakoutTrip);
});
} else if (TripMultiple != 1) {
- DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+ LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
if (ORE)
ORE->emit([&]() {
return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
<< " trips per branch";
});
} else if (RuntimeTripCount) {
- DEBUG(dbgs() << " with run-time trip count");
+ LLVM_DEBUG(dbgs() << " with run-time trip count");
if (ORE)
ORE->emit(
[&]() { return DiagBuilder() << " with run-time trip count"; });
}
- DEBUG(dbgs() << "!\n");
+ LLVM_DEBUG(dbgs() << "!\n");
}
+ // We are going to make changes to this loop. SCEV may be keeping cached info
+ // about it, in particular about backedge taken count. The changes we make
+ // are guaranteed to invalidate this information for our loop. It is tempting
+ // to only invalidate the loop being unrolled, but it is incorrect as long as
+ // all exiting branches from all inner loops have impact on the outer loops,
+ // and if something changes inside them then any of outer loops may also
+ // change. When we forget outermost loop, we also forget all contained loops
+ // and this is what we need here.
+ if (SE)
+ SE->forgetTopmostLoop(L);
+
bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
@@ -580,14 +613,9 @@ LoopUnrollResult llvm::UnrollLoop(
"Header should not be in a sub-loop");
// Tell LI about New.
const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
- if (OldLoop) {
+ if (OldLoop)
LoopsToSimplify.insert(NewLoops[OldLoop]);
- // Forget the old loop, since its inputs may have changed.
- if (SE)
- SE->forgetLoop(OldLoop);
- }
-
if (*BB == Header)
// Loop over all of the PHI nodes in the block, changing them to use
// the incoming values from the previous block.
@@ -611,13 +639,12 @@ LoopUnrollResult llvm::UnrollLoop(
for (BasicBlock *Succ : successors(*BB)) {
if (L->contains(Succ))
continue;
- for (BasicBlock::iterator BBI = Succ->begin();
- PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
- Value *Incoming = phi->getIncomingValueForBlock(*BB);
+ for (PHINode &PHI : Succ->phis()) {
+ Value *Incoming = PHI.getIncomingValueForBlock(*BB);
ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
if (It != LastValueMap.end())
Incoming = It->second;
- phi->addIncoming(Incoming, New);
+ PHI.addIncoming(Incoming, New);
}
}
// Keep track of new headers and latches as we create them, so that
@@ -721,10 +748,8 @@ LoopUnrollResult llvm::UnrollLoop(
for (BasicBlock *Succ: successors(BB)) {
if (Succ == Headers[i])
continue;
- for (BasicBlock::iterator BBI = Succ->begin();
- PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
- Phi->removeIncomingValue(BB, false);
- }
+ for (PHINode &Phi : Succ->phis())
+ Phi.removeIncomingValue(BB, false);
}
}
// Replace the conditional branch with an unconditional one.
@@ -775,17 +800,15 @@ LoopUnrollResult llvm::UnrollLoop(
}
}
- if (DT && UnrollVerifyDomtree)
- DT->verifyDomTree();
+ assert(!DT || !UnrollVerifyDomtree ||
+ DT->verify(DominatorTree::VerificationLevel::Fast));
// Merge adjacent basic blocks, if possible.
- SmallPtrSet<Loop *, 4> ForgottenLoops;
for (BasicBlock *Latch : Latches) {
BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
if (Term->isUnconditional()) {
BasicBlock *Dest = Term->getSuccessor(0);
- if (BasicBlock *Fold =
- foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
+ if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
// Dest has been folded into Fold. Update our worklists accordingly.
std::replace(Latches.begin(), Latches.end(), Dest, Fold);
UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
@@ -795,40 +818,10 @@ LoopUnrollResult llvm::UnrollLoop(
}
}
- // Simplify any new induction variables in the partially unrolled loop.
- if (SE && !CompletelyUnroll && Count > 1) {
- SmallVector<WeakTrackingVH, 16> DeadInsts;
- simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
-
- // Aggressively clean up dead instructions that simplifyLoopIVs already
- // identified. Any remaining should be cleaned up below.
- while (!DeadInsts.empty())
- if (Instruction *Inst =
- dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
- RecursivelyDeleteTriviallyDeadInstructions(Inst);
- }
-
- // At this point, the code is well formed. We now do a quick sweep over the
- // inserted code, doing constant propagation and dead code elimination as we
- // go.
- const DataLayout &DL = Header->getModule()->getDataLayout();
- const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
- for (BasicBlock *BB : NewLoopBlocks) {
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
- Instruction *Inst = &*I++;
-
- if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
- if (LI->replacementPreservesLCSSAForm(Inst, V))
- Inst->replaceAllUsesWith(V);
- if (isInstructionTriviallyDead(Inst))
- BB->getInstList().erase(Inst);
- }
- }
-
- // TODO: after peeling or unrolling, previously loop variant conditions are
- // likely to fold to constants, eagerly propagating those here will require
- // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
- // appropriate.
+ // At this point, the code is well formed. We now simplify the unrolled loop,
+ // doing constant propagation and dead code elimination as we go.
+ simplifyLoopAfterUnroll(L, !CompletelyUnroll && (Count > 1 || Peeled), LI, SE,
+ DT, AC);
NumCompletelyUnrolled += CompletelyUnroll;
++NumUnrolled;
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
new file mode 100644
index 000000000000..b919f73c3817
--- /dev/null
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -0,0 +1,785 @@
+//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam as a routine, much like
+// LoopUnroll.cpp implements loop unroll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
+STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
+
+typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
+
+// Partition blocks in an outer/inner loop pair into blocks before and after
+// the loop
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+ BasicBlockSet &ForeBlocks,
+ BasicBlockSet &SubLoopBlocks,
+ BasicBlockSet &AftBlocks,
+ DominatorTree *DT) {
+ BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+ SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+
+ for (BasicBlock *BB : L->blocks()) {
+ if (!SubLoop->contains(BB)) {
+ if (DT->dominates(SubLoopLatch, BB))
+ AftBlocks.insert(BB);
+ else
+ ForeBlocks.insert(BB);
+ }
+ }
+
+ // Check that all blocks in ForeBlocks together dominate the subloop
+ // TODO: This might ideally be done better with a dominator/postdominators.
+ BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
+ for (BasicBlock *BB : ForeBlocks) {
+ if (BB == SubLoopPreHeader)
+ continue;
+ TerminatorInst *TI = BB->getTerminator();
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+ if (!ForeBlocks.count(TI->getSuccessor(i)))
+ return false;
+ }
+
+ return true;
+}
+
+// Looks at the phi nodes in Header for values coming from Latch. For these
+// instructions and all their operands calls Visit on them, keeping going for
+// all the operands in AftBlocks. Returns false if Visit returns false,
+// otherwise returns true. This is used to process the instructions in the
+// Aft blocks that need to be moved before the subloop. It is used in two
+// places. One to check that the required set of instructions can be moved
+// before the loop. Then to collect the instructions to actually move in
+// moveHeaderPhiOperandsToForeBlocks.
+template <typename T>
+static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
+ BasicBlockSet &AftBlocks, T Visit) {
+ SmallVector<Instruction *, 8> Worklist;
+ for (auto &Phi : Header->phis()) {
+ Value *V = Phi.getIncomingValueForBlock(Latch);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ Worklist.push_back(I);
+ }
+
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+ if (!Visit(I))
+ return false;
+
+ if (AftBlocks.count(I->getParent()))
+ for (auto &U : I->operands())
+ if (Instruction *II = dyn_cast<Instruction>(U))
+ Worklist.push_back(II);
+ }
+
+ return true;
+}
+
+// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
+static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
+ BasicBlock *Latch,
+ Instruction *InsertLoc,
+ BasicBlockSet &AftBlocks) {
+ // We need to ensure we move the instructions in the correct order,
+ // starting with the earliest required instruction and moving forward.
+ std::vector<Instruction *> Visited;
+ processHeaderPhiOperands(Header, Latch, AftBlocks,
+ [&Visited, &AftBlocks](Instruction *I) {
+ if (AftBlocks.count(I->getParent()))
+ Visited.push_back(I);
+ return true;
+ });
+
+ // Move all instructions in program order to before the InsertLoc
+ BasicBlock *InsertLocBB = InsertLoc->getParent();
+ for (Instruction *I : reverse(Visited)) {
+ if (I->getParent() != InsertLocBB)
+ I->moveBefore(InsertLoc);
+ }
+}
+
+/*
+ This method performs Unroll and Jam. For a simple loop like:
+ for (i = ..)
+ Fore(i)
+ for (j = ..)
+ SubLoop(i, j)
+ Aft(i)
+
+ Instead of doing normal inner or outer unrolling, we do:
+ for (i = .., i+=2)
+ Fore(i)
+ Fore(i+1)
+ for (j = ..)
+ SubLoop(i, j)
+ SubLoop(i+1, j)
+ Aft(i)
+ Aft(i+1)
+
+ So the outer loop is essetially unrolled and then the inner loops are fused
+ ("jammed") together into a single loop. This can increase speed when there
+ are loads in SubLoop that are invariant to i, as they become shared between
+ the now jammed inner loops.
+
+ We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
+ Fore blocks are those before the inner loop, Aft are those after. Normal
+ Unroll code is used to copy each of these sets of blocks and the results are
+ combined together into the final form above.
+
+ isSafeToUnrollAndJam should be used prior to calling this to make sure the
+ unrolling will be valid. Checking profitablility is also advisable.
+*/
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+ unsigned TripMultiple, bool UnrollRemainder,
+ LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
+
+ // When we enter here we should have already checked that it is safe
+ BasicBlock *Header = L->getHeader();
+ assert(L->getSubLoops().size() == 1);
+ Loop *SubLoop = *L->begin();
+
+ // Don't enter the unroll code if there is nothing to do.
+ if (TripCount == 0 && Count < 2) {
+ LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ assert(Count > 0);
+ assert(TripMultiple > 0);
+ assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+ // Are we eliminating the loop control altogether?
+ bool CompletelyUnroll = (Count == TripCount);
+
+ // We use the runtime remainder in cases where we don't know trip multiple
+ if (TripMultiple == 1 || TripMultiple % Count != 0) {
+ if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+ /*UseEpilogRemainder*/ true,
+ UnrollRemainder, LI, SE, DT, AC, true)) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
+ "generated when assuming runtime trip count\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ }
+
+ // Notify ScalarEvolution that the loop will be substantially changed,
+ // if not outright eliminated.
+ if (SE) {
+ SE->forgetLoop(L);
+ SE->forgetLoop(SubLoop);
+ }
+
+ using namespace ore;
+ // Report the unrolling decision.
+ if (CompletelyUnroll) {
+ LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
+ << Header->getName() << " with trip count " << TripCount
+ << "!\n");
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+ L->getHeader())
+ << "completely unroll and jammed loop with "
+ << NV("UnrollCount", TripCount) << " iterations");
+ } else {
+ auto DiagBuilder = [&]() {
+ OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+ L->getHeader());
+ return Diag << "unroll and jammed loop by a factor of "
+ << NV("UnrollCount", Count);
+ };
+
+ LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
+ << " by " << Count);
+ if (TripMultiple != 1) {
+ LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+ ORE->emit([&]() {
+ return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
+ << " trips per branch";
+ });
+ } else {
+ LLVM_DEBUG(dbgs() << " with run-time trip count");
+ ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
+ }
+ LLVM_DEBUG(dbgs() << "!\n");
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+ assert(Preheader && LatchBlock && Header);
+ assert(BI && !BI->isUnconditional());
+ bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+ BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+ bool SubLoopContinueOnTrue = SubLoop->contains(
+ SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
+
+ // Partition blocks in an outer/inner loop pair into blocks before and after
+ // the loop
+ BasicBlockSet SubLoopBlocks;
+ BasicBlockSet ForeBlocks;
+ BasicBlockSet AftBlocks;
+ partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
+ DT);
+
+ // We keep track of the entering/first and exiting/last block of each of
+ // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
+ // blocks easier.
+ std::vector<BasicBlock *> ForeBlocksFirst;
+ std::vector<BasicBlock *> ForeBlocksLast;
+ std::vector<BasicBlock *> SubLoopBlocksFirst;
+ std::vector<BasicBlock *> SubLoopBlocksLast;
+ std::vector<BasicBlock *> AftBlocksFirst;
+ std::vector<BasicBlock *> AftBlocksLast;
+ ForeBlocksFirst.push_back(Header);
+ ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
+ SubLoopBlocksFirst.push_back(SubLoop->getHeader());
+ SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
+ AftBlocksFirst.push_back(SubLoop->getExitBlock());
+ AftBlocksLast.push_back(L->getExitingBlock());
+ // Maps Blocks[0] -> Blocks[It]
+ ValueToValueMapTy LastValueMap;
+
+ // Move any instructions from fore phi operands from AftBlocks into Fore.
+ moveHeaderPhiOperandsToForeBlocks(
+ Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
+ AftBlocks);
+
+ // The current on-the-fly SSA update requires blocks to be processed in
+ // reverse postorder so that LastValueMap contains the correct value at each
+ // exit.
+ LoopBlocksDFS DFS(L);
+ DFS.perform(LI);
+ // Stash the DFS iterators before adding blocks to the loop.
+ LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+ LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+ if (Header->getParent()->isDebugInfoForProfiling())
+ for (BasicBlock *BB : L->getBlocks())
+ for (Instruction &I : *BB)
+ if (!isa<DbgInfoIntrinsic>(&I))
+ if (const DILocation *DIL = I.getDebugLoc())
+ I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
+ // Copy all blocks
+ for (unsigned It = 1; It != Count; ++It) {
+ std::vector<BasicBlock *> NewBlocks;
+ // Maps Blocks[It] -> Blocks[It-1]
+ DenseMap<Value *, Value *> PrevItValueMap;
+
+ for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+ ValueToValueMapTy VMap;
+ BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+ Header->getParent()->getBasicBlockList().push_back(New);
+
+ if (ForeBlocks.count(*BB)) {
+ L->addBasicBlockToLoop(New, *LI);
+
+ if (*BB == ForeBlocksFirst[0])
+ ForeBlocksFirst.push_back(New);
+ if (*BB == ForeBlocksLast[0])
+ ForeBlocksLast.push_back(New);
+ } else if (SubLoopBlocks.count(*BB)) {
+ SubLoop->addBasicBlockToLoop(New, *LI);
+
+ if (*BB == SubLoopBlocksFirst[0])
+ SubLoopBlocksFirst.push_back(New);
+ if (*BB == SubLoopBlocksLast[0])
+ SubLoopBlocksLast.push_back(New);
+ } else if (AftBlocks.count(*BB)) {
+ L->addBasicBlockToLoop(New, *LI);
+
+ if (*BB == AftBlocksFirst[0])
+ AftBlocksFirst.push_back(New);
+ if (*BB == AftBlocksLast[0])
+ AftBlocksLast.push_back(New);
+ } else {
+ llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
+ }
+
+ // Update our running maps of newest clones
+ PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
+ LastValueMap[*BB] = New;
+ for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+ VI != VE; ++VI) {
+ PrevItValueMap[VI->second] =
+ const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
+ LastValueMap[VI->first] = VI->second;
+ }
+
+ NewBlocks.push_back(New);
+
+ // Update DomTree:
+ if (*BB == ForeBlocksFirst[0])
+ DT->addNewBlock(New, ForeBlocksLast[It - 1]);
+ else if (*BB == SubLoopBlocksFirst[0])
+ DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
+ else if (*BB == AftBlocksFirst[0])
+ DT->addNewBlock(New, AftBlocksLast[It - 1]);
+ else {
+ // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
+ // structure.
+ auto BBDomNode = DT->getNode(*BB);
+ auto BBIDom = BBDomNode->getIDom();
+ BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+ assert(OriginalBBIDom);
+ assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
+ DT->addNewBlock(
+ New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+ }
+ }
+
+ // Remap all instructions in the most recent iteration
+ for (BasicBlock *NewBlock : NewBlocks) {
+ for (Instruction &I : *NewBlock) {
+ ::remapInstruction(&I, LastValueMap);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
+
+ // Alter the ForeBlocks phi's, pointing them at the latest version of the
+ // value from the previous iteration's phis
+ for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
+ Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
+ assert(OldValue && "should have incoming edge from Aft[It]");
+ Value *NewValue = OldValue;
+ if (Value *PrevValue = PrevItValueMap[OldValue])
+ NewValue = PrevValue;
+
+ assert(Phi.getNumOperands() == 2);
+ Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
+ Phi.setIncomingValue(0, NewValue);
+ Phi.removeIncomingValue(1);
+ }
+ }
+
+ // Now that all the basic blocks for the unrolled iterations are in place,
+ // finish up connecting the blocks and phi nodes. At this point LastValueMap
+ // is the last unrolled iterations values.
+
+ // Update Phis in BB from OldBB to point to NewBB
+ auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
+ BasicBlock *NewBB) {
+ for (PHINode &Phi : BB->phis()) {
+ int I = Phi.getBasicBlockIndex(OldBB);
+ Phi.setIncomingBlock(I, NewBB);
+ }
+ };
+ // Update Phis in BB from OldBB to point to NewBB and use the latest value
+ // from LastValueMap
+ auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
+ BasicBlock *NewBB,
+ ValueToValueMapTy &LastValueMap) {
+ for (PHINode &Phi : BB->phis()) {
+ for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
+ if (Phi.getIncomingBlock(b) == OldBB) {
+ Value *OldValue = Phi.getIncomingValue(b);
+ if (Value *LastValue = LastValueMap[OldValue])
+ Phi.setIncomingValue(b, LastValue);
+ Phi.setIncomingBlock(b, NewBB);
+ break;
+ }
+ }
+ }
+ };
+ // Move all the phis from Src into Dest
+ auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
+ Instruction *insertPoint = Dest->getFirstNonPHI();
+ while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
+ Phi->moveBefore(insertPoint);
+ };
+
+ // Update the PHI values outside the loop to point to the last block
+ updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
+ LastValueMap);
+
+ // Update ForeBlocks successors and phi nodes
+ BranchInst *ForeTerm =
+ cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
+ BasicBlock *Dest = SubLoopBlocksFirst[0];
+ ForeTerm->setSuccessor(0, Dest);
+
+ if (CompletelyUnroll) {
+ while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
+ Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
+ Phi->getParent()->getInstList().erase(Phi);
+ }
+ } else {
+ // Update the PHI values to point to the last aft block
+ updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
+ AftBlocksLast.back(), LastValueMap);
+ }
+
+ for (unsigned It = 1; It != Count; It++) {
+ // Remap ForeBlock successors from previous iteration to this
+ BranchInst *ForeTerm =
+ cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
+ BasicBlock *Dest = ForeBlocksFirst[It];
+ ForeTerm->setSuccessor(0, Dest);
+ }
+
+ // Subloop successors and phis
+ BranchInst *SubTerm =
+ cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+ SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
+ SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
+ updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
+ ForeBlocksLast.back());
+ updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
+ SubLoopBlocksLast.back());
+
+ for (unsigned It = 1; It != Count; It++) {
+ // Replace the conditional branch of the previous iteration subloop with an
+ // unconditional one to this one
+ BranchInst *SubTerm =
+ cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+ BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+ SubTerm->eraseFromParent();
+
+ updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
+ ForeBlocksLast.back());
+ updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
+ SubLoopBlocksLast.back());
+ movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
+ }
+
+ // Aft blocks successors and phis
+ BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+ if (CompletelyUnroll) {
+ BranchInst::Create(LoopExit, Term);
+ Term->eraseFromParent();
+ } else {
+ Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+ }
+ updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
+ SubLoopBlocksLast.back());
+
+ for (unsigned It = 1; It != Count; It++) {
+ // Replace the conditional branch of the previous iteration subloop with an
+ // unconditional one to this one
+ BranchInst *AftTerm =
+ cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
+ BranchInst::Create(AftBlocksFirst[It], AftTerm);
+ AftTerm->eraseFromParent();
+
+ updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
+ SubLoopBlocksLast.back());
+ movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
+ }
+
+ // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
+ // new ones required.
+ if (Count != 1) {
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
+ SubLoopBlocksFirst[0]);
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+ SubLoopBlocksLast[0], AftBlocksFirst[0]);
+
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+ ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+ SubLoopBlocksLast.back(), AftBlocksFirst[0]);
+ DT->applyUpdates(DTUpdates);
+ }
+
+ // Merge adjacent basic blocks, if possible.
+ SmallPtrSet<BasicBlock *, 16> MergeBlocks;
+ MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
+ MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
+ MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
+ while (!MergeBlocks.empty()) {
+ BasicBlock *BB = *MergeBlocks.begin();
+ BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+ if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
+ BasicBlock *Dest = Term->getSuccessor(0);
+ if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
+ // Don't remove BB and add Fold as they are the same BB
+ assert(Fold == BB);
+ (void)Fold;
+ MergeBlocks.erase(Dest);
+ } else
+ MergeBlocks.erase(BB);
+ } else
+ MergeBlocks.erase(BB);
+ }
+
+ // At this point, the code is well formed. We now do a quick sweep over the
+ // inserted code, doing constant propagation and dead code elimination as we
+ // go.
+ simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
+ simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+
+ NumCompletelyUnrolledAndJammed += CompletelyUnroll;
+ ++NumUnrolledAndJammed;
+
+#ifndef NDEBUG
+ // We shouldn't have done anything to break loop simplify form or LCSSA.
+ Loop *OuterL = L->getParentLoop();
+ Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
+ assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
+ if (!CompletelyUnroll)
+ assert(L->isLoopSimplifyForm());
+ assert(SubLoop->isLoopSimplifyForm());
+ assert(DT->verify());
+#endif
+
+ // Update LoopInfo if the loop is completely removed.
+ if (CompletelyUnroll)
+ LI->erase(L);
+
+ return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+ : LoopUnrollResult::PartiallyUnrolled;
+}
+
+static bool getLoadsAndStores(BasicBlockSet &Blocks,
+ SmallVector<Value *, 4> &MemInstr) {
+ // Scan the BBs and collect legal loads and stores.
+ // Returns false if non-simple loads/stores are found.
+ for (BasicBlock *BB : Blocks) {
+ for (Instruction &I : *BB) {
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ if (!Ld->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+ if (!St->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ } else if (I.mayReadOrWriteMemory()) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
+ SmallVector<Value *, 4> &Later,
+ unsigned LoopDepth, bool InnerLoop,
+ DependenceInfo &DI) {
+ // Use DA to check for dependencies between loads and stores that make unroll
+ // and jam invalid
+ for (Value *I : Earlier) {
+ for (Value *J : Later) {
+ Instruction *Src = cast<Instruction>(I);
+ Instruction *Dst = cast<Instruction>(J);
+ if (Src == Dst)
+ continue;
+ // Ignore Input dependencies.
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+ continue;
+
+ // Track dependencies, and if we find them take a conservative approach
+ // by allowing only = or < (not >), altough some > would be safe
+ // (depending upon unroll width).
+ // For the inner loop, we need to disallow any (> <) dependencies
+ // FIXME: Allow > so long as distance is less than unroll width
+ if (auto D = DI.depends(Src, Dst, true)) {
+ assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+ if (D->isConfused())
+ return false;
+ if (!InnerLoop) {
+ if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
+ return false;
+ } else {
+ assert(LoopDepth + 1 <= D->getLevels());
+ if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
+ D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
+ BasicBlockSet &SubLoopBlocks,
+ BasicBlockSet &AftBlocks, DependenceInfo &DI) {
+ // Get all loads/store pairs for each blocks
+ SmallVector<Value *, 4> ForeMemInstr;
+ SmallVector<Value *, 4> SubLoopMemInstr;
+ SmallVector<Value *, 4> AftMemInstr;
+ if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
+ !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
+ !getLoadsAndStores(AftBlocks, AftMemInstr))
+ return false;
+
+ // Check for dependencies between any blocks that may change order
+ unsigned LoopDepth = L->getLoopDepth();
+ return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
+ DI) &&
+ checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
+ checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
+ DI) &&
+ checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
+ DI);
+}
+
+bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+ DependenceInfo &DI) {
+ /* We currently handle outer loops like this:
+ |
+ ForeFirst <----\ }
+ Blocks | } ForeBlocks
+ ForeLast | }
+ | |
+ SubLoopFirst <\ | }
+ Blocks | | } SubLoopBlocks
+ SubLoopLast -/ | }
+ | |
+ AftFirst | }
+ Blocks | } AftBlocks
+ AftLast ------/ }
+ |
+
+ There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
+ and AftBlocks, providing that there is one edge from Fores to SubLoops,
+ one edge from SubLoops to Afts and a single outer loop exit (from Afts).
+ In practice we currently limit Aft blocks to a single block, and limit
+ things further in the profitablility checks of the unroll and jam pass.
+
+ Because of the way we rearrange basic blocks, we also require that
+ the Fore blocks on all unrolled iterations are safe to move before the
+ SubLoop blocks of all iterations. So we require that the phi node looping
+ operands of ForeHeader can be moved to at least the end of ForeEnd, so that
+ we can arrange cloned Fore Blocks before the subloop and match up Phi's
+ correctly.
+
+ i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
+ It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+
+ There are then a number of checks along the lines of no calls, no
+ exceptions, inner loop IV is consistent, etc. Note that for loops requiring
+ runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
+ UnrollAndJamLoop if the trip count cannot be easily calculated.
+ */
+
+ if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+ return false;
+ Loop *SubLoop = L->getSubLoops()[0];
+ if (!SubLoop->isLoopSimplifyForm())
+ return false;
+
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *Exit = L->getExitingBlock();
+ BasicBlock *SubLoopHeader = SubLoop->getHeader();
+ BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+ BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+ if (Latch != Exit)
+ return false;
+ if (SubLoopLatch != SubLoopExit)
+ return false;
+
+ if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
+ return false;
+
+ // Split blocks into Fore/SubLoop/Aft based on dominators
+ BasicBlockSet SubLoopBlocks;
+ BasicBlockSet ForeBlocks;
+ BasicBlockSet AftBlocks;
+ if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
+ AftBlocks, &DT))
+ return false;
+
+ // Aft blocks may need to move instructions to fore blocks, which becomes more
+ // difficult if there are multiple (potentially conditionally executed)
+ // blocks. For now we just exclude loops with multiple aft blocks.
+ if (AftBlocks.size() != 1)
+ return false;
+
+ // Check inner loop IV is consistent between all iterations
+ const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
+ if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
+ !SubLoopBECountSC->getType()->isIntegerTy())
+ return false;
+ ScalarEvolution::LoopDisposition LD =
+ SE.getLoopDisposition(SubLoopBECountSC, L);
+ if (LD != ScalarEvolution::LoopInvariant)
+ return false;
+
+ // Check the loop safety info for exceptions.
+ LoopSafetyInfo LSI;
+ computeLoopSafetyInfo(&LSI, L);
+ if (LSI.MayThrow)
+ return false;
+
+ // We've ruled out the easy stuff and now need to check that there are no
+ // interdependencies which may prevent us from moving the:
+ // ForeBlocks before Subloop and AftBlocks.
+ // Subloop before AftBlocks.
+ // ForeBlock phi operands before the subloop
+
+ // Make sure we can move all instructions we need to before the subloop
+ if (!processHeaderPhiOperands(
+ Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
+ if (SubLoop->contains(I->getParent()))
+ return false;
+ if (AftBlocks.count(I->getParent())) {
+ // If we hit a phi node in afts we know we are done (probably
+ // LCSSA)
+ if (isa<PHINode>(I))
+ return false;
+ // Can't move instructions with side effects or memory
+ // reads/writes
+ if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
+ return false;
+ }
+ // Keep going
+ return true;
+ }))
+ return false;
+
+ // Check for memory dependencies which prohibit the unrolling we are doing.
+ // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
+ // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
+ if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
+ return false;
+
+ return true;
+}
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index c84ae7d693d7..13794c53f24b 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
@@ -30,6 +31,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -46,6 +48,7 @@
#include <limits>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "loop-unroll"
@@ -66,7 +69,7 @@ static const unsigned InfiniteIterationsToInvariance =
std::numeric_limits<unsigned>::max();
// Check whether we are capable of peeling this loop.
-static bool canPeel(Loop *L) {
+bool llvm::canPeel(Loop *L) {
// Make sure the loop is in simplified form
if (!L->isLoopSimplifyForm())
return false;
@@ -136,11 +139,109 @@ static unsigned calculateIterationsToInvariance(
return ToInvariance;
}
+// Return the number of iterations to peel off that make conditions in the
+// body true/false. For example, if we peel 2 iterations off the loop below,
+// the condition i < 2 can be evaluated at compile time.
+// for (i = 0; i < n; i++)
+// if (i < 2)
+// ..
+// else
+// ..
+// }
+static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
+ ScalarEvolution &SE) {
+ assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
+ unsigned DesiredPeelCount = 0;
+
+ for (auto *BB : L.blocks()) {
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || BI->isUnconditional())
+ continue;
+
+ // Ignore loop exit condition.
+ if (L.getLoopLatch() == BB)
+ continue;
+
+ Value *Condition = BI->getCondition();
+ Value *LeftVal, *RightVal;
+ CmpInst::Predicate Pred;
+ if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
+ continue;
+
+ const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
+ const SCEV *RightSCEV = SE.getSCEV(RightVal);
+
+ // Do not consider predicates that are known to be true or false
+ // independently of the loop iteration.
+ if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) ||
+ SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV,
+ RightSCEV))
+ continue;
+
+ // Check if we have a condition with one AddRec and one non AddRec
+ // expression. Normalize LeftSCEV to be the AddRec.
+ if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+ if (isa<SCEVAddRecExpr>(RightSCEV)) {
+ std::swap(LeftSCEV, RightSCEV);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ } else
+ continue;
+ }
+
+ const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
+
+ // Avoid huge SCEV computations in the loop below, make sure we only
+ // consider AddRecs of the loop we are trying to peel and avoid
+ // non-monotonic predicates, as we will not be able to simplify the loop
+ // body.
+ // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can
+ // simplify the loop, if we peel 1 additional iteration, if there
+ // is no wrapping.
+ bool Increasing;
+ if (!LeftAR->isAffine() || LeftAR->getLoop() != &L ||
+ !SE.isMonotonicPredicate(LeftAR, Pred, Increasing))
+ continue;
+ (void)Increasing;
+
+ // Check if extending the current DesiredPeelCount lets us evaluate Pred
+ // or !Pred in the loop body statically.
+ unsigned NewPeelCount = DesiredPeelCount;
+
+ const SCEV *IterVal = LeftAR->evaluateAtIteration(
+ SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
+
+ // If the original condition is not known, get the negated predicate
+ // (which holds on the else branch) and check if it is known. This allows
+ // us to peel of iterations that make the original condition false.
+ if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
+ Pred = ICmpInst::getInversePredicate(Pred);
+
+ const SCEV *Step = LeftAR->getStepRecurrence(SE);
+ while (NewPeelCount < MaxPeelCount &&
+ SE.isKnownPredicate(Pred, IterVal, RightSCEV)) {
+ IterVal = SE.getAddExpr(IterVal, Step);
+ NewPeelCount++;
+ }
+
+ // Only peel the loop if the monotonic predicate !Pred becomes known in the
+ // first iteration of the loop body after peeling.
+ if (NewPeelCount > DesiredPeelCount &&
+ SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
+ RightSCEV))
+ DesiredPeelCount = NewPeelCount;
+ }
+
+ return DesiredPeelCount;
+}
+
// Return the number of iterations we want to peel off.
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP,
- unsigned &TripCount) {
+ unsigned &TripCount, ScalarEvolution &SE) {
assert(LoopSize > 0 && "Zero loop size is not allowed!");
+ // Save the UP.PeelCount value set by the target in
+ // TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+ unsigned TargetPeelCount = UP.PeelCount;
UP.PeelCount = 0;
if (!canPeel(L))
return;
@@ -149,6 +250,19 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
if (!L->empty())
return;
+ // If the user provided a peel count, use that.
+ bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
+ if (UserPeelCount) {
+ LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
+ << " iterations.\n");
+ UP.PeelCount = UnrollForcePeelCount;
+ return;
+ }
+
+ // Skip peeling if it's disabled.
+ if (!UP.AllowPeeling)
+ return;
+
// Here we try to get rid of Phis which become invariants after 1, 2, ..., N
// iterations of the loop. For this we compute the number for iterations after
// which every Phi is guaranteed to become an invariant, and try to peel the
@@ -160,7 +274,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
// Now go through all Phis to calculate their the number of iterations they
// need to become invariants.
- unsigned DesiredPeelCount = 0;
+ // Start the max computation with the UP.PeelCount value set by the target
+ // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+ unsigned DesiredPeelCount = TargetPeelCount;
BasicBlock *BackEdge = L->getLoopLatch();
assert(BackEdge && "Loop is not in simplified form?");
for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
@@ -170,15 +286,21 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
if (ToInvariance != InfiniteIterationsToInvariance)
DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
}
+
+ // Pay respect to limitations implied by loop size and the max peel count.
+ unsigned MaxPeelCount = UnrollPeelMaxCount;
+ MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
+
+ DesiredPeelCount = std::max(DesiredPeelCount,
+ countToEliminateCompares(*L, MaxPeelCount, SE));
+
if (DesiredPeelCount > 0) {
- // Pay respect to limitations implied by loop size and the max peel count.
- unsigned MaxPeelCount = UnrollPeelMaxCount;
- MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
// Consider max peel count limitation.
assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
- DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to turn"
- << " some Phis into invariants.\n");
+ LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+ << " iteration(s) to turn"
+ << " some Phis into invariants.\n");
UP.PeelCount = DesiredPeelCount;
return;
}
@@ -189,44 +311,37 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
if (TripCount)
return;
- // If the user provided a peel count, use that.
- bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
- if (UserPeelCount) {
- DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
- << " iterations.\n");
- UP.PeelCount = UnrollForcePeelCount;
- return;
- }
-
// If we don't know the trip count, but have reason to believe the average
// trip count is low, peeling should be beneficial, since we will usually
// hit the peeled section.
// We only do this in the presence of profile information, since otherwise
// our estimates of the trip count are not reliable enough.
- if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) {
+ if (L->getHeader()->getParent()->hasProfileData()) {
Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
if (!PeelCount)
return;
- DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
- << "\n");
+ LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
+ << "\n");
if (*PeelCount) {
if ((*PeelCount <= UnrollPeelMaxCount) &&
(LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
- DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n");
+ LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
+ << " iterations.\n");
UP.PeelCount = *PeelCount;
return;
}
- DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
- DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
- DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n");
- DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
+ LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+ LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+ LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
}
}
}
-/// \brief Update the branch weights of the latch of a peeled-off loop
+/// Update the branch weights of the latch of a peeled-off loop
/// iteration.
/// This sets the branch weights for the latch of the recently peeled off loop
/// iteration correctly.
@@ -267,12 +382,12 @@ static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
}
}
-/// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
+/// Clones the body of the loop L, putting it between \p InsertTop and \p
/// InsertBot.
/// \param IterNumber The serial number of the iteration currently being
/// peeled off.
/// \param Exit The exit block of the original loop.
-/// \param[out] NewBlocks A list of the the blocks in the newly created clone
+/// \param[out] NewBlocks A list of the blocks in the newly created clone
/// \param[out] VMap The value map between the loop and the new clone.
/// \param LoopBlocks A helper for DFS-traversal of the loop.
/// \param LVMap A value-map that maps instructions from the original loop to
@@ -376,7 +491,7 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
LVMap[KV.first] = KV.second;
}
-/// \brief Peel off the first \p PeelCount iterations of loop \p L.
+/// Peel off the first \p PeelCount iterations of loop \p L.
///
/// Note that this does not peel them off as a single straight-line block.
/// Rather, each iteration is peeled off separately, and needs to check the
@@ -388,8 +503,8 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC, bool PreserveLCSSA) {
- if (!canPeel(L))
- return false;
+ assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
+ assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
LoopBlocksDFS LoopBlocks(L);
LoopBlocks.perform(LI);
@@ -500,10 +615,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
// the original loop body.
if (Iter == 0)
DT->changeImmediateDominator(Exit, cast<BasicBlock>(LVMap[Latch]));
-#ifndef NDEBUG
- if (VerifyDomInfo)
- DT->verifyDomTree();
-#endif
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
}
updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index e00541d3c812..0057b4ba7ce1 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -21,8 +21,8 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -33,7 +33,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -80,25 +80,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
// The new PHI node value is added as an operand of a PHI node in either
// the loop header or the loop exit block.
for (BasicBlock *Succ : successors(Latch)) {
- for (Instruction &BBI : *Succ) {
- PHINode *PN = dyn_cast<PHINode>(&BBI);
- // Exit when we passed all PHI nodes.
- if (!PN)
- break;
+ for (PHINode &PN : Succ->phis()) {
// Add a new PHI node to the prolog end block and add the
// appropriate incoming values.
- PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+ PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
PrologExit->getFirstNonPHI());
// Adding a value to the new PHI node from the original loop preheader.
// This is the value that skips all the prolog code.
- if (L->contains(PN)) {
- NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
+ if (L->contains(&PN)) {
+ NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
PreHeader);
} else {
- NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+ NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
}
- Value *V = PN->getIncomingValueForBlock(Latch);
+ Value *V = PN.getIncomingValueForBlock(Latch);
if (Instruction *I = dyn_cast<Instruction>(V)) {
if (L->contains(I)) {
V = VMap.lookup(I);
@@ -111,10 +107,10 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
// Update the existing PHI node operand with the value from the
// new PHI node. How this is done depends on if the existing
// PHI node is in the original loop block, or the exit block.
- if (L->contains(PN)) {
- PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
+ if (L->contains(&PN)) {
+ PN.setIncomingValue(PN.getBasicBlockIndex(NewPreHeader), NewPN);
} else {
- PN->addIncoming(NewPN, PrologExit);
+ PN.addIncoming(NewPN, PrologExit);
}
}
}
@@ -191,11 +187,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
// Exit (EpilogPN)
// Update PHI nodes at NewExit and Exit.
- for (Instruction &BBI : *NewExit) {
- PHINode *PN = dyn_cast<PHINode>(&BBI);
- // Exit when we passed all PHI nodes.
- if (!PN)
- break;
+ for (PHINode &PN : NewExit->phis()) {
// PN should be used in another PHI located in Exit block as
// Exit was split by SplitBlockPredecessors into Exit and NewExit
// Basicaly it should look like:
@@ -207,14 +199,14 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
//
// There is EpilogPreHeader incoming block instead of NewExit as
// NewExit was spilt 1 more time to get EpilogPreHeader.
- assert(PN->hasOneUse() && "The phi should have 1 use");
- PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
+ assert(PN.hasOneUse() && "The phi should have 1 use");
+ PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
// Add incoming PreHeader from branch around the Loop
- PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+ PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
- Value *V = PN->getIncomingValueForBlock(Latch);
+ Value *V = PN.getIncomingValueForBlock(Latch);
Instruction *I = dyn_cast<Instruction>(V);
if (I && L->contains(I))
// If value comes from an instruction in the loop add VMap value.
@@ -242,23 +234,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
// Skip this as we already updated phis in exit blocks.
if (!L->contains(Succ))
continue;
- for (Instruction &BBI : *Succ) {
- PHINode *PN = dyn_cast<PHINode>(&BBI);
- // Exit when we passed all PHI nodes.
- if (!PN)
- break;
+ for (PHINode &PN : Succ->phis()) {
// Add new PHI nodes to the loop exit block and update epilog
// PHIs with the new PHI values.
- PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+ PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
NewExit->getFirstNonPHI());
// Adding a value to the new PHI node from the unrolling loop preheader.
- NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
+ NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
// Adding a value to the new PHI node from the unrolling loop latch.
- NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
+ NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
// Update the existing PHI node operand with the value from the new PHI
// node. Corresponding instruction in epilog loop should be PHI.
- PHINode *VPN = cast<PHINode>(VMap[&BBI]);
+ PHINode *VPN = cast<PHINode>(VMap[&PN]);
VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
}
}
@@ -430,8 +418,9 @@ canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
// UnrollRuntimeMultiExit is true. This will need updating the logic in
// connectEpilog/connectProlog.
if (!LatchExit->getSinglePredecessor()) {
- DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
- "predecessor.\n");
+ LLVM_DEBUG(
+ dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+ "predecessor.\n");
return false;
}
// FIXME: We bail out of multi-exit unrolling when epilog loop is generated
@@ -540,14 +529,14 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC,
bool PreserveLCSSA) {
- DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
- DEBUG(L->dump());
- DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" :
- dbgs() << "Using prolog remainder.\n");
+ LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+ LLVM_DEBUG(L->dump());
+ LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
+ : dbgs() << "Using prolog remainder.\n");
// Make sure the loop is in canonical form.
if (!L->isLoopSimplifyForm()) {
- DEBUG(dbgs() << "Not in simplify form!\n");
+ LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
return false;
}
@@ -573,7 +562,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
// Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
if (!isMultiExitUnrollingEnabled &&
(!L->getExitingBlock() || OtherExits.size())) {
- DEBUG(
+ LLVM_DEBUG(
dbgs()
<< "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
"enabled!\n");
@@ -593,7 +582,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
const SCEV *BECountSC = SE->getExitCount(L, Latch);
if (isa<SCEVCouldNotCompute>(BECountSC) ||
!BECountSC->getType()->isIntegerTy()) {
- DEBUG(dbgs() << "Could not compute exit block SCEV\n");
+ LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
return false;
}
@@ -603,7 +592,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
const SCEV *TripCountSC =
SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
if (isa<SCEVCouldNotCompute>(TripCountSC)) {
- DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
+ LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
return false;
}
@@ -613,15 +602,16 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
SCEVExpander Expander(*SE, DL, "loop-unroll");
if (!AllowExpensiveTripCount &&
Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
- DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
+ LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
return false;
}
// This constraint lets us deal with an overflowing trip count easily; see the
// comment on ModVal below.
if (Log2_32(Count) > BEWidth) {
- DEBUG(dbgs()
- << "Count failed constraint on overflow trip count calculation.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Count failed constraint on overflow trip count calculation.\n");
return false;
}
@@ -775,7 +765,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
// values from the cloned region. Also update the dominator info for
// OtherExits and their immediate successors, since we have new edges into
// OtherExits.
- SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
+ SmallPtrSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
for (auto *BB : OtherExits) {
for (auto &II : *BB) {
@@ -890,10 +880,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
NewPreHeader, VMap, DT, LI, PreserveLCSSA);
}
- // If this loop is nested, then the loop unroller changes the code in the
- // parent loop, so the Scalar Evolution pass needs to be run again.
- if (Loop *ParentLoop = L->getParentLoop())
- SE->forgetLoop(ParentLoop);
+ // If this loop is nested, then the loop unroller changes the code in the any
+ // of its parent loops, so the Scalar Evolution pass needs to be run again.
+ SE->forgetTopmostLoop(L);
// Canonicalize to LoopSimplifyForm both original and remainder loops. We
// cannot rely on the LoopUnrollPass to do this because it only does
@@ -909,7 +898,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
}
if (remainderLoop && UnrollRemainder) {
- DEBUG(dbgs() << "Unrolling remainder loop\n");
+ LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1,
/*Force*/ false, /*AllowRuntime*/ false,
/*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index fe106e33bca1..46af120a428b 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -16,13 +16,16 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -30,6 +33,7 @@
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
@@ -77,10 +81,13 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) {
return false;
}
-Instruction *
-RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
- SmallPtrSetImpl<Instruction *> &Visited,
- SmallPtrSetImpl<Instruction *> &CI) {
+/// Determines if Phi may have been type-promoted. If Phi has a single user
+/// that ANDs the Phi with a type mask, return the user. RT is updated to
+/// account for the narrower bit width represented by the mask, and the AND
+/// instruction is added to CI.
+static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT,
+ SmallPtrSetImpl<Instruction *> &Visited,
+ SmallPtrSetImpl<Instruction *> &CI) {
if (!Phi->hasOneUse())
return Phi;
@@ -101,70 +108,92 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
return Phi;
}
-bool RecurrenceDescriptor::getSourceExtensionKind(
- Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned,
- SmallPtrSetImpl<Instruction *> &Visited,
- SmallPtrSetImpl<Instruction *> &CI) {
+/// Compute the minimal bit width needed to represent a reduction whose exit
+/// instruction is given by Exit.
+static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
+ DemandedBits *DB,
+ AssumptionCache *AC,
+ DominatorTree *DT) {
+ bool IsSigned = false;
+ const DataLayout &DL = Exit->getModule()->getDataLayout();
+ uint64_t MaxBitWidth = DL.getTypeSizeInBits(Exit->getType());
+
+ if (DB) {
+ // Use the demanded bits analysis to determine the bits that are live out
+ // of the exit instruction, rounding up to the nearest power of two. If the
+ // use of demanded bits results in a smaller bit width, we know the value
+ // must be positive (i.e., IsSigned = false), because if this were not the
+ // case, the sign bit would have been demanded.
+ auto Mask = DB->getDemandedBits(Exit);
+ MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();
+ }
+
+ if (MaxBitWidth == DL.getTypeSizeInBits(Exit->getType()) && AC && DT) {
+ // If demanded bits wasn't able to limit the bit width, we can try to use
+ // value tracking instead. This can be the case, for example, if the value
+ // may be negative.
+ auto NumSignBits = ComputeNumSignBits(Exit, DL, 0, AC, nullptr, DT);
+ auto NumTypeBits = DL.getTypeSizeInBits(Exit->getType());
+ MaxBitWidth = NumTypeBits - NumSignBits;
+ KnownBits Bits = computeKnownBits(Exit, DL);
+ if (!Bits.isNonNegative()) {
+ // If the value is not known to be non-negative, we set IsSigned to true,
+ // meaning that we will use sext instructions instead of zext
+ // instructions to restore the original type.
+ IsSigned = true;
+ if (!Bits.isNegative())
+ // If the value is not known to be negative, we don't known what the
+ // upper bit is, and therefore, we don't know what kind of extend we
+ // will need. In this case, just increase the bit width by one bit and
+ // use sext.
+ ++MaxBitWidth;
+ }
+ }
+ if (!isPowerOf2_64(MaxBitWidth))
+ MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+ return std::make_pair(Type::getIntNTy(Exit->getContext(), MaxBitWidth),
+ IsSigned);
+}
+
+/// Collect cast instructions that can be ignored in the vectorizer's cost
+/// model, given a reduction exit value and the minimal type in which the
+/// reduction can be represented.
+static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
+ Type *RecurrenceType,
+ SmallPtrSetImpl<Instruction *> &Casts) {
SmallVector<Instruction *, 8> Worklist;
- bool FoundOneOperand = false;
- unsigned DstSize = RT->getPrimitiveSizeInBits();
+ SmallPtrSet<Instruction *, 8> Visited;
Worklist.push_back(Exit);
- // Traverse the instructions in the reduction expression, beginning with the
- // exit value.
while (!Worklist.empty()) {
- Instruction *I = Worklist.pop_back_val();
- for (Use &U : I->operands()) {
-
- // Terminate the traversal if the operand is not an instruction, or we
- // reach the starting value.
- Instruction *J = dyn_cast<Instruction>(U.get());
- if (!J || J == Start)
- continue;
-
- // Otherwise, investigate the operation if it is also in the expression.
- if (Visited.count(J)) {
- Worklist.push_back(J);
+ Instruction *Val = Worklist.pop_back_val();
+ Visited.insert(Val);
+ if (auto *Cast = dyn_cast<CastInst>(Val))
+ if (Cast->getSrcTy() == RecurrenceType) {
+ // If the source type of a cast instruction is equal to the recurrence
+ // type, it will be eliminated, and should be ignored in the vectorizer
+ // cost model.
+ Casts.insert(Cast);
continue;
}
- // If the operand is not in Visited, it is not a reduction operation, but
- // it does feed into one. Make sure it is either a single-use sign- or
- // zero-extend instruction.
- CastInst *Cast = dyn_cast<CastInst>(J);
- bool IsSExtInst = isa<SExtInst>(J);
- if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst))
- return false;
-
- // Ensure the source type of the extend is no larger than the reduction
- // type. It is not necessary for the types to be identical.
- unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
- if (SrcSize > DstSize)
- return false;
-
- // Furthermore, ensure that all such extends are of the same kind.
- if (FoundOneOperand) {
- if (IsSigned != IsSExtInst)
- return false;
- } else {
- FoundOneOperand = true;
- IsSigned = IsSExtInst;
- }
-
- // Lastly, if the source type of the extend matches the reduction type,
- // add the extend to CI so that we can avoid accounting for it in the
- // cost model.
- if (SrcSize == DstSize)
- CI.insert(Cast);
- }
+ // Add all operands to the work list if they are loop-varying values that
+ // we haven't yet visited.
+ for (Value *O : cast<User>(Val)->operands())
+ if (auto *I = dyn_cast<Instruction>(O))
+ if (TheLoop->contains(I) && !Visited.count(I))
+ Worklist.push_back(I);
}
- return true;
}
bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
Loop *TheLoop, bool HasFunNoNaNAttr,
- RecurrenceDescriptor &RedDes) {
+ RecurrenceDescriptor &RedDes,
+ DemandedBits *DB,
+ AssumptionCache *AC,
+ DominatorTree *DT) {
if (Phi->getNumIncomingValues() != 2)
return false;
@@ -353,14 +382,49 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
return false;
- // If we think Phi may have been type-promoted, we also need to ensure that
- // all source operands of the reduction are either SExtInsts or ZEstInsts. If
- // so, we will be able to evaluate the reduction in the narrower bit width.
- if (Start != Phi)
- if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType,
- IsSigned, VisitedInsts, CastInsts))
+ if (Start != Phi) {
+ // If the starting value is not the same as the phi node, we speculatively
+ // looked through an 'and' instruction when evaluating a potential
+ // arithmetic reduction to determine if it may have been type-promoted.
+ //
+ // We now compute the minimal bit width that is required to represent the
+ // reduction. If this is the same width that was indicated by the 'and', we
+ // can represent the reduction in the smaller type. The 'and' instruction
+ // will be eliminated since it will essentially be a cast instruction that
+ // can be ignore in the cost model. If we compute a different type than we
+ // did when evaluating the 'and', the 'and' will not be eliminated, and we
+ // will end up with different kinds of operations in the recurrence
+ // expression (e.g., RK_IntegerAND, RK_IntegerADD). We give up if this is
+ // the case.
+ //
+ // The vectorizer relies on InstCombine to perform the actual
+ // type-shrinking. It does this by inserting instructions to truncate the
+ // exit value of the reduction to the width indicated by RecurrenceType and
+ // then extend this value back to the original width. If IsSigned is false,
+ // a 'zext' instruction will be generated; otherwise, a 'sext' will be
+ // used.
+ //
+ // TODO: We should not rely on InstCombine to rewrite the reduction in the
+ // smaller type. We should just generate a correctly typed expression
+ // to begin with.
+ Type *ComputedType;
+ std::tie(ComputedType, IsSigned) =
+ computeRecurrenceType(ExitInstruction, DB, AC, DT);
+ if (ComputedType != RecurrenceType)
return false;
+ // The recurrence expression will be represented in a narrower type. If
+ // there are any cast instructions that will be unnecessary, collect them
+ // in CastInsts. Note that the 'and' instruction was already included in
+ // this list.
+ //
+ // TODO: A better way to represent this may be to tag in some way all the
+ // instructions that are a part of the reduction. The vectorizer cost
+ // model could then apply the recurrence type to these instructions,
+ // without needing a white list of instructions to ignore.
+ collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
+ }
+
// We found a reduction var if we have reached the original phi node and we
// only have a single instruction with out-of-loop users.
@@ -480,48 +544,59 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
return false;
}
bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
- RecurrenceDescriptor &RedDes) {
+ RecurrenceDescriptor &RedDes,
+ DemandedBits *DB, AssumptionCache *AC,
+ DominatorTree *DT) {
BasicBlock *Header = TheLoop->getHeader();
Function &F = *Header->getParent();
bool HasFunNoNaNAttr =
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
- if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr,
- RedDes)) {
- DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes,
+ DB, AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) {
- DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+ AC, DT)) {
+ LLVM_DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi
+ << "\n");
return true;
}
// Not a reduction of known type.
@@ -849,13 +924,13 @@ bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
}
/// This function is called when we suspect that the update-chain of a phi node
-/// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts,
-/// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime
-/// predicate P under which the SCEV expression for the phi can be the
-/// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the
-/// cast instructions that are involved in the update-chain of this induction.
-/// A caller that adds the required runtime predicate can be free to drop these
-/// cast instructions, and compute the phi using \p AR (instead of some scev
+/// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts,
+/// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime
+/// predicate P under which the SCEV expression for the phi can be the
+/// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the
+/// cast instructions that are involved in the update-chain of this induction.
+/// A caller that adds the required runtime predicate can be free to drop these
+/// cast instructions, and compute the phi using \p AR (instead of some scev
/// expression with casts).
///
/// For example, without a predicate the scev expression can take the following
@@ -890,7 +965,7 @@ static bool getCastsForInductionPHI(PredicatedScalarEvolution &PSE,
assert(PSE.getSCEV(PN) == AR && "Unexpected phi node SCEV expression");
const Loop *L = AR->getLoop();
- // Find any cast instructions that participate in the def-use chain of
+ // Find any cast instructions that participate in the def-use chain of
// PhiScev in the loop.
// FORNOW/TODO: We currently expect the def-use chain to include only
// two-operand instructions, where one of the operands is an invariant.
@@ -978,7 +1053,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
AR = PSE.getAsAddRec(Phi);
if (!AR) {
- DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+ LLVM_DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
return false;
}
@@ -1012,14 +1087,15 @@ bool InductionDescriptor::isInductionPHI(
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
- DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+ LLVM_DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
return false;
}
if (AR->getLoop() != TheLoop) {
// FIXME: We should treat this as a uniform. Unfortunately, we
// don't currently know how to handled uniform PHIs.
- DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
return false;
}
@@ -1100,11 +1176,12 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA);
if (!NewExitBB)
- DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
- << *L << "\n");
+ LLVM_DEBUG(
+ dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+ << *L << "\n");
else
- DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
- << NewExitBB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+ << NewExitBB->getName() << "\n");
return true;
};
@@ -1127,7 +1204,7 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
return Changed;
}
-/// \brief Returns the instructions that use values defined in the loop.
+/// Returns the instructions that use values defined in the loop.
SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
SmallVector<Instruction *, 8> UsedOutside;
@@ -1204,7 +1281,7 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
}
-/// \brief Find string metadata for loop
+/// Find string metadata for loop
///
/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
/// operand or null otherwise. If the string metadata is not found return
@@ -1321,13 +1398,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
// Rewrite phis in the exit block to get their inputs from the Preheader
// instead of the exiting block.
- BasicBlock::iterator BI = ExitBlock->begin();
- while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ for (PHINode &P : ExitBlock->phis()) {
// Set the zero'th element of Phi to be from the preheader and remove all
// other incoming values. Given the loop has dedicated exits, all other
// incoming values must be from the exiting blocks.
int PredIndex = 0;
- P->setIncomingBlock(PredIndex, Preheader);
+ P.setIncomingBlock(PredIndex, Preheader);
// Removes all incoming values from all other exiting blocks (including
// duplicate values from an exiting block).
// Nuke all entries except the zero'th entry which is the preheader entry.
@@ -1335,13 +1411,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
// below, to keep the indices valid for deletion (removeIncomingValues
// updates getNumIncomingValues and shifts all values down into the operand
// being deleted).
- for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
- P->removeIncomingValue(e - i, false);
+ for (unsigned i = 0, e = P.getNumIncomingValues() - 1; i != e; ++i)
+ P.removeIncomingValue(e - i, false);
- assert((P->getNumIncomingValues() == 1 &&
- P->getIncomingBlock(PredIndex) == Preheader) &&
+ assert((P.getNumIncomingValues() == 1 &&
+ P.getIncomingBlock(PredIndex) == Preheader) &&
"Should have exactly one value and that's from the preheader!");
- ++BI;
}
// Disconnect the loop body by branching directly to its exit.
@@ -1358,6 +1433,32 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
DT->deleteEdge(Preheader, L->getHeader());
}
+ // Given LCSSA form is satisfied, we should not have users of instructions
+ // within the dead loop outside of the loop. However, LCSSA doesn't take
+ // unreachable uses into account. We handle them here.
+ // We could do it after drop all references (in this case all users in the
+ // loop will be already eliminated and we have less work to do but according
+ // to API doc of User::dropAllReferences only valid operation after dropping
+ // references, is deletion. So let's substitute all usages of
+ // instruction from the loop with undef value of corresponding type first.
+ for (auto *Block : L->blocks())
+ for (Instruction &I : *Block) {
+ auto *Undef = UndefValue::get(I.getType());
+ for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E;) {
+ Use &U = *UI;
+ ++UI;
+ if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
+ if (L->contains(Usr->getParent()))
+ continue;
+ // If we have a DT then we can check that uses outside a loop only in
+ // unreachable block.
+ if (DT)
+ assert(!DT->isReachableFromEntry(U) &&
+ "Unexpected user in reachable block");
+ U.set(Undef);
+ }
+ }
+
// Remove the block from the reference counting scheme, so that we can
// delete it freely later.
for (auto *Block : L->blocks())
@@ -1385,54 +1486,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
}
}
-/// Returns true if the instruction in a loop is guaranteed to execute at least
-/// once.
-bool llvm::isGuaranteedToExecute(const Instruction &Inst,
- const DominatorTree *DT, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo) {
- // We have to check to make sure that the instruction dominates all
- // of the exit blocks. If it doesn't, then there is a path out of the loop
- // which does not execute this instruction, so we can't hoist it.
-
- // If the instruction is in the header block for the loop (which is very
- // common), it is always guaranteed to dominate the exit blocks. Since this
- // is a common case, and can save some work, check it now.
- if (Inst.getParent() == CurLoop->getHeader())
- // If there's a throw in the header block, we can't guarantee we'll reach
- // Inst.
- return !SafetyInfo->HeaderMayThrow;
-
- // Somewhere in this loop there is an instruction which may throw and make us
- // exit the loop.
- if (SafetyInfo->MayThrow)
- return false;
-
- // Get the exit blocks for the current loop.
- SmallVector<BasicBlock *, 8> ExitBlocks;
- CurLoop->getExitBlocks(ExitBlocks);
-
- // Verify that the block dominates each of the exit blocks of the loop.
- for (BasicBlock *ExitBlock : ExitBlocks)
- if (!DT->dominates(Inst.getParent(), ExitBlock))
- return false;
-
- // As a degenerate case, if the loop is statically infinite then we haven't
- // proven anything since there are no exit blocks.
- if (ExitBlocks.empty())
- return false;
-
- // FIXME: In general, we have to prove that the loop isn't an infinite loop.
- // See http::llvm.org/PR24078 . (The "ExitBlocks.empty()" check above is
- // just a special case of this.)
- return true;
-}
-
Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
// Only support loops with a unique exiting block, and a latch.
if (!L->getExitingBlock())
return None;
- // Get the branch weights for the the loop's backedge.
+ // Get the branch weights for the loop's backedge.
BranchInst *LatchBR =
dyn_cast<BranchInst>(L->getLoopLatch()->getTerminator());
if (!LatchBR || LatchBR->getNumSuccessors() != 2)
@@ -1460,7 +1519,7 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
return (FalseVal + (TrueVal / 2)) / TrueVal;
}
-/// \brief Adds a 'fast' flag to floating point operations.
+/// Adds a 'fast' flag to floating point operations.
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
@@ -1470,6 +1529,38 @@ static Value *addFastMathFlag(Value *V) {
return V;
}
+// Helper to generate an ordered reduction.
+Value *
+llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
+ unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+ ArrayRef<Value *> RedOps) {
+ unsigned VF = Src->getType()->getVectorNumElements();
+
+ // Extract and apply reduction ops in ascending order:
+ // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+ Value *Result = Acc;
+ for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+ Value *Ext =
+ Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+ "bin.rdx");
+ } else {
+ assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+ "Invalid min/max");
+ Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
+ Ext);
+ }
+
+ if (!RedOps.empty())
+ propagateIRFlags(Result, RedOps);
+ }
+
+ return Result;
+}
+
// Helper to generate a log2 shuffle reduction.
Value *
llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index 29756d9dab7f..abbcd5f9e3b8 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -140,9 +140,12 @@ void LoopVersioning::addPHINodes(
if (!PN) {
PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
&PHIBlock->front());
- for (auto *User : Inst->users())
- if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
- User->replaceUsesOfWith(Inst, PN);
+ SmallVector<User*, 8> UsersToUpdate;
+ for (User *U : Inst->users())
+ if (!VersionedLoop->contains(cast<Instruction>(U)->getParent()))
+ UsersToUpdate.push_back(U);
+ for (User *U : UsersToUpdate)
+ U->replaceUsesOfWith(Inst, PN);
PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
}
}
@@ -248,7 +251,7 @@ void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
}
namespace {
-/// \brief Also expose this is a pass. Currently this is only used for
+/// Also expose this is a pass. Currently this is only used for
/// unit-testing. It adds all memchecks necessary to remove all may-aliasing
/// array accesses from the loop.
class LoopVersioningPass : public FunctionPass {
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index ee84541e526d..c852d538b0d1 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -21,7 +21,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
#define DEBUG_TYPE "lowerinvoke"
@@ -48,10 +48,12 @@ static bool runImpl(Function &F) {
bool Changed = false;
for (BasicBlock &BB : F)
if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
- SmallVector<Value *, 16> CallArgs(II->op_begin(), II->op_end() - 3);
+ SmallVector<Value *, 16> CallArgs(II->arg_begin(), II->arg_end());
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ II->getOperandBundlesAsDefs(OpBundles);
// Insert a normal call instruction...
CallInst *NewCall =
- CallInst::Create(II->getCalledValue(), CallArgs, "", II);
+ CallInst::Create(II->getCalledValue(), CallArgs, OpBundles, "", II);
NewCall->takeName(II);
NewCall->setCallingConv(II->getCallingConv());
NewCall->setAttributes(II->getAttributes());
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 57dc225e9dab..03006ef3a2d3 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -409,8 +409,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
/* SrcAddr */ Memcpy->getRawSource(),
/* DstAddr */ Memcpy->getRawDest(),
/* CopyLen */ CI,
- /* SrcAlign */ Memcpy->getAlignment(),
- /* DestAlign */ Memcpy->getAlignment(),
+ /* SrcAlign */ Memcpy->getSourceAlignment(),
+ /* DestAlign */ Memcpy->getDestAlignment(),
/* SrcIsVolatile */ Memcpy->isVolatile(),
/* DstIsVolatile */ Memcpy->isVolatile(),
/* TargetTransformInfo */ TTI);
@@ -419,8 +419,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
/* SrcAddr */ Memcpy->getRawSource(),
/* DstAddr */ Memcpy->getRawDest(),
/* CopyLen */ Memcpy->getLength(),
- /* SrcAlign */ Memcpy->getAlignment(),
- /* DestAlign */ Memcpy->getAlignment(),
+ /* SrcAlign */ Memcpy->getSourceAlignment(),
+ /* DestAlign */ Memcpy->getDestAlignment(),
/* SrcIsVolatile */ Memcpy->isVolatile(),
/* DstIsVolatile */ Memcpy->isVolatile(),
/* TargetTransfomrInfo */ TTI);
@@ -432,8 +432,8 @@ void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
/* SrcAddr */ Memmove->getRawSource(),
/* DstAddr */ Memmove->getRawDest(),
/* CopyLen */ Memmove->getLength(),
- /* SrcAlign */ Memmove->getAlignment(),
- /* DestAlign */ Memmove->getAlignment(),
+ /* SrcAlign */ Memmove->getSourceAlignment(),
+ /* DestAlign */ Memmove->getDestAlignment(),
/* SrcIsVolatile */ Memmove->isVolatile(),
/* DstIsVolatile */ Memmove->isVolatile());
}
@@ -443,6 +443,6 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
/* DstAddr */ Memset->getRawDest(),
/* CopyLen */ Memset->getLength(),
/* SetValue */ Memset->getValue(),
- /* Alignment */ Memset->getAlignment(),
+ /* Alignment */ Memset->getDestAlignment(),
Memset->isVolatile());
}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 344cb35df986..e99ecfef19cd 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -29,7 +29,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <algorithm>
#include <cassert>
@@ -74,7 +74,7 @@ namespace {
LowerSwitch() : FunctionPass(ID) {
initializeLowerSwitchPass(*PassRegistry::getPassRegistry());
- }
+ }
bool runOnFunction(Function &F) override;
@@ -155,11 +155,8 @@ bool LowerSwitch::runOnFunction(Function &F) {
}
/// Used for debugging purposes.
-static raw_ostream& operator<<(raw_ostream &O,
- const LowerSwitch::CaseVector &C)
- LLVM_ATTRIBUTE_USED;
-
-static raw_ostream& operator<<(raw_ostream &O,
+LLVM_ATTRIBUTE_USED
+static raw_ostream &operator<<(raw_ostream &O,
const LowerSwitch::CaseVector &C) {
O << "[";
@@ -172,7 +169,7 @@ static raw_ostream& operator<<(raw_ostream &O,
return O << "]";
}
-/// \brief Update the first occurrence of the "switch statement" BB in the PHI
+/// Update the first occurrence of the "switch statement" BB in the PHI
/// node with the "new" BB. The other occurrences will:
///
/// 1) Be updated by subsequent calls to this function. Switch statements may
@@ -245,14 +242,13 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
unsigned Mid = Size / 2;
std::vector<CaseRange> LHS(Begin, Begin + Mid);
- DEBUG(dbgs() << "LHS: " << LHS << "\n");
+ LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n");
std::vector<CaseRange> RHS(Begin + Mid, End);
- DEBUG(dbgs() << "RHS: " << RHS << "\n");
+ LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n");
CaseRange &Pivot = *(Begin + Mid);
- DEBUG(dbgs() << "Pivot ==> "
- << Pivot.Low->getValue()
- << " -" << Pivot.High->getValue() << "\n");
+ LLVM_DEBUG(dbgs() << "Pivot ==> " << Pivot.Low->getValue() << " -"
+ << Pivot.High->getValue() << "\n");
// NewLowerBound here should never be the integer minimal value.
// This is because it is computed from a case range that is never
@@ -274,20 +270,14 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
NewUpperBound = LHS.back().High;
}
- DEBUG(dbgs() << "LHS Bounds ==> ";
- if (LowerBound) {
- dbgs() << LowerBound->getSExtValue();
- } else {
- dbgs() << "NONE";
- }
- dbgs() << " - " << NewUpperBound->getSExtValue() << "\n";
- dbgs() << "RHS Bounds ==> ";
- dbgs() << NewLowerBound->getSExtValue() << " - ";
- if (UpperBound) {
- dbgs() << UpperBound->getSExtValue() << "\n";
- } else {
- dbgs() << "NONE\n";
- });
+ LLVM_DEBUG(dbgs() << "LHS Bounds ==> "; if (LowerBound) {
+ dbgs() << LowerBound->getSExtValue();
+ } else { dbgs() << "NONE"; } dbgs() << " - "
+ << NewUpperBound->getSExtValue() << "\n";
+ dbgs() << "RHS Bounds ==> ";
+ dbgs() << NewLowerBound->getSExtValue() << " - "; if (UpperBound) {
+ dbgs() << UpperBound->getSExtValue() << "\n";
+ } else { dbgs() << "NONE\n"; });
// Create a new node that checks if the value is < pivot. Go to the
// left branch if it is and right branch if not.
@@ -337,7 +327,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
} else if (Leaf.Low->isZero()) {
// Val >= 0 && Val <= Hi --> Val <=u Hi
Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
- "SwitchLeaf");
+ "SwitchLeaf");
} else {
// Emit V-Lo <=u Hi-Lo
Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
@@ -364,7 +354,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
for (uint64_t j = 0; j < Range; ++j) {
PN->removeIncomingValue(OrigBlock);
}
-
+
int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
assert(BlockIdx != -1 && "Switch didn't go to this successor??");
PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
@@ -382,7 +372,7 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
Case.getCaseSuccessor()));
- std::sort(Cases.begin(), Cases.end(), CaseCmp());
+ llvm::sort(Cases.begin(), Cases.end(), CaseCmp());
// Merge case into clusters
if (Cases.size() >= 2) {
@@ -443,9 +433,9 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
// Prepare cases vector.
CaseVector Cases;
unsigned numCmps = Clusterify(Cases, SI);
- DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
- << ". Total compares: " << numCmps << "\n");
- DEBUG(dbgs() << "Cases: " << Cases << "\n");
+ LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+ << ". Total compares: " << numCmps << "\n");
+ LLVM_DEBUG(dbgs() << "Cases: " << Cases << "\n");
(void)numCmps;
ConstantInt *LowerBound = nullptr;
@@ -505,6 +495,10 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
}
#endif
+ // As the default block in the switch is unreachable, update the PHI nodes
+ // (remove the entry to the default block) to reflect this.
+ Default->removePredecessor(OrigBlock);
+
// Use the most popular block as the new default, reducing the number of
// cases.
assert(MaxPop > 0 && PopSucc);
@@ -518,29 +512,33 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
if (Cases.empty()) {
BranchInst::Create(Default, CurBlock);
SI->eraseFromParent();
+ // As all the cases have been replaced with a single branch, only keep
+ // one entry in the PHI nodes.
+ for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I)
+ PopSucc->removePredecessor(OrigBlock);
return;
}
}
+ unsigned NrOfDefaults = (SI->getDefaultDest() == Default) ? 1 : 0;
+ for (const auto &Case : SI->cases())
+ if (Case.getCaseSuccessor() == Default)
+ NrOfDefaults++;
+
// Create a new, empty default block so that the new hierarchy of
// if-then statements go to this and the PHI nodes are happy.
BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
BranchInst::Create(Default, NewDefault);
- // If there is an entry in any PHI nodes for the default edge, make sure
- // to update them as well.
- for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
- PHINode *PN = cast<PHINode>(I);
- int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
- assert(BlockIdx != -1 && "Switch didn't go to this successor??");
- PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
- }
-
BasicBlock *SwitchBlock =
switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
+ // If there are entries in any PHI nodes for the default edge, make sure
+ // to update them as well.
+ fixPhis(Default, OrigBlock, NewDefault, NrOfDefaults);
+
// Branch to our shiny new if-then stuff...
BranchInst::Create(SwitchBlock, OrigBlock);
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 29f289b62da0..23145e584751 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -22,7 +22,7 @@
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <vector>
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index 0f7bd76c03ca..323f2552ca80 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -29,7 +29,7 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/TypeFinder.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
diff --git a/lib/Transforms/Utils/OrderedInstructions.cpp b/lib/Transforms/Utils/OrderedInstructions.cpp
index dc780542ce68..6d0b96f6aa8a 100644
--- a/lib/Transforms/Utils/OrderedInstructions.cpp
+++ b/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -14,19 +14,38 @@
#include "llvm/Transforms/Utils/OrderedInstructions.h"
using namespace llvm;
+bool OrderedInstructions::localDominates(const Instruction *InstA,
+ const Instruction *InstB) const {
+ assert(InstA->getParent() == InstB->getParent() &&
+ "Instructions must be in the same basic block");
+
+ const BasicBlock *IBB = InstA->getParent();
+ auto OBB = OBBMap.find(IBB);
+ if (OBB == OBBMap.end())
+ OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
+ return OBB->second->dominates(InstA, InstB);
+}
+
/// Given 2 instructions, use OrderedBasicBlock to check for dominance relation
/// if the instructions are in the same basic block, Otherwise, use dominator
/// tree.
bool OrderedInstructions::dominates(const Instruction *InstA,
const Instruction *InstB) const {
- const BasicBlock *IBB = InstA->getParent();
// Use ordered basic block to do dominance check in case the 2 instructions
// are in the same basic block.
- if (IBB == InstB->getParent()) {
- auto OBB = OBBMap.find(IBB);
- if (OBB == OBBMap.end())
- OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
- return OBB->second->dominates(InstA, InstB);
- }
+ if (InstA->getParent() == InstB->getParent())
+ return localDominates(InstA, InstB);
return DT->dominates(InstA->getParent(), InstB->getParent());
}
+
+bool OrderedInstructions::dfsBefore(const Instruction *InstA,
+ const Instruction *InstB) const {
+ // Use ordered basic block in case the 2 instructions are in the same basic
+ // block.
+ if (InstA->getParent() == InstB->getParent())
+ return localDominates(InstA, InstB);
+
+ DomTreeNode *DA = DT->getNode(InstA->getParent());
+ DomTreeNode *DB = DT->getNode(InstB->getParent());
+ return DA->getDFSNumIn() < DB->getDFSNumIn();
+}
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index d47be6ea566b..2923977b791a 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -24,6 +25,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
@@ -32,7 +34,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/OrderedInstructions.h"
#include <algorithm>
#define DEBUG_TYPE "predicateinfo"
@@ -118,7 +120,7 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
return false;
if (ArgA && ArgB)
return ArgA->getArgNo() < ArgB->getArgNo();
- return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+ return OI.dfsBefore(cast<Instruction>(A), cast<Instruction>(B));
}
// This compares ValueDFS structures, creating OrderedBasicBlocks where
@@ -479,6 +481,19 @@ void PredicateInfo::buildPredicateInfo() {
renameUses(OpsToRename);
}
+// Create a ssa_copy declaration with custom mangling, because
+// Intrinsic::getDeclaration does not handle overloaded unnamed types properly:
+// all unnamed types get mangled to the same string. We use the pointer
+// to the type as name here, as it guarantees unique names for different
+// types and we remove the declarations when destroying PredicateInfo.
+// It is a workaround for PR38117, because solving it in a fully general way is
+// tricky (FIXME).
+static Function *getCopyDeclaration(Module *M, Type *Ty) {
+ std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty);
+ return cast<Function>(M->getOrInsertFunction(
+ Name, getType(M->getContext(), Intrinsic::ssa_copy, Ty)));
+}
+
// Given the renaming stack, make all the operands currently on the stack real
// by inserting them into the IR. Return the last operation's value.
Value *PredicateInfo::materializeStack(unsigned int &Counter,
@@ -507,8 +522,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
// order in the case of multiple predicateinfo in the same block.
if (isa<PredicateWithEdge>(ValInfo)) {
IRBuilder<> B(getBranchTerminator(ValInfo));
- Function *IF = Intrinsic::getDeclaration(
- F.getParent(), Intrinsic::ssa_copy, Op->getType());
+ Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+ if (IF->user_begin() == IF->user_end())
+ CreatedDeclarations.insert(IF);
CallInst *PIC =
B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
PredicateMap.insert({PIC, ValInfo});
@@ -518,8 +534,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
assert(PAssume &&
"Should not have gotten here without it being an assume");
IRBuilder<> B(PAssume->AssumeInst);
- Function *IF = Intrinsic::getDeclaration(
- F.getParent(), Intrinsic::ssa_copy, Op->getType());
+ Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+ if (IF->user_begin() == IF->user_end())
+ CreatedDeclarations.insert(IF);
CallInst *PIC = B.CreateCall(IF, Op);
PredicateMap.insert({PIC, ValInfo});
Result.Def = PIC;
@@ -553,10 +570,11 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
auto Comparator = [&](const Value *A, const Value *B) {
return valueComesBefore(OI, A, B);
};
- std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+ llvm::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
ValueDFS_Compare Compare(OI);
// Compute liveness, and rename in O(uses) per Op.
for (auto *Op : OpsToRename) {
+ LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
unsigned Counter = 0;
SmallVector<ValueDFS, 16> OrderedUses;
const auto &ValueInfo = getValueInfo(Op);
@@ -625,15 +643,15 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
// we want to.
bool PossibleCopy = VD.PInfo != nullptr;
if (RenameStack.empty()) {
- DEBUG(dbgs() << "Rename Stack is empty\n");
+ LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
} else {
- DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
- << RenameStack.back().DFSIn << ","
- << RenameStack.back().DFSOut << ")\n");
+ LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+ << RenameStack.back().DFSIn << ","
+ << RenameStack.back().DFSOut << ")\n");
}
- DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
- << VD.DFSOut << ")\n");
+ LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+ << VD.DFSOut << ")\n");
bool ShouldPush = (VD.Def || PossibleCopy);
bool OutOfScope = !stackIsInScope(RenameStack, VD);
@@ -652,7 +670,7 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
if (VD.Def || PossibleCopy)
continue;
if (!DebugCounter::shouldExecute(RenameCounter)) {
- DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+ LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
continue;
}
ValueDFS &Result = RenameStack.back();
@@ -663,8 +681,9 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
if (!Result.Def)
Result.Def = materializeStack(Counter, RenameStack, Op);
- DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
- << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+ LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+ << *VD.U->get() << " in " << *(VD.U->getUser())
+ << "\n");
assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
"Predicateinfo def should have dominated this use");
VD.U->set(Result.Def);
@@ -702,7 +721,22 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
buildPredicateInfo();
}
-PredicateInfo::~PredicateInfo() {}
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+ // Collect function pointers in set first, as SmallSet uses a SmallVector
+ // internally and we have to remove the asserting value handles first.
+ SmallPtrSet<Function *, 20> FunctionPtrs;
+ for (auto &F : CreatedDeclarations)
+ FunctionPtrs.insert(&*F);
+ CreatedDeclarations.clear();
+
+ for (Function *F : FunctionPtrs) {
+ assert(F->user_begin() == F->user_end() &&
+ "PredicateInfo consumer did not remove all SSA copies.");
+ F->eraseFromParent();
+ }
+}
void PredicateInfo::verifyPredicateInfo() const {}
@@ -720,6 +754,20 @@ void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AssumptionCacheTracker>();
}
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+ for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
+ Instruction *Inst = &*I++;
+ const auto *PI = PredInfo.getPredicateInfoFor(Inst);
+ auto *II = dyn_cast<IntrinsicInst>(Inst);
+ if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+ continue;
+
+ Inst->replaceAllUsesWith(II->getOperand(0));
+ Inst->eraseFromParent();
+ }
+}
+
bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -727,6 +775,8 @@ bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
PredInfo->print(dbgs());
if (VerifyPredicateInfo)
PredInfo->verifyPredicateInfo();
+
+ replaceCreatedSSACopys(*PredInfo, F);
return false;
}
@@ -735,12 +785,14 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
OS << "PredicateInfo for function: " << F.getName() << "\n";
- make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+ auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+ PredInfo->print(OS);
+ replaceCreatedSSACopys(*PredInfo, F);
return PreservedAnalyses::all();
}
-/// \brief An assembly annotator class to print PredicateInfo information in
+/// An assembly annotator class to print PredicateInfo information in
/// comments.
class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
friend class PredicateInfo;
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index fcd3bd08482a..86e15bbd7f22 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -26,6 +26,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -45,7 +46,6 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
#include <cassert>
@@ -164,26 +164,27 @@ struct AllocaInfo {
}
};
-// Data package used by RenamePass()
-class RenamePassData {
-public:
+/// Data package used by RenamePass().
+struct RenamePassData {
using ValVector = std::vector<Value *>;
+ using LocationVector = std::vector<DebugLoc>;
- RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V)
- : BB(B), Pred(P), Values(std::move(V)) {}
+ RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L)
+ : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {}
BasicBlock *BB;
BasicBlock *Pred;
ValVector Values;
+ LocationVector Locations;
};
-/// \brief This assigns and keeps a per-bb relative ordering of load/store
+/// This assigns and keeps a per-bb relative ordering of load/store
/// instructions in the block that directly load or store an alloca.
///
/// This functionality is important because it avoids scanning large basic
/// blocks multiple times when promoting many allocas in the same block.
class LargeBlockInfo {
- /// \brief For each instruction that we track, keep the index of the
+ /// For each instruction that we track, keep the index of the
/// instruction.
///
/// The index starts out as the number of the instruction from the start of
@@ -242,7 +243,7 @@ struct PromoteMem2Reg {
/// Reverse mapping of Allocas.
DenseMap<AllocaInst *, unsigned> AllocaLookup;
- /// \brief The PhiNodes we're adding.
+ /// The PhiNodes we're adding.
///
/// That map is used to simplify some Phi nodes as we iterate over it, so
/// it should have deterministic iterators. We could use a MapVector, but
@@ -294,7 +295,7 @@ private:
unsigned getNumPreds(const BasicBlock *BB) {
unsigned &NP = BBNumPreds[BB];
if (NP == 0)
- NP = std::distance(pred_begin(BB), pred_end(BB)) + 1;
+ NP = pred_size(BB) + 1;
return NP - 1;
}
@@ -303,6 +304,7 @@ private:
SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
void RenamePass(BasicBlock *BB, BasicBlock *Pred,
RenamePassData::ValVector &IncVals,
+ RenamePassData::LocationVector &IncLocs,
std::vector<RenamePassData> &Worklist);
bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
};
@@ -345,7 +347,7 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
}
}
-/// \brief Rewrite as many loads as possible given a single store.
+/// Rewrite as many loads as possible given a single store.
///
/// When there is only a single store, we can use the domtree to trivially
/// replace all of the dominated loads with the stored value. Do so, and return
@@ -475,7 +477,7 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
// Sort the stores by their index, making it efficient to do a lookup with a
// binary search.
- std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
+ llvm::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
// Walk all of the loads from this alloca, replacing them with the nearest
// store above them, if any.
@@ -509,6 +511,11 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
!isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
addAssumeNonNull(AC, LI);
+ // If the replacement value is the load, this must occur in unreachable
+ // code.
+ if (ReplVal == LI)
+ ReplVal = UndefValue::get(LI->getType());
+
LI->replaceAllUsesWith(ReplVal);
}
@@ -631,10 +638,10 @@ void PromoteMem2Reg::run() {
SmallVector<BasicBlock *, 32> PHIBlocks;
IDF.calculate(PHIBlocks);
if (PHIBlocks.size() > 1)
- std::sort(PHIBlocks.begin(), PHIBlocks.end(),
- [this](BasicBlock *A, BasicBlock *B) {
- return BBNumbers.lookup(A) < BBNumbers.lookup(B);
- });
+ llvm::sort(PHIBlocks.begin(), PHIBlocks.end(),
+ [this](BasicBlock *A, BasicBlock *B) {
+ return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+ });
unsigned CurrentVersion = 0;
for (BasicBlock *BB : PHIBlocks)
@@ -653,15 +660,20 @@ void PromoteMem2Reg::run() {
for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+ // When handling debug info, treat all incoming values as if they have unknown
+ // locations until proven otherwise.
+ RenamePassData::LocationVector Locations(Allocas.size());
+
// Walks all basic blocks in the function performing the SSA rename algorithm
// and inserting the phi nodes we marked as necessary
std::vector<RenamePassData> RenamePassWorkList;
- RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values));
+ RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
+ std::move(Locations));
do {
RenamePassData RPD = std::move(RenamePassWorkList.back());
RenamePassWorkList.pop_back();
// RenamePass may add new worklist entries.
- RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+ RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
} while (!RenamePassWorkList.empty());
// The renamer uses the Visited set to avoid infinite loops. Clear it now.
@@ -740,7 +752,7 @@ void PromoteMem2Reg::run() {
// Ok, now we know that all of the PHI nodes are missing entries for some
// basic blocks. Start by sorting the incoming predecessors for efficient
// access.
- std::sort(Preds.begin(), Preds.end());
+ llvm::sort(Preds.begin(), Preds.end());
// Now we loop through all BB's which have entries in SomePHI and remove
// them from the Preds list.
@@ -772,7 +784,7 @@ void PromoteMem2Reg::run() {
NewPhiNodes.clear();
}
-/// \brief Determine which blocks the value is live in.
+/// Determine which blocks the value is live in.
///
/// These are blocks which lead to uses. Knowing this allows us to avoid
/// inserting PHI nodes into blocks which don't lead to uses (thus, the
@@ -846,7 +858,7 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
}
}
-/// \brief Queue a phi-node to be added to a basic-block for a specific Alloca.
+/// Queue a phi-node to be added to a basic-block for a specific Alloca.
///
/// Returns true if there wasn't already a phi-node for that variable
bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
@@ -868,13 +880,24 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
return true;
}
-/// \brief Recursively traverse the CFG of the function, renaming loads and
+/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
+/// create a merged location incorporating \p DL, or to set \p DL directly.
+static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
+ bool ApplyMergedLoc) {
+ if (ApplyMergedLoc)
+ PN->applyMergedLocation(PN->getDebugLoc(), DL);
+ else
+ PN->setDebugLoc(DL);
+}
+
+/// Recursively traverse the CFG of the function, renaming loads and
/// stores to the allocas which we are promoting.
///
/// IncomingVals indicates what value each Alloca contains on exit from the
/// predecessor block Pred.
void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
RenamePassData::ValVector &IncomingVals,
+ RenamePassData::LocationVector &IncomingLocs,
std::vector<RenamePassData> &Worklist) {
NextIteration:
// If we are inserting any phi nodes into this BB, they will already be in the
@@ -899,6 +922,10 @@ NextIteration:
do {
unsigned AllocaNo = PhiToAllocaMap[APN];
+ // Update the location of the phi node.
+ updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo],
+ APN->getNumIncomingValues() > 0);
+
// Add N incoming values to the PHI node.
for (unsigned i = 0; i != NumEdges; ++i)
APN->addIncoming(IncomingVals[AllocaNo], Pred);
@@ -960,8 +987,11 @@ NextIteration:
continue;
// what value were we writing?
- IncomingVals[ai->second] = SI->getOperand(0);
+ unsigned AllocaNo = ai->second;
+ IncomingVals[AllocaNo] = SI->getOperand(0);
+
// Record debuginfo for the store before removing it.
+ IncomingLocs[AllocaNo] = SI->getDebugLoc();
for (DbgInfoIntrinsic *DII : AllocaDbgDeclares[ai->second])
ConvertDebugDeclareToDebugValue(DII, SI, DIB);
BB->getInstList().erase(SI);
@@ -984,7 +1014,7 @@ NextIteration:
for (; I != E; ++I)
if (VisitedSuccs.insert(*I).second)
- Worklist.emplace_back(*I, Pred, IncomingVals);
+ Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs);
goto NextIteration;
}
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index e4b20b0faa15..ca184ed7c4e3 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -147,11 +147,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
if (isa<PHINode>(BB->begin())) {
SmallDenseMap<BasicBlock *, Value *, 8> ValueMapping(PredValues.begin(),
PredValues.end());
- PHINode *SomePHI;
- for (BasicBlock::iterator It = BB->begin();
- (SomePHI = dyn_cast<PHINode>(It)); ++It) {
- if (IsEquivalentPHI(SomePHI, ValueMapping))
- return SomePHI;
+ for (PHINode &SomePHI : BB->phis()) {
+ if (IsEquivalentPHI(&SomePHI, ValueMapping))
+ return &SomePHI;
}
}
@@ -180,7 +178,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
// If the client wants to know about all new instructions, tell it.
if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
- DEBUG(dbgs() << " Inserted PHI: " << *InsertedPHI << "\n");
+ LLVM_DEBUG(dbgs() << " Inserted PHI: " << *InsertedPHI << "\n");
return InsertedPHI;
}
diff --git a/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/lib/Transforms/Utils/SSAUpdaterBulk.cpp
new file mode 100644
index 000000000000..397bac2940a4
--- /dev/null
+++ b/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -0,0 +1,191 @@
+//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdaterBulk class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdaterbulk"
+
+/// Helper function for finding a block which should have a value for the given
+/// user. For PHI-nodes this block is the corresponding predecessor, for other
+/// instructions it's their parent block.
+static BasicBlock *getUserBB(Use *U) {
+ auto *User = cast<Instruction>(U->getUser());
+
+ if (auto *UserPN = dyn_cast<PHINode>(User))
+ return UserPN->getIncomingBlock(*U);
+ else
+ return User->getParent();
+}
+
+/// Add a new variable to the SSA rewriter. This needs to be called before
+/// AddAvailableValue or AddUse calls.
+unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) {
+ unsigned Var = Rewrites.size();
+ LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = "
+ << *Ty << ", Name = " << Name << "\n");
+ RewriteInfo RI(Name, Ty);
+ Rewrites.push_back(RI);
+ return Var;
+}
+
+/// Indicate that a rewritten value is available in the specified block with the
+/// specified value.
+void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) {
+ assert(Var < Rewrites.size() && "Variable not found!");
+ LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var
+ << ": added new available value" << *V << " in "
+ << BB->getName() << "\n");
+ Rewrites[Var].Defines[BB] = V;
+}
+
+/// Record a use of the symbolic value. This use will be updated with a
+/// rewritten value when RewriteAllUses is called.
+void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
+ assert(Var < Rewrites.size() && "Variable not found!");
+ LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get()
+ << " in " << getUserBB(U)->getName() << "\n");
+ Rewrites[Var].Uses.push_back(U);
+}
+
+/// Return true if the SSAUpdater already has a value for the specified variable
+/// in the specified block.
+bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
+ return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
+}
+
+// Compute value at the given block BB. We either should already know it, or we
+// should be able to recursively reach it going up dominator tree.
+Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
+ DominatorTree *DT) {
+ if (!R.Defines.count(BB)) {
+ if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) {
+ BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock();
+ Value *V = computeValueAt(IDom, R, DT);
+ R.Defines[BB] = V;
+ } else
+ R.Defines[BB] = UndefValue::get(R.Ty);
+ }
+ return R.Defines[BB];
+}
+
+/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks.
+/// This is basically a subgraph limited by DefBlocks and UsingBlocks.
+static void
+ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks,
+ const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+ SmallPtrSetImpl<BasicBlock *> &LiveInBlocks,
+ PredIteratorCache &PredCache) {
+ // To determine liveness, we must iterate through the predecessors of blocks
+ // where the def is live. Blocks are added to the worklist if we need to
+ // check their predecessors. Start with all the using blocks.
+ SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(),
+ UsingBlocks.end());
+
+ // Now that we have a set of blocks where the phi is live-in, recursively add
+ // their predecessors until we find the full region the value is live.
+ while (!LiveInBlockWorklist.empty()) {
+ BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+ // The block really is live in here, insert it into the set. If already in
+ // the set, then it has already been processed.
+ if (!LiveInBlocks.insert(BB).second)
+ continue;
+
+ // Since the value is live into BB, it is either defined in a predecessor or
+ // live into it to. Add the preds to the worklist unless they are a
+ // defining block.
+ for (BasicBlock *P : PredCache.get(BB)) {
+ // The value is not live into a predecessor if it defines the value.
+ if (DefBlocks.count(P))
+ continue;
+
+ // Otherwise it is, add to the worklist.
+ LiveInBlockWorklist.push_back(P);
+ }
+ }
+}
+
+/// Perform all the necessary updates, including new PHI-nodes insertion and the
+/// requested uses update.
+void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
+ SmallVectorImpl<PHINode *> *InsertedPHIs) {
+ for (auto &R : Rewrites) {
+ // Compute locations for new phi-nodes.
+ // For that we need to initialize DefBlocks from definitions in R.Defines,
+ // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use
+ // this set for computing iterated dominance frontier (IDF).
+ // The IDF blocks are the blocks where we need to insert new phi-nodes.
+ ForwardIDFCalculator IDF(*DT);
+ LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size()
+ << " use(s)\n");
+
+ SmallPtrSet<BasicBlock *, 2> DefBlocks;
+ for (auto &Def : R.Defines)
+ DefBlocks.insert(Def.first);
+ IDF.setDefiningBlocks(DefBlocks);
+
+ SmallPtrSet<BasicBlock *, 2> UsingBlocks;
+ for (Use *U : R.Uses)
+ UsingBlocks.insert(getUserBB(U));
+
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+ ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache);
+ IDF.resetLiveInBlocks();
+ IDF.setLiveInBlocks(LiveInBlocks);
+ IDF.calculate(IDFBlocks);
+
+ // We've computed IDF, now insert new phi-nodes there.
+ SmallVector<PHINode *, 4> InsertedPHIsForVar;
+ for (auto *FrontierBB : IDFBlocks) {
+ IRBuilder<> B(FrontierBB, FrontierBB->begin());
+ PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
+ R.Defines[FrontierBB] = PN;
+ InsertedPHIsForVar.push_back(PN);
+ if (InsertedPHIs)
+ InsertedPHIs->push_back(PN);
+ }
+
+ // Fill in arguments of the inserted PHIs.
+ for (auto *PN : InsertedPHIsForVar) {
+ BasicBlock *PBB = PN->getParent();
+ for (BasicBlock *Pred : PredCache.get(PBB))
+ PN->addIncoming(computeValueAt(Pred, R, DT), Pred);
+ }
+
+ // Rewrite actual uses with the inserted definitions.
+ SmallPtrSet<Use *, 4> ProcessedUses;
+ for (Use *U : R.Uses) {
+ if (!ProcessedUses.insert(U).second)
+ continue;
+ Value *V = computeValueAt(getUserBB(U), R, DT);
+ Value *OldVal = U->get();
+ assert(OldVal && "Invalid use!");
+ // Notify that users of the existing value that it is being replaced.
+ if (OldVal != V && OldVal->hasValueHandle())
+ ValueHandleBase::ValueIsRAUWd(OldVal, V);
+ LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V
+ << "\n");
+ U->set(V);
+ }
+ }
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index e7358dbcb624..c87b5c16ffce 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -19,7 +19,6 @@
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -28,6 +27,7 @@
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -66,7 +66,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
#include <cassert>
@@ -283,12 +282,8 @@ isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2,
/// of Succ.
static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
BasicBlock *ExistPred) {
- if (!isa<PHINode>(Succ->begin()))
- return; // Quick exit if nothing to do
-
- PHINode *PN;
- for (BasicBlock::iterator I = Succ->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
- PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
+ for (PHINode &PN : Succ->phis())
+ PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred);
}
/// Compute an abstract "cost" of speculating the given instruction,
@@ -692,9 +687,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
// Do not permit merging of large switch instructions into their
// predecessors unless there is only one predecessor.
- if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
- pred_end(SI->getParent())) <=
- 128)
+ if (SI->getNumSuccessors() * pred_size(SI->getParent()) <= 128)
CV = SI->getCondition();
} else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
if (BI->isConditional() && BI->getCondition()->hasOneUse())
@@ -851,9 +844,9 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
// Remove PHI node entries for the dead edge.
ThisCases[0].Dest->removePredecessor(TI->getParent());
- DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
- << "Through successor TI: " << *TI << "Leaving: " << *NI
- << "\n");
+ LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+ << "Through successor TI: " << *TI << "Leaving: " << *NI
+ << "\n");
EraseTerminatorInstAndDCECond(TI);
return true;
@@ -865,8 +858,8 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
DeadCases.insert(PredCases[i].Value);
- DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
- << "Through successor TI: " << *TI);
+ LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+ << "Through successor TI: " << *TI);
// Collect branch weights into a vector.
SmallVector<uint32_t, 8> Weights;
@@ -892,7 +885,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
if (HasWeight && Weights.size() >= 2)
setBranchWeights(SI, Weights);
- DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+ LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
return true;
}
@@ -933,9 +926,9 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
Instruction *NI = Builder.CreateBr(TheRealDest);
(void)NI;
- DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
- << "Through successor TI: " << *TI << "Leaving: " << *NI
- << "\n");
+ LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+ << "Through successor TI: " << *TI << "Leaving: " << *NI
+ << "\n");
EraseTerminatorInstAndDCECond(TI);
return true;
@@ -1228,11 +1221,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
Instruction *I1, Instruction *I2) {
for (BasicBlock *Succ : successors(BB1)) {
- PHINode *PN;
- for (BasicBlock::iterator BBI = Succ->begin();
- (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
- Value *BB1V = PN->getIncomingValueForBlock(BB1);
- Value *BB2V = PN->getIncomingValueForBlock(BB2);
+ for (const PHINode &PN : Succ->phis()) {
+ Value *BB1V = PN.getIncomingValueForBlock(BB1);
+ Value *BB2V = PN.getIncomingValueForBlock(BB2);
if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
return false;
}
@@ -1282,34 +1273,58 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
if (isa<TerminatorInst>(I1))
goto HoistTerminator;
+ // If we're going to hoist a call, make sure that the two instructions we're
+ // commoning/hoisting are both marked with musttail, or neither of them is
+ // marked as such. Otherwise, we might end up in a situation where we hoist
+ // from a block where the terminator is a `ret` to a block where the terminator
+ // is a `br`, and `musttail` calls expect to be followed by a return.
+ auto *C1 = dyn_cast<CallInst>(I1);
+ auto *C2 = dyn_cast<CallInst>(I2);
+ if (C1 && C2)
+ if (C1->isMustTailCall() != C2->isMustTailCall())
+ return Changed;
+
if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
return Changed;
- // For a normal instruction, we just move one to right before the branch,
- // then replace all uses of the other with the first. Finally, we remove
- // the now redundant second instruction.
- BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);
- if (!I2->use_empty())
- I2->replaceAllUsesWith(I1);
- I1->andIRFlags(I2);
- unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
- LLVMContext::MD_range,
- LLVMContext::MD_fpmath,
- LLVMContext::MD_invariant_load,
- LLVMContext::MD_nonnull,
- LLVMContext::MD_invariant_group,
- LLVMContext::MD_align,
- LLVMContext::MD_dereferenceable,
- LLVMContext::MD_dereferenceable_or_null,
- LLVMContext::MD_mem_parallel_loop_access};
- combineMetadata(I1, I2, KnownIDs);
-
- // I1 and I2 are being combined into a single instruction. Its debug
- // location is the merged locations of the original instructions.
- I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
- I2->eraseFromParent();
- Changed = true;
+ if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
+ assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+ // The debug location is an integral part of a debug info intrinsic
+ // and can't be separated from it or replaced. Instead of attempting
+ // to merge locations, simply hoist both copies of the intrinsic.
+ BIParent->getInstList().splice(BI->getIterator(),
+ BB1->getInstList(), I1);
+ BIParent->getInstList().splice(BI->getIterator(),
+ BB2->getInstList(), I2);
+ Changed = true;
+ } else {
+ // For a normal instruction, we just move one to right before the branch,
+ // then replace all uses of the other with the first. Finally, we remove
+ // the now redundant second instruction.
+ BIParent->getInstList().splice(BI->getIterator(),
+ BB1->getInstList(), I1);
+ if (!I2->use_empty())
+ I2->replaceAllUsesWith(I1);
+ I1->andIRFlags(I2);
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_fpmath,
+ LLVMContext::MD_invariant_load,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_invariant_group,
+ LLVMContext::MD_align,
+ LLVMContext::MD_dereferenceable,
+ LLVMContext::MD_dereferenceable_or_null,
+ LLVMContext::MD_mem_parallel_loop_access};
+ combineMetadata(I1, I2, KnownIDs);
+
+ // I1 and I2 are being combined into a single instruction. Its debug
+ // location is the merged locations of the original instructions.
+ I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+ I2->eraseFromParent();
+ Changed = true;
+ }
I1 = &*BB1_Itr++;
I2 = &*BB2_Itr++;
@@ -1332,18 +1347,16 @@ HoistTerminator:
return Changed;
for (BasicBlock *Succ : successors(BB1)) {
- PHINode *PN;
- for (BasicBlock::iterator BBI = Succ->begin();
- (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
- Value *BB1V = PN->getIncomingValueForBlock(BB1);
- Value *BB2V = PN->getIncomingValueForBlock(BB2);
+ for (PHINode &PN : Succ->phis()) {
+ Value *BB1V = PN.getIncomingValueForBlock(BB1);
+ Value *BB2V = PN.getIncomingValueForBlock(BB2);
if (BB1V == BB2V)
continue;
// Check for passingValueIsAlwaysUndefined here because we would rather
// eliminate undefined control flow then converting it to a select.
- if (passingValueIsAlwaysUndefined(BB1V, PN) ||
- passingValueIsAlwaysUndefined(BB2V, PN))
+ if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
+ passingValueIsAlwaysUndefined(BB2V, &PN))
return Changed;
if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
@@ -1369,11 +1382,9 @@ HoistTerminator:
// nodes, so we insert select instruction to compute the final result.
std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
for (BasicBlock *Succ : successors(BB1)) {
- PHINode *PN;
- for (BasicBlock::iterator BBI = Succ->begin();
- (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
- Value *BB1V = PN->getIncomingValueForBlock(BB1);
- Value *BB2V = PN->getIncomingValueForBlock(BB2);
+ for (PHINode &PN : Succ->phis()) {
+ Value *BB1V = PN.getIncomingValueForBlock(BB1);
+ Value *BB2V = PN.getIncomingValueForBlock(BB2);
if (BB1V == BB2V)
continue;
@@ -1386,9 +1397,9 @@ HoistTerminator:
BB1V->getName() + "." + BB2V->getName(), BI));
// Make the PHI node use the select for all incoming values for BB1/BB2
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
- PN->setIncomingValue(i, SI);
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+ if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
+ PN.setIncomingValue(i, SI);
}
}
@@ -1727,7 +1738,8 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
LockstepReverseIterator LRI(UnconditionalPreds);
while (LRI.isValid() &&
canSinkInstructions(*LRI, PHIOperands)) {
- DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0] << "\n");
+ LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
+ << "\n");
InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
++ScanIdx;
--LRI;
@@ -1739,7 +1751,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
for (auto *V : PHIOperands[I])
if (InstructionsToSink.count(V) == 0)
++NumPHIdValues;
- DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+ LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
NumPHIInsts++;
@@ -1767,7 +1779,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
if (!Profitable)
return false;
- DEBUG(dbgs() << "SINK: Splitting edge\n");
+ LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
// We have a conditional edge and we're going to sink some instructions.
// Insert a new block postdominating all blocks we're going to sink from.
if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))
@@ -1789,16 +1801,17 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
// and never actually sink it which means we produce more PHIs than intended.
// This is unlikely in practice though.
for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
- DEBUG(dbgs() << "SINK: Sink: "
- << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
- << "\n");
+ LLVM_DEBUG(dbgs() << "SINK: Sink: "
+ << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+ << "\n");
// Because we've sunk every instruction in turn, the current instruction to
// sink is always at index 0.
LRI.reset();
if (!ProfitableToSinkInstruction(LRI)) {
// Too many PHIs would be created.
- DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+ LLVM_DEBUG(
+ dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
break;
}
@@ -1810,7 +1823,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
return Changed;
}
-/// \brief Determine if we can hoist sink a sole store instruction out of a
+/// Determine if we can hoist sink a sole store instruction out of a
/// conditional block.
///
/// We are looking for code like the following:
@@ -1850,12 +1863,9 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
// Look for a store to the same pointer in BrBB.
unsigned MaxNumInstToLookAt = 9;
- for (Instruction &CurI : reverse(*BrBB)) {
+ for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug())) {
if (!MaxNumInstToLookAt)
break;
- // Skip debug info.
- if (isa<DbgInfoIntrinsic>(CurI))
- continue;
--MaxNumInstToLookAt;
// Could be calling an instruction that affects memory like free().
@@ -1874,7 +1884,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
return nullptr;
}
-/// \brief Speculate a conditional basic block flattening the CFG.
+/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
/// instructions like this is most often not desirable. Instead, there is an MI
@@ -1999,10 +2009,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
// Check that the PHI nodes can be converted to selects.
bool HaveRewritablePHIs = false;
- for (BasicBlock::iterator I = EndBB->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I) {
- Value *OrigV = PN->getIncomingValueForBlock(BB);
- Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+ for (PHINode &PN : EndBB->phis()) {
+ Value *OrigV = PN.getIncomingValueForBlock(BB);
+ Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
// FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
// Skip PHIs which are trivial.
@@ -2010,8 +2019,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
continue;
// Don't convert to selects if we could remove undefined behavior instead.
- if (passingValueIsAlwaysUndefined(OrigV, PN) ||
- passingValueIsAlwaysUndefined(ThenV, PN))
+ if (passingValueIsAlwaysUndefined(OrigV, &PN) ||
+ passingValueIsAlwaysUndefined(ThenV, &PN))
return false;
HaveRewritablePHIs = true;
@@ -2045,7 +2054,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
return false;
// If we get here, we can hoist the instruction and if-convert.
- DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+ LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
// Insert a select of the value of the speculated store.
if (SpeculatedStoreValue) {
@@ -2072,12 +2081,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
// Insert selects and rewrite the PHI operands.
IRBuilder<NoFolder> Builder(BI);
- for (BasicBlock::iterator I = EndBB->begin();
- PHINode *PN = dyn_cast<PHINode>(I); ++I) {
- unsigned OrigI = PN->getBasicBlockIndex(BB);
- unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
- Value *OrigV = PN->getIncomingValue(OrigI);
- Value *ThenV = PN->getIncomingValue(ThenI);
+ for (PHINode &PN : EndBB->phis()) {
+ unsigned OrigI = PN.getBasicBlockIndex(BB);
+ unsigned ThenI = PN.getBasicBlockIndex(ThenBB);
+ Value *OrigV = PN.getIncomingValue(OrigI);
+ Value *ThenV = PN.getIncomingValue(ThenI);
// Skip PHIs which are trivial.
if (OrigV == ThenV)
@@ -2091,8 +2099,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
std::swap(TrueV, FalseV);
Value *V = Builder.CreateSelect(
BrCond, TrueV, FalseV, "spec.select", BI);
- PN->setIncomingValue(OrigI, V);
- PN->setIncomingValue(ThenI, V);
+ PN.setIncomingValue(OrigI, V);
+ PN.setIncomingValue(ThenI, V);
}
// Remove speculated dbg intrinsics.
@@ -2107,19 +2115,16 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
/// Return true if we can thread a branch across this block.
static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
- BranchInst *BI = cast<BranchInst>(BB->getTerminator());
unsigned Size = 0;
- for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
- if (isa<DbgInfoIntrinsic>(BBI))
- continue;
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
if (Size > 10)
return false; // Don't clone large BB's.
++Size;
// We can only support instructions that do not define values that are
// live outside of the current basic block.
- for (User *U : BBI->users()) {
+ for (User *U : I.users()) {
Instruction *UI = cast<Instruction>(U);
if (UI->getParent() != BB || isa<PHINode>(UI))
return false;
@@ -2261,6 +2266,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
// dependence information for this check, but simplifycfg can't keep it up
// to date, and this catches most of the cases we care about anyway.
BasicBlock *BB = PN->getParent();
+ const Function *Fn = BB->getParent();
+ if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+ return false;
+
BasicBlock *IfTrue, *IfFalse;
Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
if (!IfCond ||
@@ -2351,8 +2360,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
}
}
- DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: "
- << IfTrue->getName() << " F: " << IfFalse->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond
+ << " T: " << IfTrue->getName()
+ << " F: " << IfFalse->getName() << "\n");
// If we can still promote the PHI nodes after this gauntlet of tests,
// do all of the PHI's now.
@@ -2476,9 +2486,9 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
(void)RI;
- DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
- << "\n " << *BI << "NewRet = " << *RI
- << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
+ LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+ << "\n " << *BI << "NewRet = " << *RI << "TRUEBLOCK: "
+ << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
EraseTerminatorInstAndDCECond(BI);
@@ -2487,7 +2497,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
/// Return true if the given instruction is available
/// in its predecessor block. If yes, the instruction will be removed.
-static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+static bool tryCSEWithPredecessor(Instruction *Inst, BasicBlock *PB) {
if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
return false;
for (Instruction &I : *PB) {
@@ -2544,14 +2554,16 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
if (PBI->isConditional() &&
(BI->getSuccessor(0) == PBI->getSuccessor(0) ||
BI->getSuccessor(0) == PBI->getSuccessor(1))) {
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ for (auto I = BB->instructionsWithoutDebug().begin(),
+ E = BB->instructionsWithoutDebug().end();
+ I != E;) {
Instruction *Curr = &*I++;
if (isa<CmpInst>(Curr)) {
Cond = Curr;
break;
}
// Quit if we can't remove this instruction.
- if (!checkCSEInPredecessor(Curr, PB))
+ if (!tryCSEWithPredecessor(Curr, PB))
return false;
}
}
@@ -2651,7 +2663,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
continue;
}
- DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+ LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
IRBuilder<> Builder(PBI);
// If we need to invert the condition in the pred block to match, do so now.
@@ -2861,7 +2873,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
if (!AlternativeV)
break;
- assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2);
+ assert(pred_size(Succ) == 2);
auto PredI = pred_begin(Succ);
BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
@@ -2904,14 +2916,13 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
// instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
// thread this store.
unsigned N = 0;
- for (auto &I : *BB) {
+ for (auto &I : BB->instructionsWithoutDebug()) {
// Cheap instructions viable for folding.
if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
isa<StoreInst>(I))
++N;
// Free instructions.
- else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
- IsaBitcastOfPointerType(I))
+ else if (isa<TerminatorInst>(I) || IsaBitcastOfPointerType(I))
continue;
else
return false;
@@ -2966,6 +2977,21 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
if (&*I != PStore && I->mayReadOrWriteMemory())
return false;
+ // If PostBB has more than two predecessors, we need to split it so we can
+ // sink the store.
+ if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
+ // We know that QFB's only successor is PostBB. And QFB has a single
+ // predecessor. If QTB exists, then its only successor is also PostBB.
+ // If QTB does not exist, then QFB's only predecessor has a conditional
+ // branch to QFB and PostBB.
+ BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
+ BasicBlock *NewBB = SplitBlockPredecessors(PostBB, { QFB, TruePred},
+ "condstore.split");
+ if (!NewBB)
+ return false;
+ PostBB = NewBB;
+ }
+
// OK, we're going to sink the stores to PostBB. The store has to be
// conditional though, so first create the predicate.
Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
@@ -3101,7 +3127,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
(QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
return false;
- if (!PostBB->hasNUses(2) || !QBI->getParent()->hasNUses(2))
+ if (!QBI->getParent()->hasNUses(2))
return false;
// OK, this is a sequence of two diamonds or triangles.
@@ -3201,11 +3227,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
// If this is a conditional branch in an empty block, and if any
// predecessors are a conditional branch to one of our destinations,
// fold the conditions into logical ops and one cond br.
- BasicBlock::iterator BBI = BB->begin();
+
// Ignore dbg intrinsics.
- while (isa<DbgInfoIntrinsic>(BBI))
- ++BBI;
- if (&*BBI != BI)
+ if (&*BB->instructionsWithoutDebug().begin() != BI)
return false;
int PBIOp, BIOp;
@@ -3262,8 +3286,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
// Finally, if everything is ok, fold the branches to logical ops.
BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
- DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
- << "AND: " << *BI->getParent());
+ LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+ << "AND: " << *BI->getParent());
// If OtherDest *is* BB, then BB is a basic block with a single conditional
// branch in it, where one edge (OtherDest) goes back to itself but the other
@@ -3281,7 +3305,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
OtherDest = InfLoopBlock;
}
- DEBUG(dbgs() << *PBI->getParent()->getParent());
+ LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
// BI may have other predecessors. Because of this, we leave
// it alone, but modify PBI.
@@ -3335,17 +3359,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
// it. If it has PHIs though, the PHIs may have different
// entries for BB and PBI's BB. If so, insert a select to make
// them agree.
- PHINode *PN;
- for (BasicBlock::iterator II = CommonDest->begin();
- (PN = dyn_cast<PHINode>(II)); ++II) {
- Value *BIV = PN->getIncomingValueForBlock(BB);
- unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
- Value *PBIV = PN->getIncomingValue(PBBIdx);
+ for (PHINode &PN : CommonDest->phis()) {
+ Value *BIV = PN.getIncomingValueForBlock(BB);
+ unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent());
+ Value *PBIV = PN.getIncomingValue(PBBIdx);
if (BIV != PBIV) {
// Insert a select in PBI to pick the right value.
SelectInst *NV = cast<SelectInst>(
Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
- PN->setIncomingValue(PBBIdx, NV);
+ PN.setIncomingValue(PBBIdx, NV);
// Although the select has the same condition as PBI, the original branch
// weights for PBI do not apply to the new select because the select's
// 'logical' edges are incoming edges of the phi that is eliminated, not
@@ -3367,8 +3389,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
}
}
- DEBUG(dbgs() << "INTO: " << *PBI->getParent());
- DEBUG(dbgs() << *PBI->getParent()->getParent());
+ LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+ LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
// This basic block is probably dead. We know it has at least
// one fewer predecessor.
@@ -3668,9 +3690,9 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
BasicBlock *BB = BI->getParent();
- DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
- << " cases into SWITCH. BB is:\n"
- << *BB);
+ LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+ << " cases into SWITCH. BB is:\n"
+ << *BB);
// If there are any extra values that couldn't be folded into the switch
// then we evaluate them with an explicit branch first. Split the block
@@ -3693,8 +3715,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
// for the edge we just added.
AddPredecessorToBlock(EdgeBB, BB, NewBB);
- DEBUG(dbgs() << " ** 'icmp' chain unhandled condition: " << *ExtraCase
- << "\nEXTRABB = " << *BB);
+ LLVM_DEBUG(dbgs() << " ** 'icmp' chain unhandled condition: " << *ExtraCase
+ << "\nEXTRABB = " << *BB);
BB = NewBB;
}
@@ -3725,7 +3747,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
// Erase the old branch instruction.
EraseTerminatorInstAndDCECond(BI);
- DEBUG(dbgs() << " ** 'icmp' chain result is:\n" << *BB << '\n');
+ LLVM_DEBUG(dbgs() << " ** 'icmp' chain result is:\n" << *BB << '\n');
return true;
}
@@ -3876,6 +3898,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
switch (IntrinsicID) {
case Intrinsic::dbg_declare:
case Intrinsic::dbg_value:
+ case Intrinsic::dbg_label:
case Intrinsic::lifetime_end:
break;
default:
@@ -4052,8 +4075,8 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
if (!UncondBranchPreds.empty() && DupRet) {
while (!UncondBranchPreds.empty()) {
BasicBlock *Pred = UncondBranchPreds.pop_back_val();
- DEBUG(dbgs() << "FOLDING: " << *BB
- << "INTO UNCOND BRANCH PRED: " << *Pred);
+ LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+ << "INTO UNCOND BRANCH PRED: " << *Pred);
(void)FoldReturnIntoUncondBranch(RI, BB, Pred);
}
@@ -4377,7 +4400,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
(CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
DeadCases.push_back(Case.getCaseValue());
- DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
+ LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
+ << " is dead.\n");
}
}
@@ -4393,7 +4417,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
if (HasDefault && DeadCases.empty() &&
NumUnknownBits < 64 /* avoid overflow */ &&
SI->getNumCases() == (1ULL << NumUnknownBits)) {
- DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+ LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
BasicBlock *NewDefault =
SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
SI->setDefaultDest(&*NewDefault);
@@ -4451,17 +4475,16 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
BasicBlock *Succ = Branch->getSuccessor(0);
- BasicBlock::iterator I = Succ->begin();
- while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
- int Idx = PHI->getBasicBlockIndex(BB);
+ for (PHINode &PHI : Succ->phis()) {
+ int Idx = PHI.getBasicBlockIndex(BB);
assert(Idx >= 0 && "PHI has no entry for predecessor?");
- Value *InValue = PHI->getIncomingValue(Idx);
+ Value *InValue = PHI.getIncomingValue(Idx);
if (InValue != CaseValue)
continue;
*PhiIndex = Idx;
- return PHI;
+ return &PHI;
}
return nullptr;
@@ -4491,19 +4514,16 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
// -->
// %r = phi i32 ... [ %x, %switchbb ] ...
- for (Instruction &InstInCaseDest : *CaseDest) {
- auto *Phi = dyn_cast<PHINode>(&InstInCaseDest);
- if (!Phi) break;
-
+ for (PHINode &Phi : CaseDest->phis()) {
// This only works if there is exactly 1 incoming edge from the switch to
// a phi. If there is >1, that means multiple cases of the switch map to 1
// value in the phi, and that phi value is not the switch condition. Thus,
// this transform would not make sense (the phi would be invalid because
// a phi can't have different incoming values from the same block).
- int SwitchBBIdx = Phi->getBasicBlockIndex(SwitchBlock);
- if (Phi->getIncomingValue(SwitchBBIdx) == CaseValue &&
- count(Phi->blocks(), SwitchBlock) == 1) {
- Phi->setIncomingValue(SwitchBBIdx, SI->getCondition());
+ int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock);
+ if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue &&
+ count(Phi.blocks(), SwitchBlock) == 1) {
+ Phi.setIncomingValue(SwitchBBIdx, SI->getCondition());
Changed = true;
}
}
@@ -4614,24 +4634,20 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
// which we can constant-propagate the CaseVal, continue to its successor.
SmallDenseMap<Value *, Constant *> ConstantPool;
ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
- for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
- ++I) {
- if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+ for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
+ if (TerminatorInst *T = dyn_cast<TerminatorInst>(&I)) {
// If the terminator is a simple branch, continue to the next block.
if (T->getNumSuccessors() != 1 || T->isExceptional())
return false;
Pred = CaseDest;
CaseDest = T->getSuccessor(0);
- } else if (isa<DbgInfoIntrinsic>(I)) {
- // Skip debug intrinsic.
- continue;
- } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) {
+ } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
// Instruction is side-effect free and constant.
// If the instruction has uses outside this block or a phi node slot for
// the block, it is not safe to bypass the instruction since it would then
// no longer dominate all its uses.
- for (auto &Use : I->uses()) {
+ for (auto &Use : I.uses()) {
User *User = Use.getUser();
if (Instruction *I = dyn_cast<Instruction>(User))
if (I->getParent() == CaseDest)
@@ -4642,7 +4658,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
return false;
}
- ConstantPool.insert(std::make_pair(&*I, C));
+ ConstantPool.insert(std::make_pair(&I, C));
} else {
break;
}
@@ -4656,14 +4672,13 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
return false;
// Get the values for this case from phi nodes in the destination block.
- BasicBlock::iterator I = (*CommonDest)->begin();
- while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
- int Idx = PHI->getBasicBlockIndex(Pred);
+ for (PHINode &PHI : (*CommonDest)->phis()) {
+ int Idx = PHI.getBasicBlockIndex(Pred);
if (Idx == -1)
continue;
Constant *ConstVal =
- LookupConstant(PHI->getIncomingValue(Idx), ConstantPool);
+ LookupConstant(PHI.getIncomingValue(Idx), ConstantPool);
if (!ConstVal)
return false;
@@ -4671,37 +4686,38 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
if (!ValidLookupTableConstant(ConstVal, TTI))
return false;
- Res.push_back(std::make_pair(PHI, ConstVal));
+ Res.push_back(std::make_pair(&PHI, ConstVal));
}
return Res.size() > 0;
}
// Helper function used to add CaseVal to the list of cases that generate
-// Result.
-static void MapCaseToResult(ConstantInt *CaseVal,
- SwitchCaseResultVectorTy &UniqueResults,
- Constant *Result) {
+// Result. Returns the updated number of cases that generate this result.
+static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
+ SwitchCaseResultVectorTy &UniqueResults,
+ Constant *Result) {
for (auto &I : UniqueResults) {
if (I.first == Result) {
I.second.push_back(CaseVal);
- return;
+ return I.second.size();
}
}
UniqueResults.push_back(
std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
+ return 1;
}
// Helper function that initializes a map containing
// results for the PHI node of the common destination block for a switch
// instruction. Returns false if multiple PHI nodes have been found or if
// there is not a common destination block for the switch.
-static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
- BasicBlock *&CommonDest,
- SwitchCaseResultVectorTy &UniqueResults,
- Constant *&DefaultResult,
- const DataLayout &DL,
- const TargetTransformInfo &TTI) {
+static bool
+InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
+ SwitchCaseResultVectorTy &UniqueResults,
+ Constant *&DefaultResult, const DataLayout &DL,
+ const TargetTransformInfo &TTI,
+ uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
for (auto &I : SI->cases()) {
ConstantInt *CaseVal = I.getCaseValue();
@@ -4711,10 +4727,21 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
DL, TTI))
return false;
- // Only one value per case is permitted
+ // Only one value per case is permitted.
if (Results.size() > 1)
return false;
- MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+ // Add the case->result mapping to UniqueResults.
+ const uintptr_t NumCasesForResult =
+ MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+ // Early out if there are too many cases for this result.
+ if (NumCasesForResult > MaxCasesPerResult)
+ return false;
+
+ // Early out if there are too many unique results.
+ if (UniqueResults.size() > MaxUniqueResults)
+ return false;
// Check the PHI consistency.
if (!PHI)
@@ -4814,7 +4841,7 @@ static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
SwitchCaseResultVectorTy UniqueResults;
// Collect all the cases that will deliver the same value from the switch.
if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
- DL, TTI))
+ DL, TTI, 2, 1))
return false;
// Selects choose between maximum two values.
if (UniqueResults.size() != 2)
@@ -5392,8 +5419,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
}
bool ReturnedEarly = false;
- for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
- PHINode *PHI = PHIs[I];
+ for (PHINode *PHI : PHIs) {
const ResultListTy &ResultList = ResultLists[PHI];
// If using a bitmask, use any value to fill the lookup table holes.
@@ -5483,7 +5509,7 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
SmallVector<int64_t,4> Values;
for (auto &C : SI->cases())
Values.push_back(C.getCaseValue()->getValue().getSExtValue());
- std::sort(Values.begin(), Values.end());
+ llvm::sort(Values.begin(), Values.end());
// If the switch is already dense, there's nothing useful to do here.
if (isSwitchDense(Values))
@@ -5566,11 +5592,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
// If the block only contains the switch, see if we can fold the block
// away into any preds.
- BasicBlock::iterator BBI = BB->begin();
- // Ignore dbg intrinsics.
- while (isa<DbgInfoIntrinsic>(BBI))
- ++BBI;
- if (SI == &*BBI)
+ if (SI == &*BB->instructionsWithoutDebug().begin())
if (FoldValueComparisonIntoPredecessors(SI, Builder))
return simplifyCFG(BB, TTI, Options) | true;
}
@@ -5657,7 +5679,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
/// any transform which might inhibit optimization (such as our ability to
/// specialize a particular handler via tail commoning). We do this by not
/// merging any blocks which require us to introduce a phi. Since the same
-/// values are flowing through both blocks, we don't loose any ability to
+/// values are flowing through both blocks, we don't lose any ability to
/// specialize. If anything, we make such specialization more likely.
///
/// TODO - This transformation could remove entries from a phi in the target
@@ -5687,7 +5709,7 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
// We've found an identical block. Update our predecessors to take that
// path instead and make ourselves dead.
- SmallSet<BasicBlock *, 16> Preds;
+ SmallPtrSet<BasicBlock *, 16> Preds;
Preds.insert(pred_begin(BB), pred_end(BB));
for (BasicBlock *Pred : Preds) {
InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
@@ -5705,7 +5727,7 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
Inst.eraseFromParent();
}
- SmallSet<BasicBlock *, 16> Succs;
+ SmallPtrSet<BasicBlock *, 16> Succs;
Succs.insert(succ_begin(BB), succ_end(BB));
for (BasicBlock *Succ : Succs) {
Succ->removePredecessor(BB);
@@ -5729,9 +5751,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
// header. (This is for early invocations before loop simplify and
// vectorization to keep canonical loop forms for nested loops. These blocks
// can be eliminated when the pass is invoked later in the back-end.)
+ // Note that if BB has only one predecessor then we do not introduce new
+ // backedge, so we can eliminate BB.
bool NeedCanonicalLoop =
Options.NeedCanonicalLoop &&
- (LoopHeaders && (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
+ (LoopHeaders && pred_size(BB) > 1 &&
+ (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
!NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
@@ -5779,6 +5804,9 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
BasicBlock *BB = BI->getParent();
+ const Function *Fn = BB->getParent();
+ if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+ return false;
// Conditional branch
if (isValueEqualityComparison(BI)) {
@@ -5791,18 +5819,12 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// This block must be empty, except for the setcond inst, if it exists.
// Ignore dbg intrinsics.
- BasicBlock::iterator I = BB->begin();
- // Ignore dbg intrinsics.
- while (isa<DbgInfoIntrinsic>(I))
- ++I;
+ auto I = BB->instructionsWithoutDebug().begin();
if (&*I == BI) {
if (FoldValueComparisonIntoPredecessors(BI, Builder))
return simplifyCFG(BB, TTI, Options) | true;
} else if (&*I == cast<Instruction>(BI->getCondition())) {
++I;
- // Ignore dbg intrinsics.
- while (isa<DbgInfoIntrinsic>(I))
- ++I;
if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
return simplifyCFG(BB, TTI, Options) | true;
}
@@ -5928,17 +5950,20 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
// Load from null is undefined.
if (LoadInst *LI = dyn_cast<LoadInst>(Use))
if (!LI->isVolatile())
- return LI->getPointerAddressSpace() == 0;
+ return !NullPointerIsDefined(LI->getFunction(),
+ LI->getPointerAddressSpace());
// Store to null is undefined.
if (StoreInst *SI = dyn_cast<StoreInst>(Use))
if (!SI->isVolatile())
- return SI->getPointerAddressSpace() == 0 &&
+ return (!NullPointerIsDefined(SI->getFunction(),
+ SI->getPointerAddressSpace())) &&
SI->getPointerOperand() == I;
// A call to null is undefined.
if (auto CS = CallSite(Use))
- return CS.getCalledValue() == I;
+ return !NullPointerIsDefined(CS->getFunction()) &&
+ CS.getCalledValue() == I;
}
return false;
}
@@ -5946,14 +5971,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
/// If BB has an incoming value that will always trigger undefined behavior
/// (eg. null pointer dereference), remove the branch leading here.
static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
- for (BasicBlock::iterator i = BB->begin();
- PHINode *PHI = dyn_cast<PHINode>(i); ++i)
- for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
- if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) {
- TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator();
+ for (PHINode &PHI : BB->phis())
+ for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
+ if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
+ TerminatorInst *T = PHI.getIncomingBlock(i)->getTerminator();
IRBuilder<> Builder(T);
if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
- BB->removePredecessor(PHI->getIncomingBlock(i));
+ BB->removePredecessor(PHI.getIncomingBlock(i));
// Turn uncoditional branches into unreachables and remove the dead
// destination from conditional branches.
if (BI->isUnconditional())
@@ -5980,7 +6004,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
// or that just have themself as a predecessor. These are unreachable.
if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
BB->getSinglePredecessor() == BB) {
- DEBUG(dbgs() << "Removing BB: \n" << *BB);
+ LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
DeleteDeadBlock(BB);
return true;
}
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index ad1faea0a7ae..e381fbc34ab4 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -80,6 +81,7 @@ namespace {
bool replaceIVUserWithLoopInvariant(Instruction *UseInst);
bool eliminateOverflowIntrinsic(CallInst *CI);
+ bool eliminateTrunc(TruncInst *TI);
bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand);
void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
@@ -147,8 +149,8 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
if (SE->getSCEV(UseInst) != FoldedExpr)
return nullptr;
- DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
- << " -> " << *UseInst << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+ << " -> " << *UseInst << '\n');
UseInst->setOperand(OperIdx, IVSrc);
assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
@@ -221,7 +223,7 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
// for now.
return false;
- DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
ICmp->setPredicate(InvariantPredicate);
ICmp->setOperand(0, NewLHS);
ICmp->setOperand(1, NewRHS);
@@ -252,11 +254,11 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
if (SE->isKnownPredicate(Pred, S, X)) {
ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
DeadInsts.emplace_back(ICmp);
- DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
} else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
DeadInsts.emplace_back(ICmp);
- DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
} else if (makeIVComparisonInvariant(ICmp, IVOperand)) {
// fallthrough to end of function
} else if (ICmpInst::isSigned(OriginalPred) &&
@@ -267,7 +269,8 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
// we turn the instruction's predicate to its unsigned version. Note that
// we cannot rely on Pred here unless we check if we have swapped it.
assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
- DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp
+ << '\n');
ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
} else
return;
@@ -293,7 +296,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
SDiv->getName() + ".udiv", SDiv);
UDiv->setIsExact(SDiv->isExact());
SDiv->replaceAllUsesWith(UDiv);
- DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
++NumSimplifiedSDiv;
Changed = true;
DeadInsts.push_back(SDiv);
@@ -309,7 +312,7 @@ void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
Rem->getName() + ".urem", Rem);
Rem->replaceAllUsesWith(URem);
- DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
++NumSimplifiedSRem;
Changed = true;
DeadInsts.emplace_back(Rem);
@@ -318,7 +321,7 @@ void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
// i % n --> i if i is in [0,n).
void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
Rem->replaceAllUsesWith(Rem->getOperand(0));
- DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
++NumElimRem;
Changed = true;
DeadInsts.emplace_back(Rem);
@@ -332,7 +335,7 @@ void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
SelectInst *Sel =
SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
Rem->replaceAllUsesWith(Sel);
- DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
++NumElimRem;
Changed = true;
DeadInsts.emplace_back(Rem);
@@ -492,6 +495,118 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
return true;
}
+bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
+ // It is always legal to replace
+ // icmp <pred> i32 trunc(iv), n
+ // with
+ // icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate.
+ // Or with
+ // icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate.
+ // Or with either of these if pred is an equality predicate.
+ //
+ // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for
+ // every comparison which uses trunc, it means that we can replace each of
+ // them with comparison of iv against sext/zext(n). We no longer need trunc
+ // after that.
+ //
+ // TODO: Should we do this if we can widen *some* comparisons, but not all
+ // of them? Sometimes it is enough to enable other optimizations, but the
+ // trunc instruction will stay in the loop.
+ Value *IV = TI->getOperand(0);
+ Type *IVTy = IV->getType();
+ const SCEV *IVSCEV = SE->getSCEV(IV);
+ const SCEV *TISCEV = SE->getSCEV(TI);
+
+ // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can
+ // get rid of trunc
+ bool DoesSExtCollapse = false;
+ bool DoesZExtCollapse = false;
+ if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy))
+ DoesSExtCollapse = true;
+ if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy))
+ DoesZExtCollapse = true;
+
+ // If neither sext nor zext does collapse, it is not profitable to do any
+ // transform. Bail.
+ if (!DoesSExtCollapse && !DoesZExtCollapse)
+ return false;
+
+ // Collect users of the trunc that look like comparisons against invariants.
+ // Bail if we find something different.
+ SmallVector<ICmpInst *, 4> ICmpUsers;
+ for (auto *U : TI->users()) {
+ // We don't care about users in unreachable blocks.
+ if (isa<Instruction>(U) &&
+ !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
+ continue;
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
+ if (ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) {
+ assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
+ // If we cannot get rid of trunc, bail.
+ if (ICI->isSigned() && !DoesSExtCollapse)
+ return false;
+ if (ICI->isUnsigned() && !DoesZExtCollapse)
+ return false;
+ // For equality, either signed or unsigned works.
+ ICmpUsers.push_back(ICI);
+ } else
+ return false;
+ } else
+ return false;
+ }
+
+ auto CanUseZExt = [&](ICmpInst *ICI) {
+ // Unsigned comparison can be widened as unsigned.
+ if (ICI->isUnsigned())
+ return true;
+ // Is it profitable to do zext?
+ if (!DoesZExtCollapse)
+ return false;
+ // For equality, we can safely zext both parts.
+ if (ICI->isEquality())
+ return true;
+ // Otherwise we can only use zext when comparing two non-negative or two
+ // negative values. But in practice, we will never pass DoesZExtCollapse
+ // check for a negative value, because zext(trunc(x)) is non-negative. So
+ // it only make sense to check for non-negativity here.
+ const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0));
+ const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1));
+ return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2);
+ };
+ // Replace all comparisons against trunc with comparisons against IV.
+ for (auto *ICI : ICmpUsers) {
+ auto *Op1 = ICI->getOperand(1);
+ Instruction *Ext = nullptr;
+ // For signed/unsigned predicate, replace the old comparison with comparison
+ // of immediate IV against sext/zext of the invariant argument. If we can
+ // use either sext or zext (i.e. we are dealing with equality predicate),
+ // then prefer zext as a more canonical form.
+ // TODO: If we see a signed comparison which can be turned into unsigned,
+ // we can do it here for canonicalization purposes.
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ if (CanUseZExt(ICI)) {
+ assert(DoesZExtCollapse && "Unprofitable zext?");
+ Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
+ Pred = ICmpInst::getUnsignedPredicate(Pred);
+ } else {
+ assert(DoesSExtCollapse && "Unprofitable sext?");
+ Ext = new SExtInst(Op1, IVTy, "sext", ICI);
+ assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
+ }
+ bool Changed;
+ L->makeLoopInvariant(Ext, Changed);
+ (void)Changed;
+ ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
+ ICI->replaceAllUsesWith(NewICI);
+ DeadInsts.emplace_back(ICI);
+ }
+
+ // Trunc no longer needed.
+ TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+ DeadInsts.emplace_back(TI);
+ return true;
+}
+
/// Eliminate an operation that consumes a simple IV and has no observable
/// side-effect given the range of IV values. IVOperand is guaranteed SCEVable,
/// but UseInst may not be.
@@ -516,6 +631,10 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
if (eliminateOverflowIntrinsic(CI))
return true;
+ if (auto *TI = dyn_cast<TruncInst>(UseInst))
+ if (eliminateTrunc(TI))
+ return true;
+
if (eliminateIdentitySCEV(UseInst, IVOperand))
return true;
@@ -548,8 +667,8 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
I->replaceAllUsesWith(Invariant);
- DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
- << " with loop invariant: " << *S << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
+ << " with loop invariant: " << *S << '\n');
++NumFoldedUser;
Changed = true;
DeadInsts.emplace_back(I);
@@ -589,7 +708,7 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
return false;
- DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
UseInst->replaceAllUsesWith(IVOperand);
++NumElimIdentity;
@@ -771,6 +890,15 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
SimpleIVUsers.pop_back_val();
Instruction *UseInst = UseOper.first;
+ // If a user of the IndVar is trivially dead, we prefer just to mark it dead
+ // rather than try to do some complex analysis or transformation (such as
+ // widening) basing on it.
+ // TODO: Propagate TLI and pass it here to handle more cases.
+ if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) {
+ DeadInsts.emplace_back(UseInst);
+ continue;
+ }
+
// Bypass back edges to avoid extra work.
if (UseInst == CurrIV) continue;
@@ -783,7 +911,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
for (unsigned N = 0; IVOperand; ++N) {
assert(N <= Simplified.size() && "runaway iteration");
- Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+ Value *NewOper = foldIVUser(UseInst, IVOperand);
if (!NewOper)
break; // done folding
IVOperand = dyn_cast<Instruction>(NewOper);
@@ -791,12 +919,12 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
if (!IVOperand)
continue;
- if (eliminateIVUser(UseOper.first, IVOperand)) {
+ if (eliminateIVUser(UseInst, IVOperand)) {
pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
continue;
}
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
if ((isa<OverflowingBinaryOperator>(BO) &&
strengthenOverflowingOperation(BO, IVOperand)) ||
(isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
@@ -806,13 +934,13 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
}
}
- CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+ CastInst *Cast = dyn_cast<CastInst>(UseInst);
if (V && Cast) {
V->visitCast(Cast);
continue;
}
- if (isSimpleIVUser(UseOper.first, L, SE)) {
- pushIVUsers(UseOper.first, L, Simplified, SimpleIVUsers);
+ if (isSimpleIVUser(UseInst, L, SE)) {
+ pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
}
}
}
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 03a1d55ddc30..8c48597fc2e4 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -7,10 +7,8 @@
//
//===----------------------------------------------------------------------===//
//
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification. If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
+// This file implements the library calls simplifier. It does not implement
+// any pass, but can't be used by other passes to do simplifications.
//
//===----------------------------------------------------------------------===//
@@ -21,7 +19,9 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
@@ -33,7 +33,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
using namespace PatternMatch;
@@ -104,19 +103,51 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
});
}
-/// \brief Check whether the overloaded unary floating point function
-/// corresponding to \a Ty is available.
-static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
- LibFunc DoubleFn, LibFunc FloatFn,
- LibFunc LongDoubleFn) {
- switch (Ty->getTypeID()) {
- case Type::FloatTyID:
- return TLI->has(FloatFn);
- case Type::DoubleTyID:
- return TLI->has(DoubleFn);
- default:
- return TLI->has(LongDoubleFn);
- }
+static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
+ if (Base < 2 || Base > 36)
+ // handle special zero base
+ if (Base != 0)
+ return nullptr;
+
+ char *End;
+ std::string nptr = Str.str();
+ errno = 0;
+ long long int Result = strtoll(nptr.c_str(), &End, Base);
+ if (errno)
+ return nullptr;
+
+ // if we assume all possible target locales are ASCII supersets,
+ // then if strtoll successfully parses a number on the host,
+ // it will also successfully parse the same way on the target
+ if (*End != '\0')
+ return nullptr;
+
+ if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result))
+ return nullptr;
+
+ return ConstantInt::get(CI->getType(), Result);
+}
+
+static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B,
+ const TargetLibraryInfo *TLI) {
+ CallInst *FOpen = dyn_cast<CallInst>(File);
+ if (!FOpen)
+ return false;
+
+ Function *InnerCallee = FOpen->getCalledFunction();
+ if (!InnerCallee)
+ return false;
+
+ LibFunc Func;
+ if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
+ Func != LibFunc_fopen)
+ return false;
+
+ inferLibFuncAttributes(*CI->getCalledFunction(), *TLI);
+ if (PointerMayBeCaptured(File, true, true))
+ return false;
+
+ return true;
}
//===----------------------------------------------------------------------===//
@@ -156,9 +187,8 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
// We have enough information to now generate the memcpy call to do the
// concatenation for us. Make a memcpy to copy the nul byte with align = 1.
- B.CreateMemCpy(CpyDst, Src,
- ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1),
- 1);
+ B.CreateMemCpy(CpyDst, 1, Src, 1,
+ ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
return Dst;
}
@@ -346,8 +376,8 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
// We have enough information to now generate the memcpy call to do the
// copy for us. Make a memcpy to copy the nul byte with align = 1.
- B.CreateMemCpy(Dst, Src,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1);
+ B.CreateMemCpy(Dst, 1, Src, 1,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
return Dst;
}
@@ -371,7 +401,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
// We have enough information to now generate the memcpy call to do the
// copy for us. Make a memcpy to copy the nul byte with align = 1.
- B.CreateMemCpy(Dst, Src, LenV, 1);
+ B.CreateMemCpy(Dst, 1, Src, 1, LenV);
return DstEnd;
}
@@ -388,7 +418,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
--SrcLen;
if (SrcLen == 0) {
- // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+ // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
return Dst;
}
@@ -407,8 +437,8 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
return nullptr;
Type *PT = Callee->getFunctionType()->getParamType(0);
- // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
- B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1);
+ // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
+ B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len));
return Dst;
}
@@ -508,7 +538,7 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
}
Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
- Module &M = *CI->getParent()->getParent()->getParent();
+ Module &M = *CI->getModule();
unsigned WCharSize = TLI->getWCharSize(M) * 8;
// We cannot perform this optimization without wchar_size metadata.
if (WCharSize == 0)
@@ -816,40 +846,19 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
}
Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
- // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
- B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), 1);
+ // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
+ B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+ CI->getArgOperand(2));
return CI->getArgOperand(0);
}
Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
- // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
- B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), 1);
+ // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
+ B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+ CI->getArgOperand(2));
return CI->getArgOperand(0);
}
-// TODO: Does this belong in BuildLibCalls or should all of those similar
-// functions be moved here?
-static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
- IRBuilder<> &B, const TargetLibraryInfo &TLI) {
- LibFunc Func;
- if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- const DataLayout &DL = M->getDataLayout();
- IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
- Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
- PtrType, PtrType);
- CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
-
- if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
-
- return CI;
-}
-
/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
const TargetLibraryInfo &TLI) {
@@ -901,12 +910,19 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
return Calloc;
- // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+ // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
return CI->getArgOperand(0);
}
+Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) {
+ if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
+ return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+
+ return nullptr;
+}
+
//===----------------------------------------------------------------------===//
// Math Library Optimizations
//===----------------------------------------------------------------------===//
@@ -1666,12 +1682,12 @@ Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
}
Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
- // abs(x) -> x >s -1 ? x : -x
- Value *Op = CI->getArgOperand(0);
- Value *Pos =
- B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos");
- Value *Neg = B.CreateNeg(Op, "neg");
- return B.CreateSelect(Pos, Op, Neg);
+ // abs(x) -> x <s 0 ? -x : x
+ // The negation has 'nsw' because abs of INT_MIN is undefined.
+ Value *X = CI->getArgOperand(0);
+ Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
+ Value *NegX = B.CreateNSWNeg(X, "neg");
+ return B.CreateSelect(IsNeg, NegX, X);
}
Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
@@ -1695,6 +1711,29 @@ Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
ConstantInt::get(CI->getType(), 0x7F));
}
+Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) {
+ StringRef Str;
+ if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+ return nullptr;
+
+ return convertStrToNumber(CI, Str, 10);
+}
+
+Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) {
+ StringRef Str;
+ if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+ return nullptr;
+
+ if (!isa<ConstantPointerNull>(CI->getArgOperand(1)))
+ return nullptr;
+
+ if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) {
+ return convertStrToNumber(CI, Str, CInt->getSExtValue());
+ }
+
+ return nullptr;
+}
+
//===----------------------------------------------------------------------===//
// Formatting and IO Library Call Optimizations
//===----------------------------------------------------------------------===//
@@ -1826,15 +1865,13 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
if (CI->getNumArgOperands() == 2) {
// Make sure there's no % in the constant array. We could try to handle
// %% -> % in the future if we cared.
- for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
- if (FormatStr[i] == '%')
- return nullptr; // we found a format specifier, bail out.
+ if (FormatStr.find('%') != StringRef::npos)
+ return nullptr; // we found a format specifier, bail out.
- // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
- B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
+ B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- FormatStr.size() + 1),
- 1); // Copy the null byte.
+ FormatStr.size() + 1)); // Copy the null byte.
return ConstantInt::get(CI->getType(), FormatStr.size());
}
@@ -1868,7 +1905,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
return nullptr;
Value *IncLen =
B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
- B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+ B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, IncLen);
// The sprintf result is the unincremented number of bytes in the string.
return B.CreateIntCast(Len, CI->getType(), false);
@@ -1897,6 +1934,93 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
return nullptr;
}
+Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
+ // Check for a fixed format string.
+ StringRef FormatStr;
+ if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+ return nullptr;
+
+ // Check for size
+ ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!Size)
+ return nullptr;
+
+ uint64_t N = Size->getZExtValue();
+
+ // If we just have a format string (nothing else crazy) transform it.
+ if (CI->getNumArgOperands() == 3) {
+ // Make sure there's no % in the constant array. We could try to handle
+ // %% -> % in the future if we cared.
+ if (FormatStr.find('%') != StringRef::npos)
+ return nullptr; // we found a format specifier, bail out.
+
+ if (N == 0)
+ return ConstantInt::get(CI->getType(), FormatStr.size());
+ else if (N < FormatStr.size() + 1)
+ return nullptr;
+
+ // sprintf(str, size, fmt) -> llvm.memcpy(align 1 str, align 1 fmt,
+ // strlen(fmt)+1)
+ B.CreateMemCpy(
+ CI->getArgOperand(0), 1, CI->getArgOperand(2), 1,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ FormatStr.size() + 1)); // Copy the null byte.
+ return ConstantInt::get(CI->getType(), FormatStr.size());
+ }
+
+ // The remaining optimizations require the format string to be "%s" or "%c"
+ // and have an extra operand.
+ if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
+ CI->getNumArgOperands() == 4) {
+
+ // Decode the second character of the format string.
+ if (FormatStr[1] == 'c') {
+ if (N == 0)
+ return ConstantInt::get(CI->getType(), 1);
+ else if (N == 1)
+ return nullptr;
+
+ // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+ if (!CI->getArgOperand(3)->getType()->isIntegerTy())
+ return nullptr;
+ Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
+ Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+ B.CreateStore(V, Ptr);
+ Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+ B.CreateStore(B.getInt8(0), Ptr);
+
+ return ConstantInt::get(CI->getType(), 1);
+ }
+
+ if (FormatStr[1] == 's') {
+ // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
+ StringRef Str;
+ if (!getConstantStringInfo(CI->getArgOperand(3), Str))
+ return nullptr;
+
+ if (N == 0)
+ return ConstantInt::get(CI->getType(), Str.size());
+ else if (N < Str.size() + 1)
+ return nullptr;
+
+ B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(3), 1,
+ ConstantInt::get(CI->getType(), Str.size() + 1));
+
+ // The snprintf result is the unincremented number of bytes in the string.
+ return ConstantInt::get(CI->getType(), Str.size());
+ }
+ }
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
+ if (Value *V = optimizeSnPrintFString(CI, B)) {
+ return V;
+ }
+
+ return nullptr;
+}
+
Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
optimizeErrorReporting(CI, B, 0);
@@ -1913,9 +2037,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
// fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
if (CI->getNumArgOperands() == 2) {
- for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
- if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
- return nullptr; // We found a format specifier.
+ // Could handle %% -> % if we cared.
+ if (FormatStr.find('%') != StringRef::npos)
+ return nullptr; // We found a format specifier.
return emitFWrite(
CI->getArgOperand(1),
@@ -1973,22 +2097,27 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
// Get the element size and count.
ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
- if (!SizeC || !CountC)
- return nullptr;
- uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
-
- // If this is writing zero records, remove the call (it's a noop).
- if (Bytes == 0)
- return ConstantInt::get(CI->getType(), 0);
-
- // If this is writing one byte, turn it into fputc.
- // This optimisation is only valid, if the return value is unused.
- if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
- Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
- Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
- return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+ if (SizeC && CountC) {
+ uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+ // If this is writing zero records, remove the call (it's a noop).
+ if (Bytes == 0)
+ return ConstantInt::get(CI->getType(), 0);
+
+ // If this is writing one byte, turn it into fputc.
+ // This optimisation is only valid, if the return value is unused.
+ if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+ Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
+ Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
+ return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+ }
}
+ if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
+ return emitFWriteUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
+ TLI);
+
return nullptr;
}
@@ -1997,12 +2126,18 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
// Don't rewrite fputs to fwrite when optimising for size because fwrite
// requires more arguments and thus extra MOVs are required.
- if (CI->getParent()->getParent()->optForSize())
+ if (CI->getFunction()->optForSize())
return nullptr;
- // We can't optimize if return value is used.
- if (!CI->use_empty())
- return nullptr;
+ // Check if has any use
+ if (!CI->use_empty()) {
+ if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
+ return emitFPutSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
+ TLI);
+ else
+ // We can't optimize if return value is used.
+ return nullptr;
+ }
// fputs(s,F) --> fwrite(s,1,strlen(s),F)
uint64_t Len = GetStringLength(CI->getArgOperand(0));
@@ -2016,6 +2151,40 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
CI->getArgOperand(1), B, DL, TLI);
}
+Value *LibCallSimplifier::optimizeFPutc(CallInst *CI, IRBuilder<> &B) {
+ optimizeErrorReporting(CI, B, 1);
+
+ if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
+ return emitFPutCUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
+ TLI);
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFGetc(CallInst *CI, IRBuilder<> &B) {
+ if (isLocallyOpenedFile(CI->getArgOperand(0), CI, B, TLI))
+ return emitFGetCUnlocked(CI->getArgOperand(0), B, TLI);
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFGets(CallInst *CI, IRBuilder<> &B) {
+ if (isLocallyOpenedFile(CI->getArgOperand(2), CI, B, TLI))
+ return emitFGetSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI);
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) {
+ if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
+ return emitFReadUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
+ TLI);
+
+ return nullptr;
+}
+
Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
// Check for a constant string.
StringRef Str;
@@ -2099,6 +2268,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
return optimizeMemMove(CI, Builder);
case LibFunc_memset:
return optimizeMemSet(CI, Builder);
+ case LibFunc_realloc:
+ return optimizeRealloc(CI, Builder);
case LibFunc_wcslen:
return optimizeWcslen(CI, Builder);
default:
@@ -2290,16 +2461,33 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
return optimizeIsAscii(CI, Builder);
case LibFunc_toascii:
return optimizeToAscii(CI, Builder);
+ case LibFunc_atoi:
+ case LibFunc_atol:
+ case LibFunc_atoll:
+ return optimizeAtoi(CI, Builder);
+ case LibFunc_strtol:
+ case LibFunc_strtoll:
+ return optimizeStrtol(CI, Builder);
case LibFunc_printf:
return optimizePrintF(CI, Builder);
case LibFunc_sprintf:
return optimizeSPrintF(CI, Builder);
+ case LibFunc_snprintf:
+ return optimizeSnPrintF(CI, Builder);
case LibFunc_fprintf:
return optimizeFPrintF(CI, Builder);
case LibFunc_fwrite:
return optimizeFWrite(CI, Builder);
+ case LibFunc_fread:
+ return optimizeFRead(CI, Builder);
case LibFunc_fputs:
return optimizeFPuts(CI, Builder);
+ case LibFunc_fgets:
+ return optimizeFGets(CI, Builder);
+ case LibFunc_fputc:
+ return optimizeFPutc(CI, Builder);
+ case LibFunc_fgetc:
+ return optimizeFGetc(CI, Builder);
case LibFunc_puts:
return optimizePuts(CI, Builder);
case LibFunc_perror:
@@ -2307,8 +2495,6 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
case LibFunc_vfprintf:
case LibFunc_fiprintf:
return optimizeErrorReporting(CI, Builder, 0);
- case LibFunc_fputc:
- return optimizeErrorReporting(CI, Builder, 1);
default:
return nullptr;
}
@@ -2393,8 +2579,8 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
IRBuilder<> &B) {
if (isFortifiedCallFoldable(CI, 3, 2, false)) {
- B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), 1);
+ B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+ CI->getArgOperand(2));
return CI->getArgOperand(0);
}
return nullptr;
@@ -2403,8 +2589,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
IRBuilder<> &B) {
if (isFortifiedCallFoldable(CI, 3, 2, false)) {
- B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), 1);
+ B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+ CI->getArgOperand(2));
return CI->getArgOperand(0);
}
return nullptr;
diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp
index 968eb0208f43..f8d758c54983 100644
--- a/lib/Transforms/Utils/SplitModule.cpp
+++ b/lib/Transforms/Utils/SplitModule.cpp
@@ -101,7 +101,8 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
// At this point module should have the proper mix of globals and locals.
// As we attempt to partition this module, we must not change any
// locals to globals.
- DEBUG(dbgs() << "Partition module with (" << M->size() << ")functions\n");
+ LLVM_DEBUG(dbgs() << "Partition module with (" << M->size()
+ << ")functions\n");
ClusterMapType GVtoClusterMap;
ComdatMembersType ComdatMembers;
@@ -180,28 +181,31 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
GVtoClusterMap.member_end()), I));
- std::sort(Sets.begin(), Sets.end(), [](const SortType &a, const SortType &b) {
- if (a.first == b.first)
- return a.second->getData()->getName() > b.second->getData()->getName();
- else
- return a.first > b.first;
- });
+ llvm::sort(Sets.begin(), Sets.end(),
+ [](const SortType &a, const SortType &b) {
+ if (a.first == b.first)
+ return a.second->getData()->getName() >
+ b.second->getData()->getName();
+ else
+ return a.first > b.first;
+ });
for (auto &I : Sets) {
unsigned CurrentClusterID = BalancinQueue.top().first;
unsigned CurrentClusterSize = BalancinQueue.top().second;
BalancinQueue.pop();
- DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" << I.first
- << ") ----> " << I.second->getData()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size("
+ << I.first << ") ----> " << I.second->getData()->getName()
+ << "\n");
for (ClusterMapType::member_iterator MI =
GVtoClusterMap.findLeader(I.second);
MI != GVtoClusterMap.member_end(); ++MI) {
if (!Visited.insert(*MI).second)
continue;
- DEBUG(dbgs() << "----> " << (*MI)->getName()
- << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+ LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName()
+ << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
Visited.insert(*MI);
ClusterIDMap[*MI] = CurrentClusterID;
CurrentClusterSize++;
@@ -270,7 +274,7 @@ void llvm::SplitModule(
for (unsigned I = 0; I < N; ++I) {
ValueToValueMapTy VMap;
std::unique_ptr<Module> MPart(
- CloneModule(M.get(), VMap, [&](const GlobalValue *GV) {
+ CloneModule(*M, VMap, [&](const GlobalValue *GV) {
if (ClusterIDMap.count(GV))
return (ClusterIDMap[GV] == I);
else
diff --git a/lib/Transforms/Utils/StripGCRelocates.cpp b/lib/Transforms/Utils/StripGCRelocates.cpp
index 49dc15cf5e7c..ac0b519f4a77 100644
--- a/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -21,7 +21,6 @@
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
using namespace llvm;
@@ -75,6 +74,3 @@ bool StripGCRelocates::runOnFunction(Function &F) {
INITIALIZE_PASS(StripGCRelocates, "strip-gc-relocates",
"Strip gc.relocates inserted through RewriteStatepointsForGC",
true, false)
-FunctionPass *llvm::createStripGCRelocatesPass() {
- return new StripGCRelocates();
-}
diff --git a/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index cd0378e0140c..8956a089a99c 100644
--- a/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -9,7 +9,7 @@
#include "llvm/IR/DebugInfo.h"
#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
namespace {
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index ed444e4cf43c..e633ac0c874d 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -19,7 +19,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
using namespace llvm;
char UnifyFunctionExitNodes::ID = 0;
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index f6c7d1c4989e..afd842f59911 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -12,7 +12,10 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils.h"
#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Utils.h"
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassRegistry.h"
@@ -33,7 +36,6 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
initializePromoteLegacyPassPass(Registry);
initializeStripNonLineTableDebugInfoPass(Registry);
initializeUnifyFunctionExitNodesPass(Registry);
- initializeInstSimplifierPass(Registry);
initializeMetaRenamerPass(Registry);
initializeStripGCRelocatesPass(Registry);
initializePredicateInfoPrinterLegacyPassPass(Registry);
@@ -43,3 +45,12 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
initializeTransformUtils(*unwrap(R));
}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index c3feea6a0a41..948d9bd5baad 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -20,8 +20,14 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
return false;
+ uint64_t StoreSize = DL.getTypeSizeInBits(StoredVal->getType());
+
+ // The store size must be byte-aligned to support future type casts.
+ if (llvm::alignTo(StoreSize, 8) != StoreSize)
+ return false;
+
// The store has to be at least as big as the load.
- if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+ if (StoreSize < DL.getTypeSizeInBits(LoadTy))
return false;
// Don't coerce non-integral pointers to integers or vice versa.
@@ -389,8 +395,8 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
NewLoad->takeName(SrcVal);
NewLoad->setAlignment(SrcVal->getAlignment());
- DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
- DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+ LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+ LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
// Replace uses of the original load with the wider load. On a big endian
// system, we need to shift down to get the relevant bits.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 8c9ecbc3503e..55fff3f3872a 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
@@ -536,13 +537,23 @@ Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
return None;
}
+static Metadata *cloneOrBuildODR(const MDNode &N) {
+ auto *CT = dyn_cast<DICompositeType>(&N);
+ // If ODR type uniquing is enabled, we would have uniqued composite types
+ // with identifiers during bitcode reading, so we can just use CT.
+ if (CT && CT->getContext().isODRUniquingDebugTypes() &&
+ CT->getIdentifier() != "")
+ return const_cast<DICompositeType *>(CT);
+ return MDNode::replaceWithDistinct(N.clone());
+}
+
MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
assert(N.isDistinct() && "Expected a distinct node");
assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
- DistinctWorklist.push_back(cast<MDNode>(
- (M.Flags & RF_MoveDistinctMDs)
- ? M.mapToSelf(&N)
- : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+ DistinctWorklist.push_back(
+ cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
+ ? M.mapToSelf(&N)
+ : M.mapToMetadata(&N, cloneOrBuildODR(N))));
return DistinctWorklist.back();
}
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 7622ed6d194f..27a4d241b320 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,9 +1,13 @@
add_llvm_library(LLVMVectorize
LoadStoreVectorizer.cpp
+ LoopVectorizationLegality.cpp
LoopVectorize.cpp
SLPVectorizer.cpp
Vectorize.cpp
VPlan.cpp
+ VPlanHCFGBuilder.cpp
+ VPlanHCFGTransforms.cpp
+ VPlanVerifier.cpp
ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index dc83b6d4d292..5f3d127202ad 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -6,6 +6,38 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
+// This pass merges loads/stores to/from sequential memory addresses into vector
+// loads/stores. Although there's nothing GPU-specific in here, this pass is
+// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
+//
+// (For simplicity below we talk about loads only, but everything also applies
+// to stores.)
+//
+// This pass is intended to be run late in the pipeline, after other
+// vectorization opportunities have been exploited. So the assumption here is
+// that immediately following our new vector load we'll need to extract out the
+// individual elements of the load, so we can operate on them individually.
+//
+// On CPUs this transformation is usually not beneficial, because extracting the
+// elements of a vector register is expensive on most architectures. It's
+// usually better just to load each element individually into its own scalar
+// register.
+//
+// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
+// "vector load" loads directly into a series of scalar registers. In effect,
+// extracting the elements of the vector is free. It's therefore always
+// beneficial to vectorize a sequence of loads on these architectures.
+//
+// Vectorizing (perhaps a better name might be "coalescing") loads can have
+// large performance impacts on GPU kernels, and opportunities for vectorizing
+// are common in GPU code. This pass tries very hard to find such
+// opportunities; its runtime is quadratic in the number of loads in a BB.
+//
+// Some CPU architectures, such as ARM, have instructions that load into
+// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
+// could use this pass (with some modifications), but currently it implements
+// its own pass to do something similar to what we do here.
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -21,6 +53,7 @@
#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
@@ -45,7 +78,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
@@ -65,8 +97,16 @@ static const unsigned StackAdjustedAlignment = 4;
namespace {
+/// ChainID is an arbitrary token that is allowed to be different only for the
+/// accesses that are guaranteed to be considered non-consecutive by
+/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
+/// together and reducing the number of instructions the main search operates on
+/// at a time, i.e. this is to reduce compile time and nothing else as the main
+/// search has O(n^2) time complexity. The underlying type of ChainID should not
+/// be relied upon.
+using ChainID = const Value *;
using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<Value *, InstrList>;
+using InstrListMap = MapVector<ChainID, InstrList>;
class Vectorizer {
Function &F;
@@ -86,10 +126,6 @@ public:
bool run();
private:
- Value *getPointerOperand(Value *I) const;
-
- GetElementPtrInst *getSourceGEP(Value *Src) const;
-
unsigned getPointerAddressSpace(Value *I);
unsigned getAlignment(LoadInst *LI) const {
@@ -108,7 +144,15 @@ private:
return DL.getABITypeAlignment(SI->getValueOperand()->getType());
}
+ static const unsigned MaxDepth = 3;
+
bool isConsecutiveAccess(Value *A, Value *B);
+ bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+ unsigned Depth = 0) const;
+ bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
+ unsigned Depth) const;
+ bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+ unsigned Depth) const;
/// After vectorization, reorder the instructions that I depends on
/// (the instructions defining its operands), to ensure they dominate I.
@@ -239,14 +283,6 @@ bool Vectorizer::run() {
return Changed;
}
-Value *Vectorizer::getPointerOperand(Value *I) const {
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerOperand();
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- return nullptr;
-}
-
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
if (LoadInst *L = dyn_cast<LoadInst>(I))
return L->getPointerAddressSpace();
@@ -255,23 +291,10 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
return -1;
}
-GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
- // First strip pointer bitcasts. Make sure pointee size is the same with
- // and without casts.
- // TODO: a stride set by the add instruction below can match the difference
- // in pointee type size here. Currently it will not be vectorized.
- Value *SrcPtr = getPointerOperand(Src);
- Value *SrcBase = SrcPtr->stripPointerCasts();
- if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
- DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
- SrcPtr = SrcBase;
- return dyn_cast<GetElementPtrInst>(SrcPtr);
-}
-
// FIXME: Merge with llvm::isConsecutiveAccess
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
- Value *PtrA = getPointerOperand(A);
- Value *PtrB = getPointerOperand(B);
+ Value *PtrA = getLoadStorePointerOperand(A);
+ Value *PtrB = getLoadStorePointerOperand(B);
unsigned ASA = getPointerAddressSpace(A);
unsigned ASB = getPointerAddressSpace(B);
@@ -280,18 +303,27 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
return false;
// Make sure that A and B are different pointers of the same size type.
- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
Type *PtrATy = PtrA->getType()->getPointerElementType();
Type *PtrBTy = PtrB->getType()->getPointerElementType();
if (PtrA == PtrB ||
+ PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
DL.getTypeStoreSize(PtrATy->getScalarType()) !=
DL.getTypeStoreSize(PtrBTy->getScalarType()))
return false;
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
- APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+ return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
+ const APInt &PtrDelta,
+ unsigned Depth) const {
+ unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+ APInt OffsetA(PtrBitWidth, 0);
+ APInt OffsetB(PtrBitWidth, 0);
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
@@ -300,11 +332,11 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
// Check if they are based on the same pointer. That makes the offsets
// sufficient.
if (PtrA == PtrB)
- return OffsetDelta == Size;
+ return OffsetDelta == PtrDelta;
// Compute the necessary base pointer delta to have the necessary final delta
- // equal to the size.
- APInt BaseDelta = Size - OffsetDelta;
+ // equal to the pointer delta requested.
+ APInt BaseDelta = PtrDelta - OffsetDelta;
// Compute the distance with SCEV between the base pointers.
const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
@@ -314,71 +346,127 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (X == PtrSCEVB)
return true;
+ // The above check will not catch the cases where one of the pointers is
+ // factorized but the other one is not, such as (C + (S * (A + B))) vs
+ // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+ // and getting the simplified difference.
+ const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+ if (C == Dist)
+ return true;
+
// Sometimes even this doesn't work, because SCEV can't always see through
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
// things the hard way.
+ return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+ APInt PtrDelta,
+ unsigned Depth) const {
+ auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+ auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+ if (!GEPA || !GEPB)
+ return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
// Look through GEPs after checking they're the same except for the last
// index.
- GetElementPtrInst *GEPA = getSourceGEP(A);
- GetElementPtrInst *GEPB = getSourceGEP(B);
- if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+ if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+ GEPA->getPointerOperand() != GEPB->getPointerOperand())
return false;
- unsigned FinalIndex = GEPA->getNumOperands() - 1;
- for (unsigned i = 0; i < FinalIndex; i++)
- if (GEPA->getOperand(i) != GEPB->getOperand(i))
+ gep_type_iterator GTIA = gep_type_begin(GEPA);
+ gep_type_iterator GTIB = gep_type_begin(GEPB);
+ for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+ if (GTIA.getOperand() != GTIB.getOperand())
return false;
+ ++GTIA;
+ ++GTIB;
+ }
- Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
- Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+ Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+ Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
OpA->getType() != OpB->getType())
return false;
+ if (PtrDelta.isNegative()) {
+ if (PtrDelta.isMinSignedValue())
+ return false;
+ PtrDelta.negate();
+ std::swap(OpA, OpB);
+ }
+ uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+ if (PtrDelta.urem(Stride) != 0)
+ return false;
+ unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+ APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
return false;
bool Signed = isa<SExtInst>(OpA);
- OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+ // At this point A could be a function parameter, i.e. not an instruction
+ Value *ValA = OpA->getOperand(0);
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
- if (!OpA || !OpB || OpA->getType() != OpB->getType())
+ if (!OpB || ValA->getType() != OpB->getType())
return false;
- // Now we need to prove that adding 1 to OpA won't overflow.
+ // Now we need to prove that adding IdxDiff to ValA won't overflow.
bool Safe = false;
- // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
- // we're okay.
+ // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+ // ValA, we're okay.
if (OpB->getOpcode() == Instruction::Add &&
isa<ConstantInt>(OpB->getOperand(1)) &&
- cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+ IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
if (Signed)
Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
else
Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
}
- unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+ unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
// Second attempt:
- // If any bits are known to be zero other than the sign bit in OpA, we can
- // add 1 to it while guaranteeing no overflow of any sort.
+ // If all set bits of IdxDiff or any higher order bit other than the sign bit
+ // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+ // overflow of any sort.
if (!Safe) {
+ OpA = dyn_cast<Instruction>(ValA);
+ if (!OpA)
+ return false;
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- if (Known.countMaxTrailingOnes() < (BitWidth - 1))
- Safe = true;
+ APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
+ if (Signed)
+ BitsAllowedToBeSet.clearBit(BitWidth - 1);
+ if (BitsAllowedToBeSet.ult(IdxDiff))
+ return false;
}
- if (!Safe)
+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+ const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+ const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+ return X == OffsetSCEVB;
+}
+
+bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
+ const APInt &PtrDelta,
+ unsigned Depth) const {
+ if (Depth++ == MaxDepth)
return false;
- const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
- const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
- const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
- return X2 == OffsetSCEVB;
+ if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
+ if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
+ return SelectA->getCondition() == SelectB->getCondition() &&
+ areConsecutivePointers(SelectA->getTrueValue(),
+ SelectB->getTrueValue(), PtrDelta, Depth) &&
+ areConsecutivePointers(SelectA->getFalseValue(),
+ SelectB->getFalseValue(), PtrDelta, Depth);
+ }
+ }
+ return false;
}
void Vectorizer::reorder(Instruction *I) {
@@ -448,7 +536,7 @@ Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> Instrs;
for (Instruction *I : Chain) {
- Value *PtrOperand = getPointerOperand(I);
+ Value *PtrOperand = getLoadStorePointerOperand(I);
assert(PtrOperand && "Instruction must have a pointer operand.");
Instrs.push_back(I);
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
@@ -484,7 +572,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> ChainInstrs;
bool IsLoadChain = isa<LoadInst>(Chain[0]);
- DEBUG({
+ LLVM_DEBUG({
for (Instruction *I : Chain) {
if (IsLoadChain)
assert(isa<LoadInst>(I) &&
@@ -506,11 +594,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
Intrinsic::sideeffect) {
// Ignore llvm.sideeffect calls.
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
- DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
+ << '\n');
break;
} else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
- DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
- << '\n');
+ LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+ << '\n');
break;
}
}
@@ -536,32 +625,40 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
break;
- if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
+ auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
+ auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
+ if (MemLoad && ChainLoad)
continue;
+ // We can ignore the alias if the we have a load store pair and the load
+ // is known to be invariant. The load cannot be clobbered by the store.
+ auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
+ return LI->getMetadata(LLVMContext::MD_invariant_load);
+ };
+
// We can ignore the alias as long as the load comes before the store,
// because that means we won't be moving the load past the store to
// vectorize it (the vectorized load is inserted at the location of the
// first load in the chain).
- if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
- OBB.dominates(ChainInstr, MemInstr))
+ if (isa<StoreInst>(MemInstr) && ChainLoad &&
+ (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
continue;
// Same case, but in reverse.
- if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
- OBB.dominates(MemInstr, ChainInstr))
+ if (MemLoad && isa<StoreInst>(ChainInstr) &&
+ (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
continue;
if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
MemoryLocation::get(ChainInstr))) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "LSV: Found alias:\n"
" Aliasing instruction and pointer:\n"
<< " " << *MemInstr << '\n'
- << " " << *getPointerOperand(MemInstr) << '\n'
+ << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
<< " Aliased instruction and pointer:\n"
<< " " << *ChainInstr << '\n'
- << " " << *getPointerOperand(ChainInstr) << '\n';
+ << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
});
// Save this aliasing memory instruction as a barrier, but allow other
// instructions that precede the barrier to be vectorized with this one.
@@ -594,6 +691,20 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
return Chain.slice(0, ChainIdx);
}
+static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
+ const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share the
+ // same condition and evaluate to consecutive pointers for true and false
+ // values of the condition. Therefore using the select's themselves for
+ // grouping instructions would put consecutive accesses into different lists
+ // and they won't be even checked for being consecutive, and won't be
+ // vectorized.
+ return Sel->getCondition();
+ }
+ return ObjPtr;
+}
+
std::pair<InstrListMap, InstrListMap>
Vectorizer::collectInstructions(BasicBlock *BB) {
InstrListMap LoadRefs;
@@ -632,8 +743,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
// No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2)
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
// Make sure all the users of a vector are constant-index extracts.
@@ -644,8 +759,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Save the load locations.
- Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
- LoadRefs[ObjPtr].push_back(LI);
+ const ChainID ID = getChainID(Ptr, DL);
+ LoadRefs[ID].push_back(LI);
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
@@ -675,8 +790,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
// No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2)
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
@@ -686,8 +805,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Save store location.
- Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
- StoreRefs[ObjPtr].push_back(SI);
+ const ChainID ID = getChainID(Ptr, DL);
+ StoreRefs[ID].push_back(SI);
}
}
@@ -697,12 +816,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
bool Vectorizer::vectorizeChains(InstrListMap &Map) {
bool Changed = false;
- for (const std::pair<Value *, InstrList> &Chain : Map) {
+ for (const std::pair<ChainID, InstrList> &Chain : Map) {
unsigned Size = Chain.second.size();
if (Size < 2)
continue;
- DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+ LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
// Process the stores in chunks of 64.
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
@@ -716,7 +835,8 @@ bool Vectorizer::vectorizeChains(InstrListMap &Map) {
}
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
- DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
+ LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
+ << " instructions.\n");
SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
@@ -852,14 +972,14 @@ bool Vectorizer::vectorizeStoreChain(
// vector factor, break it into two pieces.
unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
return vectorizeStoreChain(Chain.slice(0, TargetVF),
InstructionsProcessed) |
vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "LSV: Stores to vectorize:\n";
for (Instruction *I : Chain)
dbgs() << " " << *I << "\n";
@@ -1000,8 +1120,8 @@ bool Vectorizer::vectorizeLoadChain(
// vector factor, break it into two pieces.
unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
}
@@ -1024,7 +1144,7 @@ bool Vectorizer::vectorizeLoadChain(
Alignment = NewAlign;
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";
for (Instruction *I : Chain)
I->dump();
@@ -1107,7 +1227,7 @@ bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
SzInBytes * 8, AddressSpace,
Alignment, &Fast);
- DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
- << " and fast? " << Fast << "\n";);
+ LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+ << " and fast? " << Fast << "\n";);
return !Allows || !Fast;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
new file mode 100644
index 000000000000..697bc1b448d7
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -0,0 +1,1072 @@
+//===- LoopVectorizationLegality.cpp --------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides loop vectorization legality analysis. Original code
+// resided in LoopVectorize.cpp for a long time.
+//
+// At this point, it is implemented as a utility class, not as an analysis
+// pass. It should be easy to create an analysis pass around it if there
+// is a need (but D45420 needs to happen first).
+//
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+static cl::opt<bool>
+ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+ cl::desc("Enable if-conversion during vectorization."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+ "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+ "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed with a "
+ "vectorize(enable) pragma"));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+namespace llvm {
+
+OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
+ StringRef RemarkName,
+ Loop *TheLoop,
+ Instruction *I) {
+ Value *CodeRegion = TheLoop->getHeader();
+ DebugLoc DL = TheLoop->getStartLoc();
+
+ if (I) {
+ CodeRegion = I->getParent();
+ // If there is no debug location attached to the instruction, revert back to
+ // using the loop's.
+ if (I->getDebugLoc())
+ DL = I->getDebugLoc();
+ }
+
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
+}
+
+bool LoopVectorizeHints::Hint::validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ case HK_ISVECTORIZED:
+ return (Val == 0 || Val == 1);
+ }
+ return false;
+}
+
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+ OptimizationRemarkEmitter &ORE)
+ : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
+ Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
+
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizerParams::isInterleaveForced())
+ Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+ if (IsVectorized.Value != 1)
+ // If the vectorization width and interleaving count are both 1 then
+ // consider the loop to have been already vectorized because there's
+ // nothing more that we can do.
+ IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+ LLVM_DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
+}
+
+bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L,
+ bool AlwaysVectorize) const {
+ if (getForce() == LoopVectorizeHints::FK_Disabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (getIsVectorized() == 1) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ // FIXME: Add interleave.disable metadata. This will allow
+ // vectorize.disable to be used without disabling the pass and errors
+ // to differentiate between disabled vectorization and a width of 1.
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ "AllDisabled", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: vectorization and interleaving are "
+ "explicitly disabled, or the loop has already been "
+ "vectorized";
+ });
+ return false;
+ }
+
+ return true;
+}
+
+void LoopVectorizeHints::emitRemarkWithHints() const {
+ using namespace ore;
+
+ ORE.emit([&]() {
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "loop not vectorized: vectorization is explicitly disabled";
+ else {
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(), TheLoop->getHeader());
+ R << "loop not vectorized";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=" << NV("Force", true);
+ if (Width.Value != 0)
+ R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+ R << ")";
+ }
+ return R;
+ }
+ });
+}
+
+const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
+ if (getWidth() == 1)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Disabled)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+ return LV_NAME;
+ return OptimizationRemarkAnalysis::AlwaysPrint;
+}
+
+void LoopVectorizeHints::getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = nullptr;
+ SmallVector<Metadata *, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the loop metadata prefix.
+ StringRef Name = S->getString();
+ if (Args.size() == 1)
+ setHint(Name, Args[0]);
+ }
+}
+
+void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+ if (!C)
+ return;
+ unsigned Val = C->getZExtValue();
+
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
+ }
+}
+
+MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name,
+ unsigned V) const {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {
+ MDString::get(Context, Name),
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+}
+
+bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node,
+ ArrayRef<Hint> HintTypes) {
+ MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
+ if (!Name)
+ return false;
+
+ for (auto H : HintTypes)
+ if (Name->getString().endswith(H.Name))
+ return true;
+ return false;
+}
+
+void LoopVectorizeHints::writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+ if (HintTypes.empty())
+ return;
+
+ // Reserve the first element to LoopID (see below).
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, then ignore the existing operands.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ // If node in update list, ignore old value.
+ if (!matchesHintMetadataName(Node, HintTypes))
+ MDs.push_back(Node);
+ }
+ }
+
+ // Now, add the missing hints.
+ for (auto H : HintTypes)
+ MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+
+ TheLoop->setLoopID(NewLoopID);
+}
+
+bool LoopVectorizationRequirements::doesNotMeet(
+ Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+ const char *PassName = Hints.vectorizeAnalysisPassName();
+ bool Failed = false;
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisFPCommute(
+ PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
+ Failed = true;
+ }
+
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Failed = true;
+ }
+
+ return Failed;
+}
+
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+// 1) it has a canonical IV (starting from 0 and with stride 1),
+// 2) its latch terminator is a conditional branch and,
+// 3) its latch condition is a compare instruction whose operands are the
+// canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+ assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+ // If Lp is the outer loop, it's uniform by definition.
+ if (Lp == OuterLp)
+ return true;
+ assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+ // 1.
+ PHINode *IV = Lp->getCanonicalInductionVariable();
+ if (!IV) {
+ LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+ return false;
+ }
+
+ // 2.
+ BasicBlock *Latch = Lp->getLoopLatch();
+ auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+ return false;
+ }
+
+ // 3.
+ auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+ if (!LatchCmp) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+ return false;
+ }
+
+ Value *CondOp0 = LatchCmp->getOperand(0);
+ Value *CondOp1 = LatchCmp->getOperand(1);
+ Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+ if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+ !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+ LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+ if (!isUniformLoop(Lp, OuterLp))
+ return false;
+
+ // Check if nested loops are uniform.
+ for (Loop *SubLp : *Lp)
+ if (!isUniformLoopNest(SubLp, OuterLp))
+ return false;
+
+ return true;
+}
+
+/// Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+ for (PHINode &Phi : BB->phis()) {
+ for (Value *V : Phi.incoming_values())
+ if (auto *C = dyn_cast<Constant>(V))
+ if (C->canTrap())
+ return false;
+ }
+ return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+ if (Ty->isPointerTy())
+ return DL.getIntPtrType(Ty);
+
+ // It is possible that char's or short's overflow when we ask for the loop's
+ // trip count, work around this by changing the type size.
+ if (Ty->getScalarSizeInBits() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+
+ return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+ Ty0 = convertPointerToIntegerType(DL, Ty0);
+ Ty1 = convertPointerToIntegerType(DL, Ty1);
+ if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+ return Ty0;
+ return Ty1;
+}
+
+/// Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ // Reduction and Induction instructions are allowed to have exit users. All
+ // other instructions must not have external users.
+ if (!AllowedExit.count(Inst))
+ // Check that all of the users of the loop are inside the BB.
+ for (User *U : Inst->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(UI)) {
+ LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+ return true;
+ }
+ }
+ return false;
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ const ValueToValueMap &Strides =
+ getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+
+ int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+ if (Stride == 1 || Stride == -1)
+ return Stride;
+ return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return LAI->isUniform(V);
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+ assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Check whether the BB terminator is a BranchInst. Any other terminator is
+ // not supported yet.
+ auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether the BranchInst is a supported one. Only unconditional
+ // branches, conditional branches with an outer loop invariant condition or
+ // backedges are supported.
+ if (Br && Br->isConditional() &&
+ !TheLoop->isLoopInvariant(Br->getCondition()) &&
+ !LI->isLoopHeader(Br->getSuccessor(0)) &&
+ !LI->isLoopHeader(Br->getSuccessor(1))) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported conditional branch.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ }
+
+ // Check whether inner loops are uniform. At this point, we only support
+ // simple outer loops scenarios with uniform nested loops.
+ if (!isUniformLoopNest(TheLoop /*loop nest*/,
+ TheLoop /*context outer loop*/)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not vectorizing: Outer loop contains divergent loops.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+ PHINode *Phi, const InductionDescriptor &ID,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ Inductions[Phi] = ID;
+
+ // In case this induction also comes with casts that we know we can ignore
+ // in the vectorized loop body, record them here. All casts could be recorded
+ // here for ignoring, but suffices to record only the first (as it is the
+ // only one that may bw used outside the cast sequence).
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (!Casts.empty())
+ InductionCastsToIgnore.insert(*Casts.begin());
+
+ Type *PhiTy = Phi->getType();
+ const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+ // Get the widest type.
+ if (!PhiTy->isFloatingPointTy()) {
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+ }
+
+ // Int inductions are special because we only allow one IV.
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+ ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+ isa<Constant>(ID.getStartValue()) &&
+ cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+ // Use the phi node with the widest type as induction. Use the last
+ // one if there are multiple (no good reason for doing this other
+ // than it is expedient). We've checked that it begins at zero and
+ // steps by one, so this is a canonical induction variable.
+ if (!PrimaryInduction || PhiTy == WidestIndTy)
+ PrimaryInduction = Phi;
+ }
+
+ // Both the PHI node itself, and the "post-increment" value feeding
+ // back into the PHI node may have external users.
+ // We can allow those uses, except if the SCEVs we have for them rely
+ // on predicates that only hold within the loop, since allowing the exit
+ // currently means re-using this SCEV outside the loop.
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(Phi);
+ AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ HasFunNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+ // For each block in the loop.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Scan the instructions in the block and look for hazards.
+ for (Instruction &I : *BB) {
+ if (auto *Phi = dyn_cast<PHINode>(&I)) {
+ Type *PhiTy = Phi->getType();
+ // Check that this PHI type is allowed.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+ << "loop control flow is not understood by vectorizer");
+ LLVM_DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+ return false;
+ }
+
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (BB != Header) {
+ // Check that this instruction has no outside users or is an
+ // identified reduction value with an outside user.
+ if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
+ continue;
+ ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
+ << "value could not be identified as "
+ "an induction or reduction variable");
+ return false;
+ }
+
+ // We only allow if-converted PHIs with exactly two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+ << "control flow not understood by vectorizer");
+ LLVM_DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+ return false;
+ }
+
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+ DT)) {
+ if (RedDes.hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
+ continue;
+ }
+
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+ Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+ continue;
+ }
+
+ if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+ SinkAfter, DT)) {
+ FirstOrderRecurrences.insert(Phi);
+ continue;
+ }
+
+ // As a last resort, coerce the PHI to a AddRec expression
+ // and re-try classifying it a an induction PHI.
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ continue;
+ }
+
+ ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
+ << "value that could not be identified as "
+ "reduction is used outside the loop");
+ LLVM_DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
+ return false;
+ } // end of PHI handling
+
+ // We handle calls that:
+ // * Are debug info intrinsics.
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
+ auto *CI = dyn_cast<CallInst>(&I);
+ if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+ !isa<DbgInfoIntrinsic>(CI) &&
+ !(CI->getCalledFunction() && TLI &&
+ TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+ ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
+ << "call instruction cannot be vectorized");
+ LLVM_DEBUG(
+ dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
+ return false;
+ }
+
+ // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+ // second argument is the same (i.e. loop invariant)
+ if (CI && hasVectorInstrinsicScalarOpd(
+ getVectorIntrinsicIDForCall(CI, TLI), 1)) {
+ auto *SE = PSE.getSE();
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+ ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
+ << "intrinsic instruction cannot be vectorized");
+ LLVM_DEBUG(dbgs()
+ << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+ return false;
+ }
+ }
+
+ // Check that the instruction return type is vectorizable.
+ // Also, we can't vectorize extractelement instructions.
+ if ((!VectorType::isValidElementType(I.getType()) &&
+ !I.getType()->isVoidTy()) ||
+ isa<ExtractElementInst>(I)) {
+ ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
+ << "instruction return type cannot be vectorized");
+ LLVM_DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
+ return false;
+ }
+
+ // Check that the stored type is vectorizable.
+ if (auto *ST = dyn_cast<StoreInst>(&I)) {
+ Type *T = ST->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(T)) {
+ ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
+ << "store instruction cannot be vectorized");
+ return false;
+ }
+
+ // FP instructions can allow unsafe algebra, thus vectorizable by
+ // non-IEEE-754 compliant SIMD units.
+ // This applies to floating-point math operations and calls, not memory
+ // operations, shuffles, or casts, as they don't change precision or
+ // semantics.
+ } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+ !I.isFast()) {
+ LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+ Hints->setPotentiallyUnsafe();
+ }
+
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+ ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
+ << "value cannot be used outside the loop");
+ return false;
+ }
+ } // next instr.
+ }
+
+ if (!PrimaryInduction) {
+ LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ if (Inductions.empty()) {
+ ORE->emit(createMissedAnalysis("NoInductionVariable")
+ << "loop induction variable could not be identified");
+ return false;
+ }
+ }
+
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+ PrimaryInduction = nullptr;
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+ LAI = &(*GetLAA)(*TheLoop);
+ const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+ if (LAR) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ });
+ }
+ if (!LAI->canVectorizeMemory())
+ return false;
+
+ if (LAI->hasStoreToLoopInvariantAddress()) {
+ ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
+ << "write to a loop invariant address could not be vectorized");
+ LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+ return false;
+ }
+
+ Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+ PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+ return true;
+}
+
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+ Value *In0 = const_cast<Value *>(V);
+ PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+ if (!PN)
+ return false;
+
+ return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+ return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+ return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
+ BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+ const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+ for (Instruction &I : *BB) {
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Value *Operand : I.operands()) {
+ if (auto *C = dyn_cast<Constant>(Operand))
+ if (C->canTrap())
+ return false;
+ }
+ // We might be able to hoist the load.
+ if (I.mayReadFromMemory()) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ if (!LI)
+ return false;
+ if (!SafePtrs.count(LI->getPointerOperand())) {
+ // !llvm.mem.parallel_loop_access implies if-conversion safety.
+ // Otherwise, record that the load needs (real or emulated) masking
+ // and let the cost model decide.
+ if (!IsAnnotatedParallel)
+ MaskedOp.insert(LI);
+ continue;
+ }
+ }
+
+ if (I.mayWriteToMemory()) {
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ return false;
+ // Predicated store requires some form of masking:
+ // 1) masked store HW instruction,
+ // 2) emulation via load-blend-store (only if safe and legal to do so,
+ // be aware on the race conditions), or
+ // 3) element-by-element predicate check and scalar store.
+ MaskedOp.insert(SI);
+ continue;
+ }
+ if (I.mayThrow())
+ return false;
+ }
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+ if (!EnableIfConversion) {
+ ORE->emit(createMissedAnalysis("IfConversionDisabled")
+ << "if-conversion is disabled");
+ return false;
+ }
+
+ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+ // A list of pointers that we can safely read and write to.
+ SmallPtrSet<Value *, 8> SafePointes;
+
+ // Collect safe addresses.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (blockNeedsPredication(BB))
+ continue;
+
+ for (Instruction &I : *BB)
+ if (auto *Ptr = getLoadStorePointerOperand(&I))
+ SafePointes.insert(Ptr);
+ }
+
+ // Collect the blocks that need predication.
+ BasicBlock *Header = TheLoop->getHeader();
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // We don't support switch statements inside loops.
+ if (!isa<BranchInst>(BB->getTerminator())) {
+ ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
+ << "loop contains a switch statement");
+ return false;
+ }
+
+ // We must be able to predicate all blocks that need to be predicated.
+ if (blockNeedsPredication(BB)) {
+ if (!blockCanBePredicated(BB, SafePointes)) {
+ ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+ << "control flow cannot be substituted for a select");
+ return false;
+ }
+ } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+ ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+ << "control flow cannot be substituted for a select");
+ return false;
+ }
+ }
+
+ // We can if-convert this loop.
+ return true;
+}
+
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
+ bool UseVPlanNativePath) {
+ assert((UseVPlanNativePath || Lp->empty()) &&
+ "VPlan-native path is not enabled.");
+
+ // TODO: ORE should be improved to show more accurate information when an
+ // outer loop can't be vectorized because a nested loop is not understood or
+ // legal. Something like: "outer_loop_location: loop not vectorized:
+ // (inner_loop_location) loop control flow is not understood by vectorizer".
+
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!Lp->getLoopPreheader()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single backedge.
+ if (Lp->getNumBackEdges() != 1) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single exiting block.
+ if (!Lp->getExitingBlock()) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We only handle bottom-tested loops, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
+ Loop *Lp, bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Recursively check whether the loop control flow of nested loops is
+ // understood.
+ for (Loop *SubLp : *Lp)
+ if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ // Check whether the loop-related control flow in the loop nest is expected by
+ // vectorizer.
+ if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We need to have a loop header.
+ LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+ << '\n');
+
+ // Specific checks for outer loops. We skip the remaining legal checks at this
+ // point because they don't support outer loops.
+ if (!TheLoop->empty()) {
+ assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
+
+ if (!canVectorizeOuterLoop()) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n");
+ // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+ // outer loops.
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+ return Result;
+ }
+
+ assert(TheLoop->empty() && "Inner loop expected.");
+ // Check if we can if-convert non-single-bb loops.
+ unsigned NumBlocks = TheLoop->getNumBlocks();
+ if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check if we can vectorize the instructions and CFG in this loop.
+ if (!canVectorizeInstrs()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeMemory()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+ << (LAI->getRuntimePointerChecking()->Need
+ ? " (with a runtime bound check)"
+ : "")
+ << "!\n");
+
+ unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+ if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+ SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+ if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
+ << "Too many SCEV assumptions need to be made and checked "
+ << "at runtime");
+ LLVM_DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Okay! We've done all the tests. If any have failed, return false. Otherwise
+ // we can vectorize, and at this point we don't have any other mem analysis
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return Result;
+}
+
+} // namespace llvm
diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
new file mode 100644
index 000000000000..2aa219064299
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -0,0 +1,282 @@
+//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a LoopVectorizationPlanner class.
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
+///
+/// Also provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+
+#include "VPlan.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+/// VPlan-based builder utility analogous to IRBuilder.
+class VPBuilder {
+private:
+ VPBasicBlock *BB = nullptr;
+ VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ ArrayRef<VPValue *> Operands) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ if (BB)
+ BB->insert(Instr, InsertPt);
+ return Instr;
+ }
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands) {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+ }
+
+public:
+ VPBuilder() {}
+
+ /// Clear the insertion point: created instructions will not be inserted into
+ /// a block.
+ void clearInsertionPoint() {
+ BB = nullptr;
+ InsertPt = VPBasicBlock::iterator();
+ }
+
+ VPBasicBlock *getInsertBlock() const { return BB; }
+ VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+
+ /// InsertPoint - A saved insertion point.
+ class VPInsertPoint {
+ VPBasicBlock *Block = nullptr;
+ VPBasicBlock::iterator Point;
+
+ public:
+ /// Creates a new insertion point which doesn't point to anything.
+ VPInsertPoint() = default;
+
+ /// Creates a new insertion point at the given location.
+ VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
+ : Block(InsertBlock), Point(InsertPoint) {}
+
+ /// Returns true if this insert point is set.
+ bool isSet() const { return Block != nullptr; }
+
+ VPBasicBlock *getBlock() const { return Block; }
+ VPBasicBlock::iterator getPoint() const { return Point; }
+ };
+
+ /// Sets the current insert point to a previously-saved location.
+ void restoreIP(VPInsertPoint IP) {
+ if (IP.isSet())
+ setInsertPoint(IP.getBlock(), IP.getPoint());
+ else
+ clearInsertionPoint();
+ }
+
+ /// This specifies that created VPInstructions should be appended to the end
+ /// of the specified block.
+ void setInsertPoint(VPBasicBlock *TheBB) {
+ assert(TheBB && "Attempting to set a null insert point");
+ BB = TheBB;
+ InsertPt = BB->end();
+ }
+
+ /// This specifies that created instructions should be inserted at the
+ /// specified point.
+ void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
+ BB = TheBB;
+ InsertPt = IP;
+ }
+
+ /// Insert and return the specified instruction.
+ VPInstruction *insert(VPInstruction *I) const {
+ BB->insert(I, InsertPt);
+ return I;
+ }
+
+ /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
+ /// its underlying Instruction.
+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+ NewVPInst->setUnderlyingValue(Inst);
+ return NewVPInst;
+ }
+ VPValue *createNaryOp(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+ }
+
+ VPValue *createNot(VPValue *Operand) {
+ return createInstruction(VPInstruction::Not, {Operand});
+ }
+
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ }
+
+ VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ }
+
+ //===--------------------------------------------------------------------===//
+ // RAII helpers.
+ //===--------------------------------------------------------------------===//
+
+ /// RAII object that stores the current insertion point and restores it when
+ /// the object is destroyed.
+ class InsertPointGuard {
+ VPBuilder &Builder;
+ VPBasicBlock *Block;
+ VPBasicBlock::iterator Point;
+
+ public:
+ InsertPointGuard(VPBuilder &B)
+ : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
+
+ InsertPointGuard(const InsertPointGuard &) = delete;
+ InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
+ ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
+ };
+};
+
+/// TODO: The following VectorizationFactor was pulled out of
+/// LoopVectorizationCostModel class. LV also deals with
+/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// We need to streamline them.
+
+/// Information about vectorization costs
+struct VectorizationFactor {
+ // Vector width with best cost
+ unsigned Width;
+ // Cost of the loop with that width
+ unsigned Cost;
+};
+
+/// Planner drives the vectorization process after having passed
+/// Legality checks.
+class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ using VPlanPtr = std::unique_ptr<VPlan>;
+
+ SmallVector<VPlanPtr, 4> VPlans;
+
+ /// This class is used to enable the VPlan to invoke a method of ILV. This is
+ /// needed until the method is refactored out of ILV and becomes reusable.
+ struct VPCallbackILV : public VPCallback {
+ InnerLoopVectorizer &ILV;
+
+ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+ Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+ };
+
+ /// A builder used to construct the current plan.
+ VPBuilder Builder;
+
+ unsigned BestVF = 0;
+ unsigned BestUF = 0;
+
+public:
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
+
+ /// Plan how to best vectorize, return the best VF and its cost.
+ VectorizationFactor plan(bool OptForSize, unsigned UserVF);
+
+ /// Use the VPlan-native path to plan how to best vectorize, return the best
+ /// VF and its cost.
+ VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+
+ /// Finalize the best decision and dispose of all other VPlans.
+ void setBestPlan(unsigned VF, unsigned UF);
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ O << *Plan;
+ }
+
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ static bool
+ getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ VFRange &Range);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
+ void buildVPlans(unsigned MinVF, unsigned MaxVF);
+
+private:
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlanPtr buildVPlan(VFRange &Range);
+
+ /// Build a VPlan using VPRecipes according to the information gather by
+ /// Legal. This method is only used for the legacy inner loop vectorizer.
+ VPlanPtr
+ buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+ void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 52f32cda2609..3c693f5d5ee0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -26,6 +26,14 @@
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
@@ -47,8 +55,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
-#include "VPlan.h"
-#include "VPlanBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
+#include "VPlanHCFGBuilder.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -57,11 +66,9 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -70,6 +77,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
@@ -124,6 +132,7 @@
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -145,10 +154,6 @@ using namespace llvm;
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
-static cl::opt<bool>
- EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
- cl::desc("Enable if-conversion during vectorization."));
-
/// Loops with a known constant trip count below this number are vectorized only
/// if no scalar iteration overheads are incurred.
static cl::opt<unsigned> TinyTripCountVectorThreshold(
@@ -184,9 +189,6 @@ static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
-/// Maximum vectorization interleave count.
-static const unsigned MaxInterleaveFactor = 16;
-
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
@@ -209,7 +211,7 @@ static cl::opt<unsigned> SmallLoopCost(
"The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
- "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+ "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
@@ -238,71 +240,21 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum allowed number of runtime memory checks with a "
- "vectorize(enable) pragma."));
-
-static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
- "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed."));
-
-static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
- "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed with a "
- "vectorize(enable) pragma"));
-
-/// Create an analysis remark that explains why vectorization failed
-///
-/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
-/// RemarkName is the identifier for the remark. If \p I is passed it is an
-/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
-/// the location of the remark. \return the remark object that can be
-/// streamed to.
-static OptimizationRemarkAnalysis
-createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
- Instruction *I = nullptr) {
- Value *CodeRegion = TheLoop->getHeader();
- DebugLoc DL = TheLoop->getStartLoc();
-
- if (I) {
- CodeRegion = I->getParent();
- // If there is no debug location attached to the instruction, revert back to
- // using the loop's.
- if (I->getDebugLoc())
- DL = I->getDebugLoc();
- }
-
- OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
- R << "loop not vectorized: ";
- return R;
-}
-
-namespace {
-
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-class LoopVectorizationRequirements;
-
-} // end anonymous namespace
-
-/// Returns true if the given loop body has a cycle, excluding the loop
-/// itself.
-static bool hasCyclesInLoopBody(const Loop &L) {
- if (!L.empty())
- return true;
-
- for (const auto &SCC :
- make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
- scc_iterator<Loop, LoopBodyTraits>::end(L))) {
- if (SCC.size() > 1) {
- DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
- DEBUG(L.dump());
- return true;
- }
- }
- return false;
-}
+static cl::opt<bool> EnableVPlanNativePath(
+ "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path with "
+ "support for outer loop vectorization."));
+
+// This flag enables the stress testing of the VPlan H-CFG construction in the
+// VPlan-native vectorization path. It must be used in conjuction with
+// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
+// verification of the H-CFGs built.
+static cl::opt<bool> VPlanBuildStressTest(
+ "vplan-build-stress-test", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Build VPlan for every supported loop nest in the function and bail "
+ "out right after the build (stress test the VPlan H-CFG construction "
+ "in the VPlan-native vectorization path)."));
/// A helper function for converting Scalar types to vector types.
/// If the incoming type is void, we return void. If the VF is 1, we return
@@ -317,16 +269,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
// in the project. They can be effectively organized in a common Load/Store
// utilities unit.
-/// A helper function that returns the pointer operand of a load or store
-/// instruction.
-static Value *getPointerOperand(Value *I) {
- if (auto *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerOperand();
- if (auto *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- return nullptr;
-}
-
/// A helper function that returns the type of loaded or stored value.
static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -373,7 +315,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
/// A helper function that returns the reciprocal of the block probability of
/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for for every X iterations of the loop header.
+/// will execute once for every X iterations of the loop header.
///
/// TODO: We should use actual block probability here, if available. Currently,
/// we always assume predicated blocks have a 50% chance of executing.
@@ -502,7 +444,7 @@ public:
void vectorizeMemoryInstruction(Instruction *Instr,
VectorParts *BlockInMask = nullptr);
- /// \brief Set the debug location in the builder using the debug location in
+ /// Set the debug location in the builder using the debug location in
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
@@ -538,7 +480,7 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
- /// \brief The Loop exit block may have single value PHI nodes with some
+ /// The Loop exit block may have single value PHI nodes with some
/// incoming value. While vectorizing we only handled real values
/// that were defined inside the loop and we should have one value for
/// each predecessor of its parent basic block. See PR14725.
@@ -573,9 +515,9 @@ protected:
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
- /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
- /// can be a truncate instruction).
- void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+ /// Note that \p EntryVal doesn't have to be an induction variable - it
+ /// can also be a truncate instruction.
+ void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
const InductionDescriptor &ID);
/// Create a vector induction phi node based on an existing scalar one. \p
@@ -602,10 +544,20 @@ protected:
/// vector loop for both the Phi and the cast.
/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
- void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
- Value *VectorLoopValue,
- unsigned Part,
- unsigned Lane = UINT_MAX);
+ ///
+ /// \p EntryVal is the value from the original loop that maps to the vector
+ /// phi node and is used to distinguish what is the IV currently being
+ /// processed - original one (if \p EntryVal is a phi corresponding to the
+ /// original IV) or the "newly-created" one based on the proof mentioned above
+ /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
+ /// latter case \p EntryVal is a TruncInst and we must not record anything for
+ /// that IV, but it's error-prone to expect callers of this routine to care
+ /// about that, hence this explicit parameter.
+ void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
+ const Instruction *EntryVal,
+ Value *VectorLoopValue,
+ unsigned Part,
+ unsigned Lane = UINT_MAX);
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -646,7 +598,7 @@ protected:
/// loop.
void addMetadata(Instruction *To, Instruction *From);
- /// \brief Similar to the previous function but it adds the metadata to a
+ /// Similar to the previous function but it adds the metadata to a
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
@@ -679,7 +631,7 @@ protected:
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- /// \brief LoopVersioning. It's only set up (non-null) if memchecks were
+ /// LoopVersioning. It's only set up (non-null) if memchecks were
/// used.
///
/// This is currently only used to add no-alias metadata based on the
@@ -777,7 +729,7 @@ private:
} // end namespace llvm
-/// \brief Look for a meaningful debug location on the instruction or it's
+/// Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
@@ -849,7 +801,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
namespace llvm {
-/// \brief The group of interleaved loads/stores sharing the same stride and
+/// The group of interleaved loads/stores sharing the same stride and
/// close to each other.
///
/// Each member in this group has an index starting from 0, and the largest
@@ -893,7 +845,7 @@ public:
unsigned getAlignment() const { return Align; }
unsigned getNumMembers() const { return Members.size(); }
- /// \brief Try to insert a new member \p Instr with index \p Index and
+ /// Try to insert a new member \p Instr with index \p Index and
/// alignment \p NewAlign. The index is related to the leader and it could be
/// negative if it is the new leader.
///
@@ -927,7 +879,7 @@ public:
return true;
}
- /// \brief Get the member with the given index \p Index
+ /// Get the member with the given index \p Index
///
/// \returns nullptr if contains no such member.
Instruction *getMember(unsigned Index) const {
@@ -938,7 +890,7 @@ public:
return Members.find(Key)->second;
}
- /// \brief Get the index for the given member. Unlike the key in the member
+ /// Get the index for the given member. Unlike the key in the member
/// map, the index starts from 0.
unsigned getIndex(Instruction *Instr) const {
for (auto I : Members)
@@ -989,7 +941,7 @@ private:
namespace {
-/// \brief Drive the analysis of interleaved memory accesses in the loop.
+/// Drive the analysis of interleaved memory accesses in the loop.
///
/// Use this class to analyze interleaved accesses only when we can vectorize
/// a loop. Otherwise it's meaningless to do analysis as the vectorization
@@ -1000,11 +952,12 @@ namespace {
class InterleavedAccessInfo {
public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
- DominatorTree *DT, LoopInfo *LI)
- : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
+ DominatorTree *DT, LoopInfo *LI,
+ const LoopAccessInfo *LAI)
+ : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
~InterleavedAccessInfo() {
- SmallSet<InterleaveGroup *, 4> DelSet;
+ SmallPtrSet<InterleaveGroup *, 4> DelSet;
// Avoid releasing a pointer twice.
for (auto &I : InterleaveGroupMap)
DelSet.insert(I.second);
@@ -1012,16 +965,16 @@ public:
delete Ptr;
}
- /// \brief Analyze the interleaved accesses and collect them in interleave
+ /// Analyze the interleaved accesses and collect them in interleave
/// groups. Substitute symbolic strides using \p Strides.
- void analyzeInterleaving(const ValueToValueMap &Strides);
+ void analyzeInterleaving();
- /// \brief Check if \p Instr belongs to any interleave group.
+ /// Check if \p Instr belongs to any interleave group.
bool isInterleaved(Instruction *Instr) const {
return InterleaveGroupMap.count(Instr);
}
- /// \brief Get the interleave group that \p Instr belongs to.
+ /// Get the interleave group that \p Instr belongs to.
///
/// \returns nullptr if doesn't have such group.
InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
@@ -1030,13 +983,10 @@ public:
return nullptr;
}
- /// \brief Returns true if an interleaved group that may access memory
+ /// Returns true if an interleaved group that may access memory
/// out-of-bounds requires a scalar epilogue iteration for correctness.
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
- /// \brief Initialize the LoopAccessInfo used for dependence checking.
- void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
-
private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -1047,7 +997,7 @@ private:
Loop *TheLoop;
DominatorTree *DT;
LoopInfo *LI;
- const LoopAccessInfo *LAI = nullptr;
+ const LoopAccessInfo *LAI;
/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
@@ -1061,7 +1011,7 @@ private:
/// access to a set of dependent sink accesses.
DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
- /// \brief The descriptor for a strided memory access.
+ /// The descriptor for a strided memory access.
struct StrideDescriptor {
StrideDescriptor() = default;
StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
@@ -1081,10 +1031,10 @@ private:
unsigned Align = 0;
};
- /// \brief A type for holding instructions and their stride descriptors.
+ /// A type for holding instructions and their stride descriptors.
using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
- /// \brief Create a new interleave group with the given instruction \p Instr,
+ /// Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
///
/// \returns the newly created interleave group.
@@ -1096,7 +1046,7 @@ private:
return InterleaveGroupMap[Instr];
}
- /// \brief Release the group and remove all the relationships.
+ /// Release the group and remove all the relationships.
void releaseGroup(InterleaveGroup *Group) {
for (unsigned i = 0; i < Group->getFactor(); i++)
if (Instruction *Member = Group->getMember(i))
@@ -1105,28 +1055,28 @@ private:
delete Group;
}
- /// \brief Collect all the accesses with a constant stride in program order.
+ /// Collect all the accesses with a constant stride in program order.
void collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides);
- /// \brief Returns true if \p Stride is allowed in an interleaved group.
+ /// Returns true if \p Stride is allowed in an interleaved group.
static bool isStrided(int Stride) {
unsigned Factor = std::abs(Stride);
return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
}
- /// \brief Returns true if \p BB is a predicated block.
+ /// Returns true if \p BB is a predicated block.
bool isPredicated(BasicBlock *BB) const {
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
- /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+ /// Returns true if LoopAccessInfo can be used for dependence queries.
bool areDependencesValid() const {
return LAI && LAI->getDepChecker().getDependences();
}
- /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
+ /// Returns true if memory accesses \p A and \p B can be reordered, if
/// necessary, when constructing interleaved groups.
///
/// \p A must precede \p B in program order. We return false if reordering is
@@ -1174,7 +1124,7 @@ private:
return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
}
- /// \brief Collect the dependences from LoopAccessInfo.
+ /// Collect the dependences from LoopAccessInfo.
///
/// We process the dependences once during the interleaved access analysis to
/// enable constant-time dependence queries.
@@ -1187,315 +1137,6 @@ private:
}
};
-/// Utility class for getting and setting loop vectorizer hints in the form
-/// of loop metadata.
-/// This class keeps a number of loop annotations locally (as member variables)
-/// and can, upon request, write them back as metadata on the loop. It will
-/// initially scan the loop for existing metadata, and will update the local
-/// values based on information in the loop.
-/// We cannot write all values to metadata, as the mere presence of some info,
-/// for example 'force', means a decision has been made. So, we need to be
-/// careful NOT to add them if the user hasn't specifically asked so.
-class LoopVectorizeHints {
- enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
-
- /// Hint - associates name and validation with the hint value.
- struct Hint {
- const char *Name;
- unsigned Value; // This may have to change for non-numeric values.
- HintKind Kind;
-
- Hint(const char *Name, unsigned Value, HintKind Kind)
- : Name(Name), Value(Value), Kind(Kind) {}
-
- bool validate(unsigned Val) {
- switch (Kind) {
- case HK_WIDTH:
- return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
- case HK_UNROLL:
- return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
- case HK_FORCE:
- return (Val <= 1);
- case HK_ISVECTORIZED:
- return (Val==0 || Val==1);
- }
- return false;
- }
- };
-
- /// Vectorization width.
- Hint Width;
-
- /// Vectorization interleave factor.
- Hint Interleave;
-
- /// Vectorization forced
- Hint Force;
-
- /// Already Vectorized
- Hint IsVectorized;
-
- /// Return the loop metadata prefix.
- static StringRef Prefix() { return "llvm.loop."; }
-
- /// True if there is any unsafe math in the loop.
- bool PotentiallyUnsafe = false;
-
-public:
- enum ForceKind {
- FK_Undefined = -1, ///< Not selected.
- FK_Disabled = 0, ///< Forcing disabled.
- FK_Enabled = 1, ///< Forcing enabled.
- };
-
- LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
- OptimizationRemarkEmitter &ORE)
- : Width("vectorize.width", VectorizerParams::VectorizationFactor,
- HK_WIDTH),
- Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
- Force("vectorize.enable", FK_Undefined, HK_FORCE),
- IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
- // Populate values with existing loop metadata.
- getHintsFromMetadata();
-
- // force-vector-interleave overrides DisableInterleaving.
- if (VectorizerParams::isInterleaveForced())
- Interleave.Value = VectorizerParams::VectorizationInterleave;
-
- if (IsVectorized.Value != 1)
- // If the vectorization width and interleaving count are both 1 then
- // consider the loop to have been already vectorized because there's
- // nothing more that we can do.
- IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
- DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
- << "LV: Interleaving disabled by the pass manager\n");
- }
-
- /// Mark the loop L as already vectorized by setting the width to 1.
- void setAlreadyVectorized() {
- IsVectorized.Value = 1;
- Hint Hints[] = {IsVectorized};
- writeHintsToMetadata(Hints);
- }
-
- bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
- if (getForce() == LoopVectorizeHints::FK_Disabled) {
- DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
- emitRemarkWithHints();
- return false;
- }
-
- if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
- DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
- emitRemarkWithHints();
- return false;
- }
-
- if (getIsVectorized() == 1) {
- DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
- // FIXME: Add interleave.disable metadata. This will allow
- // vectorize.disable to be used without disabling the pass and errors
- // to differentiate between disabled vectorization and a width of 1.
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
- "AllDisabled", L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: vectorization and interleaving are "
- "explicitly disabled, or the loop has already been "
- "vectorized";
- });
- return false;
- }
-
- return true;
- }
-
- /// Dumps all the hint information.
- void emitRemarkWithHints() const {
- using namespace ore;
-
- ORE.emit([&]() {
- if (Force.Value == LoopVectorizeHints::FK_Disabled)
- return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "loop not vectorized: vectorization is explicitly disabled";
- else {
- OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
- TheLoop->getStartLoc(),
- TheLoop->getHeader());
- R << "loop not vectorized";
- if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=" << NV("Force", true);
- if (Width.Value != 0)
- R << ", Vector Width=" << NV("VectorWidth", Width.Value);
- if (Interleave.Value != 0)
- R << ", Interleave Count="
- << NV("InterleaveCount", Interleave.Value);
- R << ")";
- }
- return R;
- }
- });
- }
-
- unsigned getWidth() const { return Width.Value; }
- unsigned getInterleave() const { return Interleave.Value; }
- unsigned getIsVectorized() const { return IsVectorized.Value; }
- enum ForceKind getForce() const { return (ForceKind)Force.Value; }
-
- /// \brief If hints are provided that force vectorization, use the AlwaysPrint
- /// pass name to force the frontend to print the diagnostic.
- const char *vectorizeAnalysisPassName() const {
- if (getWidth() == 1)
- return LV_NAME;
- if (getForce() == LoopVectorizeHints::FK_Disabled)
- return LV_NAME;
- if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
- return LV_NAME;
- return OptimizationRemarkAnalysis::AlwaysPrint;
- }
-
- bool allowReordering() const {
- // When enabling loop hints are provided we allow the vectorizer to change
- // the order of operations that is given by the scalar loop. This is not
- // enabled by default because can be unsafe or inefficient. For example,
- // reordering floating-point operations will change the way round-off
- // error accumulates in the loop.
- return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
- }
-
- bool isPotentiallyUnsafe() const {
- // Avoid FP vectorization if the target is unsure about proper support.
- // This may be related to the SIMD unit in the target not handling
- // IEEE 754 FP ops properly, or bad single-to-double promotions.
- // Otherwise, a sequence of vectorized loops, even without reduction,
- // could lead to different end results on the destination vectors.
- return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
- }
-
- void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
-
-private:
- /// Find hints specified in the loop metadata and update local values.
- void getHintsFromMetadata() {
- MDNode *LoopID = TheLoop->getLoopID();
- if (!LoopID)
- return;
-
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- const MDString *S = nullptr;
- SmallVector<Metadata *, 4> Args;
-
- // The expected hint is either a MDString or a MDNode with the first
- // operand a MDString.
- if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
- if (!MD || MD->getNumOperands() == 0)
- continue;
- S = dyn_cast<MDString>(MD->getOperand(0));
- for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
- Args.push_back(MD->getOperand(i));
- } else {
- S = dyn_cast<MDString>(LoopID->getOperand(i));
- assert(Args.size() == 0 && "too many arguments for MDString");
- }
-
- if (!S)
- continue;
-
- // Check if the hint starts with the loop metadata prefix.
- StringRef Name = S->getString();
- if (Args.size() == 1)
- setHint(Name, Args[0]);
- }
- }
-
- /// Checks string hint with one operand and set value if valid.
- void setHint(StringRef Name, Metadata *Arg) {
- if (!Name.startswith(Prefix()))
- return;
- Name = Name.substr(Prefix().size(), StringRef::npos);
-
- const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
- if (!C)
- return;
- unsigned Val = C->getZExtValue();
-
- Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
- for (auto H : Hints) {
- if (Name == H->Name) {
- if (H->validate(Val))
- H->Value = Val;
- else
- DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
- break;
- }
- }
- }
-
- /// Create a new hint from name / value pair.
- MDNode *createHintMetadata(StringRef Name, unsigned V) const {
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- Metadata *MDs[] = {MDString::get(Context, Name),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt32Ty(Context), V))};
- return MDNode::get(Context, MDs);
- }
-
- /// Matches metadata with hint name.
- bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
- MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
- if (!Name)
- return false;
-
- for (auto H : HintTypes)
- if (Name->getString().endswith(H.Name))
- return true;
- return false;
- }
-
- /// Sets current hints into loop metadata, keeping other values intact.
- void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
- if (HintTypes.empty())
- return;
-
- // Reserve the first element to LoopID (see below).
- SmallVector<Metadata *, 4> MDs(1);
- // If the loop already has metadata, then ignore the existing operands.
- MDNode *LoopID = TheLoop->getLoopID();
- if (LoopID) {
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
- // If node in update list, ignore old value.
- if (!matchesHintMetadataName(Node, HintTypes))
- MDs.push_back(Node);
- }
- }
-
- // Now, add the missing hints.
- for (auto H : HintTypes)
- MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
-
- // Replace current metadata node with new one.
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
-
- TheLoop->setLoopID(NewLoopID);
- }
-
- /// The loop these hints belong to.
- const Loop *TheLoop;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter &ORE;
-};
-
} // end anonymous namespace
static void emitMissedWarning(Function *F, Loop *L,
@@ -1519,324 +1160,7 @@ static void emitMissedWarning(Function *F, Loop *L,
}
}
-namespace {
-
-/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
-/// to what vectorization factor.
-/// This class does not look at the profitability of vectorization, only the
-/// legality. This class has two main kinds of checks:
-/// * Memory checks - The code in canVectorizeMemory checks if vectorization
-/// will change the order of memory accesses in a way that will change the
-/// correctness of the program.
-/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
-/// checks for a number of different conditions, such as the availability of a
-/// single induction variable, that all types are supported and vectorize-able,
-/// etc. This code reflects the capabilities of InnerLoopVectorizer.
-/// This class is also used by InnerLoopVectorizer for identifying
-/// induction variable and the different reduction variables.
-class LoopVectorizationLegality {
-public:
- LoopVectorizationLegality(
- Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
- TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
- const TargetTransformInfo *TTI,
- std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
- OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
- LoopVectorizeHints *H)
- : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
- ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
-
- /// ReductionList contains the reduction descriptors for all
- /// of the reductions that were found in the loop.
- using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
-
- /// InductionList saves induction variables and maps them to the
- /// induction descriptor.
- using InductionList = MapVector<PHINode *, InductionDescriptor>;
-
- /// RecurrenceSet contains the phi nodes that are recurrences other than
- /// inductions and reductions.
- using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
-
- /// Returns true if it is legal to vectorize this loop.
- /// This does not mean that it is profitable to vectorize this
- /// loop, only that it is legal to do so.
- bool canVectorize();
-
- /// Returns the primary induction variable.
- PHINode *getPrimaryInduction() { return PrimaryInduction; }
-
- /// Returns the reduction variables found in the loop.
- ReductionList *getReductionVars() { return &Reductions; }
-
- /// Returns the induction variables found in the loop.
- InductionList *getInductionVars() { return &Inductions; }
-
- /// Return the first-order recurrences found in the loop.
- RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
-
- /// Return the set of instructions to sink to handle first-order recurrences.
- DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
-
- /// Returns the widest induction type.
- Type *getWidestInductionType() { return WidestIndTy; }
-
- /// Returns True if V is a Phi node of an induction variable in this loop.
- bool isInductionPhi(const Value *V);
-
- /// Returns True if V is a cast that is part of an induction def-use chain,
- /// and had been proven to be redundant under a runtime guard (in other
- /// words, the cast has the same SCEV expression as the induction phi).
- bool isCastedInductionVariable(const Value *V);
-
- /// Returns True if V can be considered as an induction variable in this
- /// loop. V can be the induction phi, or some redundant cast in the def-use
- /// chain of the inducion phi.
- bool isInductionVariable(const Value *V);
-
- /// Returns True if PN is a reduction variable in this loop.
- bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
-
- /// Returns True if Phi is a first-order recurrence in this loop.
- bool isFirstOrderRecurrence(const PHINode *Phi);
-
- /// Return true if the block BB needs to be predicated in order for the loop
- /// to be vectorized.
- bool blockNeedsPredication(BasicBlock *BB);
-
- /// Check if this pointer is consecutive when vectorizing. This happens
- /// when the last index of the GEP is the induction variable, or that the
- /// pointer itself is an induction variable.
- /// This check allows us to vectorize A[idx] into a wide load/store.
- /// Returns:
- /// 0 - Stride is unknown or non-consecutive.
- /// 1 - Address is consecutive.
- /// -1 - Address is consecutive, and decreasing.
- /// NOTE: This method must only be used before modifying the original scalar
- /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
- int isConsecutivePtr(Value *Ptr);
-
- /// Returns true if the value V is uniform within the loop.
- bool isUniform(Value *V);
-
- /// Returns the information that we collected about runtime memory check.
- const RuntimePointerChecking *getRuntimePointerChecking() const {
- return LAI->getRuntimePointerChecking();
- }
-
- const LoopAccessInfo *getLAI() const { return LAI; }
-
- /// \brief Check if \p Instr belongs to any interleaved access group.
- bool isAccessInterleaved(Instruction *Instr) {
- return InterleaveInfo.isInterleaved(Instr);
- }
-
- /// \brief Get the interleaved access group that \p Instr belongs to.
- const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
- return InterleaveInfo.getInterleaveGroup(Instr);
- }
-
- /// \brief Returns true if an interleaved group requires a scalar iteration
- /// to handle accesses with gaps.
- bool requiresScalarEpilogue() const {
- return InterleaveInfo.requiresScalarEpilogue();
- }
-
- unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
-
- uint64_t getMaxSafeRegisterWidth() const {
- return LAI->getDepChecker().getMaxSafeRegisterWidth();
- }
-
- bool hasStride(Value *V) { return LAI->hasStride(V); }
-
- /// Returns true if the target machine supports masked store operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
- return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
- }
-
- /// Returns true if the target machine supports masked load operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
- return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
- }
-
- /// Returns true if the target machine supports masked scatter operation
- /// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType) {
- return TTI->isLegalMaskedScatter(DataType);
- }
-
- /// Returns true if the target machine supports masked gather operation
- /// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType) {
- return TTI->isLegalMaskedGather(DataType);
- }
-
- /// Returns true if the target machine can represent \p V as a masked gather
- /// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
- auto *LI = dyn_cast<LoadInst>(V);
- auto *SI = dyn_cast<StoreInst>(V);
- if (!LI && !SI)
- return false;
- auto *Ptr = getPointerOperand(V);
- auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
- return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
- }
-
- /// Returns true if vector representation of the instruction \p I
- /// requires mask.
- bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
-
- unsigned getNumStores() const { return LAI->getNumStores(); }
- unsigned getNumLoads() const { return LAI->getNumLoads(); }
- unsigned getNumPredStores() const { return NumPredStores; }
-
- /// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- bool isScalarWithPredication(Instruction *I);
-
- /// Returns true if \p I is a memory instruction with consecutive memory
- /// access that can be widened.
- bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
-
- // Returns true if the NoNaN attribute is set on the function.
- bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
-
-private:
- /// Check if a single basic block loop is vectorizable.
- /// At this point we know that this is a loop with a constant trip count
- /// and we only need to check individual instructions.
- bool canVectorizeInstrs();
-
- /// When we vectorize loops we may change the order in which
- /// we read and write from memory. This method checks if it is
- /// legal to vectorize the code, considering only memory constrains.
- /// Returns true if the loop is vectorizable
- bool canVectorizeMemory();
-
- /// Return true if we can vectorize this loop using the IF-conversion
- /// transformation.
- bool canVectorizeWithIfConvert();
-
- /// Return true if all of the instructions in the block can be speculatively
- /// executed. \p SafePtrs is a list of addresses that are known to be legal
- /// and we know that we can read from them without segfault.
- bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
-
- /// Updates the vectorization state by adding \p Phi to the inductions list.
- /// This can set \p Phi as the main induction of the loop if \p Phi is a
- /// better choice for the main induction than the existing one.
- void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
- SmallPtrSetImpl<Value *> &AllowedExit);
-
- /// Create an analysis remark that explains why vectorization failed
- ///
- /// \p RemarkName is the identifier for the remark. If \p I is passed it is
- /// an instruction that prevents vectorization. Otherwise the loop is used
- /// for the location of the remark. \return the remark object that can be
- /// streamed to.
- OptimizationRemarkAnalysis
- createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
- return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
- RemarkName, TheLoop, I);
- }
-
- /// \brief If an access has a symbolic strides, this maps the pointer value to
- /// the stride symbol.
- const ValueToValueMap *getSymbolicStrides() {
- // FIXME: Currently, the set of symbolic strides is sometimes queried before
- // it's collected. This happens from canVectorizeWithIfConvert, when the
- // pointer is checked to reference consecutive elements suitable for a
- // masked access.
- return LAI ? &LAI->getSymbolicStrides() : nullptr;
- }
-
- unsigned NumPredStores = 0;
-
- /// The loop that we evaluate.
- Loop *TheLoop;
-
- /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
- /// Applies dynamic knowledge to simplify SCEV expressions in the context
- /// of existing SCEV assumptions. The analysis will also add a minimal set
- /// of new predicates if this is required to enable vectorization and
- /// unrolling.
- PredicatedScalarEvolution &PSE;
-
- /// Target Library Info.
- TargetLibraryInfo *TLI;
-
- /// Target Transform Info
- const TargetTransformInfo *TTI;
-
- /// Dominator Tree.
- DominatorTree *DT;
-
- // LoopAccess analysis.
- std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
-
- // And the loop-accesses info corresponding to this loop. This pointer is
- // null until canVectorizeMemory sets it up.
- const LoopAccessInfo *LAI = nullptr;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
- /// The interleave access information contains groups of interleaved accesses
- /// with the same stride and close to each other.
- InterleavedAccessInfo InterleaveInfo;
-
- // --- vectorization state --- //
-
- /// Holds the primary induction variable. This is the counter of the
- /// loop.
- PHINode *PrimaryInduction = nullptr;
-
- /// Holds the reduction variables.
- ReductionList Reductions;
-
- /// Holds all of the induction variables that we found in the loop.
- /// Notice that inductions don't need to start at zero and that induction
- /// variables can be pointers.
- InductionList Inductions;
-
- /// Holds all the casts that participate in the update chain of the induction
- /// variables, and that have been proven to be redundant (possibly under a
- /// runtime guard). These casts can be ignored when creating the vectorized
- /// loop body.
- SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
-
- /// Holds the phi nodes that are first-order recurrences.
- RecurrenceSet FirstOrderRecurrences;
-
- /// Holds instructions that need to sink past other instructions to handle
- /// first-order recurrences.
- DenseMap<Instruction *, Instruction *> SinkAfter;
-
- /// Holds the widest induction type encountered.
- Type *WidestIndTy = nullptr;
-
- /// Allowed outside users. This holds the induction and reduction
- /// vars which can be accessed from outside the loop.
- SmallPtrSet<Value *, 4> AllowedExit;
-
- /// Can we assume the absence of NaNs.
- bool HasFunNoNaNAttr = false;
-
- /// Vectorization requirements that will go through late-evaluation.
- LoopVectorizationRequirements *Requirements;
-
- /// Used to emit an analysis of any legality issues.
- LoopVectorizeHints *Hints;
-
- /// While vectorizing these instructions we have to generate a
- /// call to the appropriate masked intrinsic
- SmallPtrSet<const Instruction *, 8> MaskedOp;
-};
+namespace llvm {
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
@@ -1853,23 +1177,15 @@ public:
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
- const LoopVectorizeHints *Hints)
+ const LoopVectorizeHints *Hints,
+ InterleavedAccessInfo &IAI)
: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
- AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
+ AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization should be avoided up front.
Optional<unsigned> computeMaxVF(bool OptForSize);
- /// Information about vectorization costs
- struct VectorizationFactor {
- // Vector width with best cost
- unsigned Width;
-
- // Cost of the loop with that width
- unsigned Cost;
- };
-
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
@@ -1903,7 +1219,7 @@ public:
/// avoid redundant calculations.
void setCostBasedWideningDecision(unsigned VF);
- /// \brief A struct that represents some properties of the register usage
+ /// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
@@ -1911,9 +1227,6 @@ public:
/// Holds the maximum number of concurrent live intervals in the loop.
unsigned MaxLocalUsers;
-
- /// Holds the number of instructions in the loop.
- unsigned NumInstructions;
};
/// \return Returns information about the register usages of the loop for the
@@ -2063,7 +1376,69 @@ public:
collectLoopScalars(VF);
}
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+ return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+ }
+
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+ return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+ }
+
+ /// Returns true if the target machine supports masked scatter operation
+ /// for the given \p DataType.
+ bool isLegalMaskedScatter(Type *DataType) {
+ return TTI.isLegalMaskedScatter(DataType);
+ }
+
+ /// Returns true if the target machine supports masked gather operation
+ /// for the given \p DataType.
+ bool isLegalMaskedGather(Type *DataType) {
+ return TTI.isLegalMaskedGather(DataType);
+ }
+
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V) {
+ bool LI = isa<LoadInst>(V);
+ bool SI = isa<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ty = getMemInstValueType(V);
+ return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+ }
+
+ /// Returns true if \p I is an instruction that will be scalarized with
+ /// predication. Such instructions include conditional stores and
+ /// instructions that may divide by zero.
+ bool isScalarWithPredication(Instruction *I);
+
+ /// Returns true if \p I is a memory instruction with consecutive memory
+ /// access that can be widened.
+ bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
+ /// Check if \p Instr belongs to any interleaved access group.
+ bool isAccessInterleaved(Instruction *Instr) {
+ return InterleaveInfo.isInterleaved(Instr);
+ }
+
+ /// Get the interleaved access group that \p Instr belongs to.
+ const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+ return InterleaveInfo.getInterleaveGroup(Instr);
+ }
+
+ /// Returns true if an interleaved group requires a scalar iteration
+ /// to handle accesses with gaps.
+ bool requiresScalarEpilogue() const {
+ return InterleaveInfo.requiresScalarEpilogue();
+ }
+
private:
+ unsigned NumPredStores = 0;
+
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
@@ -2115,12 +1490,16 @@ private:
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
+ /// Returns true if an artificially high cost for emulated masked memrefs
+ /// should be used.
+ bool useEmulatedMaskMemRefHack(Instruction *I);
+
/// Create an analysis remark that explains why vectorization failed
///
/// \p RemarkName is the identifier for the remark. \return the remark object
/// that can be streamed to.
OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
- return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+ return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
RemarkName, TheLoop);
}
@@ -2222,6 +1601,10 @@ public:
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
+ /// The interleave access information contains groups of interleaved accesses
+ /// with the same stride and close to each other.
+ InterleavedAccessInfo &InterleaveInfo;
+
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
@@ -2229,271 +1612,78 @@ public:
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
-} // end anonymous namespace
-
-namespace llvm {
-
-/// InnerLoopVectorizer vectorizes loops which contain only one basic
-/// LoopVectorizationPlanner - drives the vectorization process after having
-/// passed Legality checks.
-/// The planner builds and optimizes the Vectorization Plans which record the
-/// decisions how to vectorize the given loop. In particular, represent the
-/// control-flow of the vectorized version, the replication of instructions that
-/// are to be scalarized, and interleave access groups.
-class LoopVectorizationPlanner {
- /// The loop that we evaluate.
- Loop *OrigLoop;
-
- /// Loop Info analysis.
- LoopInfo *LI;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// Target Transform Info.
- const TargetTransformInfo *TTI;
-
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
-
- /// The profitablity analysis.
- LoopVectorizationCostModel &CM;
-
- using VPlanPtr = std::unique_ptr<VPlan>;
-
- SmallVector<VPlanPtr, 4> VPlans;
-
- /// This class is used to enable the VPlan to invoke a method of ILV. This is
- /// needed until the method is refactored out of ILV and becomes reusable.
- struct VPCallbackILV : public VPCallback {
- InnerLoopVectorizer &ILV;
-
- VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
-
- Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
- return ILV.getOrCreateVectorValue(V, Part);
- }
- };
-
- /// A builder used to construct the current plan.
- VPBuilder Builder;
-
- /// When we if-convert we need to create edge masks. We have to cache values
- /// so that we don't end up with exponential recursion/IR. Note that
- /// if-conversion currently takes place during VPlan-construction, so these
- /// caches are only used at that stage.
- using EdgeMaskCacheTy =
- DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
- using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
-
- unsigned BestVF = 0;
- unsigned BestUF = 0;
-
-public:
- LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI,
- LoopVectorizationLegality *Legal,
- LoopVectorizationCostModel &CM)
- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
-
- /// Plan how to best vectorize, return the best VF and its cost.
- LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
- unsigned UserVF);
-
- /// Finalize the best decision and dispose of all other VPlans.
- void setBestPlan(unsigned VF, unsigned UF);
-
- /// Generate the IR code for the body of the vectorized loop according to the
- /// best selected VPlan.
- void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
-
- void printPlans(raw_ostream &O) {
- for (const auto &Plan : VPlans)
- O << *Plan;
- }
-
-protected:
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
- /// A range of powers-of-2 vectorization factors with fixed start and
- /// adjustable end. The range includes start and excludes end, e.g.,:
- /// [1, 9) = {1, 2, 4, 8}
- struct VFRange {
- // A power of 2.
- const unsigned Start;
-
- // Need not be a power of 2. If End <= Start range is empty.
- unsigned End;
- };
-
- /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
- /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
- /// returned value holds for the entire \p Range.
- bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
- VFRange &Range);
-
- /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
- /// according to the information gathered by Legal when it checked if it is
- /// legal to vectorize the loop.
- void buildVPlans(unsigned MinVF, unsigned MaxVF);
-
-private:
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
- VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
-
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
-
- /// Check if \I belongs to an Interleave Group within the given VF \p Range,
- /// \return true in the first returned value if so and false otherwise.
- /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
- /// for \p Range.Start, and provide it as the second returned value.
- /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
- /// \return value is <true, nullptr>, as it is handled by another recipe.
- /// \p Range.End may be decreased to ensure same decision from \p Range.Start
- /// to \p Range.End.
- VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
-
- // Check if \I is a memory instruction to be widened for \p Range.Start and
- // potentially masked. Such instructions are handled by a recipe that takes an
- // additional VPInstruction for the mask.
- VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
- VFRange &Range,
- VPlanPtr &Plan);
-
- /// Check if an induction recipe should be constructed for \I within the given
- /// VF \p Range. If so build and return it. If not, return null. \p Range.End
- /// may be decreased to ensure same decision from \p Range.Start to
- /// \p Range.End.
- VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
- VFRange &Range);
-
- /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
- /// a sequence of select instructions as the vectorizer currently performs
- /// full if-conversion.
- VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
-
- /// Check if \p I can be widened within the given VF \p Range. If \p I can be
- /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
- /// extended to include \p I or else build a new VPWidenRecipe for it and
- /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
- /// false otherwise. Range.End may be decreased to ensure same decision from
- /// \p Range.Start to \p Range.End.
- bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
-
- /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
- /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
- /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
- /// Region. Update the packing decision of predicated instructions if they
- /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
- /// \p Range.Start to \p Range.End.
- VPBasicBlock *handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
- VPlanPtr &Plan);
-
- /// Create a replicating region for instruction \p I that requires
- /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
- VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
- VPlanPtr &Plan);
-
- /// Build a VPlan according to the information gathered by Legal. \return a
- /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
- /// exclusive, possibly decreasing \p Range.End.
- VPlanPtr buildVPlan(VFRange &Range,
- const SmallPtrSetImpl<Value *> &NeedDef);
-};
-
} // end namespace llvm
-namespace {
-
-/// \brief This holds vectorization requirements that must be verified late in
-/// the process. The requirements are set by legalize and costmodel. Once
-/// vectorization has been determined to be possible and profitable the
-/// requirements can be verified by looking for metadata or compiler options.
-/// For example, some loops require FP commutativity which is only allowed if
-/// vectorization is explicitly specified or if the fast-math compiler option
-/// has been provided.
-/// Late evaluation of these requirements allows helpful diagnostics to be
-/// composed that tells the user what need to be done to vectorize the loop. For
-/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
-/// evaluation should be used only when diagnostics can generated that can be
-/// followed by a non-expert user.
-class LoopVectorizationRequirements {
-public:
- LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
-
- void addUnsafeAlgebraInst(Instruction *I) {
- // First unsafe algebra instruction.
- if (!UnsafeAlgebraInst)
- UnsafeAlgebraInst = I;
- }
-
- void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
-
- bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
- const char *PassName = Hints.vectorizeAnalysisPassName();
- bool Failed = false;
- if (UnsafeAlgebraInst && !Hints.allowReordering()) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisFPCommute(
- PassName, "CantReorderFPOps",
- UnsafeAlgebraInst->getDebugLoc(),
- UnsafeAlgebraInst->getParent())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "floating-point operations";
- });
- Failed = true;
- }
-
- // Test if runtime memcheck thresholds are exceeded.
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
- L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Failed = true;
- }
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+ OptimizationRemarkEmitter *ORE) {
+ assert(!OuterLp->empty() && "This is not an outer loop");
+ LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+ // Only outer loops with an explicit vectorization hint are supported.
+ // Unannotated outer loops are ignored.
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
- return Failed;
+ Function *Fn = OuterLp->getHeader()->getParent();
+ if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+ return false;
}
-private:
- unsigned NumRuntimePointerChecks = 0;
- Instruction *UnsafeAlgebraInst = nullptr;
+ if (!Hints.getWidth()) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
+ emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ return false;
+ }
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter &ORE;
-};
+ if (Hints.getInterleave() > 1) {
+ // TODO: Interleave support is future work.
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+ "outer loops.\n");
+ emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ return false;
+ }
-} // end anonymous namespace
+ return true;
+}
-static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
- if (L.empty()) {
- if (!hasCyclesInLoopBody(L))
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE,
+ SmallVectorImpl<Loop *> &V) {
+ // Collect inner loops and outer loops without irreducible control flow. For
+ // now, only collect outer loops that have explicit vectorization hints. If we
+ // are stress testing the VPlan H-CFG construction, we collect the outermost
+ // loop of every loop nest.
+ if (L.empty() || VPlanBuildStressTest ||
+ (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(LI);
+ if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
V.push_back(&L);
- return;
+ // TODO: Collect inner loops inside marked outer loops in case
+ // vectorization fails for the outer loop. Do not invoke
+ // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+ // already known to be reducible. We can use an inherited attribute for
+ // that.
+ return;
+ }
}
for (Loop *InnerL : L)
- addAcyclicInnerLoop(*InnerL, V);
+ collectSupportedLoops(*InnerL, LI, ORE, V);
}
namespace {
@@ -2562,14 +1752,16 @@ struct LoopVectorize : public FunctionPass {
//===----------------------------------------------------------------------===//
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // We need to place the broadcast of invariant variables outside the loop.
+ // We need to place the broadcast of invariant variables outside the loop,
+ // but only if it's proven safe to do so. Else, broadcast will be inside
+ // vector loop body.
Instruction *Instr = dyn_cast<Instruction>(V);
- bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
- bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
-
+ bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
+ (!Instr ||
+ DT->dominates(Instr->getParent(), LoopVectorPreHeader));
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
- if (Invariant)
+ if (SafeToHoist)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
// Broadcast the scalar into all locations in the vector.
@@ -2580,6 +1772,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
Value *Start = II.getStartValue();
// Construct the initial value of the vector IV in the vector loop preheader
@@ -2627,14 +1821,18 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
&*LoopVectorBody->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
- recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
+ recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
+
LastInduction = cast<Instruction>(addFastMathFlag(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
}
// Move the last step to the end of the latch block. This ensures consistent
@@ -2665,8 +1863,20 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
}
void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
- unsigned Lane) {
+ const InductionDescriptor &ID, const Instruction *EntryVal,
+ Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // This induction variable is not the phi from the original loop but the
+ // newly-created IV based on the proof that casted Phi is equal to the
+ // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
+ // re-uses the same InductionDescriptor that original IV uses but we don't
+ // have to do any recording in this case - that is done when original IV is
+ // processed.
+ if (isa<TruncInst>(EntryVal))
+ return;
+
const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
if (Casts.empty())
return;
@@ -2754,15 +1964,16 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
+ // TODO: Don't do it unless the vectorized IV is really required.
if (!VectorizedIV) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart =
getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
- recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
}
}
@@ -2833,7 +2044,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
- Value *EntryVal,
+ Instruction *EntryVal,
const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one");
@@ -2868,25 +2079,11 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
- recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
}
}
}
-int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
- const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
- ValueToValueMap();
-
- int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
- if (Stride == 1 || Stride == -1)
- return Stride;
- return 0;
-}
-
-bool LoopVectorizationLegality::isUniform(Value *V) {
- return LAI->isUniform(V);
-}
-
Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
@@ -3046,7 +2243,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
- const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+ const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.");
// Skip if current instruction is not the insert position.
@@ -3054,7 +2251,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
return;
const DataLayout &DL = Instr->getModule()->getDataLayout();
- Value *Ptr = getPointerOperand(Instr);
+ Value *Ptr = getLoadStorePointerOperand(Instr);
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
@@ -3076,6 +2273,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
if (Group->isReverse())
Index += (VF - 1) * Group->getFactor();
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
for (unsigned Part = 0; Part < UF; Part++) {
Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
@@ -3091,6 +2292,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// A[i+2] = c; // Member of index 2 (Current instruction)
// Current pointer is pointed to A[i+2], adjust it to A[i].
NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+ if (InBounds)
+ cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
// Cast to the vector pointer type.
NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
@@ -3196,7 +2399,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
- Value *Ptr = getPointerOperand(Instr);
+ Value *Ptr = getLoadStorePointerOperand(Instr);
unsigned Alignment = getMemInstAlignment(Instr);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
@@ -3227,10 +2430,37 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (isMaskRequired)
Mask = *BlockInMask;
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(Instr)->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+ // Calculate the pointer for the specific unroll-part.
+ GetElementPtrInst *PartPtr = nullptr;
+
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
+ PartPtr->setIsInBounds(InBounds);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
+ PartPtr->setIsInBounds(InBounds);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
+ } else {
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
+ PartPtr->setIsInBounds(InBounds);
+ }
+
+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ };
+
// Handle Stores:
if (SI) {
- assert(!Legal->isUniform(SI->getPointerOperand()) &&
- "We do not allow storing to uniform addresses");
setDebugLocFromInst(Builder, SI);
for (unsigned Part = 0; Part < UF; ++Part) {
@@ -3242,30 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
} else {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr =
- Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
if (Reverse) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = reverseVector(StoredVal);
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
-
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- PartPtr =
- Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
- PartPtr =
- Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- Mask[Part] = reverseVector(Mask[Part]);
}
-
- Value *VecPtr =
- Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
-
+ auto *VecPtr = CreateVecPtr(Part, Ptr);
if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
Mask[Part]);
@@ -3289,21 +2503,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
nullptr, "wide.masked.gather");
addMetadata(NewLI, LI);
} else {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr =
- Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
- if (Reverse) {
- // If the address is consecutive but reversed, then the
- // wide load needs to start at the last vector element.
- PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
- PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- Mask[Part] = reverseVector(Mask[Part]);
- }
-
- Value *VecPtr =
- Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ auto *VecPtr = CreateVecPtr(Part, Ptr);
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
UndefValue::get(DataTy),
@@ -3457,7 +2657,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
// check ensures that N >= Step.
- if (VF > 1 && Legal->requiresScalarEpilogue()) {
+ if (VF > 1 && Cost->requiresScalarEpilogue()) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -3508,8 +2708,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
@@ -3714,6 +2914,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Create phi nodes to merge from the backedge-taken check block.
PHINode *BCResumeVal = PHINode::Create(
OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
if (OrigPhi == OldInduction) {
// We know what the end value is.
@@ -3871,7 +3073,7 @@ struct CSEDenseMapInfo {
} // end anonymous namespace
-///\brief Perform cse of induction variable instructions.
+///Perform cse of induction variable instructions.
static void cse(BasicBlock *BB) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
@@ -3893,7 +3095,7 @@ static void cse(BasicBlock *BB) {
}
}
-/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
const TargetTransformInfo &TTI) {
@@ -4074,7 +3276,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
- } else if (isa<LoadInst>(I)) {
+ } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
// Don't do anything with the operands, just extend the result.
continue;
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
@@ -4089,7 +3291,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
NewI = B.CreateExtractElement(O0, EE->getOperand(2));
} else {
- llvm_unreachable("Unhandled instruction type!");
+ // If we don't know what to do, be conservative and don't do anything.
+ continue;
}
// Lastly, extend the result.
@@ -4164,15 +3367,12 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() {
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
- for (Instruction &I : *OrigLoop->getHeader()) {
- PHINode *Phi = dyn_cast<PHINode>(&I);
- if (!Phi)
- break;
+ for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
// Handle first-order recurrences and reductions that need to be fixed.
- if (Legal->isFirstOrderRecurrence(Phi))
- fixFirstOrderRecurrence(Phi);
- else if (Legal->isReductionVariable(Phi))
- fixReduction(Phi);
+ if (Legal->isFirstOrderRecurrence(&Phi))
+ fixFirstOrderRecurrence(&Phi);
+ else if (Legal->isReductionVariable(&Phi))
+ fixReduction(&Phi);
}
}
@@ -4335,15 +3535,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Finally, fix users of the recurrence outside the loop. The users will need
// either the last value of the scalar recurrence or the last value of the
// vector recurrence we extracted in the middle block. Since the loop is in
- // LCSSA form, we just need to find the phi node for the original scalar
+ // LCSSA form, we just need to find all the phi nodes for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
- for (auto &I : *LoopExitBlock) {
- auto *LCSSAPhi = dyn_cast<PHINode>(&I);
- if (!LCSSAPhi)
- break;
- if (LCSSAPhi->getIncomingValue(0) == Phi) {
- LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- break;
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (LCSSAPhi.getIncomingValue(0) == Phi) {
+ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
}
}
}
@@ -4499,21 +3695,15 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// inside and outside of the scalar remainder loop.
// We know that the loop is in LCSSA form. We need to update the
// PHI nodes in the exit blocks.
- for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
- LEE = LoopExitBlock->end();
- LEI != LEE; ++LEI) {
- PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
- if (!LCSSAPhi)
- break;
-
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
// All PHINodes need to have a single entry edge, or two if
// we already fixed them.
- assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+ assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
// We found a reduction value exit-PHI. Update it with the
// incoming bypass edge.
- if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
- LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
+ LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
} // end of the LCSSA phi scan.
// Fix the scalar loop reduction variable with the incoming reduction sum
@@ -4528,14 +3718,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
void InnerLoopVectorizer::fixLCSSAPHIs() {
- for (Instruction &LEI : *LoopExitBlock) {
- auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
- if (!LCSSAPhi)
- break;
- if (LCSSAPhi->getNumIncomingValues() == 1) {
- assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (LCSSAPhi.getNumIncomingValues() == 1) {
+ assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
"Incoming value isn't loop invariant");
- LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+ LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
}
}
}
@@ -4955,7 +4142,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
default:
// This instruction is not vectorized by simple widening.
- DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
llvm_unreachable("Unhandled instruction!");
} // end of switch.
}
@@ -4973,467 +4160,7 @@ void InnerLoopVectorizer::updateAnalysis() {
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
- DEBUG(DT->verifyDomTree());
-}
-
-/// \brief Check whether it is safe to if-convert this phi node.
-///
-/// Phi nodes with constant expressions that can trap are not safe to if
-/// convert.
-static bool canIfConvertPHINodes(BasicBlock *BB) {
- for (Instruction &I : *BB) {
- auto *Phi = dyn_cast<PHINode>(&I);
- if (!Phi)
- return true;
- for (Value *V : Phi->incoming_values())
- if (auto *C = dyn_cast<Constant>(V))
- if (C->canTrap())
- return false;
- }
- return true;
-}
-
-bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
- if (!EnableIfConversion) {
- ORE->emit(createMissedAnalysis("IfConversionDisabled")
- << "if-conversion is disabled");
- return false;
- }
-
- assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-
- // A list of pointers that we can safely read and write to.
- SmallPtrSet<Value *, 8> SafePointes;
-
- // Collect safe addresses.
- for (BasicBlock *BB : TheLoop->blocks()) {
- if (blockNeedsPredication(BB))
- continue;
-
- for (Instruction &I : *BB)
- if (auto *Ptr = getPointerOperand(&I))
- SafePointes.insert(Ptr);
- }
-
- // Collect the blocks that need predication.
- BasicBlock *Header = TheLoop->getHeader();
- for (BasicBlock *BB : TheLoop->blocks()) {
- // We don't support switch statements inside loops.
- if (!isa<BranchInst>(BB->getTerminator())) {
- ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
- << "loop contains a switch statement");
- return false;
- }
-
- // We must be able to predicate all blocks that need to be predicated.
- if (blockNeedsPredication(BB)) {
- if (!blockCanBePredicated(BB, SafePointes)) {
- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
- << "control flow cannot be substituted for a select");
- return false;
- }
- } else if (BB != Header && !canIfConvertPHINodes(BB)) {
- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
- << "control flow cannot be substituted for a select");
- return false;
- }
- }
-
- // We can if-convert this loop.
- return true;
-}
-
-bool LoopVectorizationLegality::canVectorize() {
- // Store the result and return it at the end instead of exiting early, in case
- // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
- bool Result = true;
-
- bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
- // We must have a loop in canonical form. Loops with indirectbr in them cannot
- // be canonicalized.
- if (!TheLoop->getLoopPreheader()) {
- DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // FIXME: The code is currently dead, since the loop gets sent to
- // LoopVectorizationLegality is already an innermost loop.
- //
- // We can only vectorize innermost loops.
- if (!TheLoop->empty()) {
- ORE->emit(createMissedAnalysis("NotInnermostLoop")
- << "loop is not the innermost loop");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We must have a single backedge.
- if (TheLoop->getNumBackEdges() != 1) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We must have a single exiting block.
- if (!TheLoop->getExitingBlock()) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We only handle bottom-tested loops, i.e. loop in which the condition is
- // checked at the end of each iteration. With that we can assume that all
- // instructions in the loop are executed the same number of times.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We need to have a loop header.
- DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
- << '\n');
-
- // Check if we can if-convert non-single-bb loops.
- unsigned NumBlocks = TheLoop->getNumBlocks();
- if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
- DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Check if we can vectorize the instructions and CFG in this loop.
- if (!canVectorizeInstrs()) {
- DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Go over each instruction and look at memory deps.
- if (!canVectorizeMemory()) {
- DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- DEBUG(dbgs() << "LV: We can vectorize this loop"
- << (LAI->getRuntimePointerChecking()->Need
- ? " (with a runtime bound check)"
- : "")
- << "!\n");
-
- bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
-
- // If an override option has been passed in for interleaved accesses, use it.
- if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
- UseInterleaved = EnableInterleavedMemAccesses;
-
- // Analyze interleaved memory accesses.
- if (UseInterleaved)
- InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
-
- unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
- if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
- SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
-
- if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
- ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
- << "Too many SCEV assumptions need to be made and checked "
- << "at runtime");
- DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Okay! We've done all the tests. If any have failed, return false. Otherwise
- // we can vectorize, and at this point we don't have any other mem analysis
- // which may limit our maximum vectorization factor, so just return true with
- // no restrictions.
- return Result;
-}
-
-static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
- if (Ty->isPointerTy())
- return DL.getIntPtrType(Ty);
-
- // It is possible that char's or short's overflow when we ask for the loop's
- // trip count, work around this by changing the type size.
- if (Ty->getScalarSizeInBits() < 32)
- return Type::getInt32Ty(Ty->getContext());
-
- return Ty;
-}
-
-static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
- Ty0 = convertPointerToIntegerType(DL, Ty0);
- Ty1 = convertPointerToIntegerType(DL, Ty1);
- if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
- return Ty0;
- return Ty1;
-}
-
-/// \brief Check that the instruction has outside loop users and is not an
-/// identified reduction variable.
-static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
- SmallPtrSetImpl<Value *> &AllowedExit) {
- // Reduction and Induction instructions are allowed to have exit users. All
- // other instructions must not have external users.
- if (!AllowedExit.count(Inst))
- // Check that all of the users of the loop are inside the BB.
- for (User *U : Inst->users()) {
- Instruction *UI = cast<Instruction>(U);
- // This user may be a reduction exit value.
- if (!TheLoop->contains(UI)) {
- DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
- return true;
- }
- }
- return false;
-}
-
-void LoopVectorizationLegality::addInductionPhi(
- PHINode *Phi, const InductionDescriptor &ID,
- SmallPtrSetImpl<Value *> &AllowedExit) {
- Inductions[Phi] = ID;
-
- // In case this induction also comes with casts that we know we can ignore
- // in the vectorized loop body, record them here. All casts could be recorded
- // here for ignoring, but suffices to record only the first (as it is the
- // only one that may bw used outside the cast sequence).
- const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
- if (!Casts.empty())
- InductionCastsToIgnore.insert(*Casts.begin());
-
- Type *PhiTy = Phi->getType();
- const DataLayout &DL = Phi->getModule()->getDataLayout();
-
- // Get the widest type.
- if (!PhiTy->isFloatingPointTy()) {
- if (!WidestIndTy)
- WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
- else
- WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
- }
-
- // Int inductions are special because we only allow one IV.
- if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
- ID.getConstIntStepValue() &&
- ID.getConstIntStepValue()->isOne() &&
- isa<Constant>(ID.getStartValue()) &&
- cast<Constant>(ID.getStartValue())->isNullValue()) {
-
- // Use the phi node with the widest type as induction. Use the last
- // one if there are multiple (no good reason for doing this other
- // than it is expedient). We've checked that it begins at zero and
- // steps by one, so this is a canonical induction variable.
- if (!PrimaryInduction || PhiTy == WidestIndTy)
- PrimaryInduction = Phi;
- }
-
- // Both the PHI node itself, and the "post-increment" value feeding
- // back into the PHI node may have external users.
- // We can allow those uses, except if the SCEVs we have for them rely
- // on predicates that only hold within the loop, since allowing the exit
- // currently means re-using this SCEV outside the loop.
- if (PSE.getUnionPredicate().isAlwaysTrue()) {
- AllowedExit.insert(Phi);
- AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
- }
-
- DEBUG(dbgs() << "LV: Found an induction variable.\n");
-}
-
-bool LoopVectorizationLegality::canVectorizeInstrs() {
- BasicBlock *Header = TheLoop->getHeader();
-
- // Look for the attribute signaling the absence of NaNs.
- Function &F = *Header->getParent();
- HasFunNoNaNAttr =
- F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
-
- // For each block in the loop.
- for (BasicBlock *BB : TheLoop->blocks()) {
- // Scan the instructions in the block and look for hazards.
- for (Instruction &I : *BB) {
- if (auto *Phi = dyn_cast<PHINode>(&I)) {
- Type *PhiTy = Phi->getType();
- // Check that this PHI type is allowed.
- if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
- !PhiTy->isPointerTy()) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
- << "loop control flow is not understood by vectorizer");
- DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
- return false;
- }
-
- // If this PHINode is not in the header block, then we know that we
- // can convert it to select during if-conversion. No need to check if
- // the PHIs in this block are induction or reduction variables.
- if (BB != Header) {
- // Check that this instruction has no outside users or is an
- // identified reduction value with an outside user.
- if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
- continue;
- ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
- << "value could not be identified as "
- "an induction or reduction variable");
- return false;
- }
-
- // We only allow if-converted PHIs with exactly two incoming values.
- if (Phi->getNumIncomingValues() != 2) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
- << "control flow not understood by vectorizer");
- DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
- return false;
- }
-
- RecurrenceDescriptor RedDes;
- if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
- if (RedDes.hasUnsafeAlgebra())
- Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
- AllowedExit.insert(RedDes.getLoopExitInstr());
- Reductions[Phi] = RedDes;
- continue;
- }
-
- InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
- addInductionPhi(Phi, ID, AllowedExit);
- if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
- Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
- continue;
- }
-
- if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
- SinkAfter, DT)) {
- FirstOrderRecurrences.insert(Phi);
- continue;
- }
-
- // As a last resort, coerce the PHI to a AddRec expression
- // and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
- addInductionPhi(Phi, ID, AllowedExit);
- continue;
- }
-
- ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
- << "value that could not be identified as "
- "reduction is used outside the loop");
- DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
- return false;
- } // end of PHI handling
-
- // We handle calls that:
- // * Are debug info intrinsics.
- // * Have a mapping to an IR intrinsic.
- // * Have a vector version available.
- auto *CI = dyn_cast<CallInst>(&I);
- if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
- !isa<DbgInfoIntrinsic>(CI) &&
- !(CI->getCalledFunction() && TLI &&
- TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
- ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
- << "call instruction cannot be vectorized");
- DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
- return false;
- }
-
- // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
- // second argument is the same (i.e. loop invariant)
- if (CI && hasVectorInstrinsicScalarOpd(
- getVectorIntrinsicIDForCall(CI, TLI), 1)) {
- auto *SE = PSE.getSE();
- if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
- ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
- << "intrinsic instruction cannot be vectorized");
- DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
- return false;
- }
- }
-
- // Check that the instruction return type is vectorizable.
- // Also, we can't vectorize extractelement instructions.
- if ((!VectorType::isValidElementType(I.getType()) &&
- !I.getType()->isVoidTy()) ||
- isa<ExtractElementInst>(I)) {
- ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
- << "instruction return type cannot be vectorized");
- DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
- return false;
- }
-
- // Check that the stored type is vectorizable.
- if (auto *ST = dyn_cast<StoreInst>(&I)) {
- Type *T = ST->getValueOperand()->getType();
- if (!VectorType::isValidElementType(T)) {
- ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
- << "store instruction cannot be vectorized");
- return false;
- }
-
- // FP instructions can allow unsafe algebra, thus vectorizable by
- // non-IEEE-754 compliant SIMD units.
- // This applies to floating-point math operations and calls, not memory
- // operations, shuffles, or casts, as they don't change precision or
- // semantics.
- } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.isFast()) {
- DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
- Hints->setPotentiallyUnsafe();
- }
-
- // Reduction instructions are allowed to have exit users.
- // All other instructions must not have external users.
- if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
- ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
- << "value cannot be used outside the loop");
- return false;
- }
- } // next instr.
- }
-
- if (!PrimaryInduction) {
- DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
- if (Inductions.empty()) {
- ORE->emit(createMissedAnalysis("NoInductionVariable")
- << "loop induction variable could not be identified");
- return false;
- }
- }
-
- // Now we know the widest induction type, check if our found induction
- // is the same size. If it's not, unset it here and InnerLoopVectorizer
- // will create another.
- if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
- PrimaryInduction = nullptr;
-
- return true;
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
}
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@@ -5461,7 +4188,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
if (auto *Store = dyn_cast<StoreInst>(MemAccess))
if (Ptr == Store->getValueOperand())
return WideningDecision == CM_Scalarize;
- assert(Ptr == getPointerOperand(MemAccess) &&
+ assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
"Ptr is neither a value or pointer operand");
return WideningDecision != CM_GatherScatter;
};
@@ -5527,7 +4254,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
}
for (auto *I : ScalarPtrs)
if (!PossibleNonScalarPtrs.count(I)) {
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
Worklist.insert(I);
}
@@ -5544,8 +4271,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
continue;
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
}
// Insert the forced scalars.
@@ -5572,7 +4300,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
isScalarUse(J, Src));
})) {
Worklist.insert(Src);
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
}
}
@@ -5612,21 +4340,30 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// The induction variable and its update instruction will remain scalar.
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
}
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
- if (!blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
+ if (!Legal->blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
default:
break;
- case Instruction::Store:
- return !isMaskRequired(I);
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (!Legal->isMaskRequired(I))
+ return false;
+ auto *Ptr = getLoadStorePointerOperand(I);
+ auto *Ty = getMemInstValueType(I);
+ return isa<LoadInst>(I) ?
+ !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
+ : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+ }
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
@@ -5636,17 +4373,17 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
return false;
}
-bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+ unsigned VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
assert((LI || SI) && "Invalid memory instruction");
- auto *Ptr = getPointerOperand(I);
+ auto *Ptr = getLoadStorePointerOperand(I);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!isConsecutivePtr(Ptr))
+ if (!Legal->isConsecutivePtr(Ptr))
return false;
// If the instruction is a store located in a predicated block, it will be
@@ -5697,7 +4434,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
Worklist.insert(Cmp);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
}
// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
@@ -5729,7 +4466,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
// If there's no pointer operand, there's nothing to do.
- auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+ auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
if (!Ptr)
continue;
@@ -5737,7 +4474,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// pointer operand.
auto UsersAreMemAccesses =
llvm::all_of(Ptr->users(), [&](User *U) -> bool {
- return getPointerOperand(U) == Ptr;
+ return getLoadStorePointerOperand(U) == Ptr;
});
// Ensure the memory instruction will not be scalarized or used by
@@ -5758,7 +4495,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
if (!PossibleNonUniformPtrs.count(V)) {
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
Worklist.insert(V);
}
@@ -5777,10 +4514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
return !TheLoop->contains(J) || Worklist.count(J) ||
- (OI == getPointerOperand(J) && isUniformDecision(J, VF));
+ (OI == getLoadStorePointerOperand(J) &&
+ isUniformDecision(J, VF));
})) {
Worklist.insert(OI);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
}
}
}
@@ -5788,7 +4526,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Returns true if Ptr is the pointer operand of a memory access instruction
// I, and I is known to not require scalarization.
auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
- return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
};
// For an instruction to be added into Worklist above, all its users inside
@@ -5825,123 +4563,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// The induction variable and its update instruction will remain uniform.
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
+ << "\n");
}
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationLegality::canVectorizeMemory() {
- LAI = &(*GetLAA)(*TheLoop);
- InterleaveInfo.setLAI(LAI);
- const OptimizationRemarkAnalysis *LAR = LAI->getReport();
- if (LAR) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
- "loop not vectorized: ", *LAR);
- });
- }
- if (!LAI->canVectorizeMemory())
- return false;
-
- if (LAI->hasStoreToLoopInvariantAddress()) {
- ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
- << "write to a loop invariant address could not be vectorized");
- DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
- return false;
- }
-
- Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
- PSE.addPredicate(LAI->getPSE().getUnionPredicate());
-
- return true;
-}
-
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
- Value *In0 = const_cast<Value *>(V);
- PHINode *PN = dyn_cast_or_null<PHINode>(In0);
- if (!PN)
- return false;
-
- return Inductions.count(PN);
-}
-
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
- auto *Inst = dyn_cast<Instruction>(V);
- return (Inst && InductionCastsToIgnore.count(Inst));
-}
-
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
- return isInductionPhi(V) || isCastedInductionVariable(V);
-}
-
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
- return FirstOrderRecurrences.count(Phi);
-}
-
-bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
- return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-}
-
-bool LoopVectorizationLegality::blockCanBePredicated(
- BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
- const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-
- for (Instruction &I : *BB) {
- // Check that we don't have a constant expression that can trap as operand.
- for (Value *Operand : I.operands()) {
- if (auto *C = dyn_cast<Constant>(Operand))
- if (C->canTrap())
- return false;
- }
- // We might be able to hoist the load.
- if (I.mayReadFromMemory()) {
- auto *LI = dyn_cast<LoadInst>(&I);
- if (!LI)
- return false;
- if (!SafePtrs.count(LI->getPointerOperand())) {
- if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
- isLegalMaskedGather(LI->getType())) {
- MaskedOp.insert(LI);
- continue;
- }
- // !llvm.mem.parallel_loop_access implies if-conversion safety.
- if (IsAnnotatedParallel)
- continue;
- return false;
- }
- }
-
- if (I.mayWriteToMemory()) {
- auto *SI = dyn_cast<StoreInst>(&I);
- // We only support predication of stores in basic blocks with one
- // predecessor.
- if (!SI)
- return false;
-
- // Build a masked store if it is legal for the target.
- if (isLegalMaskedStore(SI->getValueOperand()->getType(),
- SI->getPointerOperand()) ||
- isLegalMaskedScatter(SI->getValueOperand()->getType())) {
- MaskedOp.insert(SI);
- continue;
- }
-
- bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
- bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
-
- if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
- !isSinglePredecessor)
- return false;
- }
- if (I.mayThrow())
- return false;
- }
-
- return true;
-}
-
void InterleavedAccessInfo::collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides) {
@@ -5962,7 +4591,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
if (!LI && !SI)
continue;
- Value *Ptr = getPointerOperand(&I);
+ Value *Ptr = getLoadStorePointerOperand(&I);
// We don't check wrapping here because we don't know yet if Ptr will be
// part of a full group or a group with gaps. Checking wrapping for all
// pointers (even those that end up in groups with no gaps) will be overly
@@ -6022,9 +4651,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
// this group because it and (2) are dependent. However, (1) can be grouped
// with other accesses that may precede it in program order. Note that a
// bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving(
- const ValueToValueMap &Strides) {
- DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+void InterleavedAccessInfo::analyzeInterleaving() {
+ LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+ const ValueToValueMap &Strides = LAI->getSymbolicStrides();
// Holds all accesses with a constant stride.
MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
@@ -6065,7 +4694,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
if (isStrided(DesB.Stride)) {
Group = getInterleaveGroup(B);
if (!Group) {
- DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
+ << '\n');
Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
}
if (B->mayWriteToMemory())
@@ -6124,7 +4754,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Ignore A if it's already in a group or isn't the same kind of memory
// operation as B.
- if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
+ // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory
+ // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory()
+ // should have returned false - except for the case we asked for optimization
+ // remarks.
+ if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory())
+ || (A->mayWriteToMemory() != B->mayWriteToMemory()))
continue;
// Check rules 1 and 2. Ignore A if its stride or size is different from
@@ -6163,8 +4798,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Try to insert A into B's group.
if (Group->insertMember(A, IndexA, DesA.Align)) {
- DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
- << " into the interleave group with" << *B << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
+ << " into the interleave group with" << *B
+ << '\n');
InterleaveGroupMap[A] = Group;
// Set the first load in program order as the insert position.
@@ -6177,8 +4813,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Remove interleaved store groups with gaps.
for (InterleaveGroup *Group : StoreGroups)
if (Group->getNumMembers() != Group->getFactor()) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
- "to gaps.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved store group due "
+ "to gaps.\n");
releaseGroup(Group);
}
// Remove interleaved groups with gaps (currently only loads) whose memory
@@ -6207,21 +4844,23 @@ void InterleavedAccessInfo::analyzeInterleaving(
// So we check only group member 0 (which is always guaranteed to exist),
// and group member Factor - 1; If the latter doesn't exist we rely on
// peeling (if it is a non-reveresed accsess -- see Case 3).
- Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
+ Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
- "first group member potentially pointer-wrapping.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "first group member potentially pointer-wrapping.\n");
releaseGroup(Group);
continue;
}
Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
if (LastMember) {
- Value *LastMemberPtr = getPointerOperand(LastMember);
+ Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
- "last group member potentially pointer-wrapping.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "last group member potentially pointer-wrapping.\n");
releaseGroup(Group);
}
} else {
@@ -6231,29 +4870,25 @@ void InterleavedAccessInfo::analyzeInterleaving(
// to look for a member at index factor - 1, since every group must have
// a member at index zero.
if (Group->isReverse()) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
- "a reverse access with gaps.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "a reverse access with gaps.\n");
releaseGroup(Group);
continue;
}
- DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
RequiresScalarEpilogue = true;
}
}
}
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
- if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
- ORE->emit(createMissedAnalysis("ConditionalStore")
- << "store that is conditionally executed prevents vectorization");
- DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
- return None;
- }
-
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
- DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+ LLVM_DEBUG(
+ dbgs() << "LV: Not inserting runtime ptr check for divergent target");
ORE->emit(
createMissedAnalysis("CantVersionLoopWithDivergentTarget")
@@ -6271,20 +4906,22 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
<< "runtime pointer checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
"compiling with -Os/-Oz");
- DEBUG(dbgs()
- << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
return None;
}
// If we optimize the program for size, avoid creating the tail loop.
- DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
// If we don't know the precise trip count, don't try to vectorize.
if (TC < 2) {
ORE->emit(
createMissedAnalysis("UnknownLoopCountComplexCFG")
<< "unable to calculate the loop count due to complex control flow");
- DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return None;
}
@@ -6302,7 +4939,8 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
"same time. Enable vectorization of this loop "
"with '#pragma clang loop vectorize(enable)' "
"when compiling with -Os/-Oz");
- DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return None;
}
@@ -6327,29 +4965,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
unsigned MaxVectorSize = WidestRegister / WidestType;
- DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
- << WidestType << " bits.\n");
- DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
- << " bits.\n");
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
+ LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+ << WidestRegister << " bits.\n");
- assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
- " into one vector!");
+ assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
+ " into one vector!");
if (MaxVectorSize == 0) {
- DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+ LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
return MaxVectorSize;
} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
// choosing a higher viable VF as done in the loop below.
- DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
- << ConstTripCount << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+ << ConstTripCount << "\n");
MaxVectorSize = ConstTripCount;
return MaxVectorSize;
}
unsigned MaxVF = MaxVectorSize;
- if (MaximizeBandwidth && !OptForSize) {
+ if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
+ (MaximizeBandwidth && !OptForSize)) {
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
SmallVector<unsigned, 8> VFs;
@@ -6369,24 +5008,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
break;
}
}
+ if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
+ if (MaxVF < MinVF) {
+ LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
+ << ") with target's minimum: " << MinVF << '\n');
+ MaxVF = MinVF;
+ }
+ }
}
return MaxVF;
}
-LoopVectorizationCostModel::VectorizationFactor
+VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
float Cost = expectedCost(1).first;
-#ifndef NDEBUG
const float ScalarCost = Cost;
-#endif /* NDEBUG */
unsigned Width = 1;
- DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
- // Ignore scalar width, because the user explicitly wants vectorization.
if (ForceVectorization && MaxVF > 1) {
- Width = 2;
- Cost = expectedCost(Width).first / (float)Width;
+ // Ignore scalar width, because the user explicitly wants vectorization.
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+ // evaluation.
+ Cost = std::numeric_limits<float>::max();
}
for (unsigned i = 2; i <= MaxVF; i *= 2) {
@@ -6395,10 +5040,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
// the vector elements.
VectorizationCostTy C = expectedCost(i);
float VectorCost = C.first / (float)i;
- DEBUG(dbgs() << "LV: Vector loop of width " << i
- << " costs: " << (int)VectorCost << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+ << " costs: " << (int)VectorCost << ".\n");
if (!C.second && !ForceVectorization) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << "LV: Not considering vector loop of width " << i
<< " because it will not generate any vector instructions.\n");
continue;
@@ -6409,10 +5054,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
}
}
- DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
- << "LV: Vectorization seems to be not beneficial, "
- << "but was forced by a user.\n");
- DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
+ if (!EnableCondStoresVectorization && NumPredStores) {
+ ORE->emit(createMissedAnalysis("ConditionalStore")
+ << "store that is conditionally executed prevents vectorization");
+ LLVM_DEBUG(
+ dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ Width = 1;
+ Cost = ScalarCost;
+ }
+
+ LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+ << "LV: Vectorization seems to be not beneficial, "
+ << "but was forced by a user.\n");
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
return Factor;
}
@@ -6460,7 +5114,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
// optimization to non-pointer types.
//
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
- !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
+ !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
continue;
MinWidth = std::min(MinWidth,
@@ -6504,8 +5158,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
return 1;
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
- DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
- << " registers\n");
+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+ << " registers\n");
if (VF == 1) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
@@ -6519,7 +5173,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
- R.NumInstructions = std::max(R.NumInstructions, 1U);
// We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
@@ -6564,7 +5217,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
if (VF > 1 && !Legal->getReductionVars()->empty()) {
- DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -6575,7 +5228,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
- DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
@@ -6603,11 +5256,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
if (EnableLoadStoreRuntimeInterleave &&
std::max(StoresIC, LoadsIC) > SmallIC) {
- DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving to saturate store or load ports.\n");
return std::max(StoresIC, LoadsIC);
}
- DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
return SmallIC;
}
@@ -6615,11 +5269,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// this point) that could benefit from interleaving.
bool HasReductions = !Legal->getReductionVars()->empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
- DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
}
- DEBUG(dbgs() << "LV: Not Interleaving.\n");
+ LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
return 1;
}
@@ -6646,7 +5300,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
DFS.perform(LI);
RegisterUsage RU;
- RU.NumInstructions = 0;
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
@@ -6658,14 +5311,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// Marks the end of each interval.
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
- SmallSet<Instruction *, 8> Ends;
+ SmallPtrSet<Instruction *, 8> Ends;
// Saves the list of values that are used in the loop but are
// defined outside the loop, such as arguments and constants.
SmallPtrSet<Value *, 8> LoopInvariants;
unsigned Index = 0;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- RU.NumInstructions += BB->size();
for (Instruction &I : *BB) {
IdxToInstr[Index++] = &I;
@@ -6698,7 +5350,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
for (auto &Interval : EndPoint)
TransposeEnds[Interval.second].push_back(Interval.first);
- SmallSet<Instruction *, 8> OpenIntervals;
+ SmallPtrSet<Instruction *, 8> OpenIntervals;
// Get the size of the widest register.
unsigned MaxSafeDepDist = -1U;
@@ -6711,7 +5363,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
- DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+ LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
// A lambda that gets the register usage for the given type and VF.
auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
@@ -6756,8 +5408,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
}
- DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
- << OpenIntervals.size() << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
// Add the current instruction to the list of open intervals.
OpenIntervals.insert(I);
@@ -6772,10 +5424,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
Invariant += GetRegUsage(Inst->getType(), VFs[i]);
}
- DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
- DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
- DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
- DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
+ << '\n');
RU.LoopInvariantRegs = Invariant;
RU.MaxLocalUsers = MaxUsages[i];
@@ -6785,6 +5437,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
return RUs;
}
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+ // TODO: Cost model for emulated masked load/store is completely
+ // broken. This hack guides the cost model to use an artificially
+ // high enough value to practically disable vectorization with such
+ // operations, except where previously deployed legality hack allowed
+ // using very low cost values. This is to avoid regressions coming simply
+ // from moving "masked load/store" check from legality to cost model.
+ // Masked Load/Gather emulation was previously never allowed.
+ // Limited number of Masked Store/Scatter emulation was allowed.
+ assert(isScalarWithPredication(I) &&
+ "Expecting a scalar emulated instruction");
+ return isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ NumPredStores > NumberOfStoresToPredicate);
+}
+
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
@@ -6805,11 +5473,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
if (!Legal->blockNeedsPredication(BB))
continue;
for (Instruction &I : *BB)
- if (Legal->isScalarWithPredication(&I)) {
+ if (isScalarWithPredication(&I)) {
ScalarCostsTy ScalarCosts;
- if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ // Do not apply discount logic if hacked cost is needed
+ // for emulated masked memrefs.
+ if (!useEmulatedMaskMemRefHack(&I) &&
+ computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
-
// Remember that BB will remain after vectorization.
PredicatedBBsAfterVectorization.insert(BB);
}
@@ -6844,7 +5514,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
- if (Legal->isScalarWithPredication(I))
+ if (isScalarWithPredication(I))
return false;
// If any of the instruction's operands are uniform after vectorization,
@@ -6898,7 +5568,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
true, false);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
@@ -6940,11 +5610,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy BlockCost;
// For each instruction in the old loop.
- for (Instruction &I : *BB) {
- // Skip dbg intrinsics.
- if (isa<DbgInfoIntrinsic>(I))
- continue;
-
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
if (ValuesToIgnore.count(&I) ||
(VF > 1 && VecValuesToIgnore.count(&I)))
@@ -6958,8 +5624,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
BlockCost.first += C.first;
BlockCost.second |= C.second;
- DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
- << VF << " For instruction: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+ << " for VF " << VF << " For instruction: " << I
+ << '\n');
}
// If we are vectorizing a predicated block, it will have been
@@ -6978,7 +5645,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
return Cost;
}
-/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// Gets Address Access SCEV after verifying that the access pattern
/// is loop invariant except the induction variable dependence.
///
/// This SCEV can be sent to the Target in order to estimate the address
@@ -7020,7 +5687,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned Alignment = getMemInstAlignment(I);
unsigned AS = getMemInstAddressSpace(I);
- Value *Ptr = getPointerOperand(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
// Figure out whether the access is strided and get the stride value
@@ -7041,9 +5708,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated store, it may not be executed for each vector
// lane. Scale the cost by the probability of executing the predicated
// block.
- if (Legal->isScalarWithPredication(I))
+ if (isScalarWithPredication(I)) {
Cost /= getReciprocalPredBlockProb();
+ if (useEmulatedMaskMemRefHack(I))
+ // Artificially setting to a high enough value to practically disable
+ // vectorization with such operations.
+ Cost = 3000000;
+ }
+
return Cost;
}
@@ -7052,7 +5725,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = getMemInstAlignment(I);
- Value *Ptr = getPointerOperand(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getMemInstAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
@@ -7088,7 +5761,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = getMemInstAlignment(I);
- Value *Ptr = getPointerOperand(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
@@ -7101,7 +5774,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned AS = getMemInstAddressSpace(I);
- auto Group = Legal->getInterleavedAccessGroup(I);
+ auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.");
unsigned InterleaveFactor = Group->getFactor();
@@ -7168,13 +5841,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (VF == 1)
return;
+ NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
- Value *Ptr = getPointerOperand(&I);
+ Value *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
continue;
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ NumPredStores++;
if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
// Scalar load + broadcast
unsigned Cost = getUniformMemOpCost(&I, VF);
@@ -7183,9 +5859,10 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
// We assume that widening is the best solution when possible.
- if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+ if (memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
- int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
+ int ConsecutiveStride =
+ Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.");
InstWidening Decision =
@@ -7197,8 +5874,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// Choose between Interleaving, Gather/Scatter or Scalarization.
unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
unsigned NumAccesses = 1;
- if (Legal->isAccessInterleaved(&I)) {
- auto Group = Legal->getInterleavedAccessGroup(&I);
+ if (isAccessInterleaved(&I)) {
+ auto Group = getInterleavedAccessGroup(&I);
assert(Group && "Fail to get an interleaved access group.");
// Make one decision for the whole group.
@@ -7210,7 +5887,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
unsigned GatherScatterCost =
- Legal->isLegalGatherOrScatter(&I)
+ isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
: std::numeric_limits<unsigned>::max();
@@ -7235,7 +5912,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// If the instructions belongs to an interleave group, the whole group
// receives the same decision. The whole group receives the cost, but
// the cost will actually be assigned to one instruction.
- if (auto Group = Legal->getInterleavedAccessGroup(&I))
+ if (auto Group = getInterleavedAccessGroup(&I))
setWideningDecision(Group, VF, Decision, Cost);
else
setWideningDecision(&I, VF, Decision, Cost);
@@ -7255,7 +5932,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
for (BasicBlock *BB : TheLoop->blocks())
for (Instruction &I : *BB) {
Instruction *PtrDef =
- dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+ dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
if (PtrDef && TheLoop->contains(PtrDef) &&
getWideningDecision(&I, VF) != CM_GatherScatter)
AddrDefs.insert(PtrDef);
@@ -7285,7 +5962,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// Scalarize a widened load of address.
setWideningDecision(I, VF, CM_Scalarize,
(VF * getMemoryInstructionCost(I, 1)));
- else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+ else if (auto Group = getInterleavedAccessGroup(I)) {
// Scalarize an interleave group of address loads.
for (unsigned I = 0; I < Group->getFactor(); ++I) {
if (Instruction *Member = Group->getMember(I))
@@ -7371,7 +6048,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF > 1 && Legal->isScalarWithPredication(I)) {
+ if (VF > 1 && isScalarWithPredication(I)) {
unsigned Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
@@ -7569,7 +6246,7 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
- if (auto *Ptr = getPointerOperand(Inst))
+ if (auto *Ptr = getLoadStorePointerOperand(Inst))
return Legal->isConsecutivePtr(Ptr);
return false;
}
@@ -7594,23 +6271,59 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
}
-LoopVectorizationCostModel::VectorizationFactor
+VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
+ unsigned UserVF) {
+ // Width 1 means no vectorization, cost 0 means uncomputed cost.
+ const VectorizationFactor NoVectorization = {1U, 0U};
+
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ if (!OrigLoop->empty()) {
+ // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.
+ // This won't be necessary when UserVF is not required in the VPlan-native
+ // path.
+ if (VPlanBuildStressTest && !UserVF)
+ UserVF = 4;
+
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+ assert(UserVF && "Expected UserVF for outer loop vectorization.");
+ assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ buildVPlans(UserVF, UserVF);
+
+ // For VPlan build stress testing, we bail out after VPlan construction.
+ if (VPlanBuildStressTest)
+ return NoVectorization;
+
+ return {UserVF, 0};
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+ "VPlan-native path.\n");
+ return NoVectorization;
+}
+
+VectorizationFactor
LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
- // Width 1 means no vectorize, cost 0 means uncomputed cost.
- const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
- 0U};
+ assert(OrigLoop->empty() && "Inner loop expected.");
+ // Width 1 means no vectorization, cost 0 means uncomputed cost.
+ const VectorizationFactor NoVectorization = {1U, 0U};
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
return NoVectorization;
if (UserVF) {
- DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
- buildVPlans(UserVF, UserVF);
- DEBUG(printPlans(dbgs()));
+ buildVPlansWithVPRecipes(UserVF, UserVF);
+ LLVM_DEBUG(printPlans(dbgs()));
return {UserVF, 0};
}
@@ -7627,8 +6340,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
CM.collectInstsToScalarize(VF);
}
- buildVPlans(1, MaxVF);
- DEBUG(printPlans(dbgs()));
+ buildVPlansWithVPRecipes(1, MaxVF);
+ LLVM_DEBUG(printPlans(dbgs()));
if (MaxVF == 1)
return NoVectorization;
@@ -7637,7 +6350,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
}
void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
- DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
+ LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
+ << '\n');
BestVF = VF;
BestUF = UF;
@@ -7787,30 +6501,15 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
/// vectorization decision can potentially shorten this sub-range during
/// buildVPlan().
void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
-
- // Collect conditions feeding internal conditional branches; they need to be
- // represented in VPlan for it to model masking.
- SmallPtrSet<Value *, 1> NeedDef;
-
- auto *Latch = OrigLoop->getLoopLatch();
- for (BasicBlock *BB : OrigLoop->blocks()) {
- if (BB == Latch)
- continue;
- BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
- if (Branch && Branch->isConditional())
- NeedDef.insert(Branch->getCondition());
- }
-
for (unsigned VF = MinVF; VF < MaxVF + 1;) {
VFRange SubRange = {VF, MaxVF + 1};
- VPlans.push_back(buildVPlan(SubRange, NeedDef));
+ VPlans.push_back(buildVPlan(SubRange));
VF = SubRange.End;
}
}
-VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
- BasicBlock *Dst,
- VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
+ VPlanPtr &Plan) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
@@ -7840,8 +6539,7 @@ VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
return EdgeMaskCache[Edge] = EdgeMask;
}
-VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
- VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
// Look for cached value.
@@ -7874,10 +6572,9 @@ VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
return BlockMaskCache[BB] = BlockMask;
}
-VPInterleaveRecipe *
-LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
- VFRange &Range) {
- const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
+VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
+ VFRange &Range) {
+ const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
if (!IG)
return nullptr;
@@ -7889,7 +6586,7 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
LoopVectorizationCostModel::CM_Interleave);
};
};
- if (!getDecisionAndClampRange(isIGMember(I), Range))
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
return nullptr;
// I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
@@ -7902,8 +6599,8 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
}
VPWidenMemoryInstructionRecipe *
-LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
- VPlanPtr &Plan) {
+VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
return nullptr;
@@ -7922,7 +6619,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
return Decision != LoopVectorizationCostModel::CM_Scalarize;
};
- if (!getDecisionAndClampRange(willWiden, Range))
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return nullptr;
VPValue *Mask = nullptr;
@@ -7933,8 +6630,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
}
VPWidenIntOrFpInductionRecipe *
-LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
- VFRange &Range) {
+VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
if (PHINode *Phi = dyn_cast<PHINode>(I)) {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
@@ -7959,15 +6655,14 @@ LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
[=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
};
- if (isa<TruncInst>(I) &&
- getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
+ if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
+ isOptimizableIVTruncate(I), Range))
return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
cast<TruncInst>(I));
return nullptr;
}
-VPBlendRecipe *
-LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
PHINode *Phi = dyn_cast<PHINode>(I);
if (!Phi || Phi->getParent() == OrigLoop->getHeader())
return nullptr;
@@ -7991,9 +6686,9 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
return new VPBlendRecipe(Phi, Masks);
}
-bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
- VFRange &Range) {
- if (Legal->isScalarWithPredication(I))
+bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
+ VFRange &Range) {
+ if (CM.isScalarWithPredication(I))
return false;
auto IsVectorizableOpcode = [](unsigned Opcode) {
@@ -8077,7 +6772,7 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
return true;
};
- if (!getDecisionAndClampRange(willWiden, Range))
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return false;
// Success: widen this instruction. We optimize the common case where
@@ -8092,15 +6787,15 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
return true;
}
-VPBasicBlock *LoopVectorizationPlanner::handleReplication(
+VPBasicBlock *VPRecipeBuilder::handleReplication(
Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
VPlanPtr &Plan) {
- bool IsUniform = getDecisionAndClampRange(
+ bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
- bool IsPredicated = Legal->isScalarWithPredication(I);
+ bool IsPredicated = CM.isScalarWithPredication(I);
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
// Find if I uses a predicated instruction. If so, it will use its scalar
@@ -8113,24 +6808,25 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication(
// Finalize the recipe for Instr, first if it is not predicated.
if (!IsPredicated) {
- DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
VPBB->appendRecipe(Recipe);
return VPBB;
}
- DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
assert(VPBB->getSuccessors().empty() &&
"VPBB has successors when handling predicated replication.");
// Record predicated instructions for above packing optimizations.
PredInst2Recipe[I] = Recipe;
- VPBlockBase *Region =
- VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
- return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
+ VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+ VPBlockUtils::insertBlockAfter(Region, VPBB);
+ auto *RegSucc = new VPBasicBlock();
+ VPBlockUtils::insertBlockAfter(RegSucc, Region);
+ return RegSucc;
}
-VPRegionBlock *
-LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
- VPRecipeBase *PredRecipe,
- VPlanPtr &Plan) {
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan) {
// Instructions marked for predication are replicated and placed under an
// if-then construct to prevent side-effects.
@@ -8150,19 +6846,67 @@ LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
- Entry->setTwoSuccessors(Pred, Exit);
- Pred->setOneSuccessor(Exit);
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exit);
return Region;
}
-LoopVectorizationPlanner::VPlanPtr
-LoopVectorizationPlanner::buildVPlan(VFRange &Range,
- const SmallPtrSetImpl<Value *> &NeedDef) {
- EdgeMaskCache.clear();
- BlockMaskCache.clear();
- DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
+ VPlanPtr &Plan, VPBasicBlock *VPBB) {
+ VPRecipeBase *Recipe = nullptr;
+ // Check if Instr should belong to an interleave memory recipe, or already
+ // does. In the latter case Instr is irrelevant.
+ if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr is a memory operation that should be widened.
+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr should form some PHI recipe.
+ if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+ if ((Recipe = tryToBlend(Instr, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+ if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+ VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+ return true;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes that deal with
+ // Interleave Groups, Inductions and Phi nodes.
+ if (tryToWiden(Instr, VPBB, Range))
+ return true;
+
+ return false;
+}
+
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
+ unsigned MaxVF) {
+ assert(OrigLoop->empty() && "Inner loop expected.");
+
+ // Collect conditions feeding internal conditional branches; they need to be
+ // represented in VPlan for it to model masking.
+ SmallPtrSet<Value *, 1> NeedDef;
+
+ auto *Latch = OrigLoop->getLoopLatch();
+ for (BasicBlock *BB : OrigLoop->blocks()) {
+ if (BB == Latch)
+ continue;
+ BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+ if (Branch && Branch->isConditional())
+ NeedDef.insert(Branch->getCondition());
+ }
// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
@@ -8173,15 +6917,31 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
SmallPtrSet<Instruction *, 4> DeadInstructions;
collectTriviallyDeadInstructions(DeadInstructions);
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlans.push_back(
+ buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+ VF = SubRange.End;
+ }
+}
+
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
// Hold a mapping from predicated instructions to their recipes, in order to
// fix their AlsoPack behavior if a user is determined to replicate and use a
// scalar instead of vector value.
DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
auto Plan = llvm::make_unique<VPlan>(VPBB);
+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
// Represent values that will have defs inside VPlan.
for (Value *V : NeedDef)
Plan->addVPValue(V);
@@ -8196,7 +6956,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
- VPBB->setOneSuccessor(FirstVPBBForBB);
+ VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
VPBB = FirstVPBBForBB;
Builder.setInsertPoint(VPBB);
@@ -8204,18 +6964,17 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// Organize the ingredients to vectorize from current basic block in the
// right order.
- for (Instruction &I : *BB) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
Instruction *Instr = &I;
// First filter out irrelevant instructions, to ensure no recipes are
// built for them.
- if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
- DeadInstructions.count(Instr))
+ if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
continue;
// I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
// member of the IG, do not construct any Recipe for it.
- const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
+ const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
if (IG && Instr != IG->getInsertPos() &&
Range.Start >= 2 && // Query is illegal for VF == 1
CM.getWideningDecision(Instr, Range.Start) ==
@@ -8230,8 +6989,9 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// should follow.
auto SAIt = SinkAfter.find(Instr);
if (SAIt != SinkAfter.end()) {
- DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
- << " to vectorize a 1st order recurrence.\n");
+ LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
+ << *SAIt->second
+ << " to vectorize a 1st order recurrence.\n");
SinkAfterInverse[SAIt->second] = Instr;
continue;
}
@@ -8247,45 +7007,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// Introduce each ingredient into VPlan.
for (Instruction *Instr : Ingredients) {
- VPRecipeBase *Recipe = nullptr;
-
- // Check if Instr should belong to an interleave memory recipe, or already
- // does. In the latter case Instr is irrelevant.
- if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
-
- // Check if Instr is a memory operation that should be widened.
- if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
-
- // Check if Instr should form some PHI recipe.
- if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
- if ((Recipe = tryToBlend(Instr, Plan))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
- if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
- VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
- continue;
- }
-
- // Check if Instr is to be widened by a general VPWidenRecipe, after
- // having first checked for specific widening recipes that deal with
- // Interleave Groups, Inductions and Phi nodes.
- if (tryToWiden(Instr, VPBB, Range))
+ if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
continue;
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
- VPBasicBlock *NextVPBB =
- handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
+ VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
+ Instr, Range, VPBB, PredInst2Recipe, Plan);
if (NextVPBB != VPBB) {
VPBB = NextVPBB;
VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
@@ -8300,7 +7028,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
assert(PreEntry->empty() && "Expecting empty pre-entry block.");
VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
- PreEntry->disconnectSuccessor(Entry);
+ VPBlockUtils::disconnectBlocks(PreEntry, Entry);
delete PreEntry;
std::string PlanName;
@@ -8319,6 +7047,30 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
return Plan;
}
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ assert(!OrigLoop->empty());
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ // Create new empty VPlan
+ auto Plan = llvm::make_unique<VPlan>();
+
+ // Build hierarchical CFG
+ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI);
+ HCFGBuilder.buildHierarchicalCFG(*Plan.get());
+
+ return Plan;
+}
+
+Value* LoopVectorizationPlanner::VPCallbackILV::
+getOrCreateVectorValues(Value *V, unsigned Part) {
+ return ILV.getOrCreateVectorValue(V, Part);
+}
+
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
O << " +\n"
<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
@@ -8483,28 +7235,66 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+ LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
+
+ assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+ Function *F = L->getHeader()->getParent();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+ LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ &Hints, IAI);
+ // Use the planner for outer loop vectorization.
+ // TODO: CM is not used at this point inside the planner. Turn CM into an
+ // optional argument if we don't need it in the future.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+
+ // Get user vectorization factor.
+ unsigned UserVF = Hints.getWidth();
+
+ // Check the function attributes to find out if this function should be
+ // optimized for size.
+ bool OptForSize =
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ LVP.planInVPlanNativePath(OptForSize, UserVF);
+
+ // Returning false. We are currently not generating vector code in the VPlan
+ // native path.
+ return false;
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
- assert(L->empty() && "Only process inner loops.");
+ assert((EnableVPlanNativePath || L->empty()) &&
+ "VPlan-native path is not enabled. Only process inner loops.");
#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
#endif /* NDEBUG */
- DEBUG(dbgs() << "\nLV: Checking a loop in \""
- << L->getHeader()->getParent()->getName() << "\" from "
- << DebugLocStr << "\n");
+ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
+ << L->getHeader()->getParent()->getName() << "\" from "
+ << DebugLocStr << "\n");
LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
- DEBUG(dbgs() << "LV: Loop hints:"
- << " force="
- << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
- ? "disabled"
- : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
- ? "enabled"
- : "?"))
- << " width=" << Hints.getWidth()
- << " unroll=" << Hints.getInterleave() << "\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop hints:"
+ << " force="
+ << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+ ? "disabled"
+ : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+ ? "enabled"
+ : "?"))
+ << " width=" << Hints.getWidth()
+ << " unroll=" << Hints.getInterleave() << "\n");
// Function containing loop
Function *F = L->getHeader()->getParent();
@@ -8518,7 +7308,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// benefit from vectorization, respectively.
if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
- DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
return false;
}
@@ -8526,10 +7316,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
- &Requirements, &Hints);
- if (!LVL.canVectorize()) {
- DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+ LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
+ &Requirements, &Hints, DB, AC);
+ if (!LVL.canVectorize(EnableVPlanNativePath)) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints, ORE);
return false;
}
@@ -8539,11 +7329,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+ // Entrance to the VPlan-native vectorization path. Outer loops are processed
+ // here. They may require CFG and instruction level transformations before
+ // even evaluating whether vectorization is profitable. Since we cannot modify
+ // the incoming IR, we need to build VPlan upfront in the vectorization
+ // pipeline.
+ if (!L->empty())
+ return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+ ORE, Hints);
+
+ assert(L->empty() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
- unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
- bool HasExpectedTC = (ExpectedTC > 0);
-
+ // Prefer constant trip counts over profile data, over upper bound estimate.
+ unsigned ExpectedTC = 0;
+ bool HasExpectedTC = false;
+ if (const SCEVConstant *ConstExits =
+ dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
+ const APInt &ExitsCount = ConstExits->getAPInt();
+ // We are interested in small values for ExpectedTC. Skip over those that
+ // can't fit an unsigned.
+ if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
+ ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
+ HasExpectedTC = true;
+ }
+ }
+ // ExpectedTC may be large because it's bound by a variable. Check
+ // profiling information to validate we should vectorize.
if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
auto EstimatedTC = getLoopEstimatedTripCount(L);
if (EstimatedTC) {
@@ -8551,15 +7363,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
HasExpectedTC = true;
}
}
+ if (!HasExpectedTC) {
+ ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+ HasExpectedTC = (ExpectedTC > 0);
+ }
if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
- DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
- << "This loop is worth vectorizing only if no scalar "
- << "iteration overheads are incurred.");
+ LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+ << "This loop is worth vectorizing only if no scalar "
+ << "iteration overheads are incurred.");
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
- DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+ LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
// Loops with a very small trip count are considered for vectorization
// under OptForSize, thereby making sure the cost of their loop body is
// dominant, free of runtime guards and scalar iteration overheads.
@@ -8572,10 +7388,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
- DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
- "attribute is used.\n");
- ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
- "NoImplicitFloat", L)
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+ "attribute is used.\n");
+ ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
+ "NoImplicitFloat", L)
<< "loop not vectorized due to NoImplicitFloat attribute");
emitMissedWarning(F, L, Hints, ORE);
return false;
@@ -8587,17 +7403,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// additional fp-math flags can help.
if (Hints.isPotentiallyUnsafe() &&
TTI->isFPVectorizationPotentiallyUnsafe()) {
- DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
ORE->emit(
- createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
+ createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
<< "loop not vectorized due to unsafe FP support.");
emitMissedWarning(F, L, Hints, ORE);
return false;
}
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
+ // Analyze interleaved memory accesses.
+ if (UseInterleaved) {
+ IAI.analyzeInterleaving();
+ }
+
// Use the cost model.
LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints);
+ &Hints, IAI);
CM.collectValuesToIgnore();
// Use the planner for vectorization.
@@ -8607,8 +7436,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned UserVF = Hints.getWidth();
// Plan how to best vectorize, return the best VF and its cost.
- LoopVectorizationCostModel::VectorizationFactor VF =
- LVP.plan(OptForSize, UserVF);
+ VectorizationFactor VF = LVP.plan(OptForSize, UserVF);
// Select the interleave count.
unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
@@ -8620,14 +7448,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
if (Requirements.doesNotMeet(F, L, Hints)) {
- DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
- "requirements.\n");
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+ "requirements.\n");
emitMissedWarning(F, L, Hints, ORE);
return false;
}
if (VF.Width == 1) {
- DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg = std::make_pair(
"VectorizationNotBeneficial",
"the cost-model indicates that vectorization is not beneficial");
@@ -8636,7 +7464,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (IC == 1 && UserIC <= 1) {
// Tell the user interleaving is not beneficial.
- DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
IntDiagMsg = std::make_pair(
"InterleavingNotBeneficial",
"the cost-model indicates that interleaving is not beneficial");
@@ -8648,8 +7476,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
} else if (IC > 1 && UserIC == 1) {
// Tell the user interleaving is beneficial, but it explicitly disabled.
- DEBUG(dbgs()
- << "LV: Interleaving is beneficial but is explicitly disabled.");
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
IntDiagMsg = std::make_pair(
"InterleavingBeneficialButDisabled",
"the cost-model indicates that interleaving is beneficial "
@@ -8676,24 +7504,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
return false;
} else if (!VectorizeLoop && InterleaveLoop) {
- DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
ORE->emit([&]() {
return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
L->getStartLoc(), L->getHeader())
<< VecDiagMsg.second;
});
} else if (VectorizeLoop && !InterleaveLoop) {
- DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
- << DebugLocStr << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
ORE->emit([&]() {
return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
L->getStartLoc(), L->getHeader())
<< IntDiagMsg.second;
});
} else if (VectorizeLoop && InterleaveLoop) {
- DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
- << DebugLocStr << '\n');
- DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
LVP.setBestPlan(VF.Width, IC);
@@ -8740,7 +7568,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
- DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
@@ -8788,7 +7616,7 @@ bool LoopVectorizePass::runImpl(
SmallVector<Loop *, 8> Worklist;
for (Loop *L : *LI)
- addAcyclicInnerLoop(*L, Worklist);
+ collectSupportedLoops(*L, LI, ORE, Worklist);
LoopsAnalyzed += Worklist.size();
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a7ccd3faec44..ac8c4f046c6f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -161,7 +161,7 @@ static const unsigned MaxMemDepDistance = 160;
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
-/// \brief Predicate for the element types that the SLP vectorizer supports.
+/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
@@ -246,13 +246,15 @@ static bool isSplat(ArrayRef<Value *> VL) {
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
- enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+ enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
auto *EI = cast<ExtractElementInst>(VL[I]);
@@ -272,7 +274,11 @@ isShuffle(ArrayRef<Value *> VL) {
continue;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
- if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+ if (!Vec1 || Vec1 == Vec)
+ Vec1 = Vec;
+ else if (!Vec2 || Vec2 == Vec)
+ Vec2 = Vec;
+ else
return None;
if (CommonShuffleMode == Permute)
continue;
@@ -282,119 +288,17 @@ isShuffle(ArrayRef<Value *> VL) {
CommonShuffleMode = Permute;
continue;
}
- // Check the shuffle mode for the current operation.
- if (!Vec1)
- Vec1 = Vec;
- else if (Vec != Vec1)
- Vec2 = Vec;
- // Example: shufflevector A, B, <0,5,2,7>
- // I is odd and IntIdx for A == I - FirstAlternate shuffle.
- // I is even and IntIdx for B == I - FirstAlternate shuffle.
- // Example: shufflevector A, B, <4,1,6,3>
- // I is even and IntIdx for A == I - SecondAlternate shuffle.
- // I is odd and IntIdx for B == I - SecondAlternate shuffle.
- const bool IIsEven = I & 1;
- const bool CurrVecIsA = Vec == Vec1;
- const bool IIsOdd = !IIsEven;
- const bool CurrVecIsB = !CurrVecIsA;
- ShuffleMode CurrentShuffleMode =
- ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
- : SecondAlternate;
- // Common mode is not set or the same as the shuffle mode of the current
- // operation - alternate.
- if (CommonShuffleMode == Unknown)
- CommonShuffleMode = CurrentShuffleMode;
- // Common shuffle mode is not the same as the shuffle mode of the current
- // operation - permutation.
- if (CommonShuffleMode != CurrentShuffleMode)
- CommonShuffleMode = Permute;
+ CommonShuffleMode = Select;
}
// If we're not crossing lanes in different vectors, consider it as blending.
- if ((CommonShuffleMode == FirstAlternate ||
- CommonShuffleMode == SecondAlternate) &&
- Vec2)
- return TargetTransformInfo::SK_Alternate;
+ if (CommonShuffleMode == Select && Vec2)
+ return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
-///\returns Opcode that can be clubbed with \p Op to create an alternate
-/// sequence which can later be merged as a ShuffleVector instruction.
-static unsigned getAltOpcode(unsigned Op) {
- switch (Op) {
- case Instruction::FAdd:
- return Instruction::FSub;
- case Instruction::FSub:
- return Instruction::FAdd;
- case Instruction::Add:
- return Instruction::Sub;
- case Instruction::Sub:
- return Instruction::Add;
- default:
- return 0;
- }
-}
-
-static bool isOdd(unsigned Value) {
- return Value & 1;
-}
-
-static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
- unsigned CheckedOpcode) {
- return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
-}
-
-/// Chooses the correct key for scheduling data. If \p Op has the same (or
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
-/// OpValue.
-static Value *isOneOf(Value *OpValue, Value *Op) {
- auto *I = dyn_cast<Instruction>(Op);
- if (!I)
- return OpValue;
- auto *OpInst = cast<Instruction>(OpValue);
- unsigned OpInstOpcode = OpInst->getOpcode();
- unsigned IOpcode = I->getOpcode();
- if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
- return Op;
- return OpValue;
-}
-
-namespace {
-
-/// Contains data for the instructions going to be vectorized.
-struct RawInstructionsData {
- /// Main Opcode of the instructions going to be vectorized.
- unsigned Opcode = 0;
-
- /// The list of instructions have some instructions with alternate opcodes.
- bool HasAltOpcodes = false;
-};
-
-} // end anonymous namespace
-
-/// Checks the list of the vectorized instructions \p VL and returns info about
-/// this list.
-static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
- auto *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
- return {};
- RawInstructionsData Res;
- unsigned Opcode = I0->getOpcode();
- // Walk through the list of the vectorized instructions
- // in order to check its structure described by RawInstructionsData.
- for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
- auto *I = dyn_cast<Instruction>(VL[Cnt]);
- if (!I)
- return {};
- if (Opcode != I->getOpcode())
- Res.HasAltOpcodes = true;
- }
- Res.Opcode = Opcode;
- return Res;
-}
-
namespace {
/// Main data required for vectorization of instructions.
@@ -402,42 +306,90 @@ struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
- /// The main opcode for the list of instructions.
- unsigned Opcode = 0;
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
/// Some of the instructions in the list have alternate opcodes.
- bool IsAltShuffle = false;
+ bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+ }
- InstructionsState() = default;
- InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
- : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
+ InstructionsState() = delete;
+ InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+ : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
};
} // end anonymous namespace
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.OpValue;
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
- auto Res = getMainOpcode(VL);
- unsigned Opcode = Res.Opcode;
- if (!Res.HasAltOpcodes)
- return InstructionsState(VL[0], Opcode, false);
- auto *OpInst = cast<Instruction>(VL[0]);
- unsigned AltOpcode = getAltOpcode(Opcode);
- // Examine each element in the list instructions VL to determine
- // if some operations there could be considered as an alternative
- // (for example as subtraction relates to addition operation).
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0) {
+ // Make sure these are all Instructions.
+ if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+ bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+ bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+ unsigned AltOpcode = Opcode;
+ unsigned AltIndex = BaseIndex;
+
+ // Check for one alternate opcode from another BinaryOperator.
+ // TODO - generalize to support all operators (types, calls etc.).
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
- auto *I = cast<Instruction>(VL[Cnt]);
- unsigned InstOpcode = I->getOpcode();
- if ((Res.HasAltOpcodes &&
- InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
- (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
- return InstructionsState(OpInst, 0, false);
- }
+ unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+ if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+ Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+ Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ }
+ } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
- return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
+
+ return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+ cast<Instruction>(VL[AltIndex]));
}
/// \returns true if all of the values in \p VL have the same type or false
@@ -452,16 +404,21 @@ static bool allSameType(ArrayRef<Value *> VL) {
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
- assert(Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue);
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+ unsigned Opcode = E->getOpcode();
+ assert((Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue) &&
+ "Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
- ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
- return CI && CI->getZExtValue() == Idx;
- } else {
- ExtractValueInst *EI = cast<ExtractValueInst>(E);
- return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
+ auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+ if (!CI)
+ return None;
+ return CI->getZExtValue();
}
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ if (EI->getNumIndices() != 1)
+ return None;
+ return *EI->idx_begin();
}
/// \returns True if in-tree use also needs extract. This refers to
@@ -549,7 +506,7 @@ public:
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
- /// \brief Vectorize the tree that starts with the elements in \p VL.
+ /// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
@@ -585,8 +542,8 @@ public:
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
- NumLoadsWantToKeepOrder = 0;
- NumLoadsWantToChangeOrder = 0;
+ NumOpsWantToKeepOrder.clear();
+ NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -596,12 +553,22 @@ public:
unsigned getTreeSize() const { return VectorizableTree.size(); }
- /// \brief Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence(Function &F);
+ /// Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+
+ /// \returns The best order of instructions for vectorization.
+ Optional<ArrayRef<unsigned>> bestOrder() const {
+ auto I = std::max_element(
+ NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+ [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+ const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+ return D1.second < D2.second;
+ });
+ if (I == NumOpsWantToKeepOrder.end() ||
+ I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+ return None;
- /// \returns true if it is beneficial to reverse the vector order.
- bool shouldReorder() const {
- return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+ return makeArrayRef(I->getFirst());
}
/// \return The vector element size in bits to use when vectorizing the
@@ -625,7 +592,7 @@ public:
return MinVecRegSize;
}
- /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+ /// Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -648,9 +615,13 @@ private:
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
- /// \returns True if the ExtractElement/ExtractValue instructions in VL can
- /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
- bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
+ /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a
+ /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+ /// returns false, setting \p CurrentOrder to either an empty vector or a
+ /// non-identity permutation that allows to reuse extract instructions.
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
@@ -658,22 +629,19 @@ private:
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
- /// \returns the pointer to the vectorized value if \p VL is already
- /// vectorized, or NULL. They may happen in cycles.
- Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
-
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(Type *Ty);
+ int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
- /// \brief Set the Builder insert point to one after the last instruction in
+ /// Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
+ void setInsertPointAfterBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -684,7 +652,8 @@ private:
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
- void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+ void reorderAltShuffleOperands(const InstructionsState &S,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
@@ -698,8 +667,12 @@ private:
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
- assert(VL.size() == Scalars.size() && "Invalid size");
- return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ if (VL.size() == Scalars.size())
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ return VL.size() == ReuseShuffleIndices.size() &&
+ std::equal(
+ VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+ [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
@@ -711,6 +684,12 @@ private:
/// Do we need to gather this sequence ?
bool NeedToGather = false;
+ /// Does this sequence require some shuffling?
+ SmallVector<unsigned, 4> ReuseShuffleIndices;
+
+ /// Does this entry require reordering?
+ ArrayRef<unsigned> ReorderIndices;
+
/// Points back to the VectorizableTree.
///
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
@@ -725,13 +704,17 @@ private:
};
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
- int &UserTreeIdx) {
+ void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
+ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+ ReuseShuffleIndices.end());
+ Last->ReorderIndices = ReorderIndices;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -744,7 +727,6 @@ private:
if (UserTreeIdx >= 0)
Last->UserTreeIndices.push_back(UserTreeIdx);
UserTreeIdx = idx;
- return Last;
}
/// -- Vectorization State --
@@ -758,13 +740,6 @@ private:
return nullptr;
}
- const TreeEntry *getTreeEntry(Value *V) const {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return &VectorizableTree[I->second];
- return nullptr;
- }
-
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
@@ -1038,7 +1013,7 @@ private:
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
- DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
@@ -1061,8 +1036,8 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs()
- << "SLP: gets ready (def): " << *DepBundle << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
}
});
}
@@ -1075,8 +1050,8 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
- << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
@@ -1101,7 +1076,8 @@ private:
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
- DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *I << "\n");
}
});
}
@@ -1110,7 +1086,8 @@ private:
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
- bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
+ bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1120,7 +1097,7 @@ private:
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V, Value *OpValue);
+ bool extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -1201,11 +1178,38 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- // Number of load bundles that contain consecutive loads.
- int NumLoadsWantToKeepOrder = 0;
+ using OrdersType = SmallVector<unsigned, 4>;
+ /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+ /// sorted SmallVectors of unsigned.
+ struct OrdersTypeDenseMapInfo {
+ static OrdersType getEmptyKey() {
+ OrdersType V;
+ V.push_back(~1U);
+ return V;
+ }
+
+ static OrdersType getTombstoneKey() {
+ OrdersType V;
+ V.push_back(~2U);
+ return V;
+ }
+
+ static unsigned getHashValue(const OrdersType &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+ return LHS == RHS;
+ }
+ };
- // Number of load bundles that contain consecutive loads in reversed order.
- int NumLoadsWantToChangeOrder = 0;
+ /// Contains orders of operations along with the number of bundles that have
+ /// operations in this order. It stores only those orders that require
+ /// reordering, if reordering is not required it is counted using \a
+ /// NumOpsWantToKeepOriginalOrder.
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+ /// Number of bundles that do not require reordering.
+ unsigned NumOpsWantToKeepOriginalOrder = 0;
// Analysis and block reference.
Function *F;
@@ -1242,7 +1246,7 @@ template <> struct GraphTraits<BoUpSLP *> {
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
- /// \brief Add the VectorizableTree to the index iterator to be able to return
+ /// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<ChildIteratorType,
@@ -1340,17 +1344,22 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ int FoundLane = Lane;
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(Entry->ReuseShuffleIndices.begin(),
+ llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+ }
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
- Lane << " from " << *Scalar << ".\n");
- ExternalUses.emplace_back(Scalar, nullptr, Lane);
- continue;
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
}
for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
@@ -1364,8 +1373,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
- DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
- << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
assert(!UseEntry->NeedToGather && "Bad state");
continue;
}
@@ -1375,9 +1384,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
if (is_contained(UserIgnoreList, UserInst))
continue;
- DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
- Lane << " from " << *Scalar << ".\n");
- ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
}
}
}
@@ -1389,28 +1398,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
- DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
- DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
- DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
- DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1421,8 +1430,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
- DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
- ") is ephemeral.\n");
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ << ") is ephemeral.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1430,18 +1439,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
- if (E->Scalars[i] != VL[i]) {
- DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false, UserTreeIdx);
- return;
- }
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (!E->isSame(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, false, UserTreeIdx);
+ return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
return;
}
@@ -1451,8 +1459,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!I)
continue;
if (getTreeEntry(I)) {
- DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
- ") is already in tree.\n");
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ << ") is already in tree.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1462,7 +1470,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
- DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1476,19 +1484,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
- DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Check that every instruction appears once in this bundle.
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- for (unsigned j = i + 1; j < e; ++j)
- if (VL[i] == VL[j]) {
- DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false, UserTreeIdx);
- return;
- }
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ if (UniqueValues.size() == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, false, UserTreeIdx);
+ return;
+ }
+ VL = UniqueValues;
+ }
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
@@ -1496,18 +1517,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
- DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ if (!BS.tryScheduleBundle(VL, this, S)) {
+ LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
- DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+ LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1518,15 +1539,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TerminatorInst *Term = dyn_cast<TerminatorInst>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
- DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -1541,13 +1564,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = canReuseExtract(VL, VL0);
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
- DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
- } else {
- BS.cancelScheduling(VL, VL0);
+ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
}
- newTreeEntry(VL, Reuse, UserTreeIdx);
+ if (!CurrentOrder.empty()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ auto StoredCurrentOrderAndNum =
+ NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++StoredCurrentOrderAndNum->getSecond();
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+ StoredCurrentOrderAndNum->getFirst());
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+ BS.cancelScheduling(VL, VL0);
return;
}
case Instruction::Load: {
@@ -1562,62 +1607,67 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
- LoadInst *L = cast<LoadInst>(VL[i]);
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ auto POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
+ *POIter = L->getPointerOperand();
+ ++POIter;
}
- // Check if the loads are consecutive, reversed, or neither.
- // TODO: What we really want is to sort the loads, but for now, check
- // the two likely directions.
- bool Consecutive = true;
- bool ReverseConsecutive = true;
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
- if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
- Consecutive = false;
- break;
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
} else {
- ReverseConsecutive = false;
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
}
- }
-
- if (Consecutive) {
- ++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of loads.\n");
- return;
- }
-
- // If none of the load pairs were consecutive when checked in order,
- // check the reverse order.
- if (ReverseConsecutive)
- for (unsigned i = VL.size() - 1; i > 0; --i)
- if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
- ReverseConsecutive = false;
- break;
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted loads are consecutive.
+ if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original loads are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ } else {
+ // Need to reorder.
+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++I->getSecond();
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies, I->getFirst());
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
+ return;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
-
- if (ReverseConsecutive) {
- ++NumLoadsWantToChangeOrder;
- DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
- } else {
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- }
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
@@ -1637,13 +1687,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering casts with different src types.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
@@ -1665,14 +1716,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
@@ -1703,14 +1755,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
+ reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
@@ -1730,9 +1782,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
@@ -1743,9 +1795,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
@@ -1754,16 +1807,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
- DEBUG(
- dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1779,13 +1832,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
@@ -1802,8 +1855,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
@@ -1816,9 +1869,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
- << "\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+ << "\n");
return;
}
// ctlz,cttz and powi are special intrinsics whose second argument
@@ -1827,10 +1880,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument "<< A1I<<"!=" << A1J
- << "\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << A1I << "!=" << A1J << "\n");
return;
}
}
@@ -1840,14 +1892,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
- << *VL[i] << '\n');
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+ << *CI << "!=" << *VL[i] << '\n');
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1862,19 +1914,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::ShuffleVector:
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
- if (!S.IsAltShuffle) {
+ if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
+ reorderAltShuffleOperands(S, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
@@ -1892,8 +1944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
}
@@ -1923,15 +1975,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return N;
}
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const {
Instruction *E0 = cast<Instruction>(OpValue);
assert(E0->getOpcode() == Instruction::ExtractElement ||
E0->getOpcode() == Instruction::ExtractValue);
- assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
+ assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
+ CurrentOrder.clear();
+
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
@@ -1951,15 +2006,40 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
return false;
// Check that all of the indices extract from the correct offset.
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- Instruction *Inst = cast<Instruction>(VL[I]);
- if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
- return false;
+ bool ShouldKeepOrder = true;
+ unsigned E = VL.size();
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E + 1);
+ unsigned I = 0;
+ for (; I < E; ++I) {
+ auto *Inst = cast<Instruction>(VL[I]);
if (Inst->getOperand(0) != Vec)
- return false;
+ break;
+ Optional<unsigned> Idx = getExtractIndex(Inst);
+ if (!Idx)
+ break;
+ const unsigned ExtIdx = *Idx;
+ if (ExtIdx != I) {
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ break;
+ ShouldKeepOrder = false;
+ CurrentOrder[ExtIdx] = I;
+ } else {
+ if (CurrentOrder[I] != E + 1)
+ break;
+ CurrentOrder[I] = I;
+ }
+ }
+ if (I < E) {
+ CurrentOrder.clear();
+ return false;
}
- return true;
+ return ShouldKeepOrder;
}
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
@@ -1985,13 +2065,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ int ReuseShuffleCost = 0;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ return ReuseShuffleCost +
+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
- if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
+ if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+ allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
@@ -2008,37 +2097,86 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
IO->getZExtValue());
}
}
- return Cost;
+ return ReuseShuffleCost + Cost;
}
}
- return getGatherCost(E->Scalars);
+ return ReuseShuffleCost + getGatherCost(VL);
}
InstructionsState S = getSameOpcode(VL);
- assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(S.OpValue);
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI:
return 0;
case Instruction::ExtractValue:
case Instruction::ExtractElement:
- if (canReuseExtract(VL, S.OpValue)) {
- int DeadCost = 0;
+ if (NeedToShuffleReuses) {
+ unsigned Idx = 0;
+ for (unsigned I : E->ReuseShuffleIndices) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(VL[I])->getIndexOperand());
+ Idx = IO->getZExtValue();
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ } else {
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ ++Idx;
+ }
+ }
+ Idx = ReuseShuffleNumbers;
+ for (Value *V : VL) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Idx = IO->getZExtValue();
+ } else {
+ --Idx;
+ }
+ ReuseShuffleCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ }
+ }
+ if (!E->NeedToGather) {
+ int DeadCost = ReuseShuffleCost;
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E))
+ if (areAllUsersVectorized(E)) {
// Take credit for instruction that will become dead.
- DeadCost +=
+ if (E->hasOneUse()) {
+ Instruction *Ext = E->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, i);
+ // Add back the cost of s|zext which is subtracted seperately.
+ DeadCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+ continue;
+ }
+ }
+ DeadCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ }
}
- return -DeadCost;
+ return DeadCost;
}
- return getGatherCost(VecTy);
+ return ReuseShuffleCost + getGatherCost(VL);
case Instruction::ZExt:
case Instruction::SExt:
@@ -2053,24 +2191,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
+ int ScalarEltCost =
+ TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
// Calculate the cost of this instruction.
- int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
- VL0->getType(), SrcTy, VL0);
+ int ScalarCost = VL.size() * ScalarEltCost;
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
+ int VecCost = 0;
+ // Check if the values are candidates to demote.
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+ VecCost = ReuseShuffleCost +
+ TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+ }
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
+ int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+ Builder.getInt1Ty(), VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
- int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
- int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
- return VecCost - ScalarCost;
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Add:
case Instruction::FAdd:
@@ -2099,42 +2250,43 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_None;
+ TargetTransformInfo::OP_PowerOf2;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
- ConstantInt *CInt = nullptr;
- for (unsigned i = 0; i < VL.size(); ++i) {
+ ConstantInt *CInt0 = nullptr;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
- if (!isa<ConstantInt>(I->getOperand(1))) {
+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!CInt) {
Op2VK = TargetTransformInfo::OK_AnyValue;
+ Op2VP = TargetTransformInfo::OP_None;
break;
}
+ if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+ !CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_None;
if (i == 0) {
- CInt = cast<ConstantInt>(I->getOperand(1));
+ CInt0 = CInt;
continue;
}
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
- CInt != cast<ConstantInt>(I->getOperand(1)))
+ if (CInt0 != CInt)
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
- // FIXME: Currently cost of model modification for division by power of
- // 2 is handled for X86 and AArch64. Add support for other targets.
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
- CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_PowerOf2;
SmallVector<const Value *, 4> Operands(VL0->operand_values());
- int ScalarCost =
- VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
- Op2VP, Operands);
- int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP, Operands);
- return VecCost - ScalarCost;
+ int ScalarEltCost = TTI->getArithmeticInstrCost(
+ S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
@@ -2142,83 +2294,119 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
- int ScalarCost =
- VecTy->getNumElements() *
+ int ScalarEltCost =
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
- return VecCost - ScalarCost;
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
- int ScalarLdCost = VecTy->getNumElements() *
+ unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+ int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
- VecTy, alignment, 0, VL0);
- return VecLdCost - ScalarLdCost;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecLdCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ return ReuseShuffleCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
- unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
- int ScalarStCost = VecTy->getNumElements() *
+ unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+ int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, alignment, 0, VL0);
- return VecStCost - ScalarStCost;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecStCost =
+ TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+ return ReuseShuffleCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type*, 4> ScalarTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
+ SmallVector<Type *, 4> ScalarTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
ScalarTys.push_back(CI->getArgOperand(op)->getType());
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
- int ScalarCallCost = VecTy->getNumElements() *
+ int ScalarEltCost =
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
SmallVector<Value *, 4> Args(CI->arg_operands());
int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
VecTy->getNumElements());
- DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
- << " (" << VecCallCost << "-" << ScalarCallCost << ")"
- << " for " << *CI << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
- return VecCallCost - ScalarCallCost;
+ return ReuseShuffleCost + VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_AnyValue;
+ assert(S.isAltShuffle() &&
+ ((Instruction::isBinaryOp(S.getOpcode()) &&
+ Instruction::isBinaryOp(S.getAltOpcode())) ||
+ (Instruction::isCast(S.getOpcode()) &&
+ Instruction::isCast(S.getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
int ScalarCost = 0;
- int VecCost = 0;
+ if (NeedToShuffleReuses) {
+ for (unsigned Idx : E->ReuseShuffleIndices) {
+ Instruction *I = cast<Instruction>(VL[Idx]);
+ ReuseShuffleCost -= TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ ReuseShuffleCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ }
for (Value *i : VL) {
Instruction *I = cast<Instruction>(i);
- if (!I)
- break;
- ScalarCost +=
- TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+ assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ ScalarCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
- Instruction *I0 = cast<Instruction>(VL[0]);
- VecCost =
- TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
- Instruction *I1 = cast<Instruction>(VL[1]);
- VecCost +=
- TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
- return VecCost - ScalarCost;
+ int VecCost = 0;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
+ VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+ } else {
+ Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
+ Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+ VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
+ VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
+ VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+ }
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
@@ -2226,8 +2414,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
bool BoUpSLP::isFullyVectorizableTinyTree() {
- DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
- VectorizableTree.size() << " is fully vectorizable .\n");
+ LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+ << VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
@@ -2297,13 +2485,13 @@ int BoUpSLP::getSpillCost() {
LiveValues.insert(cast<Instruction>(&*J));
}
- DEBUG(
+ LLVM_DEBUG({
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
dbgs() << " " << X->getName();
dbgs() << ", Looking at ";
Inst->dump();
- );
+ });
// Now find the sequence of instructions between PrevInst and Inst.
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
@@ -2315,7 +2503,10 @@ int BoUpSLP::getSpillCost() {
continue;
}
- if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+ // Debug informations don't impact spill cost.
+ if ((isa<CallInst>(&*PrevInstIt) &&
+ !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+ &*PrevInstIt != PrevInst) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
V.push_back(VectorType::get(II->getType(), BundleWidth));
@@ -2333,19 +2524,41 @@ int BoUpSLP::getSpillCost() {
int BoUpSLP::getTreeCost() {
int Cost = 0;
- DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
- VectorizableTree.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+ << VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
- for (TreeEntry &TE : VectorizableTree) {
+ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+ TreeEntry &TE = VectorizableTree[I];
+
+ // We create duplicate tree entries for gather sequences that have multiple
+ // uses. However, we should not compute the cost of duplicate sequences.
+ // For example, if we have a build vector (i.e., insertelement sequence)
+ // that is used by more than one vector instruction, we only need to
+ // compute the cost of the insertelement instructions once. The redundent
+ // instructions will be eliminated by CSE.
+ //
+ // We should consider not creating duplicate tree entries for gather
+ // sequences, and instead add additional edges to the tree representing
+ // their uses. Since such an approach results in fewer total entries,
+ // existing heuristics based on tree size may yeild different results.
+ //
+ if (TE.NeedToGather &&
+ std::any_of(std::next(VectorizableTree.begin(), I + 1),
+ VectorizableTree.end(), [TE](TreeEntry &Entry) {
+ return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+ }))
+ continue;
+
int C = getEntryCost(&TE);
- DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
- << *TE.Scalars[0] << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for bundle that starts with " << *TE.Scalars[0]
+ << ".\n");
Cost += C;
}
- SmallSet<Value *, 16> ExtractCostCalculated;
+ SmallPtrSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
@@ -2386,7 +2599,7 @@ int BoUpSLP::getTreeCost() {
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
}
- DEBUG(dbgs() << Str);
+ LLVM_DEBUG(dbgs() << Str);
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
@@ -2394,10 +2607,14 @@ int BoUpSLP::getTreeCost() {
return Cost;
}
-int BoUpSLP::getGatherCost(Type *Ty) {
+int BoUpSLP::getGatherCost(Type *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
- Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.count(i))
+ Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.empty())
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
@@ -2408,7 +2625,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
- return getGatherCost(VecTy);
+ // Check if the same elements are inserted several times and count them as
+ // shuffle candidates.
+ DenseSet<unsigned> ShuffledElements;
+ DenseSet<Value *> UniqueElements;
+ // Iterate in reverse order to consider insert elements with the high cost.
+ for (unsigned I = VL.size(); I > 0; --I) {
+ unsigned Idx = I - 1;
+ if (!UniqueElements.insert(VL[Idx]).second)
+ ShuffledElements.insert(Idx);
+ }
+ return getGatherCost(VecTy, ShuffledElements);
}
// Reorder commutative operations in alternate shuffle if the resulting vectors
@@ -2420,16 +2647,14 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
-void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
- unsigned AltOpcode = getAltOpcode(Opcode);
- (void)AltOpcode;
for (Value *V : VL) {
auto *I = cast<Instruction>(V);
- assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
- "Incorrect instruction in vector");
+ assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
Left.push_back(I->getOperand(0));
Right.push_back(I->getOperand(1));
}
@@ -2609,7 +2834,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
// add a[1],c[2] load b[1]
// b[2] load b[2]
// add a[3],c[3] load b[3]
- for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
@@ -2630,17 +2855,15 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
}
}
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
- auto *Front = cast<Instruction>(OpValue);
+ auto *Front = cast<Instruction>(S.OpValue);
auto *BB = Front->getParent();
- const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
- const unsigned AltOpcode = getAltOpcode(Opcode);
assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
- return !sameOpcodeOrAlt(Opcode, AltOpcode,
- cast<Instruction>(V)->getOpcode()) ||
- cast<Instruction>(V)->getParent() == BB;
+ auto *I = cast<Instruction>(V);
+ return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
}));
// The last instruction in the bundle in program order.
@@ -2652,7 +2875,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
+ BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
@@ -2680,7 +2903,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
+ if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
LastInst = &I;
if (Bundle.empty())
break;
@@ -2706,7 +2929,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
if (TreeEntry *E = getTreeEntry(VL[i])) {
// Find which lane we need to extract.
int FoundLane = -1;
- for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+ for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane;
@@ -2714,6 +2937,11 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
}
}
assert(FoundLane >= 0 && "Could not find the correct lane");
+ if (!E->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(E->ReuseShuffleIndices.begin(),
+ llvm::find(E->ReuseShuffleIndices, FoundLane));
+ }
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
}
}
@@ -2722,66 +2950,128 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
return Vec;
}
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
- if (const TreeEntry *En = getTreeEntry(OpValue)) {
- if (En->isSame(VL) && En->VectorizedValue)
- return En->VectorizedValue;
- }
- return nullptr;
-}
-
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
InstructionsState S = getSameOpcode(VL);
- if (S.Opcode) {
+ if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- if (E->isSame(VL))
- return vectorizeTree(E);
+ if (E->isSame(VL)) {
+ Value *V = vectorizeTree(E);
+ if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+ // We need to get the vectorized value but without shuffle.
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ V = SV->getOperand(0);
+ } else {
+ // Reshuffle to get only unique values.
+ SmallVector<unsigned, 4> UniqueIdxs;
+ SmallSet<unsigned, 4> UsedIdxs;
+ for(unsigned Idx : E->ReuseShuffleIndices)
+ if (UsedIdxs.insert(Idx).second)
+ UniqueIdxs.emplace_back(Idx);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ UniqueIdxs);
+ }
+ }
+ return V;
+ }
}
}
Type *ScalarTy = S.OpValue->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType();
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ if (VL.size() > 2) {
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second || isa<Constant>(V))
+ UniqueValues.emplace_back(V);
+ }
+ // Do not shuffle single element or if number of unique values is not power
+ // of 2.
+ if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+ !llvm::isPowerOf2_32(UniqueValues.size()))
+ ReuseShuffleIndicies.clear();
+ else
+ VL = UniqueValues;
+ }
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
- return Gather(VL, VecTy);
+ Value *V = Gather(VL, VecTy);
+ if (!ReuseShuffleIndicies.empty()) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ ReuseShuffleIndicies, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ return V;
+}
+
+static void inversePermutation(ArrayRef<unsigned> Indices,
+ SmallVectorImpl<unsigned> &Mask) {
+ Mask.clear();
+ const unsigned E = Indices.size();
+ Mask.resize(E);
+ for (unsigned I = 0; I < E; ++I)
+ Mask[Indices[I]] = I;
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) {
- DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
InstructionsState S = getSameOpcode(E->Scalars);
- Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+ Instruction *VL0 = cast<Instruction>(S.OpValue);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+
if (E->NeedToGather) {
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- E->VectorizedValue = NewPhi;
+ Value *V = NewPhi;
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
- SmallSet<BasicBlock*, 4> VisitedBBs;
+ SmallPtrSet<BasicBlock*, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -2804,32 +3094,74 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
- return NewPhi;
+ return V;
}
case Instruction::ExtractElement: {
- if (canReuseExtract(E->Scalars, VL0)) {
+ if (!E->NeedToGather) {
Value *V = VL0->getOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (!E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
return V;
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (canReuseExtract(E->Scalars, VL0)) {
+ if (!E->NeedToGather) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
- E->VectorizedValue = V;
- return propagateMetadata(V, E->Scalars);
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ NewV = Builder.CreateShuffleVector(
+ NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = NewV;
+ return NewV;
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
@@ -2849,15 +3181,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *InVec = vectorizeTree(INVL);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -2870,23 +3208,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *L = vectorizeTree(LHSV);
Value *R = vectorizeTree(RHSV);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
- if (S.Opcode == Instruction::FCmp)
+ if (S.getOpcode() == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
@@ -2898,16 +3242,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *Cond = vectorizeTree(CondVec);
Value *True = vectorizeTree(TrueVec);
Value *False = vectorizeTree(FalseVec);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
Value *V = Builder.CreateSelect(Cond, True, False);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -2932,7 +3282,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
- reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
+ reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
RHSVL);
else
for (Value *V : E->Scalars) {
@@ -2941,29 +3291,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSVL.push_back(I->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
Value *V = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
-
return V;
}
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- setInsertPointAfterBundle(E->Scalars, VL0);
+ bool IsReorder = !E->ReorderIndices.empty();
+ if (IsReorder) {
+ S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+ VL0 = cast<Instruction>(S.OpValue);
+ }
+ setInsertPointAfterBundle(E->Scalars, S);
LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
@@ -2985,9 +3346,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
LI->setAlignment(Alignment);
- E->VectorizedValue = LI;
+ Value *V = propagateMetadata(LI, E->Scalars);
+ if (IsReorder) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ Mask, "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
++NumVectorInstructions;
- return propagateMetadata(LI, E->Scalars);
+ return V;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
@@ -2998,12 +3371,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *VecValue = vectorizeTree(ScalarStoreValues);
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
- StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+ StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
// The pointer operand uses an in-tree scalar, so add the new BitCast to
// ExternalUses to make sure that an extract will be generated in the
@@ -3014,13 +3387,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
- S->setAlignment(Alignment);
- E->VectorizedValue = S;
+ ST->setAlignment(Alignment);
+ Value *V = propagateMetadata(ST, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
++NumVectorInstructions;
- return propagateMetadata(S, E->Scalars);
+ return V;
}
case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
ValueList Op0VL;
for (Value *V : E->Scalars)
@@ -3041,17 +3419,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
-
return V;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
@@ -3075,7 +3457,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *OpVec = vectorizeTree(OpVL);
- DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
@@ -3093,58 +3475,87 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (ScalarArg && getTreeEntry(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
- assert(Instruction::isBinaryOp(S.Opcode) &&
+ assert(S.isAltShuffle() &&
+ ((Instruction::isBinaryOp(S.getOpcode()) &&
+ Instruction::isBinaryOp(S.getAltOpcode())) ||
+ (Instruction::isCast(S.getOpcode()) &&
+ Instruction::isCast(S.getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
- reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
- setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
-
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ Value *LHS, *RHS;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
+ setInsertPointAfterBundle(E->Scalars, S);
+ LHS = vectorizeTree(LHSVL);
+ RHS = vectorizeTree(RHSVL);
+ } else {
+ ValueList INVL;
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
+ setInsertPointAfterBundle(E->Scalars, S);
+ LHS = vectorizeTree(INVL);
+ }
- // Create a vector of LHS op1 RHS
- Value *V0 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
- unsigned AltOpcode = getAltOpcode(S.Opcode);
- // Create a vector of LHS op2 RHS
- Value *V1 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
+ Value *V0, *V1;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+ } else {
+ V0 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+ V1 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+ }
// Create shuffle to take alternate operations from the vector.
- // Also, gather up odd and even scalar ops to propagate IR flags to
+ // Also, gather up main and alt scalar ops to propagate IR flags to
// each vector operation.
- ValueList OddScalars, EvenScalars;
+ ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (isOdd(i)) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ if (OpInst->getOpcode() == S.getAltOpcode()) {
Mask[i] = Builder.getInt32(e + i);
- OddScalars.push_back(E->Scalars[i]);
+ AltScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
- EvenScalars.push_back(E->Scalars[i]);
+ OpScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
- propagateIRFlags(V0, EvenScalars);
- propagateIRFlags(V1, OddScalars);
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
return V;
}
@@ -3183,7 +3594,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
VectorizableTree[0].VectorizedValue = Trunc;
}
- DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+ LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+ << " values .\n");
// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
// specified by ScalarType.
@@ -3260,7 +3672,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
- }
+ }
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
@@ -3269,7 +3681,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
User->replaceUsesOfWith(Scalar, Ex);
}
- DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
// For each vectorized value:
@@ -3290,7 +3702,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
if (!Ty->isVoidTy()) {
#ifndef NDEBUG
for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to replace users in the ignorelist by undef.
assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
@@ -3300,7 +3712,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Value *Undef = UndefValue::get(Ty);
Scalar->replaceAllUsesWith(Undef);
}
- DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
}
}
@@ -3310,18 +3722,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
return VectorizableTree[0].VectorizedValue;
}
-void BoUpSLP::optimizeGatherSequence(Function &F) {
- DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
- << " gather sequences instructions.\n");
+void BoUpSLP::optimizeGatherSequence() {
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *it : GatherSeq) {
- InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
-
- if (!Insert)
+ for (Instruction *I : GatherSeq) {
+ if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
continue;
// Check if this block is inside a loop.
- Loop *L = LI->getLoopFor(Insert->getParent());
+ Loop *L = LI->getLoopFor(I->getParent());
if (!L)
continue;
@@ -3333,27 +3743,41 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
- Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
- Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
- if (CurrVec && L->contains(CurrVec))
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (Op0 && L->contains(Op0))
continue;
- if (NewElem && L->contains(NewElem))
+ if (Op1 && L->contains(Op1))
continue;
// We can hoist this instruction. Move it to the pre-header.
- Insert->moveBefore(PreHeader->getTerminator());
+ I->moveBefore(PreHeader->getTerminator());
}
+ // Make a list of all reachable blocks in our CSE queue.
+ SmallVector<const DomTreeNode *, 8> CSEWorkList;
+ CSEWorkList.reserve(CSEBlocks.size());
+ for (BasicBlock *BB : CSEBlocks)
+ if (DomTreeNode *N = DT->getNode(BB)) {
+ assert(DT->isReachableFromEntry(N));
+ CSEWorkList.push_back(N);
+ }
+
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
+
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
- ReversePostOrderTraversal<Function *> RPOT(&F);
- for (auto BB : RPOT) {
- // Traverse CSEBlocks by RPOT order.
- if (!CSEBlocks.count(BB))
- continue;
-
+ for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ "Worklist not sorted properly!");
+ BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
@@ -3384,8 +3808,9 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
- BoUpSLP *SLP, Value *OpValue) {
- if (isa<PHINode>(OpValue))
+ BoUpSLP *SLP,
+ const InstructionsState &S) {
+ if (isa<PHINode>(S.OpValue))
return true;
// Initialize the instruction bundle.
@@ -3393,12 +3818,12 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
ScheduleData *PrevInBundle = nullptr;
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
- DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (!extendSchedulingRegion(V, OpValue))
+ if (!extendSchedulingRegion(V, S))
return false;
}
@@ -3410,8 +3835,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
- DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
- << " was already scheduled\n");
+ LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
ReSchedule = true;
}
assert(BundleMember->isSchedulingEntity() &&
@@ -3446,8 +3871,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
initialFillReadyList(ReadyInsts);
}
- DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
- << BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
calculateDependencies(Bundle, true, SLP);
@@ -3465,7 +3890,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
}
}
if (!Bundle->isReady()) {
- cancelScheduling(VL, OpValue);
+ cancelScheduling(VL, S.OpValue);
return false;
}
return true;
@@ -3477,7 +3902,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
return;
ScheduleData *Bundle = getScheduleData(OpValue);
- DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
@@ -3508,13 +3933,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
}
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
- Value *OpValue) {
- if (getScheduleData(V, isOneOf(OpValue, V)))
+ const InstructionsState &S) {
+ if (getScheduleData(V, isOneOf(S, V)))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
- auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
+ auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
return false;
@@ -3522,8 +3947,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
"ScheduleData not in scheduling region");
ScheduleData *SD = allocateScheduleDataChunks();
SD->Inst = I;
- SD->init(SchedulingRegionID, OpValue);
- ExtraScheduleDataMap[I][OpValue] = SD;
+ SD->init(SchedulingRegionID, S.OpValue);
+ ExtraScheduleDataMap[I][S.OpValue] = SD;
return true;
};
if (CheckSheduleForI(I))
@@ -3533,10 +3958,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
- DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
}
// Search up and down at the same time, because we don't know if the new
@@ -3548,7 +3973,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
BasicBlock::iterator LowerEnd = BB->end();
while (true) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
- DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
}
@@ -3556,9 +3981,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
- DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
+ << "\n");
return true;
}
UpIter++;
@@ -3568,10 +3994,11 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
- DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
+ << "\n");
return true;
}
DownIter++;
@@ -3635,7 +4062,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
assert(isInSchedulingRegion(BundleMember));
if (!BundleMember->hasValidDependencies()) {
- DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
+ << "\n");
BundleMember->Dependencies = 0;
BundleMember->resetUnscheduledDeps();
@@ -3727,7 +4155,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
// and we can abort this loop at i6.
if (DistToSrc >= 2 * MaxMemDepDistance)
- break;
+ break;
DistToSrc++;
}
}
@@ -3736,7 +4164,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.push_back(SD);
- DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
+ << "\n");
}
}
}
@@ -3759,7 +4188,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
- DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
BS->resetSchedule();
@@ -4025,7 +4454,11 @@ void BoUpSLP::computeMinimumValueSizes() {
// We start by looking at each entry that can be demoted. We compute the
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
- if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+ llvm::all_of(TreeRoot, [](Value *R) {
+ assert(R->hasOneUse() && "Root should have only one use!");
+ return isa<GetElementPtrInst>(R->user_back());
+ })) {
MaxBitWidth = 8u;
// Determine if the sign bit of all the roots is known to be zero. If not,
@@ -4188,7 +4621,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
- DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
@@ -4203,8 +4636,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Vectorize trees that end at stores.
if (!Stores.empty()) {
- DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
- << " underlying objects.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+ << " underlying objects.\n");
Changed |= vectorizeStoreChains(R);
}
@@ -4215,21 +4648,21 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
if (!GEPs.empty()) {
- DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
- << " underlying objects.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+ << " underlying objects.\n");
Changed |= vectorizeGEPIndices(BB, R);
}
}
if (Changed) {
- R.optimizeGatherSequence(F);
- DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- DEBUG(verifyFunction(F));
+ R.optimizeGatherSequence();
+ LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ LLVM_DEBUG(verifyFunction(F));
}
return Changed;
}
-/// \brief Check that the Values in the slice in VL array are still existent in
+/// Check that the Values in the slice in VL array are still existent in
/// the WeakTrackingVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakTrackingVH
@@ -4244,30 +4677,28 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
- unsigned ChainLen = Chain.size();
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
- << "\n");
- unsigned Sz = R.getVectorElementSize(Chain[0]);
- unsigned VF = VecRegSize / Sz;
+ const unsigned ChainLen = Chain.size();
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ << "\n");
+ const unsigned Sz = R.getVectorElementSize(Chain[0]);
+ const unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
// Keep track of values that were deleted by vectorizing in the loop below.
- SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
+ const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
- for (unsigned i = 0, e = ChainLen; i < e; ++i) {
- if (i + VF > e)
- break;
+ for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;
- DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
- << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+ << "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
@@ -4278,9 +4709,10 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
int Cost = R.getTreeCost();
- DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
+ << "\n");
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
using namespace ore;
@@ -4417,66 +4849,48 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R, None, true);
+ return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector,
- bool AllowReorder,
- bool NeedExtraction) {
+ int UserCost, bool AllowReorder) {
if (VL.size() < 2)
return false;
- DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
- << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+ << VL.size() << ".\n");
- // Check that all of the parts are scalar instructions of the same type.
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
+ // Check that all of the parts are scalar instructions of the same type,
+ // we permit an alternate opcode via InstructionsState.
+ InstructionsState S = getSameOpcode(VL);
+ if (!S.getOpcode())
return false;
- unsigned Opcode0 = I0->getOpcode();
-
+ Instruction *I0 = cast<Instruction>(S.OpValue);
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "SmallVF", I0)
- << "Cannot SLP vectorize list: vectorization factor "
- << "less than 2 is not supported";
- });
- return false;
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
}
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty)) {
- // NOTE: the following will give user internal llvm type name, which may not be useful
+ // NOTE: the following will give user internal llvm type name, which may
+ // not be useful.
R.getORE()->emit([&]() {
- std::string type_str;
- llvm::raw_string_ostream rso(type_str);
- Ty->print(rso);
- return OptimizationRemarkMissed(
- SV_NAME, "UnsupportedType", I0)
- << "Cannot SLP vectorize list: type "
- << rso.str() + " is unsupported by vectorizer";
- });
- return false;
- }
- Instruction *Inst = dyn_cast<Instruction>(V);
-
- if (!Inst)
- return false;
- if (Inst->getOpcode() != Opcode0) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "InequableTypes", I0)
- << "Cannot SLP vectorize list: not all of the "
- << "parts of scalar instructions are of the same type: "
- << ore::NV("Instruction1Opcode", I0) << " and "
- << ore::NV("Instruction2Opcode", Inst);
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
});
return false;
}
@@ -4513,24 +4927,20 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
continue;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
- << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
- ArrayRef<Value *> EmptyArray;
- ArrayRef<Value *> BuildVectorSlice;
- if (!BuildVector.empty())
- BuildVectorSlice = BuildVector.slice(I, OpsWidth);
-
- R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
+ R.buildTree(Ops);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
// TODO: check if we can allow reordering for more cases.
- if (AllowReorder && R.shouldReorder()) {
+ if (AllowReorder && Order) {
+ // TODO: reorder tree nodes without tree rebuilding.
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here when
// there are exactly two operations.
assert(Ops.size() == 2);
- assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
@@ -4538,43 +4948,19 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ int Cost = R.getTreeCost() - UserCost;
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
cast<Instruction>(Ops[0]))
<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
<< " and with tree size "
<< ore::NV("TreeSize", R.getTreeSize()));
- Value *VectorizedRoot = R.vectorizeTree();
-
- // Reconstruct the build vector by extracting the vectorized root. This
- // way we handle the case where some elements of the vector are
- // undefined.
- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
- if (!BuildVectorSlice.empty()) {
- // The insert point is the last build vector instruction. The
- // vectorized root will precede it. This guarantees that we get an
- // instruction. The vectorized tree could have been constant folded.
- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
- unsigned VecIdx = 0;
- for (auto &V : BuildVectorSlice) {
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
- Instruction *I = cast<Instruction>(V);
- assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
- Instruction *Extract =
- cast<Instruction>(Builder.CreateExtractElement(
- VectorizedRoot, Builder.getInt32(VecIdx++)));
- I->setOperand(1, Extract);
- I->moveAfter(Extract);
- InsertAfter = I;
- }
- }
+ R.vectorizeTree();
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
@@ -4585,18 +4971,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!Changed && CandidateFound) {
R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "NotBeneficial", I0)
- << "List vectorization was possible but not beneficial with cost "
- << ore::NV("Cost", MinCost) << " >= "
- << ore::NV("Treshold", -SLPCostThreshold);
+ return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
});
} else if (!Changed) {
R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "NotPossible", I0)
- << "Cannot SLP vectorize list: vectorization was impossible"
- << " with available vectorization factors";
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
});
}
return Changed;
@@ -4645,7 +5029,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
return false;
}
-/// \brief Generate a shuffle mask to be used in a reduction tree.
+/// Generate a shuffle mask to be used in a reduction tree.
///
/// \param VecLen The length of the vector to be reduced.
/// \param NumEltsToRdx The number of elements that should be reduced in the
@@ -5128,6 +5512,77 @@ class HorizontalReduction {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Max,
cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+
+ LHS = Select->getTrueValue();
+ RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
+
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return OperationData(V);
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return OperationData(V);
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ }
+ switch (Pred) {
+ default:
+ return OperationData(V);
+
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Cond)->hasNoNaNs());
+
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_UGE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Cond)->hasNoNaNs());
+ }
}
}
return OperationData(V);
@@ -5136,7 +5591,7 @@ class HorizontalReduction {
public:
HorizontalReduction() = default;
- /// \brief Try to find a reduction tree.
+ /// Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
@@ -5164,6 +5619,8 @@ public:
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
+ if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+ return false;
ReducedValueData.clear();
ReductionRoot = B;
@@ -5262,7 +5719,7 @@ public:
return true;
}
- /// \brief Attempt to vectorize the tree found by
+ /// Attempt to vectorize the tree found by
/// matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
if (ReducedVals.empty())
@@ -5295,9 +5752,14 @@ public:
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
- if (V.shouldReorder()) {
- SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
- V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
+ Optional<ArrayRef<unsigned>> Order = V.bestOrder();
+ // TODO: Handle orders of size less than number of elements in the vector.
+ if (Order && Order->size() == VL.size()) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(VL.size());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [VL](const unsigned Idx) { return VL[Idx]; });
+ V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
@@ -5305,8 +5767,9 @@ public:
V.computeMinimumValueSizes();
// Estimate cost.
- int Cost =
- V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int TreeCost = V.getTreeCost();
+ int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int Cost = TreeCost + ReductionCost;
if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
return OptimizationRemarkMissed(
@@ -5319,8 +5782,8 @@ public:
break;
}
- DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
- << ". (HorRdx)\n");
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
return OptimizationRemark(
SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
@@ -5382,7 +5845,7 @@ public:
}
private:
- /// \brief Calculate the cost of a reduction.
+ /// Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
@@ -5441,16 +5904,16 @@ private:
}
ScalarReduxCost *= (ReduxWidth - 1);
- DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
- << " for reduction that starts with " << *FirstReducedVal
- << " (It is a "
- << (IsPairwiseReduction ? "pairwise" : "splitting")
- << " reduction)\n");
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ << " for reduction that starts with " << *FirstReducedVal
+ << " (It is a "
+ << (IsPairwiseReduction ? "pairwise" : "splitting")
+ << " reduction)\n");
return VecReduxCost - ScalarReduxCost;
}
- /// \brief Emit a horizontal reduction of the vectorized value.
+ /// Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
unsigned ReduxWidth, const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
@@ -5486,7 +5949,7 @@ private:
} // end anonymous namespace
-/// \brief Recognize construction of vectors like
+/// Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
@@ -5495,11 +5958,17 @@ private:
///
/// Returns true if it matches
static bool findBuildVector(InsertElementInst *LastInsertElem,
- SmallVectorImpl<Value *> &BuildVector,
- SmallVectorImpl<Value *> &BuildVectorOpds) {
+ TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ int &UserCost) {
+ UserCost = 0;
Value *V = nullptr;
do {
- BuildVector.push_back(LastInsertElem);
+ if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+ LastInsertElem->getType(),
+ CI->getZExtValue());
+ }
BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
V = LastInsertElem->getOperand(0);
if (isa<UndefValue>(V))
@@ -5508,20 +5977,17 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
if (!LastInsertElem || !LastInsertElem->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
-/// \brief Like findBuildVector, but looks for construction of aggregate.
+/// Like findBuildVector, but looks for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
- SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
Value *V;
do {
- BuildVector.push_back(IV);
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
V = IV->getAggregateOperand();
if (isa<UndefValue>(V))
@@ -5530,7 +5996,6 @@ static bool findBuildAggregate(InsertValueInst *IV,
if (!IV || !IV->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
@@ -5539,7 +6004,7 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
-/// \brief Try and get a reduction value from a phi node.
+/// Try and get a reduction value from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
@@ -5552,9 +6017,8 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
auto DominatedReduxValue = [&](Value *R) {
- return (
- dyn_cast<Instruction>(R) &&
- DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+ return isa<Instruction>(R) &&
+ DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
};
Value *Rdx = nullptr;
@@ -5624,7 +6088,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
- SmallSet<Value *, 8> VisitedInstrs;
+ SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
while (!Stack.empty()) {
Value *V;
@@ -5706,27 +6170,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
if (!R.canMapToVector(IVI->getType(), DL))
return false;
- SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+ if (!findBuildAggregate(IVI, BuildVectorOpds))
return false;
- DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+ return tryToVectorizeList(BuildVectorOpds, R);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
- SmallVector<Value *, 16> BuildVector;
+ int UserCost;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+ if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+ (llvm::all_of(BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ isShuffle(BuildVectorOpds)))
return false;
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);
}
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -5763,7 +6229,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
- SmallSet<Value *, 16> VisitedInstrs;
+ SmallPtrSet<Value *, 16> VisitedInstrs;
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -5798,14 +6264,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
- DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+ << NumElts << ")\n");
// The order in which the phi nodes appear in the program does not matter.
// So allow tryToVectorizeList to reorder them if it is beneficial. This
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- None, AllowReorder)) {
+ /*UserCost=*/0, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
@@ -5885,7 +6352,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
isa<InsertValueInst>(it))
PostProcessInstructions.push_back(&*it);
-
}
return Changed;
@@ -5899,8 +6365,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
if (Entry.second.size() < 2)
continue;
- DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
- << Entry.second.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+ << Entry.second.size() << ".\n");
// We process the getelementptr list in chunks of 16 (like we do for
// stores) to minimize compile-time.
@@ -5982,14 +6448,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (it->second.size() < 2)
continue;
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
- << it->second.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+ << it->second.size() << ".\n");
// Process the stores in chunks of 16.
// TODO: The limit of 16 inhibits greater vectorization factors.
// For example, AVX2 supports v32i8. Increasing this limit, however,
// may cause a significant compile-time increase.
- for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+ for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
new file mode 100644
index 000000000000..f43a8bb123b1
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -0,0 +1,131 @@
+//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class TargetTransformInfo;
+class TargetLibraryInfo;
+
+/// Helper class to create VPRecipies from IR instructions.
+class VPRecipeBuilder {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ VPBuilder &Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
+
+public:
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+ /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+ /// \return true in the first returned value if so and false otherwise.
+ /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+ /// for \p Range.Start, and provide it as the second returned value.
+ /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+ /// \return value is <true, nullptr>, as it is handled by another recipe.
+ /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+ /// to \p Range.End.
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+
+ /// Check if \I is a memory instruction to be widened for \p Range.Start and
+ /// potentially masked. Such instructions are handled by a recipe that takes
+ /// an additional VPInstruction for the mask.
+ VPWidenMemoryInstructionRecipe *
+ tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+
+ /// Check if an induction recipe should be constructed for \I within the given
+ /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+ /// may be decreased to ensure same decision from \p Range.Start to
+ /// \p Range.End.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+ VFRange &Range);
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
+
+ /// Check if \p I can be widened within the given VF \p Range. If \p I can be
+ /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
+ /// extended to include \p I or else build a new VPWidenRecipe for it and
+ /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
+ /// false otherwise. Range.End may be decreased to ensure same decision from
+ /// \p Range.Start to \p Range.End.
+ bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan);
+
+public:
+ VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM, VPBuilder &Builder)
+ : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
+ Builder(Builder) {}
+
+ /// Check if a recipe can be create for \p I withing the given VF \p Range.
+ /// If a recipe can be created, it adds it to \p VPBB.
+ bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan,
+ VPBasicBlock *VPBB);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 4e54fc6db2a5..f7b07b722bb1 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -116,7 +116,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
BasicBlock *PrevBB = CFG.PrevBB;
BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
PrevBB->getParent(), CFG.LastBB);
- DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
// Hook up the new basic block to its predecessors.
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -125,7 +125,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
- DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
if (isa<UnreachableInst>(PredBBTerminator)) {
assert(PredVPSuccessors.size() == 1 &&
"Predecessor ending w/o branch must have single successor.");
@@ -175,8 +175,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
}
// 2. Fill the IR basic block with IR instructions.
- DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
- << " in BB:" << NewBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+ << " in BB:" << NewBB->getName() << '\n');
State->CFG.VPBB2IRBB[this] = NewBB;
State->CFG.PrevVPBB = this;
@@ -184,7 +184,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
for (VPRecipeBase &Recipe : Recipes)
Recipe.execute(*State);
- DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+ LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
}
void VPRegionBlock::execute(VPTransformState *State) {
@@ -193,7 +193,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
if (!isReplicator()) {
// Visit the VPBlocks connected to "this", starting from it.
for (VPBlockBase *Block : RPOT) {
- DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
return;
@@ -210,7 +210,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Instance->Lane = Lane;
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {
- DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
}
@@ -220,6 +220,15 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Instance.reset();
}
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+ return getParent()->getRecipeList().erase(getIterator());
+}
+
void VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilder<> &Builder = State.Builder;
@@ -356,7 +365,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
"One successor of a basic block does not lead to the other.");
assert(InterimSucc->getSinglePredecessor() &&
"Interim successor has more than one predecessor.");
- assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 &&
+ assert(pred_size(PostDomSucc) == 2 &&
"PostDom successor has more than two predecessors.");
DT->addNewBlock(InterimSucc, BB);
DT->addNewBlock(PostDomSucc, BB);
@@ -448,6 +457,18 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
bumpIndent(1);
for (const VPRecipeBase &Recipe : *BasicBlock)
Recipe.print(OS, Indent);
+
+ // Dump the condition bit.
+ const VPValue *CBV = BasicBlock->getCondBit();
+ if (CBV) {
+ OS << " +\n" << Indent << " \"CondBit: ";
+ if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
+ CBI->printAsOperand(OS);
+ OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
+ } else
+ CBV->printAsOperand(OS);
+ }
+
bumpIndent(-2);
OS << "\n" << Indent << "]\n";
dumpEdges(BasicBlock);
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 2ccabfd6af25..866951cb79a4 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -30,6 +30,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
@@ -42,15 +43,10 @@
#include <map>
#include <string>
-// The (re)use of existing LoopVectorize classes is subject to future VPlan
-// refactoring.
-namespace {
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-} // namespace
-
namespace llvm {
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
class BasicBlock;
class DominatorTree;
class InnerLoopVectorizer;
@@ -60,6 +56,20 @@ class raw_ostream;
class Value;
class VPBasicBlock;
class VPRegionBlock;
+class VPlan;
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 9) = {1, 2, 4, 8}
+struct VFRange {
+ // A power of 2.
+ const unsigned Start;
+
+ // Need not be a power of 2. If End <= Start range is empty.
+ unsigned End;
+};
+
+using VPlanPtr = std::unique_ptr<VPlan>;
/// In what follows, the term "input IR" refers to code that is fed into the
/// vectorizer whereas the term "output IR" refers to code that is generated by
@@ -311,6 +321,8 @@ struct VPTransformState {
/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
class VPBlockBase {
+ friend class VPBlockUtils;
+
private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -327,6 +339,9 @@ private:
/// List of successor blocks.
SmallVector<VPBlockBase *, 1> Successors;
+ /// Successor selector, null for zero or single successor blocks.
+ VPValue *CondBit = nullptr;
+
/// Add \p Successor as the last successor to this block.
void appendSuccessor(VPBlockBase *Successor) {
assert(Successor && "Cannot add nullptr successor!");
@@ -377,6 +392,7 @@ public:
/// for any other purpose, as the values may change as LLVM evolves.
unsigned getVPBlockID() const { return SubclassID; }
+ VPRegionBlock *getParent() { return Parent; }
const VPRegionBlock *getParent() const { return Parent; }
void setParent(VPRegionBlock *P) { Parent = P; }
@@ -411,6 +427,9 @@ public:
return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
}
+ size_t getNumSuccessors() const { return Successors.size(); }
+ size_t getNumPredecessors() const { return Predecessors.size(); }
+
/// An Enclosing Block of a block B is any block containing B, including B
/// itself. \return the closest enclosing block starting from "this", which
/// has successors. \return the root enclosing block if all enclosing blocks
@@ -454,34 +473,41 @@ public:
return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
}
- /// Sets a given VPBlockBase \p Successor as the single successor and \return
- /// \p Successor. The parent of this Block is copied to be the parent of
- /// \p Successor.
- VPBlockBase *setOneSuccessor(VPBlockBase *Successor) {
+ /// \return the condition bit selecting the successor.
+ VPValue *getCondBit() { return CondBit; }
+
+ const VPValue *getCondBit() const { return CondBit; }
+
+ void setCondBit(VPValue *CV) { CondBit = CV; }
+
+ /// Set a given VPBlockBase \p Successor as the single successor of this
+ /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
+ /// This VPBlockBase must have no successors.
+ void setOneSuccessor(VPBlockBase *Successor) {
assert(Successors.empty() && "Setting one successor when others exist.");
appendSuccessor(Successor);
- Successor->appendPredecessor(this);
- Successor->Parent = Parent;
- return Successor;
}
- /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two
- /// successors. The parent of this Block is copied to be the parent of both
- /// \p IfTrue and \p IfFalse.
- void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
+ /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+ /// successors of this VPBlockBase. \p Condition is set as the successor
+ /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
+ /// IfFalse. This VPBlockBase must have no successors.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition) {
assert(Successors.empty() && "Setting two successors when others exist.");
+ assert(Condition && "Setting two successors without condition!");
+ CondBit = Condition;
appendSuccessor(IfTrue);
appendSuccessor(IfFalse);
- IfTrue->appendPredecessor(this);
- IfFalse->appendPredecessor(this);
- IfTrue->Parent = Parent;
- IfFalse->Parent = Parent;
}
- void disconnectSuccessor(VPBlockBase *Successor) {
- assert(Successor && "Successor to disconnect is null.");
- removeSuccessor(Successor);
- Successor->removePredecessor(this);
+ /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
+ /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
+ /// as successor of any VPBasicBlock in \p NewPreds.
+ void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
+ assert(Predecessors.empty() && "Block predecessors already set.");
+ for (auto *Pred : NewPreds)
+ appendPredecessor(Pred);
}
/// The method which generates the output IR that correspond to this
@@ -539,6 +565,15 @@ public:
/// Each recipe prints itself.
virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+
+ /// Insert an unlinked recipe into a basic block immediately before
+ /// the specified recipe.
+ void insertBefore(VPRecipeBase *InsertPos);
+
+ /// This method unlinks 'this' from the containing basic block and deletes it.
+ ///
+ /// \returns an iterator pointing to the element after the erased one
+ iplist<VPRecipeBase>::iterator eraseFromParent();
};
/// This is a concrete Recipe that models a single VPlan-level instruction.
@@ -546,6 +581,8 @@ public:
/// executed, these instructions would always form a single-def expression as
/// the VPInstruction is also a single def-use vertex.
class VPInstruction : public VPUser, public VPRecipeBase {
+ friend class VPlanHCFGTransforms;
+
public:
/// VPlan opcodes, extending LLVM IR with idiomatics instructions.
enum { Not = Instruction::OtherOpsEnd + 1 };
@@ -559,10 +596,13 @@ private:
void generateInstruction(VPTransformState &State, unsigned Part);
public:
- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
: VPUser(VPValue::VPInstructionSC, Operands),
VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPInstructionSC;
@@ -907,7 +947,10 @@ public:
inline const VPRecipeBase &back() const { return Recipes.back(); }
inline VPRecipeBase &back() { return Recipes.back(); }
- /// \brief Returns a pointer to a member of the recipe list.
+ /// Returns a reference to the list of recipes.
+ RecipeListTy &getRecipeList() { return Recipes; }
+
+ /// Returns a pointer to a member of the recipe list.
static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
return &VPBasicBlock::Recipes;
}
@@ -968,6 +1011,9 @@ public:
Entry->setParent(this);
Exit->setParent(this);
}
+ VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+ IsReplicator(IsReplicator) {}
~VPRegionBlock() override {
if (Entry)
@@ -982,9 +1028,27 @@ public:
const VPBlockBase *getEntry() const { return Entry; }
VPBlockBase *getEntry() { return Entry; }
+ /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+ /// EntryBlock must have no predecessors.
+ void setEntry(VPBlockBase *EntryBlock) {
+ assert(EntryBlock->getPredecessors().empty() &&
+ "Entry block cannot have predecessors.");
+ Entry = EntryBlock;
+ EntryBlock->setParent(this);
+ }
+
const VPBlockBase *getExit() const { return Exit; }
VPBlockBase *getExit() { return Exit; }
+ /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
+ /// ExitBlock must have no successors.
+ void setExit(VPBlockBase *ExitBlock) {
+ assert(ExitBlock->getSuccessors().empty() &&
+ "Exit block cannot have successors.");
+ Exit = ExitBlock;
+ ExitBlock->setParent(this);
+ }
+
/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
bool isReplicator() const { return IsReplicator; }
@@ -1012,6 +1076,13 @@ private:
/// Holds the name of the VPlan, for printing.
std::string Name;
+ /// Holds all the external definitions created for this VPlan.
+ // TODO: Introduce a specific representation for external definitions in
+ // VPlan. External definitions must be immutable and hold a pointer to its
+ // underlying IR that will be used to implement its structural comparison
+ // (operators '==' and '<').
+ SmallPtrSet<VPValue *, 16> VPExternalDefs;
+
/// Holds a mapping between Values and their corresponding VPValue inside
/// VPlan.
Value2VPValueTy Value2VPValue;
@@ -1024,6 +1095,8 @@ public:
VPBlockBase::deleteCFG(Entry);
for (auto &MapEntry : Value2VPValue)
delete MapEntry.second;
+ for (VPValue *Def : VPExternalDefs)
+ delete Def;
}
/// Generate the IR code for this VPlan.
@@ -1042,6 +1115,12 @@ public:
void setName(const Twine &newName) { Name = newName.str(); }
+ /// Add \p VPVal to the pool of external definitions if it's not already
+ /// in the pool.
+ void addExternalDef(VPValue *VPVal) {
+ VPExternalDefs.insert(VPVal);
+ }
+
void addVPValue(Value *V) {
assert(V && "Trying to add a null Value to VPlan");
assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
@@ -1189,6 +1268,72 @@ template <> struct GraphTraits<Inverse<VPBlockBase *>> {
}
};
+//===----------------------------------------------------------------------===//
+// VPlan Utilities
+//===----------------------------------------------------------------------===//
+
+/// Class that provides utilities for VPBlockBases in VPlan.
+class VPBlockUtils {
+public:
+ VPBlockUtils() = delete;
+
+ /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
+ /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
+ /// has more than one successor, its conditional bit is propagated to \p
+ /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+ static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+ assert(NewBlock->getSuccessors().empty() &&
+ "Can't insert new block with successors.");
+ // TODO: move successors from BlockPtr to NewBlock when this functionality
+ // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
+ // already has successors.
+ BlockPtr->setOneSuccessor(NewBlock);
+ NewBlock->setPredecessors({BlockPtr});
+ NewBlock->setParent(BlockPtr->getParent());
+ }
+
+ /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
+ /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
+ /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
+ /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
+ /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
+ /// must have neither successors nor predecessors.
+ static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition, VPBlockBase *BlockPtr) {
+ assert(IfTrue->getSuccessors().empty() &&
+ "Can't insert IfTrue with successors.");
+ assert(IfFalse->getSuccessors().empty() &&
+ "Can't insert IfFalse with successors.");
+ BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+ IfTrue->setPredecessors({BlockPtr});
+ IfFalse->setPredecessors({BlockPtr});
+ IfTrue->setParent(BlockPtr->getParent());
+ IfFalse->setParent(BlockPtr->getParent());
+ }
+
+ /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
+ /// the successors of \p From and \p From to the predecessors of \p To. Both
+ /// VPBlockBases must have the same parent, which can be null. Both
+ /// VPBlockBases can be already connected to other VPBlockBases.
+ static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert((From->getParent() == To->getParent()) &&
+ "Can't connect two block with different parents");
+ assert(From->getNumSuccessors() < 2 &&
+ "Blocks can't have more than two successors.");
+ From->appendSuccessor(To);
+ To->appendPredecessor(From);
+ }
+
+ /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
+ /// from the successors of \p From and \p From from the predecessors of \p To.
+ static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert(To && "Successor to disconnect is null.");
+ From->removeSuccessor(To);
+ To->removePredecessor(From);
+ }
+};
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/lib/Transforms/Vectorize/VPlanBuilder.h b/lib/Transforms/Vectorize/VPlanBuilder.h
deleted file mode 100644
index d6eb3397d044..000000000000
--- a/lib/Transforms/Vectorize/VPlanBuilder.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides a VPlan-based builder utility analogous to IRBuilder.
-/// It provides an instruction-level API for generating VPInstructions while
-/// abstracting away the Recipe manipulation details.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
-
-#include "VPlan.h"
-
-namespace llvm {
-
-class VPBuilder {
-private:
- VPBasicBlock *BB = nullptr;
- VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
-
- VPInstruction *createInstruction(unsigned Opcode,
- std::initializer_list<VPValue *> Operands) {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands);
- BB->insert(Instr, InsertPt);
- return Instr;
- }
-
-public:
- VPBuilder() {}
-
- /// \brief This specifies that created VPInstructions should be appended to
- /// the end of the specified block.
- void setInsertPoint(VPBasicBlock *TheBB) {
- assert(TheBB && "Attempting to set a null insert point");
- BB = TheBB;
- InsertPt = BB->end();
- }
-
- VPValue *createNot(VPValue *Operand) {
- return createInstruction(VPInstruction::Not, {Operand});
- }
-
- VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
- }
-
- VPValue *createOr(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
- }
-};
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
new file mode 100644
index 000000000000..08129b74cddf
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -0,0 +1,336 @@
+//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the construction of a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR. This construction comprises the following
+/// components and steps:
+//
+/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
+/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
+/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
+/// in the plain CFG.
+/// NOTE: At this point, there is a direct correspondence between all the
+/// VPBasicBlocks created for the initial plain CFG and the incoming
+/// BasicBlocks. However, this might change in the future.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "llvm/Analysis/LoopIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+namespace {
+// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // Vectorization plan that we are working on.
+ VPlan &Plan;
+
+ // Output Top Region.
+ VPRegionBlock *TopRegion = nullptr;
+
+ // Builder of the VPlan instruction-level representation.
+ VPBuilder VPIRBuilder;
+
+ // NOTE: The following maps are intentionally destroyed after the plain CFG
+ // construction because subsequent VPlan-to-VPlan transformation may
+ // invalidate them.
+ // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+ DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+ // Map incoming Value definitions to their newly-created VPValues.
+ DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+ // Hold phi node's that need to be fixed once the plain CFG has been built.
+ SmallVector<PHINode *, 8> PhisToFix;
+
+ // Utility functions.
+ void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+ void fixPhiNodes();
+ VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+ bool isExternalDef(Value *Val);
+ VPValue *getOrCreateVPOperand(Value *IRVal);
+ void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+ PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+ : TheLoop(Lp), LI(LI), Plan(P) {}
+
+ // Build the plain CFG and return its Top Region.
+ VPRegionBlock *buildPlainCFG();
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+ SmallVector<VPBlockBase *, 8> VPBBPreds;
+ // Collect VPBB predecessors.
+ for (BasicBlock *Pred : predecessors(BB))
+ VPBBPreds.push_back(getOrCreateVPBB(Pred));
+
+ VPBB->setPredecessors(VPBBPreds);
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixPhiNodes() {
+ for (auto *Phi : PhisToFix) {
+ assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+ VPValue *VPVal = IRDef2VPValue[Phi];
+ assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
+ auto *VPPhi = cast<VPInstruction>(VPVal);
+ assert(VPPhi->getNumOperands() == 0 &&
+ "Expected VPInstruction with no operands.");
+
+ for (Value *Op : Phi->operands())
+ VPPhi->addOperand(getOrCreateVPOperand(Op));
+ }
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
+// existing one if it was already created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+ auto BlockIt = BB2VPBB.find(BB);
+ if (BlockIt != BB2VPBB.end())
+ // Retrieve existing VPBB.
+ return BlockIt->second;
+
+ // Create new VPBB.
+ LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
+ VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+ BB2VPBB[BB] = VPBB;
+ VPBB->setParent(TopRegion);
+ return VPBB;
+}
+
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+ // All the Values that are not Instructions are considered external
+ // definitions for now.
+ Instruction *Inst = dyn_cast<Instruction>(Val);
+ if (!Inst)
+ return true;
+
+ BasicBlock *InstParent = Inst->getParent();
+ assert(InstParent && "Expected instruction parent.");
+
+ // Check whether Instruction definition is in loop PH.
+ BasicBlock *PH = TheLoop->getLoopPreheader();
+ assert(PH && "Expected loop pre-header.");
+
+ if (InstParent == PH)
+ // Instruction definition is in outermost loop PH.
+ return false;
+
+ // Check whether Instruction definition is in the loop exit.
+ BasicBlock *Exit = TheLoop->getUniqueExitBlock();
+ assert(Exit && "Expected loop with single exit.");
+ if (InstParent == Exit) {
+ // Instruction definition is in outermost loop exit.
+ return false;
+ }
+
+ // Check whether Instruction definition is in loop body.
+ return !TheLoop->contains(Inst);
+}
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+ auto VPValIt = IRDef2VPValue.find(IRVal);
+ if (VPValIt != IRDef2VPValue.end())
+ // Operand has an associated VPInstruction or VPValue that was previously
+ // created.
+ return VPValIt->second;
+
+ // Operand doesn't have a previously created VPInstruction/VPValue. This
+ // means that operand is:
+ // A) a definition external to VPlan,
+ // B) any other Value without specific representation in VPlan.
+ // For now, we use VPValue to represent A and B and classify both as external
+ // definitions. We may introduce specific VPValue subclasses for them in the
+ // future.
+ assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+ // A and B: Create VPValue and add it to the pool of external definitions and
+ // to the Value->VPValue map.
+ VPValue *NewVPVal = new VPValue(IRVal);
+ Plan.addExternalDef(NewVPVal);
+ IRDef2VPValue[IRVal] = NewVPVal;
+ return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+ BasicBlock *BB) {
+ VPIRBuilder.setInsertPoint(VPBB);
+ for (Instruction &InstRef : *BB) {
+ Instruction *Inst = &InstRef;
+
+ // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+ // visited Inst when we shouldn't, breaking the RPO traversal order.
+ assert(!IRDef2VPValue.count(Inst) &&
+ "Instruction shouldn't have been visited.");
+
+ if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+ // Branch instruction is not explicitly represented in VPlan but we need
+ // to represent its condition bit when it's conditional.
+ if (Br->isConditional())
+ getOrCreateVPOperand(Br->getCondition());
+
+ // Skip the rest of the Instruction processing for Branch instructions.
+ continue;
+ }
+
+ VPInstruction *NewVPInst;
+ if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+ // Phi node's operands may have not been visited at this point. We create
+ // an empty VPInstruction that we will fix once the whole plain CFG has
+ // been built.
+ NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+ Inst->getOpcode(), {} /*No operands*/, Inst));
+ PhisToFix.push_back(Phi);
+ } else {
+ // Translate LLVM-IR operands into VPValue operands and set them in the
+ // new VPInstruction.
+ SmallVector<VPValue *, 4> VPOperands;
+ for (Value *Op : Inst->operands())
+ VPOperands.push_back(getOrCreateVPOperand(Op));
+
+ // Build VPInstruction for any arbitraty Instruction without specific
+ // representation in VPlan.
+ NewVPInst = cast<VPInstruction>(
+ VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+ }
+
+ IRDef2VPValue[Inst] = NewVPInst;
+ }
+}
+
+// Main interface to build the plain CFG.
+VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
+ // 1. Create the Top Region. It will be the parent of all VPBBs.
+ TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
+
+ // 2. Scan the body of the loop in a topological order to visit each basic
+ // block after having visited its predecessor basic blocks. Create a VPBB for
+ // each BB and link it to its successor and predecessor VPBBs. Note that
+ // predecessors must be set in the same order as they are in the incomming IR.
+ // Otherwise, there might be problems with existing phi nodes and algorithm
+ // based on predecessors traversal.
+
+ // Loop PH needs to be explicitly visited since it's not taken into account by
+ // LoopBlocksDFS.
+ BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
+ assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+ "Unexpected loop preheader");
+ VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
+ createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
+ // Create empty VPBB for Loop H so that we can link PH->H.
+ VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
+ // Preheader's predecessors will be set during the loop RPO traversal below.
+ PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+
+ LoopBlocksRPO RPO(TheLoop);
+ RPO.perform(LI);
+
+ for (BasicBlock *BB : RPO) {
+ // Create or retrieve the VPBasicBlock for this BB and create its
+ // VPInstructions.
+ VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+ createVPInstructionsForVPBB(VPBB, BB);
+
+ // Set VPBB successors. We create empty VPBBs for successors if they don't
+ // exist already. Recipes will be created when the successor is visited
+ // during the RPO traversal.
+ TerminatorInst *TI = BB->getTerminator();
+ assert(TI && "Terminator expected.");
+ unsigned NumSuccs = TI->getNumSuccessors();
+
+ if (NumSuccs == 1) {
+ VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB && "VPBB Successor not found.");
+ VPBB->setOneSuccessor(SuccVPBB);
+ } else if (NumSuccs == 2) {
+ VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB0 && "Successor 0 not found.");
+ VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
+ assert(SuccVPBB1 && "Successor 1 not found.");
+
+ // Get VPBB's condition bit.
+ assert(isa<BranchInst>(TI) && "Unsupported terminator!");
+ auto *Br = cast<BranchInst>(TI);
+ Value *BrCond = Br->getCondition();
+ // Look up the branch condition to get the corresponding VPValue
+ // representing the condition bit in VPlan (which may be in another VPBB).
+ assert(IRDef2VPValue.count(BrCond) &&
+ "Missing condition bit in IRDef2VPValue!");
+ VPValue *VPCondBit = IRDef2VPValue[BrCond];
+
+ // Link successors using condition bit.
+ VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+ } else
+ llvm_unreachable("Number of successors not supported.");
+
+ // Set VPBB predecessors in the same order as they are in the incoming BB.
+ setVPBBPredsFromBB(VPBB, BB);
+ }
+
+ // 3. Process outermost loop exit. We created an empty VPBB for the loop
+ // single exit BB during the RPO traversal of the loop body but Instructions
+ // weren't visited because it's not part of the the loop.
+ BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+ assert(LoopExitBB && "Loops with multiple exits are not supported.");
+ VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
+ createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
+ // Loop exit was already set as successor of the loop exiting BB.
+ // We only set its predecessor VPBB now.
+ setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+
+ // 4. The whole CFG has been built at this point so all the input Values must
+ // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
+ // VPlan operands.
+ fixPhiNodes();
+
+ // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
+ // Top Region entry and exit.
+ TopRegion->setEntry(PreheaderVPBB);
+ TopRegion->setExit(LoopExitVPBB);
+ return TopRegion;
+}
+
+// Public interface to build a H-CFG.
+void VPlanHCFGBuilder::buildHierarchicalCFG(VPlan &Plan) {
+ // Build Top Region enclosing the plain CFG and set it as VPlan entry.
+ PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
+ VPRegionBlock *TopRegion = PCFGBuilder.buildPlainCFG();
+ Plan.setEntry(TopRegion);
+ LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+
+ Verifier.verifyHierarchicalCFG(TopRegion);
+}
diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
new file mode 100644
index 000000000000..c4e69843615a
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -0,0 +1,55 @@
+//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanHCFGBuilder class which contains the public
+/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR.
+///
+/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
+/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
+/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
+/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
+/// other than the Top Region will have a parent VPRegionBlock and allows us
+/// to easily add more nodes before/after the main vector loop (such as the
+/// reduction epilogue).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+
+#include "VPlan.h"
+#include "VPlanVerifier.h"
+
+namespace llvm {
+
+class Loop;
+
+/// Main class to build the VPlan H-CFG for an incoming IR.
+class VPlanHCFGBuilder {
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // VPlan verifier utility.
+ VPlanVerifier Verifier;
+
+public:
+ VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI) : TheLoop(Lp), LI(LI) {}
+
+ /// Build H-CFG for TheLoop and update \p Plan accordingly.
+ void buildHierarchicalCFG(VPlan &Plan);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
new file mode 100644
index 000000000000..e3cbab077e61
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
@@ -0,0 +1,73 @@
+//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGTransforms.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList *Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
+ VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+ for (VPBlockBase *Base : RPOT) {
+ // Do not widen instructions in pre-header and exit blocks.
+ if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
+ continue;
+
+ VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ VPRecipeBase *LastRecipe = nullptr;
+ // Introduce each ingredient into VPlan.
+ for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
+ VPRecipeBase *Ingredient = &*I++;
+ // Can only handle VPInstructions.
+ VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
+ Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+ if (DeadInstructions.count(Inst)) {
+ Ingredient->eraseFromParent();
+ continue;
+ }
+
+ VPRecipeBase *NewRecipe = nullptr;
+ // Create VPWidenMemoryInstructionRecipe for loads and stores.
+ if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+ NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/);
+ else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+ InductionDescriptor II = Inductions->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction) {
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
+ } else
+ NewRecipe = new VPWidenPHIRecipe(Phi);
+ } else {
+ // If the last recipe is a VPWidenRecipe, add Inst to it instead of
+ // creating a new recipe.
+ if (VPWidenRecipe *WidenRecipe =
+ dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) {
+ WidenRecipe->appendInstruction(Inst);
+ Ingredient->eraseFromParent();
+ continue;
+ }
+ NewRecipe = new VPWidenRecipe(Inst);
+ }
+
+ NewRecipe->insertBefore(Ingredient);
+ LastRecipe = NewRecipe;
+ Ingredient->eraseFromParent();
+ }
+ }
+}
diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
new file mode 100644
index 000000000000..ae549c6871b3
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
@@ -0,0 +1,36 @@
+//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility VPlan to VPlan transformations.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+
+#include "VPlan.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+
+class VPlanHCFGTransforms {
+
+public:
+ /// Replaces the VPInstructions in \p Plan with corresponding
+ /// widen recipes.
+ static void VPInstructionsToVPRecipes(
+ VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList *Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h
index 50966891e0eb..08f142915b49 100644
--- a/lib/Transforms/Vectorize/VPlanValue.h
+++ b/lib/Transforms/Vectorize/VPlanValue.h
@@ -37,13 +37,34 @@ class VPUser;
// coming from the input IR, instructions which VPlan will generate if executed
// and live-outs which the VPlan will need to fix accordingly.
class VPValue {
+ friend class VPBuilder;
private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
SmallVector<VPUser *, 1> Users;
protected:
- VPValue(const unsigned char SC) : SubclassID(SC) {}
+ // Hold the underlying Value, if any, attached to this VPValue.
+ Value *UnderlyingVal;
+
+ VPValue(const unsigned char SC, Value *UV = nullptr)
+ : SubclassID(SC), UnderlyingVal(UV) {}
+
+ // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
+ // the front-end and back-end of VPlan so that the middle-end is as
+ // independent as possible of the underlying IR. We grant access to the
+ // underlying IR using friendship. In that way, we should be able to use VPlan
+ // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
+ // back-end and analysis information for the new IR.
+
+ /// Return the underlying Value attached to this VPValue.
+ Value *getUnderlyingValue() { return UnderlyingVal; }
+
+ // Set \p Val as the underlying Value of this VPValue.
+ void setUnderlyingValue(Value *Val) {
+ assert(!UnderlyingVal && "Underlying Value is already set.");
+ UnderlyingVal = Val;
+ }
public:
/// An enumeration for keeping track of the concrete subclass of VPValue that
@@ -52,7 +73,7 @@ public:
/// type identification.
enum { VPValueSC, VPUserSC, VPInstructionSC };
- VPValue() : SubclassID(VPValueSC) {}
+ VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {}
VPValue(const VPValue &) = delete;
VPValue &operator=(const VPValue &) = delete;
@@ -94,11 +115,6 @@ class VPUser : public VPValue {
private:
SmallVector<VPValue *, 2> Operands;
- void addOperand(VPValue *Operand) {
- Operands.push_back(Operand);
- Operand->addUser(*this);
- }
-
protected:
VPUser(const unsigned char SC) : VPValue(SC) {}
VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
@@ -120,6 +136,11 @@ public:
V->getVPValueID() <= VPInstructionSC;
}
+ void addOperand(VPValue *Operand) {
+ Operands.push_back(Operand);
+ Operand->addUser(*this);
+ }
+
unsigned getNumOperands() const { return Operands.size(); }
inline VPValue *getOperand(unsigned N) const {
assert(N < Operands.size() && "Operand index out of bounds");
diff --git a/lib/Transforms/Vectorize/VPlanVerifier.cpp b/lib/Transforms/Vectorize/VPlanVerifier.cpp
new file mode 100644
index 000000000000..054bed4e177f
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -0,0 +1,133 @@
+//===-- VPlanVerifier.cpp -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class VPlanVerifier, which contains utility functions
+/// to check the consistency and invariants of a VPlan.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanVerifier.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
+ cl::Hidden,
+ cl::desc("Verify VPlan H-CFG."));
+
+#ifndef NDEBUG
+/// Utility function that checks whether \p VPBlockVec has duplicate
+/// VPBlockBases.
+static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
+ SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
+ for (const auto *Block : VPBlockVec) {
+ if (VPBlockSet.count(Block))
+ return true;
+ VPBlockSet.insert(Block);
+ }
+ return false;
+}
+#endif
+
+/// Helper function that verifies the CFG invariants of the VPBlockBases within
+/// \p Region. Checks in this function are generic for VPBlockBases. They are
+/// not specific for VPBasicBlocks or VPRegionBlocks.
+static void verifyBlocksInRegion(const VPRegionBlock *Region) {
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ // Check block's parent.
+ assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+
+ // Check block's condition bit.
+ if (VPB->getNumSuccessors() > 1)
+ assert(VPB->getCondBit() && "Missing condition bit!");
+ else
+ assert(!VPB->getCondBit() && "Unexpected condition bit!");
+
+ // Check block's successors.
+ const auto &Successors = VPB->getSuccessors();
+ // There must be only one instance of a successor in block's successor list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Successors) &&
+ "Multiple instances of the same successor.");
+
+ for (const VPBlockBase *Succ : Successors) {
+ // There must be a bi-directional link between block and successor.
+ const auto &SuccPreds = Succ->getPredecessors();
+ assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) !=
+ SuccPreds.end() &&
+ "Missing predecessor link.");
+ (void)SuccPreds;
+ }
+
+ // Check block's predecessors.
+ const auto &Predecessors = VPB->getPredecessors();
+ // There must be only one instance of a predecessor in block's predecessor
+ // list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Predecessors) &&
+ "Multiple instances of the same predecessor.");
+
+ for (const VPBlockBase *Pred : Predecessors) {
+ // Block and predecessor must be inside the same region.
+ assert(Pred->getParent() == VPB->getParent() &&
+ "Predecessor is not in the same region.");
+
+ // There must be a bi-directional link between block and predecessor.
+ const auto &PredSuccs = Pred->getSuccessors();
+ assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) !=
+ PredSuccs.end() &&
+ "Missing successor link.");
+ (void)PredSuccs;
+ }
+ }
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+static void verifyRegion(const VPRegionBlock *Region) {
+ const VPBlockBase *Entry = Region->getEntry();
+ const VPBlockBase *Exit = Region->getExit();
+
+ // Entry and Exit shouldn't have any predecessor/successor, respectively.
+ assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
+ assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+ (void)Entry;
+ (void)Exit;
+
+ verifyBlocksInRegion(Region);
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Recurse inside nested VPRegionBlocks.
+static void verifyRegionRec(const VPRegionBlock *Region) {
+ verifyRegion(Region);
+
+ // Recurse inside nested regions.
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
+ verifyRegionRec(SubRegion);
+ }
+}
+
+void VPlanVerifier::verifyHierarchicalCFG(
+ const VPRegionBlock *TopRegion) const {
+ if (!EnableHCFGVerifier)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
+ assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
+ verifyRegionRec(TopRegion);
+}
diff --git a/lib/Transforms/Vectorize/VPlanVerifier.h b/lib/Transforms/Vectorize/VPlanVerifier.h
new file mode 100644
index 000000000000..d2f99d006a66
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -0,0 +1,44 @@
+//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the class VPlanVerifier, which contains utility functions
+/// to check the consistency of a VPlan. This includes the following kinds of
+/// invariants:
+///
+/// 1. Region/Block invariants:
+/// - Region's entry/exit block must have no predecessors/successors,
+/// respectively.
+/// - Block's parent must be the region immediately containing the block.
+/// - Linked blocks must have a bi-directional link (successor/predecessor).
+/// - All predecessors/successors of a block must belong to the same region.
+/// - Blocks must have no duplicated successor/predecessor.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+
+#include "VPlan.h"
+
+namespace llvm {
+
+/// Class with utility functions that can be used to check the consistency and
+/// invariants of a VPlan, including the components of its H-CFG.
+class VPlanVerifier {
+public:
+ /// Verify the invariants of the H-CFG starting from \p TopRegion. The
+ /// verification process comprises the following steps:
+ /// 1. Region/Block verification: Check the Region/Block verification
+ /// invariants for every region in the H-CFG.
+ void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+};
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index b04905bfc6fa..f62a88558328 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -34,10 +34,6 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
initializeVectorization(*unwrap(R));
}
-// DEPRECATED: Remove after the LLVM 5 release.
-void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-}
-
void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopVectorizePass());
}