diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2024-07-27 23:34:35 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-10-23 18:26:01 +0000 |
commit | 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583 (patch) | |
tree | 6cf5ab1f05330c6773b1f3f64799d56a9c7a1faa /contrib/llvm-project/llvm/lib/Transforms/Scalar | |
parent | 6b9f7133aba44189d9625c352bc2c2a59baf18ef (diff) | |
parent | ac9a064cb179f3425b310fa2847f8764ac970a4d (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Scalar')
67 files changed, 3836 insertions, 3602 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp index 90b544c89226..5f0a9b22c3ee 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -350,7 +350,7 @@ bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) { // TODO -- move this test into llvm::isInstructionTriviallyDead if (CallInst *CI = dyn_cast<CallInst>(&I)) if (Function *Callee = CI->getCalledFunction()) - if (Callee->getName().equals(getInstrProfValueProfFuncName())) + if (Callee->getName() == getInstrProfValueProfFuncName()) if (isa<Constant>(CI->getArgOperand(0))) return true; return false; @@ -544,19 +544,20 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() { // value of the function, and may therefore be deleted safely. // NOTE: We reuse the Worklist vector here for memory efficiency. for (Instruction &I : llvm::reverse(instructions(F))) { - // With "RemoveDIs" debug-info stored in DPValue objects, debug-info - // attached to this instruction, and drop any for scopes that aren't alive, - // like the rest of this loop does. Extending support to assignment tracking - // is future work. - for (DPValue &DPV : make_early_inc_range(I.getDbgValueRange())) { - // Avoid removing a DPV that is linked to instructions because it holds + // With "RemoveDIs" debug-info stored in DbgVariableRecord objects, + // debug-info attached to this instruction, and drop any for scopes that + // aren't alive, like the rest of this loop does. Extending support to + // assignment tracking is future work. + for (DbgRecord &DR : make_early_inc_range(I.getDbgRecordRange())) { + // Avoid removing a DVR that is linked to instructions because it holds // information about an existing store. - if (DPV.isDbgAssign()) - if (!at::getAssignmentInsts(&DPV).empty()) + if (DbgVariableRecord *DVR = dyn_cast<DbgVariableRecord>(&DR); + DVR && DVR->isDbgAssign()) + if (!at::getAssignmentInsts(DVR).empty()) continue; - if (AliveScopes.count(DPV.getDebugLoc()->getScope())) + if (AliveScopes.count(DR.getDebugLoc()->getScope())) continue; - I.dropOneDbgValue(&DPV); + I.dropOneDbgRecord(&DR); } // Check if the instruction is alive. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp index b182f46cc515..5d9a7bca7efe 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp @@ -33,7 +33,7 @@ static void tryEmitAutoInitRemark(ArrayRef<Instruction *> Instructions, continue; Function &F = *I->getParent()->getParent(); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); AutoInitRemark Remark(ORE, REMARK_PASS, DL, TLI); Remark.visit(I); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp index 1fa2c75b0f42..d96dbca30fdb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -23,10 +23,13 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" + using namespace llvm; +using namespace PatternMatch; #define DEBUG_TYPE "bdce" @@ -42,15 +45,17 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { assert(I->getType()->isIntOrIntVectorTy() && "Trivializing a non-integer value?"); + // If all bits of a user are demanded, then we know that nothing below that + // in the def-use chain needs to be changed. + if (DB.getDemandedBits(I).isAllOnes()) + return; + // Initialize the worklist with eligible direct users. SmallPtrSet<Instruction *, 16> Visited; SmallVector<Instruction *, 16> WorkList; for (User *JU : I->users()) { - // If all bits of a user are demanded, then we know that nothing below that - // in the def-use chain needs to be changed. - auto *J = dyn_cast<Instruction>(JU); - if (J && J->getType()->isIntOrIntVectorTy() && - !DB.getDemandedBits(J).isAllOnes()) { + auto *J = cast<Instruction>(JU); + if (J->getType()->isIntOrIntVectorTy()) { Visited.insert(J); WorkList.push_back(J); } @@ -70,18 +75,19 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { Instruction *J = WorkList.pop_back_val(); // NSW, NUW, and exact are based on operands that might have changed. - J->dropPoisonGeneratingFlags(); + J->dropPoisonGeneratingAnnotations(); - // We do not have to worry about llvm.assume or range metadata: - // 1. llvm.assume demands its operand, so trivializing can't change it. - // 2. range metadata only applies to memory accesses which demand all bits. + // We do not have to worry about llvm.assume, because it demands its + // operand, so trivializing can't change it. + + // If all bits of a user are demanded, then we know that nothing below + // that in the def-use chain needs to be changed. + if (DB.getDemandedBits(J).isAllOnes()) + continue; for (User *KU : J->users()) { - // If all bits of a user are demanded, then we know that nothing below - // that in the def-use chain needs to be changed. - auto *K = dyn_cast<Instruction>(KU); - if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() && - !DB.getDemandedBits(K).isAllOnes()) + auto *K = cast<Instruction>(KU); + if (Visited.insert(K).second && K->getType()->isIntOrIntVectorTy()) WorkList.push_back(K); } } @@ -125,6 +131,38 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { } } + // Simplify and, or, xor when their mask does not affect the demanded bits. + if (auto *BO = dyn_cast<BinaryOperator>(&I)) { + APInt Demanded = DB.getDemandedBits(BO); + if (!Demanded.isAllOnes()) { + const APInt *Mask; + if (match(BO->getOperand(1), m_APInt(Mask))) { + bool CanBeSimplified = false; + switch (BO->getOpcode()) { + case Instruction::Or: + case Instruction::Xor: + CanBeSimplified = !Demanded.intersects(*Mask); + break; + case Instruction::And: + CanBeSimplified = Demanded.isSubsetOf(*Mask); + break; + default: + // TODO: Handle more cases here. + break; + } + + if (CanBeSimplified) { + clearAssumptionsOfUsers(BO, DB); + BO->replaceAllUsesWith(BO->getOperand(0)); + Worklist.push_back(BO); + ++NumSimplified; + Changed = true; + continue; + } + } + } + } + for (Use &U : I.operands()) { // DemandedBits only detects dead integer uses. if (!U->getType()->isIntOrIntVectorTy()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index 47f663fa0cf0..b8571ba07489 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -403,7 +403,7 @@ static void splitCallSite(CallBase &CB, NewPN->insertBefore(*TailBB, TailBB->begin()); CurrentI->replaceAllUsesWith(NewPN); } - CurrentI->dropDbgValues(); + CurrentI->dropDbgRecords(); CurrentI->eraseFromParent(); // We are done once we handled the first original instruction in TailBB. if (CurrentI == OriginalBeginInst) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 49f8761a1392..4a6dedc93d30 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -43,6 +43,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -162,27 +163,27 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { void ConstantHoistingPass::collectMatInsertPts( const RebasedConstantListType &RebasedConstants, - SmallVectorImpl<Instruction *> &MatInsertPts) const { + SmallVectorImpl<BasicBlock::iterator> &MatInsertPts) const { for (const RebasedConstantInfo &RCI : RebasedConstants) for (const ConstantUser &U : RCI.Uses) MatInsertPts.emplace_back(findMatInsertPt(U.Inst, U.OpndIdx)); } /// Find the constant materialization insertion point. -Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, - unsigned Idx) const { +BasicBlock::iterator ConstantHoistingPass::findMatInsertPt(Instruction *Inst, + unsigned Idx) const { // If the operand is a cast instruction, then we have to materialize the // constant before the cast instruction. if (Idx != ~0U) { Value *Opnd = Inst->getOperand(Idx); if (auto CastInst = dyn_cast<Instruction>(Opnd)) if (CastInst->isCast()) - return CastInst; + return CastInst->getIterator(); } // The simple and common case. This also includes constant expressions. if (!isa<PHINode>(Inst) && !Inst->isEHPad()) - return Inst; + return Inst->getIterator(); // We can't insert directly before a phi node or an eh pad. Insert before // the terminator of the incoming or dominating block. @@ -191,7 +192,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, if (Idx != ~0U && isa<PHINode>(Inst)) { InsertionBlock = cast<PHINode>(Inst)->getIncomingBlock(Idx); if (!InsertionBlock->isEHPad()) { - return InsertionBlock->getTerminator(); + return InsertionBlock->getTerminator()->getIterator(); } } else { InsertionBlock = Inst->getParent(); @@ -206,7 +207,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, IDom = IDom->getIDom(); } - return IDom->getBlock()->getTerminator(); + return IDom->getBlock()->getTerminator()->getIterator(); } /// Given \p BBs as input, find another set of BBs which collectively @@ -314,26 +315,27 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, } /// Find an insertion point that dominates all uses. -SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint( +SetVector<BasicBlock::iterator> +ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo, - const ArrayRef<Instruction *> MatInsertPts) const { + const ArrayRef<BasicBlock::iterator> MatInsertPts) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. SetVector<BasicBlock *> BBs; - SetVector<Instruction *> InsertPts; + SetVector<BasicBlock::iterator> InsertPts; - for (Instruction *MatInsertPt : MatInsertPts) + for (BasicBlock::iterator MatInsertPt : MatInsertPts) BBs.insert(MatInsertPt->getParent()); if (BBs.count(Entry)) { - InsertPts.insert(&Entry->front()); + InsertPts.insert(Entry->begin()); return InsertPts; } if (BFI) { findBestInsertionSet(*DT, *BFI, Entry, BBs); for (BasicBlock *BB : BBs) - InsertPts.insert(&*BB->getFirstInsertionPt()); + InsertPts.insert(BB->getFirstInsertionPt()); return InsertPts; } @@ -343,7 +345,7 @@ SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint( BB2 = BBs.pop_back_val(); BB = DT->findNearestCommonDominator(BB1, BB2); if (BB == Entry) { - InsertPts.insert(&Entry->front()); + InsertPts.insert(Entry->begin()); return InsertPts; } BBs.insert(BB); @@ -363,6 +365,9 @@ SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint( void ConstantHoistingPass::collectConstantCandidates( ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx, ConstantInt *ConstInt) { + if (ConstInt->getType()->isVectorTy()) + return; + InstructionCost Cost; // Ask the target about the cost of materializing the constant for the given // instruction and operand index. @@ -761,11 +766,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, Mat = GetElementPtrInst::Create(Type::getInt8Ty(*Ctx), Base, Adj->Offset, "mat_gep", Adj->MatInsertPt); // Hide it behind a bitcast. - Mat = new BitCastInst(Mat, Adj->Ty, "mat_bitcast", Adj->MatInsertPt); + Mat = new BitCastInst(Mat, Adj->Ty, "mat_bitcast", + Adj->MatInsertPt->getIterator()); } else // Constant being rebased is a ConstantInt. - Mat = BinaryOperator::Create(Instruction::Add, Base, Adj->Offset, - "const_mat", Adj->MatInsertPt); + Mat = + BinaryOperator::Create(Instruction::Add, Base, Adj->Offset, + "const_mat", Adj->MatInsertPt->getIterator()); LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0) << " + " << *Adj->Offset << ") in BB " @@ -816,7 +823,8 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, // Aside from constant GEPs, only constant cast expressions are collected. assert(ConstExpr->isCast() && "ConstExpr should be a cast"); - Instruction *ConstExprInst = ConstExpr->getAsInstruction(Adj->MatInsertPt); + Instruction *ConstExprInst = ConstExpr->getAsInstruction(); + ConstExprInst->insertBefore(Adj->MatInsertPt); ConstExprInst->setOperand(0, Mat); // Use the same debug location as the instruction we are about to update. @@ -842,9 +850,9 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) { SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec = BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec; for (const consthoist::ConstantInfo &ConstInfo : ConstInfoVec) { - SmallVector<Instruction *, 4> MatInsertPts; + SmallVector<BasicBlock::iterator, 4> MatInsertPts; collectMatInsertPts(ConstInfo.RebasedConstants, MatInsertPts); - SetVector<Instruction *> IPSet = + SetVector<BasicBlock::iterator> IPSet = findConstantInsertionPoint(ConstInfo, MatInsertPts); // We can have an empty set if the function contains unreachable blocks. if (IPSet.empty()) @@ -853,7 +861,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) { unsigned UsesNum = 0; unsigned ReBasesNum = 0; unsigned NotRebasedNum = 0; - for (Instruction *IP : IPSet) { + for (const BasicBlock::iterator &IP : IPSet) { // First, collect constants depending on this IP of the base. UsesNum = 0; SmallVector<UserAdjustment, 4> ToBeRebased; @@ -861,7 +869,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) { for (auto const &RCI : ConstInfo.RebasedConstants) { UsesNum += RCI.Uses.size(); for (auto const &U : RCI.Uses) { - Instruction *MatInsertPt = MatInsertPts[MatCtr++]; + const BasicBlock::iterator &MatInsertPt = MatInsertPts[MatCtr++]; BasicBlock *OrigMatInsertBB = MatInsertPt->getParent(); // If Base constant is to be inserted in multiple places, // generate rebase for U using the Base dominating U. @@ -941,7 +949,7 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, this->TTI = &TTI; this->DT = &DT; this->BFI = BFI; - this->DL = &Fn.getParent()->getDataLayout(); + this->DL = &Fn.getDataLayout(); this->Ctx = &Fn.getContext(); this->Entry = &Entry; this->PSI = PSI; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 7b672e89b67a..c31173879af1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" @@ -231,8 +232,8 @@ struct ConstraintTy { ConstraintTy(SmallVector<int64_t, 8> Coefficients, bool IsSigned, bool IsEq, bool IsNe) - : Coefficients(Coefficients), IsSigned(IsSigned), IsEq(IsEq), IsNe(IsNe) { - } + : Coefficients(std::move(Coefficients)), IsSigned(IsSigned), IsEq(IsEq), + IsNe(IsNe) {} unsigned size() const { return Coefficients.size(); } @@ -461,7 +462,7 @@ static Decomposition decomposeGEP(GEPOperator &GEP, // If Op0 is signed non-negative, the GEP is increasing monotonically and // can be de-composed. - if (!isKnownNonNegative(Index, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1)) + if (!isKnownNonNegative(Index, DL)) Preconditions.emplace_back(CmpInst::ICMP_SGE, Index, ConstantInt::get(Index->getType(), 0)); } @@ -499,6 +500,8 @@ static Decomposition decompose(Value *V, if (!Ty->isIntegerTy() || Ty->getIntegerBitWidth() > 64) return V; + bool IsKnownNonNegative = false; + // Decompose \p V used with a signed predicate. if (IsSigned) { if (auto *CI = dyn_cast<ConstantInt>(V)) { @@ -507,6 +510,14 @@ static Decomposition decompose(Value *V, } Value *Op0; Value *Op1; + + if (match(V, m_SExt(m_Value(Op0)))) + V = Op0; + else if (match(V, m_NNegZExt(m_Value(Op0)))) { + V = Op0; + IsKnownNonNegative = true; + } + if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) return MergeResults(Op0, Op1, IsSigned); @@ -529,7 +540,7 @@ static Decomposition decompose(Value *V, } } - return V; + return {V, IsKnownNonNegative}; } if (auto *CI = dyn_cast<ConstantInt>(V)) { @@ -539,22 +550,27 @@ static Decomposition decompose(Value *V, } Value *Op0; - bool IsKnownNonNegative = false; if (match(V, m_ZExt(m_Value(Op0)))) { IsKnownNonNegative = true; V = Op0; } + if (match(V, m_SExt(m_Value(Op0)))) { + V = Op0; + Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, + ConstantInt::get(Op0->getType(), 0)); + } + Value *Op1; ConstantInt *CI; if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1)))) { return MergeResults(Op0, Op1, IsSigned); } if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) { - if (!isKnownNonNegative(Op0, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1)) + if (!isKnownNonNegative(Op0, DL)) Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, ConstantInt::get(Op0->getType(), 0)); - if (!isKnownNonNegative(Op1, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1)) + if (!isKnownNonNegative(Op1, DL)) Preconditions.emplace_back(CmpInst::ICMP_SGE, Op1, ConstantInt::get(Op1->getType(), 0)); @@ -1016,6 +1032,20 @@ void State::addInfoForInductions(BasicBlock &BB) { WorkList.push_back(FactOrCheck::getConditionFact( DTN, CmpInst::ICMP_SLT, PN, B, ConditionTy(CmpInst::ICMP_SLE, StartValue, B))); + + // Try to add condition from header to the exit blocks. When exiting either + // with EQ or NE in the header, we know that the induction value must be u<= + // B, as other exits may only exit earlier. + assert(!StepOffset.isNegative() && "induction must be increasing"); + assert((Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && + "unsupported predicate"); + ConditionTy Precond = {CmpInst::ICMP_ULE, StartValue, B}; + SmallVector<BasicBlock *> ExitBBs; + L->getExitBlocks(ExitBBs); + for (BasicBlock *EB : ExitBBs) { + WorkList.emplace_back(FactOrCheck::getConditionFact( + DT.getNode(EB), CmpInst::ICMP_ULE, A, B, Precond)); + } } void State::addInfoFor(BasicBlock &BB) { @@ -1057,6 +1087,8 @@ void State::addInfoFor(BasicBlock &BB) { } // Enqueue ssub_with_overflow for simplification. case Intrinsic::ssub_with_overflow: + case Intrinsic::ucmp: + case Intrinsic::scmp: WorkList.push_back( FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I))); break; @@ -1065,6 +1097,9 @@ void State::addInfoFor(BasicBlock &BB) { case Intrinsic::umax: case Intrinsic::smin: case Intrinsic::smax: + // TODO: handle llvm.abs as well + WorkList.push_back( + FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I))); // TODO: Check if it is possible to instead only added the min/max facts // when simplifying uses of the min/max intrinsics. if (!isGuaranteedNotToBePoison(&I)) @@ -1395,6 +1430,48 @@ static bool checkAndReplaceCondition( return false; } +static bool checkAndReplaceMinMax(MinMaxIntrinsic *MinMax, ConstraintInfo &Info, + SmallVectorImpl<Instruction *> &ToRemove) { + auto ReplaceMinMaxWithOperand = [&](MinMaxIntrinsic *MinMax, bool UseLHS) { + // TODO: generate reproducer for min/max. + MinMax->replaceAllUsesWith(MinMax->getOperand(UseLHS ? 0 : 1)); + ToRemove.push_back(MinMax); + return true; + }; + + ICmpInst::Predicate Pred = + ICmpInst::getNonStrictPredicate(MinMax->getPredicate()); + if (auto ImpliedCondition = checkCondition( + Pred, MinMax->getOperand(0), MinMax->getOperand(1), MinMax, Info)) + return ReplaceMinMaxWithOperand(MinMax, *ImpliedCondition); + if (auto ImpliedCondition = checkCondition( + Pred, MinMax->getOperand(1), MinMax->getOperand(0), MinMax, Info)) + return ReplaceMinMaxWithOperand(MinMax, !*ImpliedCondition); + return false; +} + +static bool checkAndReplaceCmp(CmpIntrinsic *I, ConstraintInfo &Info, + SmallVectorImpl<Instruction *> &ToRemove) { + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + if (checkCondition(I->getGTPredicate(), LHS, RHS, I, Info).value_or(false)) { + I->replaceAllUsesWith(ConstantInt::get(I->getType(), 1)); + ToRemove.push_back(I); + return true; + } + if (checkCondition(I->getLTPredicate(), LHS, RHS, I, Info).value_or(false)) { + I->replaceAllUsesWith(ConstantInt::getSigned(I->getType(), -1)); + ToRemove.push_back(I); + return true; + } + if (checkCondition(ICmpInst::ICMP_EQ, LHS, RHS, I, Info)) { + I->replaceAllUsesWith(ConstantInt::get(I->getType(), 0)); + ToRemove.push_back(I); + return true; + } + return false; +} + static void removeEntryFromStack(const StackEntry &E, ConstraintInfo &Info, Module *ReproducerModule, @@ -1602,7 +1679,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, SmallVector<Value *> FunctionArgs; for (Value &Arg : F.args()) FunctionArgs.push_back(&Arg); - ConstraintInfo Info(F.getParent()->getDataLayout(), FunctionArgs); + ConstraintInfo Info(F.getDataLayout(), FunctionArgs); State S(DT, LI, SE); std::unique_ptr<Module> ReproducerModule( DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr); @@ -1695,6 +1772,10 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, ReproducerCondStack, DFSInStack); } Changed |= Simplified; + } else if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Inst)) { + Changed |= checkAndReplaceMinMax(MinMax, Info, ToRemove); + } else if (auto *CmpIntr = dyn_cast<CmpIntrinsic>(Inst)) { + Changed |= checkAndReplaceCmp(CmpIntr, Info, ToRemove); } continue; } @@ -1730,7 +1811,10 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, if (!CB.isConditionFact()) { Value *X; if (match(CB.Inst, m_Intrinsic<Intrinsic::abs>(m_Value(X)))) { - // TODO: Add CB.Inst >= 0 fact. + // If is_int_min_poison is true then we may assume llvm.abs >= 0. + if (cast<ConstantInt>(CB.Inst->getOperand(1))->isOne()) + AddFact(CmpInst::ICMP_SGE, CB.Inst, + ConstantInt::get(CB.Inst->getType(), 0)); AddFact(CmpInst::ICMP_SGE, CB.Inst, X); continue; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 9235850de92f..95de8eceb6be 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -47,11 +48,6 @@ using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" -static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned( - "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden, - cl::desc("Enables canonicalization of signed relational predicates to " - "unsigned (e.g. sgt => ugt)")); - STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); STATISTIC(NumSelects, "Number of selects propagated"); @@ -67,6 +63,7 @@ STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr"); STATISTIC(NumAShrsRemoved, "Number of ashr removed"); STATISTIC(NumSRems, "Number of srem converted to urem"); STATISTIC(NumSExt, "Number of sext converted to zext"); +STATISTIC(NumSIToFP, "Number of sitofp converted to uitofp"); STATISTIC(NumSICmps, "Number of signed icmp preds simplified to unsigned"); STATISTIC(NumAnd, "Number of ands removed"); STATISTIC(NumNW, "Number of no-wrap deductions"); @@ -89,10 +86,13 @@ STATISTIC(NumOverflows, "Number of overflow checks removed"); STATISTIC(NumSaturating, "Number of saturating arithmetics converted to normal arithmetics"); STATISTIC(NumNonNull, "Number of function pointer arguments marked non-null"); +STATISTIC(NumCmpIntr, "Number of llvm.[us]cmp intrinsics removed"); STATISTIC(NumMinMax, "Number of llvm.[us]{min,max} intrinsics removed"); +STATISTIC(NumSMinMax, + "Number of llvm.s{min,max} intrinsics simplified to unsigned"); STATISTIC(NumUDivURemsNarrowedExpanded, "Number of bound udiv's/urem's expanded"); -STATISTIC(NumZExt, "Number of non-negative deductions"); +STATISTIC(NumNNeg, "Number of zext/uitofp non-negative deductions"); static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { if (Constant *C = LVI->getConstant(V, At)) @@ -109,14 +109,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { if (!Op1) return nullptr; - LazyValueInfo::Tristate Result = LVI->getPredicateAt( - C->getPredicate(), Op0, Op1, At, /*UseBlockValue=*/false); - if (Result == LazyValueInfo::Unknown) - return nullptr; - - return (Result == LazyValueInfo::True) - ? ConstantInt::getTrue(C->getContext()) - : ConstantInt::getFalse(C->getContext()); + return LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At, + /*UseBlockValue=*/false); } static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { @@ -243,15 +237,17 @@ static Value *getValueOnEdge(LazyValueInfo *LVI, Value *Incoming, // The "false" case if (auto *C = dyn_cast<Constant>(SI->getFalseValue())) - if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) == - LazyValueInfo::False) + if (auto *Res = dyn_cast_or_null<ConstantInt>( + LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI)); + Res && Res->isZero()) return SI->getTrueValue(); // The "true" case, // similar to the select "false" case, but try the select "true" value if (auto *C = dyn_cast<Constant>(SI->getTrueValue())) - if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) == - LazyValueInfo::False) + if (auto *Res = dyn_cast_or_null<ConstantInt>( + LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI)); + Res && Res->isZero()) return SI->getFalseValue(); return nullptr; @@ -289,12 +285,8 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, } static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { - if (!CanonicalizeICmpPredicatesToUnsigned) - return false; - - // Only for signed relational comparisons of scalar integers. - if (Cmp->getType()->isVectorTy() || - !Cmp->getOperand(0)->getType()->isIntegerTy()) + // Only for signed relational comparisons of integers. + if (!Cmp->getOperand(0)->getType()->isIntOrIntVectorTy()) return false; if (!Cmp->isSigned()) @@ -324,16 +316,13 @@ static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { static bool constantFoldCmp(CmpInst *Cmp, LazyValueInfo *LVI) { Value *Op0 = Cmp->getOperand(0); Value *Op1 = Cmp->getOperand(1); - LazyValueInfo::Tristate Result = - LVI->getPredicateAt(Cmp->getPredicate(), Op0, Op1, Cmp, - /*UseBlockValue=*/true); - if (Result == LazyValueInfo::Unknown) + Constant *Res = LVI->getPredicateAt(Cmp->getPredicate(), Op0, Op1, Cmp, + /*UseBlockValue=*/true); + if (!Res) return false; ++NumCmps; - Constant *TorF = - ConstantInt::get(CmpInst::makeCmpResultType(Op0->getType()), Result); - Cmp->replaceAllUsesWith(TorF); + Cmp->replaceAllUsesWith(Res); Cmp->eraseFromParent(); return true; } @@ -371,14 +360,15 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, { // Scope for SwitchInstProfUpdateWrapper. It must not live during // ConstantFoldTerminator() as the underlying SwitchInst can be changed. SwitchInstProfUpdateWrapper SI(*I); + unsigned ReachableCaseCount = 0; for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); - LazyValueInfo::Tristate State = + auto *Res = dyn_cast_or_null<ConstantInt>( LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, - /* UseBlockValue */ true); + /* UseBlockValue */ true)); - if (State == LazyValueInfo::False) { + if (Res && Res->isZero()) { // This case never fires - remove it. BasicBlock *Succ = CI->getCaseSuccessor(); Succ->removePredecessor(BB); @@ -395,7 +385,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}}); continue; } - if (State == LazyValueInfo::True) { + if (Res && Res->isOne()) { // This case always fires. Arrange for the switch to be turned into an // unconditional branch by replacing the switch condition with the case // value. @@ -407,6 +397,31 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, // Increment the case iterator since we didn't delete it. ++CI; + ++ReachableCaseCount; + } + + BasicBlock *DefaultDest = SI->getDefaultDest(); + if (ReachableCaseCount > 1 && + !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())) { + ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0), + /*UndefAllowed*/ false); + // The default dest is unreachable if all cases are covered. + if (!CR.isSizeLargerThan(ReachableCaseCount)) { + BasicBlock *NewUnreachableBB = + BasicBlock::Create(BB->getContext(), "default.unreachable", + BB->getParent(), DefaultDest); + new UnreachableInst(BB->getContext(), NewUnreachableBB); + + DefaultDest->removePredecessor(BB); + SI->setDefaultDest(NewUnreachableBB); + + if (SuccessorsCount[DefaultDest] == 1) + DTU.applyUpdates({{DominatorTree::Delete, BB, DefaultDest}}); + DTU.applyUpdates({{DominatorTree::Insert, BB, NewUnreachableBB}}); + + ++NumDeadCases; + Changed = true; + } } } @@ -483,12 +498,8 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI); // because it is negation-invariant. static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) { Value *X = II->getArgOperand(0); - Type *Ty = X->getType(); - if (!Ty->isIntegerTy()) - return false; - bool IsIntMinPoison = cast<ConstantInt>(II->getArgOperand(1))->isOne(); - APInt IntMin = APInt::getSignedMinValue(Ty->getScalarSizeInBits()); + APInt IntMin = APInt::getSignedMinValue(X->getType()->getScalarSizeInBits()); ConstantRange Range = LVI->getConstantRangeAtUse( II->getOperandUse(0), /*UndefAllowed*/ IsIntMinPoison); @@ -503,7 +514,7 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) { // Is X in [IntMin, 0]? NOTE: INT_MIN is fine! if (Range.getSignedMax().isNonPositive()) { IRBuilder<> B(II); - Value *NegX = B.CreateNeg(X, II->getName(), /*HasNUW=*/false, + Value *NegX = B.CreateNeg(X, II->getName(), /*HasNSW=*/IsIntMinPoison); ++NumAbs; II->replaceAllUsesWith(NegX); @@ -527,18 +538,69 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) { return false; } +static bool processCmpIntrinsic(CmpIntrinsic *CI, LazyValueInfo *LVI) { + ConstantRange LHS_CR = + LVI->getConstantRangeAtUse(CI->getOperandUse(0), /*UndefAllowed*/ false); + ConstantRange RHS_CR = + LVI->getConstantRangeAtUse(CI->getOperandUse(1), /*UndefAllowed*/ false); + + if (LHS_CR.icmp(CI->getGTPredicate(), RHS_CR)) { + ++NumCmpIntr; + CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1)); + CI->eraseFromParent(); + return true; + } + if (LHS_CR.icmp(CI->getLTPredicate(), RHS_CR)) { + ++NumCmpIntr; + CI->replaceAllUsesWith(ConstantInt::getSigned(CI->getType(), -1)); + CI->eraseFromParent(); + return true; + } + if (LHS_CR.icmp(ICmpInst::ICMP_EQ, RHS_CR)) { + ++NumCmpIntr; + CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0)); + CI->eraseFromParent(); + return true; + } + + return false; +} + // See if this min/max intrinsic always picks it's one specific operand. +// If not, check whether we can canonicalize signed minmax into unsigned version static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) { CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate()); - LazyValueInfo::Tristate Result = LVI->getPredicateAt( - Pred, MM->getLHS(), MM->getRHS(), MM, /*UseBlockValue=*/true); - if (Result == LazyValueInfo::Unknown) - return false; + ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0), + /*UndefAllowed*/ false); + ConstantRange RHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(1), + /*UndefAllowed*/ false); + if (LHS_CR.icmp(Pred, RHS_CR)) { + ++NumMinMax; + MM->replaceAllUsesWith(MM->getLHS()); + MM->eraseFromParent(); + return true; + } + if (RHS_CR.icmp(Pred, LHS_CR)) { + ++NumMinMax; + MM->replaceAllUsesWith(MM->getRHS()); + MM->eraseFromParent(); + return true; + } - ++NumMinMax; - MM->replaceAllUsesWith(MM->getOperand(!Result)); - MM->eraseFromParent(); - return true; + if (MM->isSigned() && + ConstantRange::areInsensitiveToSignednessOfICmpPredicate(LHS_CR, + RHS_CR)) { + ++NumSMinMax; + IRBuilder<> B(MM); + MM->replaceAllUsesWith(B.CreateBinaryIntrinsic( + MM->getIntrinsicID() == Intrinsic::smin ? Intrinsic::umin + : Intrinsic::umax, + MM->getLHS(), MM->getRHS())); + MM->eraseFromParent(); + return true; + } + + return false; } // Rewrite this with.overflow intrinsic as non-overflowing. @@ -573,7 +635,7 @@ static bool processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) { bool NSW = SI->isSigned(); bool NUW = !SI->isSigned(); BinaryOperator *BinOp = BinaryOperator::Create( - Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI); + Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI->getIterator()); BinOp->setDebugLoc(SI->getDebugLoc()); setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW); @@ -595,20 +657,22 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) { return processAbsIntrinsic(&cast<IntrinsicInst>(CB), LVI); } + if (auto *CI = dyn_cast<CmpIntrinsic>(&CB)) { + return processCmpIntrinsic(CI, LVI); + } + if (auto *MM = dyn_cast<MinMaxIntrinsic>(&CB)) { return processMinMaxIntrinsic(MM, LVI); } if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) { - if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) { + if (willNotOverflow(WO, LVI)) return processOverflowIntrinsic(WO, LVI); - } } if (auto *SI = dyn_cast<SaturatingInst>(&CB)) { - if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) { + if (willNotOverflow(SI, LVI)) return processSaturatingInst(SI, LVI); - } } bool Changed = false; @@ -643,11 +707,12 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) { // relatively expensive analysis for constants which are obviously either // null or non-null to start with. if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) && - !isa<Constant>(V) && - LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, - ConstantPointerNull::get(Type), &CB, - /*UseBlockValue=*/false) == LazyValueInfo::False) - ArgNos.push_back(ArgNo); + !isa<Constant>(V)) + if (auto *Res = dyn_cast_or_null<ConstantInt>(LVI->getPredicateAt( + ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), &CB, + /*UseBlockValue=*/false)); + Res && Res->isZero()) + ArgNos.push_back(ArgNo); ArgNo++; } @@ -682,11 +747,10 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR, const ConstantRange &RCR) { assert(Instr->getOpcode() == Instruction::SDiv || Instr->getOpcode() == Instruction::SRem); - assert(!Instr->getType()->isVectorTy()); // Find the smallest power of two bitwidth that's sufficient to hold Instr's // operands. - unsigned OrigWidth = Instr->getType()->getIntegerBitWidth(); + unsigned OrigWidth = Instr->getType()->getScalarSizeInBits(); // What is the smallest bit width that can accommodate the entire value ranges // of both of the operands? @@ -709,7 +773,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR, ++NumSDivSRemsNarrowed; IRBuilder<> B{Instr}; - auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth); + auto *TruncTy = Instr->getType()->getWithNewBitWidth(NewWidth); auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy, Instr->getName() + ".lhs.trunc"); auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy, @@ -730,7 +794,6 @@ static bool expandUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR, Type *Ty = Instr->getType(); assert(Instr->getOpcode() == Instruction::UDiv || Instr->getOpcode() == Instruction::URem); - assert(!Ty->isVectorTy()); bool IsRem = Instr->getOpcode() == Instruction::URem; Value *X = Instr->getOperand(0); @@ -788,9 +851,12 @@ static bool expandUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR, Value *FrozenX = X; if (!isGuaranteedNotToBeUndef(X)) FrozenX = B.CreateFreeze(X, X->getName() + ".frozen"); - auto *AdjX = B.CreateNUWSub(FrozenX, Y, Instr->getName() + ".urem"); - auto *Cmp = - B.CreateICmp(ICmpInst::ICMP_ULT, FrozenX, Y, Instr->getName() + ".cmp"); + Value *FrozenY = Y; + if (!isGuaranteedNotToBeUndef(Y)) + FrozenY = B.CreateFreeze(Y, Y->getName() + ".frozen"); + auto *AdjX = B.CreateNUWSub(FrozenX, FrozenY, Instr->getName() + ".urem"); + auto *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, FrozenX, FrozenY, + Instr->getName() + ".cmp"); ExpandedOp = B.CreateSelect(Cmp, FrozenX, AdjX); } else { auto *Cmp = @@ -810,7 +876,6 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR, const ConstantRange &YCR) { assert(Instr->getOpcode() == Instruction::UDiv || Instr->getOpcode() == Instruction::URem); - assert(!Instr->getType()->isVectorTy()); // Find the smallest power of two bitwidth that's sufficient to hold Instr's // operands. @@ -823,12 +888,12 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR, // NewWidth might be greater than OrigWidth if OrigWidth is not a power of // two. - if (NewWidth >= Instr->getType()->getIntegerBitWidth()) + if (NewWidth >= Instr->getType()->getScalarSizeInBits()) return false; ++NumUDivURemsNarrowed; IRBuilder<> B{Instr}; - auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth); + auto *TruncTy = Instr->getType()->getWithNewBitWidth(NewWidth); auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy, Instr->getName() + ".lhs.trunc"); auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy, @@ -847,9 +912,6 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR, static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) { assert(Instr->getOpcode() == Instruction::UDiv || Instr->getOpcode() == Instruction::URem); - if (Instr->getType()->isVectorTy()) - return false; - ConstantRange XCR = LVI->getConstantRangeAtUse(Instr->getOperandUse(0), /*UndefAllowed*/ false); // Allow undef for RHS, as we can assume it is division by zero UB. @@ -864,7 +926,6 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) { static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR, const ConstantRange &RCR, LazyValueInfo *LVI) { assert(SDI->getOpcode() == Instruction::SRem); - assert(!SDI->getType()->isVectorTy()); if (LCR.abs().icmp(CmpInst::ICMP_ULT, RCR.abs())) { SDI->replaceAllUsesWith(SDI->getOperand(0)); @@ -888,21 +949,22 @@ static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR, for (Operand &Op : Ops) { if (Op.D == Domain::NonNegative) continue; - auto *BO = - BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI); + auto *BO = BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", + SDI->getIterator()); BO->setDebugLoc(SDI->getDebugLoc()); Op.V = BO; } - auto *URem = - BinaryOperator::CreateURem(Ops[0].V, Ops[1].V, SDI->getName(), SDI); + auto *URem = BinaryOperator::CreateURem(Ops[0].V, Ops[1].V, SDI->getName(), + SDI->getIterator()); URem->setDebugLoc(SDI->getDebugLoc()); auto *Res = URem; // If the divident was non-positive, we need to negate the result. if (Ops[0].D == Domain::NonPositive) { - Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI); + Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", + SDI->getIterator()); Res->setDebugLoc(SDI->getDebugLoc()); } @@ -923,7 +985,6 @@ static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR, static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR, const ConstantRange &RCR, LazyValueInfo *LVI) { assert(SDI->getOpcode() == Instruction::SDiv); - assert(!SDI->getType()->isVectorTy()); // Check whether the division folds to a constant. ConstantRange DivCR = LCR.sdiv(RCR); @@ -949,14 +1010,14 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR, for (Operand &Op : Ops) { if (Op.D == Domain::NonNegative) continue; - auto *BO = - BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI); + auto *BO = BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", + SDI->getIterator()); BO->setDebugLoc(SDI->getDebugLoc()); Op.V = BO; } - auto *UDiv = - BinaryOperator::CreateUDiv(Ops[0].V, Ops[1].V, SDI->getName(), SDI); + auto *UDiv = BinaryOperator::CreateUDiv(Ops[0].V, Ops[1].V, SDI->getName(), + SDI->getIterator()); UDiv->setDebugLoc(SDI->getDebugLoc()); UDiv->setIsExact(SDI->isExact()); @@ -964,7 +1025,8 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR, // If the operands had two different domains, we need to negate the result. if (Ops[0].D != Ops[1].D) { - Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI); + Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", + SDI->getIterator()); Res->setDebugLoc(SDI->getDebugLoc()); } @@ -980,9 +1042,6 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR, static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) { assert(Instr->getOpcode() == Instruction::SDiv || Instr->getOpcode() == Instruction::SRem); - if (Instr->getType()->isVectorTy()) - return false; - ConstantRange LCR = LVI->getConstantRangeAtUse(Instr->getOperandUse(0), /*AllowUndef*/ false); // Allow undef for RHS, as we can assume it is division by zero UB. @@ -1001,12 +1060,9 @@ static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) { } static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy()) - return false; - ConstantRange LRange = LVI->getConstantRangeAtUse(SDI->getOperandUse(0), /*UndefAllowed*/ false); - unsigned OrigWidth = SDI->getType()->getIntegerBitWidth(); + unsigned OrigWidth = SDI->getType()->getScalarSizeInBits(); ConstantRange NegOneOrZero = ConstantRange(APInt(OrigWidth, (uint64_t)-1, true), APInt(OrigWidth, 1)); if (NegOneOrZero.contains(LRange)) { @@ -1022,7 +1078,7 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { ++NumAShrsConverted; auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1), - "", SDI); + "", SDI->getIterator()); BO->takeName(SDI); BO->setDebugLoc(SDI->getDebugLoc()); BO->setIsExact(SDI->isExact()); @@ -1033,16 +1089,14 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { } static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy()) - return false; - const Use &Base = SDI->getOperandUse(0); if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false) .isAllNonNegative()) return false; ++NumSExt; - auto *ZExt = CastInst::CreateZExtOrBitCast(Base, SDI->getType(), "", SDI); + auto *ZExt = CastInst::CreateZExtOrBitCast(Base, SDI->getType(), "", + SDI->getIterator()); ZExt->takeName(SDI); ZExt->setDebugLoc(SDI->getDebugLoc()); ZExt->setNonNeg(); @@ -1052,20 +1106,43 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) { return true; } -static bool processZExt(ZExtInst *ZExt, LazyValueInfo *LVI) { - if (ZExt->getType()->isVectorTy()) +static bool processPossibleNonNeg(PossiblyNonNegInst *I, LazyValueInfo *LVI) { + if (I->hasNonNeg()) return false; - if (ZExt->hasNonNeg()) + const Use &Base = I->getOperandUse(0); + if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false) + .isAllNonNegative()) return false; - const Use &Base = ZExt->getOperandUse(0); + ++NumNNeg; + I->setNonNeg(); + + return true; +} + +static bool processZExt(ZExtInst *ZExt, LazyValueInfo *LVI) { + return processPossibleNonNeg(cast<PossiblyNonNegInst>(ZExt), LVI); +} + +static bool processUIToFP(UIToFPInst *UIToFP, LazyValueInfo *LVI) { + return processPossibleNonNeg(cast<PossiblyNonNegInst>(UIToFP), LVI); +} + +static bool processSIToFP(SIToFPInst *SIToFP, LazyValueInfo *LVI) { + const Use &Base = SIToFP->getOperandUse(0); if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false) .isAllNonNegative()) return false; - ++NumZExt; - ZExt->setNonNeg(); + ++NumSIToFP; + auto *UIToFP = CastInst::Create(Instruction::UIToFP, Base, SIToFP->getType(), + "", SIToFP->getIterator()); + UIToFP->takeName(SIToFP); + UIToFP->setDebugLoc(SIToFP->getDebugLoc()); + UIToFP->setNonNeg(); + SIToFP->replaceAllUsesWith(UIToFP); + SIToFP->eraseFromParent(); return true; } @@ -1073,22 +1150,16 @@ static bool processZExt(ZExtInst *ZExt, LazyValueInfo *LVI) { static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { using OBO = OverflowingBinaryOperator; - if (BinOp->getType()->isVectorTy()) - return false; - bool NSW = BinOp->hasNoSignedWrap(); bool NUW = BinOp->hasNoUnsignedWrap(); if (NSW && NUW) return false; Instruction::BinaryOps Opcode = BinOp->getOpcode(); - Value *LHS = BinOp->getOperand(0); - Value *RHS = BinOp->getOperand(1); - - ConstantRange LRange = - LVI->getConstantRange(LHS, BinOp, /*UndefAllowed*/ false); - ConstantRange RRange = - LVI->getConstantRange(RHS, BinOp, /*UndefAllowed*/ false); + ConstantRange LRange = LVI->getConstantRangeAtUse(BinOp->getOperandUse(0), + /*UndefAllowed=*/false); + ConstantRange RRange = LVI->getConstantRangeAtUse(BinOp->getOperandUse(1), + /*UndefAllowed=*/false); bool Changed = false; bool NewNUW = false, NewNSW = false; @@ -1111,21 +1182,20 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { } static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) { - if (BinOp->getType()->isVectorTy()) - return false; + using namespace llvm::PatternMatch; // Pattern match (and lhs, C) where C includes a superset of bits which might // be set in lhs. This is a common truncation idiom created by instcombine. const Use &LHS = BinOp->getOperandUse(0); - ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1)); - if (!RHS || !RHS->getValue().isMask()) + const APInt *RHS; + if (!match(BinOp->getOperand(1), m_LowBitMask(RHS))) return false; // We can only replace the AND with LHS based on range info if the range does // not include undef. ConstantRange LRange = LVI->getConstantRangeAtUse(LHS, /*UndefAllowed=*/false); - if (!LRange.getUnsignedMax().ule(RHS->getValue())) + if (!LRange.getUnsignedMax().ule(*RHS)) return false; BinOp->replaceAllUsesWith(LHS); @@ -1177,6 +1247,12 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, case Instruction::ZExt: BBChanged |= processZExt(cast<ZExtInst>(&II), LVI); break; + case Instruction::UIToFP: + BBChanged |= processUIToFP(cast<UIToFPInst>(&II), LVI); + break; + case Instruction::SIToFP: + BBChanged |= processSIToFP(cast<SIToFPInst>(&II), LVI); + break; case Instruction::Add: case Instruction::Sub: case Instruction::Mul: @@ -1227,6 +1303,12 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { if (!Changed) { PA = PreservedAnalyses::all(); } else { +#if defined(EXPENSIVE_CHECKS) + assert(DT->verify(DominatorTree::VerificationLevel::Full)); +#else + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); +#endif // EXPENSIVE_CHECKS + PA.preserve<DominatorTreeAnalysis>(); PA.preserve<LazyValueAnalysis>(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 85d4065286e4..4371b821eae6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -65,6 +65,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" @@ -95,6 +96,11 @@ static cl::opt<bool> cl::desc("View the CFG before DFA Jump Threading"), cl::Hidden, cl::init(false)); +static cl::opt<bool> EarlyExitHeuristic( + "dfa-early-exit-heuristic", + cl::desc("Exit early if an unpredictable value come from the same loop"), + cl::Hidden, cl::init(true)); + static cl::opt<unsigned> MaxPathLength( "dfa-max-path-length", cl::desc("Max number of blocks searched to find a threading path"), @@ -125,17 +131,18 @@ public: explicit operator bool() const { return SI && SIUse; } }; -void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, +void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, std::vector<SelectInstToUnfold> *NewSIsToUnfold, std::vector<BasicBlock *> *NewBBs); class DFAJumpThreading { public: - DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, + DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AC(AC), DT(DT), TTI(TTI), ORE(ORE) {} + : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {} bool run(Function &F); + bool LoopInfoBroken; private: void @@ -151,7 +158,7 @@ private: std::vector<SelectInstToUnfold> NewSIsToUnfold; std::vector<BasicBlock *> NewBBs; - unfold(&DTU, SIToUnfold, &NewSIsToUnfold, &NewBBs); + unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs); // Put newly discovered select instructions into the work list. for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold) @@ -161,6 +168,7 @@ private: AssumptionCache *AC; DominatorTree *DT; + LoopInfo *LI; TargetTransformInfo *TTI; OptimizationRemarkEmitter *ORE; }; @@ -194,7 +202,7 @@ void createBasicBlockAndSinkSelectInst( /// created basic blocks into \p NewBBs. /// /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. -void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, +void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, std::vector<SelectInstToUnfold> *NewSIsToUnfold, std::vector<BasicBlock *> *NewBBs) { SelectInst *SI = SIToUnfold.getInst(); @@ -300,6 +308,12 @@ void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, DTU->applyUpdates({{DominatorTree::Insert, StartBlock, TT}, {DominatorTree::Insert, StartBlock, FT}}); + // Preserve loop info + if (Loop *L = LI->getLoopFor(SI->getParent())) { + for (BasicBlock *NewBB : *NewBBs) + L->addBasicBlockToLoop(NewBB, *LI); + } + // The select is now dead. assert(SI->use_empty() && "Select must be dead now"); SI->eraseFromParent(); @@ -378,7 +392,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { #endif struct MainSwitch { - MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) { + MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE) + : LI(LI) { if (isCandidate(SI)) { Instr = SI; } else { @@ -402,7 +417,7 @@ private: /// /// Also, collect select instructions to unfold. bool isCandidate(const SwitchInst *SI) { - std::deque<Value *> Q; + std::deque<std::pair<Value *, BasicBlock *>> Q; SmallSet<Value *, 16> SeenValues; SelectInsts.clear(); @@ -411,22 +426,29 @@ private: if (!isa<PHINode>(SICond)) return false; - addToQueue(SICond, Q, SeenValues); + // The switch must be in a loop. + const Loop *L = LI->getLoopFor(SI->getParent()); + if (!L) + return false; + + addToQueue(SICond, nullptr, Q, SeenValues); while (!Q.empty()) { - Value *Current = Q.front(); + Value *Current = Q.front().first; + BasicBlock *CurrentIncomingBB = Q.front().second; Q.pop_front(); if (auto *Phi = dyn_cast<PHINode>(Current)) { - for (Value *Incoming : Phi->incoming_values()) { - addToQueue(Incoming, Q, SeenValues); + for (BasicBlock *IncomingBB : Phi->blocks()) { + Value *Incoming = Phi->getIncomingValueForBlock(IncomingBB); + addToQueue(Incoming, IncomingBB, Q, SeenValues); } LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n"); } else if (SelectInst *SelI = dyn_cast<SelectInst>(Current)) { if (!isValidSelectInst(SelI)) return false; - addToQueue(SelI->getTrueValue(), Q, SeenValues); - addToQueue(SelI->getFalseValue(), Q, SeenValues); + addToQueue(SelI->getTrueValue(), CurrentIncomingBB, Q, SeenValues); + addToQueue(SelI->getFalseValue(), CurrentIncomingBB, Q, SeenValues); LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n"); if (auto *SelIUse = dyn_cast<PHINode>(SelI->user_back())) SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse)); @@ -439,6 +461,18 @@ private: // initial switch values that can be ignored (they will hit the // unthreaded switch) but this assumption will get checked later after // paths have been enumerated (in function getStateDefMap). + + // If the unpredictable value comes from the same inner loop it is + // likely that it will also be on the enumerated paths, causing us to + // exit after we have enumerated all the paths. This heuristic save + // compile time because a search for all the paths can become expensive. + if (EarlyExitHeuristic && + L->contains(LI->getLoopFor(CurrentIncomingBB))) { + LLVM_DEBUG(dbgs() + << "\tExiting early due to unpredictability heuristic.\n"); + return false; + } + continue; } } @@ -446,11 +480,12 @@ private: return true; } - void addToQueue(Value *Val, std::deque<Value *> &Q, + void addToQueue(Value *Val, BasicBlock *BB, + std::deque<std::pair<Value *, BasicBlock *>> &Q, SmallSet<Value *, 16> &SeenValues) { if (SeenValues.contains(Val)) return; - Q.push_back(Val); + Q.push_back({Val, BB}); SeenValues.insert(Val); } @@ -488,14 +523,16 @@ private: return true; } + LoopInfo *LI; SwitchInst *Instr = nullptr; SmallVector<SelectInstToUnfold, 4> SelectInsts; }; struct AllSwitchPaths { - AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE) - : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), - ORE(ORE) {} + AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE, + LoopInfo *LI) + : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), ORE(ORE), + LI(LI) {} std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; } unsigned getNumThreadingPaths() { return TPaths.size(); } @@ -516,7 +553,7 @@ struct AllSwitchPaths { return; } - for (PathType Path : LoopPaths) { + for (const PathType &Path : LoopPaths) { ThreadingPath TPath; const BasicBlock *PrevBB = Path.back(); @@ -567,6 +604,12 @@ private: Visited.insert(BB); + // Stop if we have reached the BB out of loop, since its successors have no + // impact on the DFA. + // TODO: Do we need to stop exploring if BB is the outer loop of the switch? + if (!LI->getLoopFor(BB)) + return Res; + // Some blocks have multiple edges to the same successor, and this set // is used to prevent a duplicate path from being generated SmallSet<BasicBlock *, 4> Successors; @@ -708,6 +751,7 @@ private: BasicBlock *SwitchBlock; OptimizationRemarkEmitter *ORE; std::vector<ThreadingPath> TPaths; + LoopInfo *LI; }; struct TransformDFA { @@ -783,7 +827,8 @@ private: return false; } - if (Metrics.convergent) { + // FIXME: Allow jump threading with controlled convergence. + if (Metrics.Convergence != ConvergenceKind::None) { LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " << "convergent instructions.\n"); ORE->emit([&]() { @@ -1254,6 +1299,7 @@ bool DFAJumpThreading::run(Function &F) { SmallVector<AllSwitchPaths, 2> ThreadableLoops; bool MadeChanges = false; + LoopInfoBroken = false; for (BasicBlock &BB : F) { auto *SI = dyn_cast<SwitchInst>(BB.getTerminator()); @@ -1262,7 +1308,7 @@ bool DFAJumpThreading::run(Function &F) { LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() << " is a candidate\n"); - MainSwitch Switch(SI, ORE); + MainSwitch Switch(SI, LI, ORE); if (!Switch.getInstr()) continue; @@ -1275,7 +1321,7 @@ bool DFAJumpThreading::run(Function &F) { if (!Switch.getSelectInsts().empty()) MadeChanges = true; - AllSwitchPaths SwitchPaths(&Switch, ORE); + AllSwitchPaths SwitchPaths(&Switch, ORE, LI); SwitchPaths.run(); if (SwitchPaths.getNumThreadingPaths() > 0) { @@ -1286,10 +1332,15 @@ bool DFAJumpThreading::run(Function &F) { // strict requirement but it can cause buggy behavior if there is an // overlap of blocks in different opportunities. There is a lot of room to // experiment with catching more opportunities here. + // NOTE: To release this contraint, we must handle LoopInfo invalidation break; } } +#ifdef NDEBUG + LI->verify(*DT); +#endif + SmallPtrSet<const Value *, 32> EphValues; if (ThreadableLoops.size() > 0) CodeMetrics::collectEphemeralValues(&F, AC, EphValues); @@ -1298,6 +1349,7 @@ bool DFAJumpThreading::run(Function &F) { TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); Transform.run(); MadeChanges = true; + LoopInfoBroken = true; } #ifdef EXPENSIVE_CHECKS @@ -1315,13 +1367,16 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F, FunctionAnalysisManager &AM) { AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = AM.getResult<LoopAnalysis>(F); TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); OptimizationRemarkEmitter ORE(&F); - - if (!DFAJumpThreading(&AC, &DT, &TTI, &ORE).run(F)) + DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE); + if (!ThreadImpl.run(F)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<DominatorTreeAnalysis>(); + if (!ThreadImpl.LoopInfoBroken) + PA.preserve<LoopAnalysis>(); return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index f0f0f5f28025..931606c6f8fe 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -484,7 +484,7 @@ memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, static void shortenAssignment(Instruction *Inst, Value *OriginalDest, uint64_t OldOffsetInBits, uint64_t OldSizeInBits, uint64_t NewSizeInBits, bool IsOverwriteEnd) { - const DataLayout &DL = Inst->getModule()->getDataLayout(); + const DataLayout &DL = Inst->getDataLayout(); uint64_t DeadSliceSizeInBits = OldSizeInBits - NewSizeInBits; uint64_t DeadSliceOffsetInBits = OldOffsetInBits + (IsOverwriteEnd ? NewSizeInBits : 0); @@ -526,7 +526,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest, // returned by getAssignmentMarkers so save a copy of the markers to iterate // over. auto LinkedRange = at::getAssignmentMarkers(Inst); - SmallVector<DPValue *> LinkedDPVAssigns = at::getDPVAssignmentMarkers(Inst); + SmallVector<DbgVariableRecord *> LinkedDVRAssigns = + at::getDVRAssignmentMarkers(Inst); SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(), LinkedRange.end()); auto InsertAssignForOverlap = [&](auto *Assign) { @@ -554,7 +555,7 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest, NewAssign->setKillAddress(); }; for_each(Linked, InsertAssignForOverlap); - for_each(LinkedDPVAssigns, InsertAssignForOverlap); + for_each(LinkedDVRAssigns, InsertAssignForOverlap); } static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, @@ -634,7 +635,8 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, Value *Indices[1] = { ConstantInt::get(DeadWriteLength->getType(), ToRemoveSize)}; Instruction *NewDestGEP = GetElementPtrInst::CreateInBounds( - Type::getInt8Ty(DeadIntrinsic->getContext()), OrigDest, Indices, "", DeadI); + Type::getInt8Ty(DeadIntrinsic->getContext()), OrigDest, Indices, "", + DeadI->getIterator()); NewDestGEP->setDebugLoc(DeadIntrinsic->getDebugLoc()); DeadIntrinsic->setDest(NewDestGEP); } @@ -868,7 +870,7 @@ struct DSEState { PostDominatorTree &PDT, const TargetLibraryInfo &TLI, const LoopInfo &LI) : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT), - PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { + PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) { // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -900,6 +902,16 @@ struct DSEState { }); } + static void pushMemUses(MemoryAccess *Acc, + SmallVectorImpl<MemoryAccess *> &WorkList, + SmallPtrSetImpl<MemoryAccess *> &Visited) { + for (Use &U : Acc->uses()) { + auto *MA = cast<MemoryAccess>(U.getUser()); + if (Visited.insert(MA).second) + WorkList.push_back(MA); + } + }; + LocationSize strengthenLocationSize(const Instruction *I, LocationSize Size) const { if (auto *CB = dyn_cast<CallBase>(I)) { @@ -1155,26 +1167,14 @@ struct DSEState { } /// Returns true if \p Def is not read before returning from the function. - bool isWriteAtEndOfFunction(MemoryDef *Def) { + bool isWriteAtEndOfFunction(MemoryDef *Def, const MemoryLocation &DefLoc) { LLVM_DEBUG(dbgs() << " Check if def " << *Def << " (" << *Def->getMemoryInst() << ") is at the end the function \n"); - - auto MaybeLoc = getLocForWrite(Def->getMemoryInst()); - if (!MaybeLoc) { - LLVM_DEBUG(dbgs() << " ... could not get location for write.\n"); - return false; - } - SmallVector<MemoryAccess *, 4> WorkList; SmallPtrSet<MemoryAccess *, 8> Visited; - auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) { - if (!Visited.insert(Acc).second) - return; - for (Use &U : Acc->uses()) - WorkList.push_back(cast<MemoryAccess>(U.getUser())); - }; - PushMemUses(Def); + + pushMemUses(Def, WorkList, Visited); for (unsigned I = 0; I < WorkList.size(); I++) { if (WorkList.size() >= MemorySSAScanLimit) { LLVM_DEBUG(dbgs() << " ... hit exploration limit.\n"); @@ -1186,22 +1186,22 @@ struct DSEState { // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same // memory location. - if (!isGuaranteedLoopInvariant(MaybeLoc->Ptr)) + if (!isGuaranteedLoopInvariant(DefLoc.Ptr)) return false; - PushMemUses(cast<MemoryPhi>(UseAccess)); + pushMemUses(cast<MemoryPhi>(UseAccess), WorkList, Visited); continue; } // TODO: Checking for aliasing is expensive. Consider reducing the amount // of times this is called and/or caching it. Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst(); - if (isReadClobber(*MaybeLoc, UseInst)) { + if (isReadClobber(DefLoc, UseInst)) { LLVM_DEBUG(dbgs() << " ... hit read clobber " << *UseInst << ".\n"); return false; } if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) - PushMemUses(UseDef); + pushMemUses(UseDef, WorkList, Visited); } return true; } @@ -1503,12 +1503,9 @@ struct DSEState { LLVM_DEBUG(dbgs() << " Checking for reads of " << *MaybeDeadAccess << " (" << *MaybeDeadI << ")\n"); - SmallSetVector<MemoryAccess *, 32> WorkList; - auto PushMemUses = [&WorkList](MemoryAccess *Acc) { - for (Use &U : Acc->uses()) - WorkList.insert(cast<MemoryAccess>(U.getUser())); - }; - PushMemUses(MaybeDeadAccess); + SmallVector<MemoryAccess *, 32> WorkList; + SmallPtrSet<MemoryAccess *, 32> Visited; + pushMemUses(MaybeDeadAccess, WorkList, Visited); // Check if DeadDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { @@ -1532,7 +1529,7 @@ struct DSEState { continue; } LLVM_DEBUG(dbgs() << "\n ... adding PHI uses\n"); - PushMemUses(UseAccess); + pushMemUses(UseAccess, WorkList, Visited); continue; } @@ -1557,7 +1554,7 @@ struct DSEState { if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) { LLVM_DEBUG(dbgs() << " ... adding uses of intrinsic\n"); - PushMemUses(UseAccess); + pushMemUses(UseAccess, WorkList, Visited); continue; } @@ -1616,7 +1613,7 @@ struct DSEState { return std::nullopt; } } else - PushMemUses(UseDef); + pushMemUses(UseDef, WorkList, Visited); } } @@ -1819,8 +1816,11 @@ struct DSEState { Instruction *DefI = Def->getMemoryInst(); auto DefLoc = getLocForWrite(DefI); - if (!DefLoc || !isRemovable(DefI)) + if (!DefLoc || !isRemovable(DefI)) { + LLVM_DEBUG(dbgs() << " ... could not get location for write or " + "instruction not removable.\n"); continue; + } // NOTE: Currently eliminating writes at the end of a function is // limited to MemoryDefs with a single underlying object, to save @@ -1831,7 +1831,7 @@ struct DSEState { if (!isInvisibleToCallerAfterRet(UO)) continue; - if (isWriteAtEndOfFunction(Def)) { + if (isWriteAtEndOfFunction(Def, *DefLoc)) { // See through pointer-to-pointer bitcasts LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end " "of the function\n"); @@ -1923,6 +1923,57 @@ struct DSEState { return true; } + // Check if there is a dominating condition, that implies that the value + // being stored in a ptr is already present in the ptr. + bool dominatingConditionImpliesValue(MemoryDef *Def) { + auto *StoreI = cast<StoreInst>(Def->getMemoryInst()); + BasicBlock *StoreBB = StoreI->getParent(); + Value *StorePtr = StoreI->getPointerOperand(); + Value *StoreVal = StoreI->getValueOperand(); + + DomTreeNode *IDom = DT.getNode(StoreBB)->getIDom(); + if (!IDom) + return false; + + auto *BI = dyn_cast<BranchInst>(IDom->getBlock()->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + // In case both blocks are the same, it is not possible to determine + // if optimization is possible. (We would not want to optimize a store + // in the FalseBB if condition is true and vice versa.) + if (BI->getSuccessor(0) == BI->getSuccessor(1)) + return false; + + Instruction *ICmpL; + ICmpInst::Predicate Pred; + if (!match(BI->getCondition(), + m_c_ICmp(Pred, + m_CombineAnd(m_Load(m_Specific(StorePtr)), + m_Instruction(ICmpL)), + m_Specific(StoreVal))) || + !ICmpInst::isEquality(Pred)) + return false; + + // In case the else blocks also branches to the if block or the other way + // around it is not possible to determine if the optimization is possible. + if (Pred == ICmpInst::ICMP_EQ && + !DT.dominates(BasicBlockEdge(BI->getParent(), BI->getSuccessor(0)), + StoreBB)) + return false; + + if (Pred == ICmpInst::ICMP_NE && + !DT.dominates(BasicBlockEdge(BI->getParent(), BI->getSuccessor(1)), + StoreBB)) + return false; + + MemoryAccess *LoadAcc = MSSA.getMemoryAccess(ICmpL); + MemoryAccess *ClobAcc = + MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def, BatchAA); + + return MSSA.dominates(ClobAcc, LoadAcc); + } + /// \returns true if \p Def is a no-op store, either because it /// directly stores back a loaded value or stores zero to a calloced object. bool storeIsNoop(MemoryDef *Def, const Value *DefUO) { @@ -1953,6 +2004,9 @@ struct DSEState { if (!Store) return false; + if (dominatingConditionImpliesValue(Def)) + return true; + if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) { if (LoadI->getPointerOperand() == Store->getOperand(1)) { // Get the defining access for the load. @@ -2053,10 +2107,12 @@ struct DSEState { if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) { if (auto *SI = dyn_cast<StoreInst>(DefInst)) { // MemSetInst must have a write location. - MemoryLocation UpperLoc = *getLocForWrite(UpperInst); + auto UpperLoc = getLocForWrite(UpperInst); + if (!UpperLoc) + return false; int64_t InstWriteOffset = 0; int64_t DepWriteOffset = 0; - auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc, + auto OR = isOverwrite(UpperInst, DefInst, *UpperLoc, *MaybeDefLoc, InstWriteOffset, DepWriteOffset); Value *StoredByte = isBytewiseValue(SI->getValueOperand(), DL); return StoredByte && StoredByte == MemSetI->getOperand(1) && diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp index 57d3f312186e..d8aea1e810e9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp @@ -215,6 +215,7 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, RemInst = RealRem; // And replace the original instruction with the new one. OrigRemInst->replaceAllUsesWith(RealRem); + RealRem->setDebugLoc(OrigRemInst->getDebugLoc()); OrigRemInst->eraseFromParent(); NumRecomposed++; // Note that we have left ((X / Y) * Y) around. @@ -366,7 +367,9 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, if (!DivDominates) DivInst->moveBefore(RemInst); Mul->insertAfter(RemInst); + Mul->setDebugLoc(RemInst->getDebugLoc()); Sub->insertAfter(Mul); + Sub->setDebugLoc(RemInst->getDebugLoc()); // If DivInst has the exact flag, remove it. Otherwise this optimization // may replace a well-defined value 'X % Y' with poison. @@ -381,16 +384,19 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, // %mul = mul %div, 1 // %mul = undef // %rem = sub %x, %mul // %rem = undef - undef = undef // If X is not frozen, %rem becomes undef after transformation. - // TODO: We need a undef-specific checking function in ValueTracking - if (!isGuaranteedNotToBeUndefOrPoison(X, nullptr, DivInst, &DT)) { - auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst); + if (!isGuaranteedNotToBeUndef(X, nullptr, DivInst, &DT)) { + auto *FrX = + new FreezeInst(X, X->getName() + ".frozen", DivInst->getIterator()); + FrX->setDebugLoc(DivInst->getDebugLoc()); DivInst->setOperand(0, FrX); Sub->setOperand(0, FrX); } // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0, // but %rem in tgt can be one of many integer values. - if (!isGuaranteedNotToBeUndefOrPoison(Y, nullptr, DivInst, &DT)) { - auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst); + if (!isGuaranteedNotToBeUndef(Y, nullptr, DivInst, &DT)) { + auto *FrY = + new FreezeInst(Y, Y->getName() + ".frozen", DivInst->getIterator()); + FrY->setDebugLoc(DivInst->getDebugLoc()); DivInst->setOperand(1, FrY); Mul->setOperand(1, FrY); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index f736d429cb63..cf11f5bc885a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -1833,7 +1833,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, auto *MSSA = UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr; - EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA); + EarlyCSE CSE(F.getDataLayout(), TLI, TTI, DT, AC, MSSA); if (!CSE.run()) return PreservedAnalyses::all(); @@ -1887,7 +1887,7 @@ public: auto *MSSA = UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr; - EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA); + EarlyCSE CSE(F.getDataLayout(), TLI, TTI, DT, AC, MSSA); return CSE.run(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index ad2041cd4253..213d0f389c2e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -21,7 +21,7 @@ using namespace llvm; -#define DEBUG_TYPE "flattencfg" +#define DEBUG_TYPE "flatten-cfg" namespace { struct FlattenCFGLegacyPass : public FunctionPass { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp index ccca8bcc1a56..a4a1438dbe41 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -311,7 +311,7 @@ void Float2IntPass::walkForwards() { } // If there is a valid transform to be done, do it. -bool Float2IntPass::validateAndTransform() { +bool Float2IntPass::validateAndTransform(const DataLayout &DL) { bool MadeChange = false; // Iterate over every disjoint partition of the def-use graph. @@ -359,9 +359,7 @@ bool Float2IntPass::validateAndTransform() { // The number of bits required is the maximum of the upper and // lower limits, plus one so it can be signed. - unsigned MinBW = std::max(R.getLower().getSignificantBits(), - R.getUpper().getSignificantBits()) + - 1; + unsigned MinBW = R.getMinSignedBits() + 1; LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n"); // If we've run off the realms of the exactly representable integers, @@ -376,15 +374,23 @@ bool Float2IntPass::validateAndTransform() { LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n"); continue; } - if (MinBW > 64) { - LLVM_DEBUG( - dbgs() << "F2I: Value requires more than 64 bits to represent!\n"); - continue; - } - // OK, R is known to be representable. Now pick a type for it. - // FIXME: Pick the smallest legal type that will fit. - Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx); + // OK, R is known to be representable. + // Pick the smallest legal type that will fit. + Type *Ty = DL.getSmallestLegalIntType(*Ctx, MinBW); + if (!Ty) { + // Every supported target supports 64-bit and 32-bit integers, + // so fallback to a 32 or 64-bit integer if the value fits. + if (MinBW <= 32) { + Ty = Type::getInt32Ty(*Ctx); + } else if (MinBW <= 64) { + Ty = Type::getInt64Ty(*Ctx); + } else { + LLVM_DEBUG(dbgs() << "F2I: Value requires more bits to represent than " + "the target supports!\n"); + continue; + } + } for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); MI != ME; ++MI) @@ -491,7 +497,8 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) { walkBackwards(); walkForwards(); - bool Modified = validateAndTransform(); + const DataLayout &DL = F.getDataLayout(); + bool Modified = validateAndTransform(DL); if (Modified) cleanup(); return Modified; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp index e36578f3de7a..db39d8621d07 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionPrecedenceTracking.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -419,7 +420,7 @@ GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) { Expression E; Type *PtrTy = GEP->getType()->getScalarType(); - const DataLayout &DL = GEP->getModule()->getDataLayout(); + const DataLayout &DL = GEP->getDataLayout(); unsigned BitWidth = DL.getIndexTypeSizeInBits(PtrTy); MapVector<Value *, APInt> VariableOffsets; APInt ConstantOffset(BitWidth, 0); @@ -725,6 +726,69 @@ void GVNPass::ValueTable::verifyRemoved(const Value *V) const { } //===----------------------------------------------------------------------===// +// LeaderMap External Functions +//===----------------------------------------------------------------------===// + +/// Push a new Value to the LeaderTable onto the list for its value number. +void GVNPass::LeaderMap::insert(uint32_t N, Value *V, const BasicBlock *BB) { + LeaderListNode &Curr = NumToLeaders[N]; + if (!Curr.Entry.Val) { + Curr.Entry.Val = V; + Curr.Entry.BB = BB; + return; + } + + LeaderListNode *Node = TableAllocator.Allocate<LeaderListNode>(); + Node->Entry.Val = V; + Node->Entry.BB = BB; + Node->Next = Curr.Next; + Curr.Next = Node; +} + +/// Scan the list of values corresponding to a given +/// value number, and remove the given instruction if encountered. +void GVNPass::LeaderMap::erase(uint32_t N, Instruction *I, + const BasicBlock *BB) { + LeaderListNode *Prev = nullptr; + LeaderListNode *Curr = &NumToLeaders[N]; + + while (Curr && (Curr->Entry.Val != I || Curr->Entry.BB != BB)) { + Prev = Curr; + Curr = Curr->Next; + } + + if (!Curr) + return; + + if (Prev) { + Prev->Next = Curr->Next; + } else { + if (!Curr->Next) { + Curr->Entry.Val = nullptr; + Curr->Entry.BB = nullptr; + } else { + LeaderListNode *Next = Curr->Next; + Curr->Entry.Val = Next->Entry.Val; + Curr->Entry.BB = Next->Entry.BB; + Curr->Next = Next->Next; + } + } +} + +void GVNPass::LeaderMap::verifyRemoved(const Value *V) const { + // Walk through the value number scope to make sure the instruction isn't + // ferreted away in it. + for (const auto &I : NumToLeaders) { + (void)I; + assert(I.second.Entry.Val != V && "Inst still in value numbering scope!"); + assert( + std::none_of(leader_iterator(&I.second), leader_iterator(nullptr), + [=](const LeaderTableEntry &E) { return E.Val == V; }) && + "Inst still in value numbering scope!"); + } +} + +//===----------------------------------------------------------------------===// // GVN Pass //===----------------------------------------------------------------------===// @@ -1008,7 +1072,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, GVNPass &gvn) const { Value *Res; Type *LoadTy = Load->getType(); - const DataLayout &DL = Load->getModule()->getDataLayout(); + const DataLayout &DL = Load->getDataLayout(); if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { @@ -1056,7 +1120,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, // Introduce a new value select for a load from an eligible pointer select. SelectInst *Sel = getSelectValue(); assert(V1 && V2 && "both value operands of the select must be present"); - Res = SelectInst::Create(Sel->getCondition(), V1, V2, "", Sel); + Res = + SelectInst::Create(Sel->getCondition(), V1, V2, "", Sel->getIterator()); } else { llvm_unreachable("Should not materialize value from dead block"); } @@ -1173,7 +1238,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, Instruction *DepInst = DepInfo.getInst(); - const DataLayout &DL = Load->getModule()->getDataLayout(); + const DataLayout &DL = Load->getDataLayout(); if (DepInfo.isClobber()) { // If the dependence is to a store that writes to a superset of the bits // read by the load, we can extract the bits we need for the load from the @@ -1412,10 +1477,10 @@ void GVNPass::eliminatePartiallyRedundantLoad( BasicBlock *UnavailableBlock = AvailableLoad.first; Value *LoadPtr = AvailableLoad.second; - auto *NewLoad = - new LoadInst(Load->getType(), LoadPtr, Load->getName() + ".pre", - Load->isVolatile(), Load->getAlign(), Load->getOrdering(), - Load->getSyncScopeID(), UnavailableBlock->getTerminator()); + auto *NewLoad = new LoadInst( + Load->getType(), LoadPtr, Load->getName() + ".pre", Load->isVolatile(), + Load->getAlign(), Load->getOrdering(), Load->getSyncScopeID(), + UnavailableBlock->getTerminator()->getIterator()); NewLoad->setDebugLoc(Load->getDebugLoc()); if (MSSAU) { auto *NewAccess = MSSAU->createMemoryAccessInBB( @@ -1465,7 +1530,7 @@ void GVNPass::eliminatePartiallyRedundantLoad( OldLoad->replaceAllUsesWith(NewLoad); replaceValuesPerBlockEntry(ValuesPerBlock, OldLoad, NewLoad); if (uint32_t ValNo = VN.lookup(OldLoad, false)) - removeFromLeaderTable(ValNo, OldLoad, OldLoad->getParent()); + LeaderTable.erase(ValNo, OldLoad, OldLoad->getParent()); VN.erase(OldLoad); removeInstruction(OldLoad); } @@ -1658,7 +1723,7 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; - const DataLayout &DL = Load->getModule()->getDataLayout(); + const DataLayout &DL = Load->getDataLayout(); SmallVector<Instruction*, 8> NewInsts; for (auto &PredLoad : PredLoads) { BasicBlock *UnavailablePred = PredLoad.first; @@ -1994,8 +2059,9 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { // Insert a new store to null instruction before the load to indicate that // this code is not reachable. FIXME: We could insert unreachable // instruction directly because we can modify the CFG. - auto *NewS = new StoreInst(PoisonValue::get(Int8Ty), - Constant::getNullValue(PtrTy), IntrinsicI); + auto *NewS = + new StoreInst(PoisonValue::get(Int8Ty), Constant::getNullValue(PtrTy), + IntrinsicI->getIterator()); if (MSSAU) { const MemoryUseOrDef *FirstNonDom = nullptr; const auto *AL = @@ -2201,10 +2267,9 @@ GVNPass::ValueTable::assignExpNewValueNum(Expression &Exp) { /// defined in \p BB. bool GVNPass::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, GVNPass &Gvn) { - LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; - while (Vals && Vals->BB == BB) - Vals = Vals->Next; - return !Vals; + return all_of( + Gvn.LeaderTable.getLeaders(Num), + [=](const LeaderMap::LeaderTableEntry &L) { return L.BB == BB; }); } /// Wrap phiTranslateImpl to provide caching functionality. @@ -2226,12 +2291,11 @@ bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum, const BasicBlock *PhiBlock, GVNPass &Gvn) { CallInst *Call = nullptr; - LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; - while (Vals) { - Call = dyn_cast<CallInst>(Vals->Val); + auto Leaders = Gvn.LeaderTable.getLeaders(Num); + for (const auto &Entry : Leaders) { + Call = dyn_cast<CallInst>(Entry.Val); if (Call && Call->getParent() == PhiBlock) break; - Vals = Vals->Next; } if (AA->doesNotAccessMemory(Call)) @@ -2324,23 +2388,17 @@ void GVNPass::ValueTable::eraseTranslateCacheEntry( // question. This is fast because dominator tree queries consist of only // a few comparisons of DFS numbers. Value *GVNPass::findLeader(const BasicBlock *BB, uint32_t num) { - LeaderTableEntry Vals = LeaderTable[num]; - if (!Vals.Val) return nullptr; + auto Leaders = LeaderTable.getLeaders(num); + if (Leaders.empty()) + return nullptr; Value *Val = nullptr; - if (DT->dominates(Vals.BB, BB)) { - Val = Vals.Val; - if (isa<Constant>(Val)) return Val; - } - - LeaderTableEntry* Next = Vals.Next; - while (Next) { - if (DT->dominates(Next->BB, BB)) { - if (isa<Constant>(Next->Val)) return Next->Val; - if (!Val) Val = Next->Val; + for (const auto &Entry : Leaders) { + if (DT->dominates(Entry.BB, BB)) { + Val = Entry.Val; + if (isa<Constant>(Val)) + return Val; } - - Next = Next->Next; } return Val; @@ -2417,6 +2475,10 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) std::swap(LHS, RHS); assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); + const DataLayout &DL = + isa<Argument>(LHS) + ? cast<Argument>(LHS)->getParent()->getDataLayout() + : cast<Instruction>(LHS)->getDataLayout(); // If there is no obvious reason to prefer the left-hand side over the // right-hand side, ensure the longest lived term is on the right-hand side, @@ -2443,23 +2505,32 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, // using the leader table is about compiling faster, not optimizing better). // The leader table only tracks basic blocks, not edges. Only add to if we // have the simple case where the edge dominates the end. - if (RootDominatesEnd && !isa<Instruction>(RHS)) - addToLeaderTable(LVN, RHS, Root.getEnd()); + if (RootDominatesEnd && !isa<Instruction>(RHS) && + canReplacePointersIfEqual(LHS, RHS, DL)) + LeaderTable.insert(LVN, RHS, Root.getEnd()); // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As // LHS always has at least one use that is not dominated by Root, this will // never do anything if LHS has only one use. if (!LHS->hasOneUse()) { + // Create a callback that captures the DL. + auto canReplacePointersCallBack = [&DL](const Use &U, const Value *To) { + return canReplacePointersInUseIfEqual(U, To, DL); + }; unsigned NumReplacements = DominatesByEdge - ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) - : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart()); - - Changed |= NumReplacements > 0; - NumGVNEqProp += NumReplacements; - // Cached information for anything that uses LHS will be invalid. - if (MD) - MD->invalidateCachedPointerInfo(LHS); + ? replaceDominatedUsesWithIf(LHS, RHS, *DT, Root, + canReplacePointersCallBack) + : replaceDominatedUsesWithIf(LHS, RHS, *DT, Root.getStart(), + canReplacePointersCallBack); + + if (NumReplacements > 0) { + Changed = true; + NumGVNEqProp += NumReplacements; + // Cached information for anything that uses LHS will be invalid. + if (MD) + MD->invalidateCachedPointerInfo(LHS); + } } // Now try to deduce additional equalities from this one. For example, if @@ -2530,7 +2601,7 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, // The leader table only tracks basic blocks, not edges. Only add to if we // have the simple case where the edge dominates the end. if (RootDominatesEnd) - addToLeaderTable(Num, NotVal, Root.getEnd()); + LeaderTable.insert(Num, NotVal, Root.getEnd()); continue; } @@ -2550,7 +2621,7 @@ bool GVNPass::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. - const DataLayout &DL = I->getModule()->getDataLayout(); + const DataLayout &DL = I->getDataLayout(); if (Value *V = simplifyInstruction(I, {DL, TLI, DT, AC})) { bool Changed = false; if (!I->use_empty()) { @@ -2580,7 +2651,7 @@ bool GVNPass::processInstruction(Instruction *I) { return true; unsigned Num = VN.lookupOrAdd(Load); - addToLeaderTable(Num, Load, Load->getParent()); + LeaderTable.insert(Num, Load, Load->getParent()); return false; } @@ -2622,8 +2693,8 @@ bool GVNPass::processInstruction(Instruction *I) { // Remember how many outgoing edges there are to every successor. SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges; - for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i) - ++SwitchEdges[SI->getSuccessor(i)]; + for (BasicBlock *Succ : successors(Parent)) + ++SwitchEdges[Succ]; for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { @@ -2648,7 +2719,7 @@ bool GVNPass::processInstruction(Instruction *I) { // Allocations are always uniquely numbered, so we can save time and memory // by fast failing them. if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) { - addToLeaderTable(Num, I, I->getParent()); + LeaderTable.insert(Num, I, I->getParent()); return false; } @@ -2656,7 +2727,7 @@ bool GVNPass::processInstruction(Instruction *I) { // need to do a lookup to see if the number already exists // somewhere in the domtree: it can't! if (Num >= NextNum) { - addToLeaderTable(Num, I, I->getParent()); + LeaderTable.insert(Num, I, I->getParent()); return false; } @@ -2665,7 +2736,7 @@ bool GVNPass::processInstruction(Instruction *I) { Value *Repl = findLeader(I->getParent(), Num); if (!Repl) { // Failure, just remember this instance for future use. - addToLeaderTable(Num, I, I->getParent()); + LeaderTable.insert(Num, I, I->getParent()); return false; } @@ -2706,7 +2777,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, bool Changed = false; bool ShouldContinue = true; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (BasicBlock &BB : llvm::make_early_inc_range(F)) { @@ -2716,6 +2787,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, Changed |= removedBlock; } + DTU.flush(); unsigned Iteration = 0; while (ShouldContinue) { @@ -2859,7 +2931,7 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, VN.add(Instr, Num); // Update the availability map to include the new instruction. - addToLeaderTable(Num, Instr, Pred); + LeaderTable.insert(Num, Instr, Pred); return true; } @@ -3010,13 +3082,13 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { // After creating a new PHI for ValNo, the phi translate result for ValNo will // be changed, so erase the related stale entries in phi translate cache. VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock); - addToLeaderTable(ValNo, Phi, CurrentBlock); + LeaderTable.insert(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); if (MD && Phi->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(Phi); VN.erase(CurInst); - removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + LeaderTable.erase(ValNo, CurInst, CurrentBlock); LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); removeInstruction(CurInst); @@ -3110,7 +3182,6 @@ void GVNPass::cleanupGlobalSets() { VN.clear(); LeaderTable.clear(); BlockRPONumber.clear(); - TableAllocator.Reset(); ICF->clear(); InvalidBlockRPONumbers = true; } @@ -3130,18 +3201,7 @@ void GVNPass::removeInstruction(Instruction *I) { /// internal data structures. void GVNPass::verifyRemoved(const Instruction *Inst) const { VN.verifyRemoved(Inst); - - // Walk through the value number scope to make sure the instruction isn't - // ferreted away in it. - for (const auto &I : LeaderTable) { - const LeaderTableEntry *Node = &I.second; - assert(Node->Val != Inst && "Inst still in value numbering scope!"); - - while (Node->Next) { - Node = Node->Next; - assert(Node->Val != Inst && "Inst still in value numbering scope!"); - } - } + LeaderTable.verifyRemoved(Inst); } /// BB is declared dead, which implied other blocks become dead as well. This @@ -3268,7 +3328,7 @@ void GVNPass::assignValNumForDeadCode() { for (BasicBlock *BB : DeadBlocks) { for (Instruction &Inst : *BB) { unsigned ValNum = VN.lookupOrAdd(&Inst); - addToLeaderTable(ValNum, &Inst, BB); + LeaderTable.insert(ValNum, &Inst, BB); } } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp index b564f00eb9d1..b5333c532280 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -238,18 +238,6 @@ public: const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; } }; -static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) { - static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_load, - LLVMContext::MD_invariant_group, - LLVMContext::MD_access_group}; - combineMetadata(ReplInst, I, KnownIDs, true); -} - // This pass hoists common computations across branches sharing common // dominator. The primary goal is to reduce the code size, and in some // cases reduce critical path (by exposing more ILP). @@ -951,6 +939,14 @@ void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt, OtherGep = cast<GetElementPtrInst>( cast<StoreInst>(OtherInst)->getPointerOperand()); ClonedGep->andIRFlags(OtherGep); + + // Merge debug locations of GEPs, because the hoisted GEP replaces those + // in branches. When cloning, ClonedGep preserves the debug location of + // Gepd, so Gep is skipped to avoid merging it twice. + if (OtherGep != Gep) { + ClonedGep->applyMergedLocation(ClonedGep->getDebugLoc(), + OtherGep->getDebugLoc()); + } } // Replace uses of Gep with ClonedGep in Repl. @@ -988,8 +984,8 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl, MSSAUpdater->removeMemoryAccess(OldMA); } + combineMetadataForCSE(Repl, I, true); Repl->andIRFlags(I); - combineKnownMetadata(Repl, I); I->replaceAllUsesWith(Repl); // Also invalidate the Alias Analysis cache. MD->removeInstruction(I); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp index 2b38831139a5..3dfa2dd9df27 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -132,7 +132,7 @@ public: ActiveBlocks.remove(BB); continue; } - Insts.push_back(BB->getTerminator()->getPrevNode()); + Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction()); } if (Insts.empty()) Fail = true; @@ -168,7 +168,7 @@ public: if (Inst == &Inst->getParent()->front()) ActiveBlocks.remove(Inst->getParent()); else - NewInsts.push_back(Inst->getPrevNode()); + NewInsts.push_back(Inst->getPrevNonDebugInstruction()); } if (NewInsts.empty()) { Fail = true; @@ -226,12 +226,22 @@ class ModelledPHI { public: ModelledPHI() = default; - ModelledPHI(const PHINode *PN) { - // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order. - SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops; + ModelledPHI(const PHINode *PN, + const DenseMap<const BasicBlock *, unsigned> &BlockOrder) { + // BasicBlock comes first so we sort by basic block pointer order, + // then by value pointer order. No need to call `verifyModelledPHI` + // As the Values and Blocks are populated in a deterministic order. + using OpsType = std::pair<BasicBlock *, Value *>; + SmallVector<OpsType, 4> Ops; for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)}); - llvm::sort(Ops); + + auto ComesBefore = [BlockOrder](OpsType O1, OpsType O2) { + return BlockOrder.lookup(O1.first) < BlockOrder.lookup(O2.first); + }; + // Sort in a deterministic order. + llvm::sort(Ops, ComesBefore); + for (auto &P : Ops) { Blocks.push_back(P.first); Values.push_back(P.second); @@ -247,16 +257,38 @@ public: return M; } + void + verifyModelledPHI(const DenseMap<const BasicBlock *, unsigned> &BlockOrder) { + assert(Values.size() > 1 && Blocks.size() > 1 && + "Modelling PHI with less than 2 values"); + auto ComesBefore = [BlockOrder](const BasicBlock *BB1, + const BasicBlock *BB2) { + return BlockOrder.lookup(BB1) < BlockOrder.lookup(BB2); + }; + assert(llvm::is_sorted(Blocks, ComesBefore)); + int C = 0; + for (const Value *V : Values) { + if (!isa<UndefValue>(V)) { + assert(cast<Instruction>(V)->getParent() == Blocks[C]); + (void)C; + } + C++; + } + } /// Create a PHI from an array of incoming values and incoming blocks. - template <typename VArray, typename BArray> - ModelledPHI(const VArray &V, const BArray &B) { + ModelledPHI(SmallVectorImpl<Instruction *> &V, + SmallSetVector<BasicBlock *, 4> &B, + const DenseMap<const BasicBlock *, unsigned> &BlockOrder) { + // The order of Values and Blocks are already ordered by the caller. llvm::copy(V, std::back_inserter(Values)); llvm::copy(B, std::back_inserter(Blocks)); + verifyModelledPHI(BlockOrder); } /// Create a PHI from [I[OpNum] for I in Insts]. - template <typename BArray> - ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) { + /// TODO: Figure out a way to verifyModelledPHI in this constructor. + ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, + SmallSetVector<BasicBlock *, 4> &B) { llvm::copy(B, std::back_inserter(Blocks)); for (auto *I : Insts) Values.push_back(I->getOperand(OpNum)); @@ -297,7 +329,8 @@ public: // Hash functor unsigned hash() const { - return (unsigned)hash_combine_range(Values.begin(), Values.end()); + // Is deterministic because Values are saved in a specific order. + return (unsigned)hash_combine_range(Values.begin(), Values.end()); } bool operator==(const ModelledPHI &Other) const { @@ -566,7 +599,7 @@ public: class GVNSink { public: - GVNSink() = default; + GVNSink() {} bool run(Function &F) { LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() @@ -575,6 +608,16 @@ public: unsigned NumSunk = 0; ReversePostOrderTraversal<Function*> RPOT(&F); VN.setReachableBBs(BasicBlocksSet(RPOT.begin(), RPOT.end())); + // Populate reverse post-order to order basic blocks in deterministic + // order. Any arbitrary ordering will work in this case as long as they are + // deterministic. The node ordering of newly created basic blocks + // are irrelevant because RPOT(for computing sinkable candidates) is also + // obtained ahead of time and only their order are relevant for this pass. + unsigned NodeOrdering = 0; + RPOTOrder[*RPOT.begin()] = ++NodeOrdering; + for (auto *BB : RPOT) + if (!pred_empty(BB)) + RPOTOrder[BB] = ++NodeOrdering; for (auto *N : RPOT) NumSunk += sinkBB(N); @@ -583,6 +626,7 @@ public: private: ValueTable VN; + DenseMap<const BasicBlock *, unsigned> RPOTOrder; bool shouldAvoidSinkingInstruction(Instruction *I) { // These instructions may change or break semantics if moved. @@ -603,7 +647,7 @@ private: void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs, SmallPtrSetImpl<Value *> &PHIContents) { for (PHINode &PN : BB->phis()) { - auto MPHI = ModelledPHI(&PN); + auto MPHI = ModelledPHI(&PN, RPOTOrder); PHIs.insert(MPHI); for (auto *V : MPHI.getValues()) PHIContents.insert(V); @@ -655,8 +699,7 @@ GVNSink::analyzeInstructionForSinking(LockstepReverseIterator &LRI, return std::nullopt; VNums[N]++; } - unsigned VNumToSink = - std::max_element(VNums.begin(), VNums.end(), llvm::less_second())->first; + unsigned VNumToSink = llvm::max_element(VNums, llvm::less_second())->first; if (VNums[VNumToSink] == 1) // Can't sink anything! @@ -692,7 +735,7 @@ GVNSink::analyzeInstructionForSinking(LockstepReverseIterator &LRI, } // The sunk instruction's results. - ModelledPHI NewPHI(NewInsts, ActivePreds); + ModelledPHI NewPHI(NewInsts, ActivePreds, RPOTOrder); // Does sinking this instruction render previous PHIs redundant? if (NeededPHIs.erase(NewPHI)) @@ -720,12 +763,11 @@ GVNSink::analyzeInstructionForSinking(LockstepReverseIterator &LRI, // try and continue making progress. Instruction *I0 = NewInsts[0]; - // If all instructions that are going to participate don't have the same - // number of operands, we can't do any useful PHI analysis for all operands. - auto hasDifferentNumOperands = [&I0](Instruction *I) { - return I->getNumOperands() != I0->getNumOperands(); + auto isNotSameOperation = [&I0](Instruction *I) { + return !I0->isSameOperationAs(I); }; - if (any_of(NewInsts, hasDifferentNumOperands)) + + if (any_of(NewInsts, isNotSameOperation)) return std::nullopt; for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) { @@ -767,6 +809,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); SmallVector<BasicBlock *, 4> Preds; for (auto *B : predecessors(BBEnd)) { + // Bailout on basic blocks without predecessor(PR42346). + if (!RPOTOrder.count(B)) + return 0; auto *T = B->getTerminator(); if (isa<BranchInst>(T) || isa<SwitchInst>(T)) Preds.push_back(B); @@ -775,7 +820,11 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { } if (Preds.size() < 2) return 0; - llvm::sort(Preds); + auto ComesBefore = [this](const BasicBlock *BB1, const BasicBlock *BB2) { + return RPOTOrder.lookup(BB1) < RPOTOrder.lookup(BB2); + }; + // Sort in a deterministic order. + llvm::sort(Preds, ComesBefore); unsigned NumOrigPreds = Preds.size(); // We can only sink instructions through unconditional branches. @@ -834,7 +883,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd) { SmallVector<Instruction *, 4> Insts; for (BasicBlock *BB : Blocks) - Insts.push_back(BB->getTerminator()->getPrevNode()); + Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction()); Instruction *I0 = Insts.front(); SmallVector<Value *, 4> NewOperands; @@ -872,8 +921,10 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, } for (auto *I : Insts) - if (I != I0) + if (I != I0) { I->replaceAllUsesWith(I0); + I0->applyMergedLocation(I0->getDebugLoc(), I->getDebugLoc()); + } foldPointlessPHINodes(BBEnd); // Finally nuke all instructions apart from the common instruction. @@ -890,5 +941,6 @@ PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { GVNSink G; if (!G.run(F)) return PreservedAnalyses::all(); + return PreservedAnalyses::none(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 3bbf6642a90c..e7ff2a14469c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -52,6 +52,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -121,12 +122,13 @@ static void eliminateGuard(Instruction *GuardInst, MemorySSAUpdater *MSSAU) { /// condition should stay invariant. Otherwise there can be a miscompile, like /// the one described at https://github.com/llvm/llvm-project/issues/60234. The /// safest way to do it is to expand the new condition at WC's block. -static Instruction *findInsertionPointForWideCondition(Instruction *WCOrGuard) { +static std::optional<BasicBlock::iterator> +findInsertionPointForWideCondition(Instruction *WCOrGuard) { if (isGuard(WCOrGuard)) - return WCOrGuard; + return WCOrGuard->getIterator(); if (auto WC = extractWidenableCondition(WCOrGuard)) - return cast<Instruction>(WC); - return nullptr; + return cast<Instruction>(WC)->getIterator(); + return std::nullopt; } class GuardWideningImpl { @@ -182,30 +184,30 @@ class GuardWideningImpl { /// into \p WideningPoint. WideningScore computeWideningScore(Instruction *DominatedInstr, Instruction *ToWiden, - Instruction *WideningPoint, + BasicBlock::iterator WideningPoint, SmallVectorImpl<Value *> &ChecksToHoist, SmallVectorImpl<Value *> &ChecksToWiden); /// Helper to check if \p V can be hoisted to \p InsertPos. - bool canBeHoistedTo(const Value *V, const Instruction *InsertPos) const { + bool canBeHoistedTo(const Value *V, BasicBlock::iterator InsertPos) const { SmallPtrSet<const Instruction *, 8> Visited; return canBeHoistedTo(V, InsertPos, Visited); } - bool canBeHoistedTo(const Value *V, const Instruction *InsertPos, + bool canBeHoistedTo(const Value *V, BasicBlock::iterator InsertPos, SmallPtrSetImpl<const Instruction *> &Visited) const; bool canBeHoistedTo(const SmallVectorImpl<Value *> &Checks, - const Instruction *InsertPos) const { + BasicBlock::iterator InsertPos) const { return all_of(Checks, [&](const Value *V) { return canBeHoistedTo(V, InsertPos); }); } /// Helper to hoist \p V to \p InsertPos. Guaranteed to succeed if \c /// canBeHoistedTo returned true. - void makeAvailableAt(Value *V, Instruction *InsertPos) const; + void makeAvailableAt(Value *V, BasicBlock::iterator InsertPos) const; void makeAvailableAt(const SmallVectorImpl<Value *> &Checks, - Instruction *InsertPos) const { + BasicBlock::iterator InsertPos) const { for (Value *V : Checks) makeAvailableAt(V, InsertPos); } @@ -217,18 +219,19 @@ class GuardWideningImpl { /// InsertPt is true then actually generate the resulting expression, make it /// available at \p InsertPt and return it in \p Result (else no change to the /// IR is made). - std::optional<Value *> mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, - SmallVectorImpl<Value *> &ChecksToWiden, - Instruction *InsertPt); + std::optional<Value *> + mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, + SmallVectorImpl<Value *> &ChecksToWiden, + std::optional<BasicBlock::iterator> InsertPt); /// Generate the logical AND of \p ChecksToHoist and \p OldCondition and make /// it available at InsertPt Value *hoistChecks(SmallVectorImpl<Value *> &ChecksToHoist, - Value *OldCondition, Instruction *InsertPt); + Value *OldCondition, BasicBlock::iterator InsertPt); /// Adds freeze to Orig and push it as far as possible very aggressively. /// Also replaces all uses of frozen instruction with frozen version. - Value *freezeAndPush(Value *Orig, Instruction *InsertPt); + Value *freezeAndPush(Value *Orig, BasicBlock::iterator InsertPt); /// Represents a range check of the form \c Base + \c Offset u< \c Length, /// with the constraint that \c Length is not negative. \c CheckInst is the @@ -294,7 +297,7 @@ class GuardWideningImpl { /// for the price of computing only one of the set of expressions? bool isWideningCondProfitable(SmallVectorImpl<Value *> &ChecksToHoist, SmallVectorImpl<Value *> &ChecksToWiden) { - return mergeChecks(ChecksToHoist, ChecksToWiden, /*InsertPt=*/nullptr) + return mergeChecks(ChecksToHoist, ChecksToWiden, /*InsertPt=*/std::nullopt) .has_value(); } @@ -302,11 +305,11 @@ class GuardWideningImpl { void widenGuard(SmallVectorImpl<Value *> &ChecksToHoist, SmallVectorImpl<Value *> &ChecksToWiden, Instruction *ToWiden) { - Instruction *InsertPt = findInsertionPointForWideCondition(ToWiden); + auto InsertPt = findInsertionPointForWideCondition(ToWiden); auto MergedCheck = mergeChecks(ChecksToHoist, ChecksToWiden, InsertPt); Value *Result = MergedCheck ? *MergedCheck : hoistChecks(ChecksToHoist, - getCondition(ToWiden), InsertPt); + getCondition(ToWiden), *InsertPt); if (isGuardAsWidenableBranch(ToWiden)) { setWidenableBranchCond(cast<BranchInst>(ToWiden), Result); @@ -417,12 +420,12 @@ bool GuardWideningImpl::eliminateInstrViaWidening( assert((i == (e - 1)) == (Instr->getParent() == CurBB) && "Bad DFS?"); for (auto *Candidate : make_range(I, E)) { - auto *WideningPoint = findInsertionPointForWideCondition(Candidate); + auto WideningPoint = findInsertionPointForWideCondition(Candidate); if (!WideningPoint) continue; SmallVector<Value *> CandidateChecks; parseWidenableGuard(Candidate, CandidateChecks); - auto Score = computeWideningScore(Instr, Candidate, WideningPoint, + auto Score = computeWideningScore(Instr, Candidate, *WideningPoint, ChecksToHoist, CandidateChecks); LLVM_DEBUG(dbgs() << "Score between " << *Instr << " and " << *Candidate << " is " << scoreTypeToString(Score) << "\n"); @@ -456,7 +459,7 @@ bool GuardWideningImpl::eliminateInstrViaWidening( GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( Instruction *DominatedInstr, Instruction *ToWiden, - Instruction *WideningPoint, SmallVectorImpl<Value *> &ChecksToHoist, + BasicBlock::iterator WideningPoint, SmallVectorImpl<Value *> &ChecksToHoist, SmallVectorImpl<Value *> &ChecksToWiden) { Loop *DominatedInstrLoop = LI.getLoopFor(DominatedInstr->getParent()); Loop *DominatingGuardLoop = LI.getLoopFor(WideningPoint->getParent()); @@ -559,7 +562,7 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( } bool GuardWideningImpl::canBeHoistedTo( - const Value *V, const Instruction *Loc, + const Value *V, BasicBlock::iterator Loc, SmallPtrSetImpl<const Instruction *> &Visited) const { auto *Inst = dyn_cast<Instruction>(V); if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst)) @@ -580,7 +583,8 @@ bool GuardWideningImpl::canBeHoistedTo( [&](Value *Op) { return canBeHoistedTo(Op, Loc, Visited); }); } -void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const { +void GuardWideningImpl::makeAvailableAt(Value *V, + BasicBlock::iterator Loc) const { auto *Inst = dyn_cast<Instruction>(V); if (!Inst || DT.dominates(Inst, Loc)) return; @@ -592,7 +596,7 @@ void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const { for (Value *Op : Inst->operands()) makeAvailableAt(Op, Loc); - Inst->moveBefore(Loc); + Inst->moveBefore(*Loc->getParent(), Loc); } // Return Instruction before which we can insert freeze for the value V as close @@ -621,14 +625,15 @@ getFreezeInsertPt(Value *V, const DominatorTree &DT) { return Res; } -Value *GuardWideningImpl::freezeAndPush(Value *Orig, Instruction *InsertPt) { +Value *GuardWideningImpl::freezeAndPush(Value *Orig, + BasicBlock::iterator InsertPt) { if (isGuaranteedNotToBePoison(Orig, nullptr, InsertPt, &DT)) return Orig; std::optional<BasicBlock::iterator> InsertPtAtDef = getFreezeInsertPt(Orig, DT); if (!InsertPtAtDef) { FreezeInst *FI = new FreezeInst(Orig, "gw.freeze"); - FI->insertBefore(InsertPt); + FI->insertBefore(*InsertPt->getParent(), InsertPt); return FI; } if (isa<Constant>(Orig) || isa<GlobalValue>(Orig)) { @@ -695,7 +700,7 @@ Value *GuardWideningImpl::freezeAndPush(Value *Orig, Instruction *InsertPt) { Worklist.push_back(U.get()); } for (Instruction *I : DropPoisonFlags) - I->dropPoisonGeneratingFlagsAndMetadata(); + I->dropPoisonGeneratingAnnotations(); Value *Result = Orig; for (Value *V : NeedFreeze) { @@ -715,7 +720,7 @@ Value *GuardWideningImpl::freezeAndPush(Value *Orig, Instruction *InsertPt) { std::optional<Value *> GuardWideningImpl::mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, SmallVectorImpl<Value *> &ChecksToWiden, - Instruction *InsertPt) { + std::optional<BasicBlock::iterator> InsertPt) { using namespace llvm::PatternMatch; Value *Result = nullptr; @@ -747,10 +752,10 @@ GuardWideningImpl::mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, if (Intersect->getEquivalentICmp(Pred, NewRHSAP)) { if (InsertPt) { ConstantInt *NewRHS = - ConstantInt::get(InsertPt->getContext(), NewRHSAP); - assert(canBeHoistedTo(LHS, InsertPt) && "must be"); - makeAvailableAt(LHS, InsertPt); - Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk"); + ConstantInt::get((*InsertPt)->getContext(), NewRHSAP); + assert(canBeHoistedTo(LHS, *InsertPt) && "must be"); + makeAvailableAt(LHS, *InsertPt); + Result = new ICmpInst(*InsertPt, Pred, LHS, NewRHS, "wide.chk"); } return Result; } @@ -765,16 +770,16 @@ GuardWideningImpl::mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, combineRangeChecks(Checks, CombinedChecks)) { if (InsertPt) { for (auto &RC : CombinedChecks) { - makeAvailableAt(RC.getCheckInst(), InsertPt); + makeAvailableAt(RC.getCheckInst(), *InsertPt); if (Result) Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "", - InsertPt); + *InsertPt); else Result = RC.getCheckInst(); } assert(Result && "Failed to find result value"); Result->setName("wide.chk"); - Result = freezeAndPush(Result, InsertPt); + Result = freezeAndPush(Result, *InsertPt); } return Result; } @@ -786,9 +791,9 @@ GuardWideningImpl::mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, Value *GuardWideningImpl::hoistChecks(SmallVectorImpl<Value *> &ChecksToHoist, Value *OldCondition, - Instruction *InsertPt) { + BasicBlock::iterator InsertPt) { assert(!ChecksToHoist.empty()); - IRBuilder<> Builder(InsertPt); + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); makeAvailableAt(ChecksToHoist, InsertPt); makeAvailableAt(OldCondition, InsertPt); Value *Result = Builder.CreateAnd(ChecksToHoist); @@ -812,7 +817,7 @@ bool GuardWideningImpl::parseRangeChecks( if (IC->getPredicate() == ICmpInst::ICMP_UGT) std::swap(CmpLHS, CmpRHS); - auto &DL = IC->getModule()->getDataLayout(); + auto &DL = IC->getDataLayout(); GuardWideningImpl::RangeCheck Check( CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())), diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 41c4d6236173..5e2131b0b180 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -70,6 +70,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -137,6 +138,8 @@ class IndVarSimplify { SmallVector<WeakTrackingVH, 16> DeadInsts; bool WidenIndVars; + bool RunUnswitching = false; + bool handleFloatingPointIV(Loop *L, PHINode *PH); bool rewriteNonIntegerIVs(Loop *L); @@ -170,6 +173,8 @@ public: } bool run(Loop *L); + + bool runUnswitching() const { return RunUnswitching; } }; } // end anonymous namespace @@ -350,18 +355,22 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); // Insert new integer induction variable. - PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN); + PHINode *NewPHI = + PHINode::Create(Int32Ty, 2, PN->getName() + ".int", PN->getIterator()); NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), PN->getIncomingBlock(IncomingEdge)); + NewPHI->setDebugLoc(PN->getDebugLoc()); - Value *NewAdd = - BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), - Incr->getName()+".int", Incr); + Instruction *NewAdd = + BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), + Incr->getName() + ".int", Incr->getIterator()); + NewAdd->setDebugLoc(Incr->getDebugLoc()); NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); - ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, - ConstantInt::get(Int32Ty, ExitValue), - Compare->getName()); + ICmpInst *NewCompare = + new ICmpInst(TheBr->getIterator(), NewPred, NewAdd, + ConstantInt::get(Int32Ty, ExitValue), Compare->getName()); + NewCompare->setDebugLoc(Compare->getDebugLoc()); // In the following deletions, PN may become dead and may be deleted. // Use a WeakTrackingVH to observe whether this happens. @@ -385,8 +394,9 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { // We give preference to sitofp over uitofp because it is faster on most // platforms. if (WeakPH) { - Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", - &*PN->getParent()->getFirstInsertionPt()); + Instruction *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", + PN->getParent()->getFirstInsertionPt()); + Conv->setDebugLoc(PN->getDebugLoc()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); } @@ -508,7 +518,7 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); - if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) + if (!Cast->getDataLayout().isLegalInteger(Width)) return; // Check that `Cast` actually extends the induction variable (we rely on this @@ -614,9 +624,11 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L, // Information about sign/zero extensions of CurrIV. IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter, - &Visitor); + const auto &[C, U] = simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, + Rewriter, &Visitor); + Changed |= C; + RunUnswitching |= U; if (Visitor.WI.WidestNativeType) { WideIVs.push_back(Visitor.WI); } @@ -833,7 +845,7 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, const SCEV *BestInit = nullptr; BasicBlock *LatchBlock = L->getLoopLatch(); assert(LatchBlock && "Must be in simplified form"); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = L->getHeader()->getDataLayout(); for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { PHINode *Phi = cast<PHINode>(I); @@ -1220,7 +1232,7 @@ static void replaceLoopPHINodesWithPreheaderValues( if (!L->contains(I)) continue; - Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout()); + Value *Res = simplifyInstruction(I, I->getDataLayout()); if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { for (User *U : I->users()) Worklist.push_back(cast<Instruction>(U)); @@ -1451,7 +1463,7 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) continue; - const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); + const DataLayout &DL = ExitingBB->getDataLayout(); const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); auto FullCR = ConstantRange::getFull(InnerBitWidth); @@ -1516,9 +1528,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { // loop varying work to loop-invariant work. auto doRotateTransform = [&]() { assert(ICmp->isUnsigned() && "must have proven unsigned already"); - auto *NewRHS = - CastInst::Create(Instruction::Trunc, RHS, LHSOp->getType(), "", - L->getLoopPreheader()->getTerminator()); + auto *NewRHS = CastInst::Create( + Instruction::Trunc, RHS, LHSOp->getType(), "", + L->getLoopPreheader()->getTerminator()->getIterator()); ICmp->setOperand(Swapped ? 1 : 0, LHSOp); ICmp->setOperand(Swapped ? 0 : 1, NewRHS); if (LHS->use_empty()) @@ -1526,7 +1538,7 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { }; - const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); + const DataLayout &DL = ExitingBB->getDataLayout(); const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); auto FullCR = ConstantRange::getFull(InnerBitWidth); @@ -1873,6 +1885,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { if (OldCond->use_empty()) DeadInsts.emplace_back(OldCond); Changed = true; + RunUnswitching = true; } return Changed; @@ -2049,7 +2062,7 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { Function *F = L.getHeader()->getParent(); - const DataLayout &DL = F->getParent()->getDataLayout(); + const DataLayout &DL = F->getDataLayout(); IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, WidenIndVars && AllowIVWidening); @@ -2058,6 +2071,11 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, auto PA = getLoopPassPreservedAnalyses(); PA.preserveSet<CFGAnalyses>(); + if (IVS.runUnswitching()) { + AM.getResult<ShouldRunExtraSimpleLoopUnswitch>(L, AR); + PA.preserve<ShouldRunExtraSimpleLoopUnswitch>(); + } + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 1bf50d79e533..c9be8ee00cdc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -642,6 +642,7 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( Type *NewPtrTy = getPtrOrVecOfPtrsWithNewAS(I->getType(), AS); auto *NewI = new AddrSpaceCastInst(I, NewPtrTy); NewI->insertAfter(I); + NewI->setDebugLoc(I->getDebugLoc()); return NewI; } @@ -821,7 +822,7 @@ unsigned InferAddressSpacesImpl::joinAddressSpaces(unsigned AS1, } bool InferAddressSpacesImpl::run(Function &F) { - DL = &F.getParent()->getDataLayout(); + DL = &F.getDataLayout(); if (AssumeDefaultIsFlatAddressSpace) FlatAddrSpace = 0; @@ -1221,6 +1222,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( Value::use_iterator I, E, Next; for (I = V->use_begin(), E = V->use_end(); I != E;) { Use &U = *I; + User *CurUser = U.getUser(); // Some users may see the same pointer operand in multiple operands. Skip // to the next instruction. @@ -1231,11 +1233,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. - U.set(NewV); + CurUser->replaceUsesOfWith(V, NewV); continue; } - User *CurUser = U.getUser(); // Skip if the current user is the new value itself. if (CurUser == NewV) continue; @@ -1311,10 +1312,13 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( while (isa<PHINode>(InsertPos)) ++InsertPos; - U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + // This instruction may contain multiple uses of V, update them all. + CurUser->replaceUsesOfWith( + V, new AddrSpaceCastInst(NewV, V->getType(), "", InsertPos)); } else { - U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), - V->getType())); + CurUser->replaceUsesOfWith( + V, ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), + V->getType())); } } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAlignment.cpp index b75b8d486fbb..6e0c206bd198 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -48,7 +48,7 @@ static bool tryToImproveAlign( } bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); bool Changed = false; // Enforce preferred type alignment if possible. We do this as a separate diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp index ee9452ce1c7d..326849a4eb39 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -99,7 +99,7 @@ struct InstSimplifyLegacyPass : public FunctionPass { &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); AssumptionCache *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); const SimplifyQuery SQ(DL, TLI, DT, AC); return runImpl(F, SQ); } @@ -125,7 +125,7 @@ PreservedAnalyses InstSimplifyPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); const SimplifyQuery SQ(DL, &TLI, &DT, &AC); bool Changed = runImpl(F, SQ); if (!Changed) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp new file mode 100644 index 000000000000..2a4f68e12525 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -0,0 +1,190 @@ +//===- JumpTableToSwitch.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/JumpTableToSwitch.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +static cl::opt<unsigned> + JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden, + cl::desc("Only split jump tables with size less or " + "equal than JumpTableSizeThreshold."), + cl::init(10)); + +// TODO: Consider adding a cost model for profitability analysis of this +// transformation. Currently we replace a jump table with a switch if all the +// functions in the jump table are smaller than the provided threshold. +static cl::opt<unsigned> FunctionSizeThreshold( + "jump-table-to-switch-function-size-threshold", cl::Hidden, + cl::desc("Only split jump tables containing functions whose sizes are less " + "or equal than this threshold."), + cl::init(50)); + +#define DEBUG_TYPE "jump-table-to-switch" + +namespace { +struct JumpTableTy { + Value *Index; + SmallVector<Function *, 10> Funcs; +}; +} // anonymous namespace + +static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP, + PointerType *PtrTy) { + Constant *Ptr = dyn_cast<Constant>(GEP->getPointerOperand()); + if (!Ptr) + return std::nullopt; + + GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr); + if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) + return std::nullopt; + + Function &F = *GEP->getParent()->getParent(); + const DataLayout &DL = F.getDataLayout(); + const unsigned BitWidth = + DL.getIndexSizeInBits(GEP->getPointerAddressSpace()); + MapVector<Value *, APInt> VariableOffsets; + APInt ConstantOffset(BitWidth, 0); + if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) + return std::nullopt; + if (VariableOffsets.size() != 1) + return std::nullopt; + // TODO: consider supporting more general patterns + if (!ConstantOffset.isZero()) + return std::nullopt; + APInt StrideBytes = VariableOffsets.front().second; + const uint64_t JumpTableSizeBytes = DL.getTypeAllocSize(GV->getValueType()); + if (JumpTableSizeBytes % StrideBytes.getZExtValue() != 0) + return std::nullopt; + const uint64_t N = JumpTableSizeBytes / StrideBytes.getZExtValue(); + if (N > JumpTableSizeThreshold) + return std::nullopt; + + JumpTableTy JumpTable; + JumpTable.Index = VariableOffsets.front().first; + JumpTable.Funcs.reserve(N); + for (uint64_t Index = 0; Index < N; ++Index) { + // ConstantOffset is zero. + APInt Offset = Index * StrideBytes; + Constant *C = + ConstantFoldLoadFromConst(GV->getInitializer(), PtrTy, Offset, DL); + auto *Func = dyn_cast_or_null<Function>(C); + if (!Func || Func->isDeclaration() || + Func->getInstructionCount() > FunctionSizeThreshold) + return std::nullopt; + JumpTable.Funcs.push_back(Func); + } + return JumpTable; +} + +static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT, + DomTreeUpdater &DTU, + OptimizationRemarkEmitter &ORE) { + const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext()); + + SmallVector<DominatorTree::UpdateType, 8> DTUpdates; + BasicBlock *BB = CB->getParent(); + BasicBlock *Tail = SplitBlock(BB, CB, &DTU, nullptr, nullptr, + BB->getName() + Twine(".tail")); + DTUpdates.push_back({DominatorTree::Delete, BB, Tail}); + BB->getTerminator()->eraseFromParent(); + + Function &F = *BB->getParent(); + BasicBlock *BBUnreachable = BasicBlock::Create( + F.getContext(), "default.switch.case.unreachable", &F, Tail); + IRBuilder<> BuilderUnreachable(BBUnreachable); + BuilderUnreachable.CreateUnreachable(); + + IRBuilder<> Builder(BB); + SwitchInst *Switch = Builder.CreateSwitch(JT.Index, BBUnreachable); + DTUpdates.push_back({DominatorTree::Insert, BB, BBUnreachable}); + + IRBuilder<> BuilderTail(CB); + PHINode *PHI = + IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size()); + + for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) { + BasicBlock *B = BasicBlock::Create(Func->getContext(), + "call." + Twine(Index), &F, Tail); + DTUpdates.push_back({DominatorTree::Insert, BB, B}); + DTUpdates.push_back({DominatorTree::Insert, B, Tail}); + + CallBase *Call = cast<CallBase>(CB->clone()); + Call->setCalledFunction(Func); + Call->insertInto(B, B->end()); + Switch->addCase( + cast<ConstantInt>(ConstantInt::get(JT.Index->getType(), Index)), B); + BranchInst::Create(Tail, B); + if (PHI) + PHI->addIncoming(Call, B); + } + DTU.applyUpdates(DTUpdates); + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB) + << "expanded indirect call into switch"; + }); + if (PHI) + CB->replaceAllUsesWith(PHI); + CB->eraseFromParent(); + return Tail; +} + +PreservedAnalyses JumpTableToSwitchPass::run(Function &F, + FunctionAnalysisManager &AM) { + OptimizationRemarkEmitter &ORE = + AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + DominatorTree *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); + PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F); + DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy); + bool Changed = false; + for (BasicBlock &BB : make_early_inc_range(F)) { + BasicBlock *CurrentBB = &BB; + while (CurrentBB) { + BasicBlock *SplittedOutTail = nullptr; + for (Instruction &I : make_early_inc_range(*CurrentBB)) { + auto *Call = dyn_cast<CallInst>(&I); + if (!Call || Call->getCalledFunction() || Call->isMustTailCall()) + continue; + auto *L = dyn_cast<LoadInst>(Call->getCalledOperand()); + // Skip atomic or volatile loads. + if (!L || !L->isSimple()) + continue; + auto *GEP = dyn_cast<GetElementPtrInst>(L->getPointerOperand()); + if (!GEP) + continue; + auto *PtrTy = dyn_cast<PointerType>(L->getType()); + assert(PtrTy && "call operand must be a pointer"); + std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy); + if (!JumpTable) + continue; + SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE); + Changed = true; + break; + } + CurrentBB = SplittedOutTail ? SplittedOutTail : nullptr; + } + } + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + if (DT) + PA.preserve<DominatorTreeAnalysis>(); + if (PDT) + PA.preserve<PostDominatorTreeAnalysis>(); + return PA; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 87c01ead634f..7a0b661a0779 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -231,7 +231,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { Weights[0] = BP.getCompl().getNumerator(); Weights[1] = BP.getNumerator(); } - setBranchWeights(*PredBr, Weights); + setBranchWeights(*PredBr, Weights, hasBranchWeightOrigin(*PredBr)); } } @@ -401,8 +401,8 @@ static bool replaceFoldableUses(Instruction *Cond, Value *ToVal, Changed |= replaceNonLocalUsesWith(Cond, ToVal); for (Instruction &I : reverse(*KnownAtEndOfBB)) { // Replace any debug-info record users of Cond with ToVal. - for (DPValue &DPV : I.getDbgValueRange()) - DPV.replaceVariableLocationOp(Cond, ToVal, true); + for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) + DVR.replaceVariableLocationOp(Cond, ToVal, true); // Reached the Cond whose uses we are trying to replace, so there are no // more uses. @@ -558,9 +558,9 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { /// This returns true if there were any known values. bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( Value *V, BasicBlock *BB, PredValueInfo &Result, - ConstantPreference Preference, DenseSet<Value *> &RecursionSet, + ConstantPreference Preference, SmallPtrSet<Value *, 4> &RecursionSet, Instruction *CxtI) { - const DataLayout &DL = BB->getModule()->getDataLayout(); + const DataLayout &DL = BB->getDataLayout(); // This method walks up use-def chains recursively. Because of this, we could // get into an infinite loop going around loops in the use-def chain. To @@ -596,11 +596,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( CmpInst::Predicate Pred; Value *Val; Constant *Cst; - if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst)))) { - auto Res = LVI->getPredicateOnEdge(Pred, Val, Cst, P, BB, CxtI); - if (Res != LazyValueInfo::Unknown) - PredCst = ConstantInt::getBool(V->getContext(), Res); - } + if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst)))) + PredCst = LVI->getPredicateOnEdge(Pred, Val, Cst, P, BB, CxtI); if (Constant *KC = getKnownConstant(PredCst, Preference)) Result.emplace_back(KC, P); } @@ -757,7 +754,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // may result in comparison of values from two different loop iterations. // FIXME: This check is broken if LoopHeaders is not populated. if (PN && PN->getParent() == BB && !LoopHeaders.contains(BB)) { - const DataLayout &DL = PN->getModule()->getDataLayout(); + const DataLayout &DL = PN->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. // See if any do. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { @@ -780,13 +777,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( if (LHSInst && LHSInst->getParent() == BB) continue; - LazyValueInfo::Tristate - ResT = LVI->getPredicateOnEdge(Pred, LHS, - cast<Constant>(RHS), PredBB, BB, - CxtI ? CxtI : Cmp); - if (ResT == LazyValueInfo::Unknown) - continue; - Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); + Res = LVI->getPredicateOnEdge(Pred, LHS, cast<Constant>(RHS), PredBB, + BB, CxtI ? CxtI : Cmp); } if (Constant *KC = getKnownConstant(Res, WantInteger)) @@ -806,14 +798,10 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. - LazyValueInfo::Tristate Res = - LVI->getPredicateOnEdge(Pred, CmpLHS, - CmpConst, P, BB, CxtI ? CxtI : Cmp); - if (Res == LazyValueInfo::Unknown) - continue; - - Constant *ResC = ConstantInt::get(CmpType, Res); - Result.emplace_back(ResC, P); + Constant *Res = LVI->getPredicateOnEdge(Pred, CmpLHS, CmpConst, P, BB, + CxtI ? CxtI : Cmp); + if (Constant *KC = getKnownConstant(Res, WantInteger)) + Result.emplace_back(KC, P); } return !Result.empty(); @@ -868,7 +856,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( for (const auto &LHSVal : LHSVals) { Constant *V = LHSVal.first; - Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst); + Constant *Folded = + ConstantFoldCompareInstOperands(Pred, V, CmpConst, DL); if (Constant *KC = getKnownConstant(Folded, WantInteger)) Result.emplace_back(KC, LHSVal.second); } @@ -1007,7 +996,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { // constant. if (Instruction *I = dyn_cast<Instruction>(Condition)) { Value *SimpleVal = - ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); + ConstantFoldInstruction(I, BB->getDataLayout(), TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); if (isInstructionTriviallyDead(I, TLI)) @@ -1037,7 +1026,8 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { LLVM_DEBUG(dbgs() << " In block '" << BB->getName() << "' folding undef terminator: " << *BBTerm << '\n'); - BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); + Instruction *NewBI = BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm->getIterator()); + NewBI->setDebugLoc(BBTerm->getDebugLoc()); ++NumFolds; BBTerm->eraseFromParent(); DTU->applyUpdatesPermissive(Updates); @@ -1080,11 +1070,11 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { // it's value at the branch instruction. We only handle comparisons // against a constant at this time. if (Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1))) { - LazyValueInfo::Tristate Ret = + Constant *Res = LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), CondConst, BB->getTerminator(), /*UseBlockValue=*/false); - if (Ret != LazyValueInfo::Unknown) { + if (Res) { // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This @@ -1092,10 +1082,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. - auto *CI = Ret == LazyValueInfo::True ? - ConstantInt::getTrue(CondCmp->getType()) : - ConstantInt::getFalse(CondCmp->getType()); - if (replaceFoldableUses(CondCmp, CI, BB)) + if (replaceFoldableUses(CondCmp, Res, BB)) return true; } @@ -1177,7 +1164,7 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) { BasicBlock *CurrentPred = BB->getSinglePredecessor(); unsigned Iter = 0; - auto &DL = BB->getModule()->getDataLayout(); + auto &DL = BB->getDataLayout(); while (CurrentPred && Iter++ < ImplicationSearchThreshold) { auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); @@ -1202,7 +1189,7 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) { BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1); BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0); RemoveSucc->removePredecessor(BB); - BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI); + BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI->getIterator()); UncondBI->setDebugLoc(BI->getDebugLoc()); ++NumFolds; BI->eraseFromParent(); @@ -1278,9 +1265,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // only happen in dead loops. if (AvailableVal == LoadI) AvailableVal = PoisonValue::get(LoadI->getType()); - if (AvailableVal->getType() != LoadI->getType()) + if (AvailableVal->getType() != LoadI->getType()) { AvailableVal = CastInst::CreateBitOrPointerCast( - AvailableVal, LoadI->getType(), "", LoadI); + AvailableVal, LoadI->getType(), "", LoadI->getIterator()); + cast<Instruction>(AvailableVal)->setDebugLoc(LoadI->getDebugLoc()); + } LoadI->replaceAllUsesWith(AvailableVal); LoadI->eraseFromParent(); return true; @@ -1321,7 +1310,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // If this is a load on a phi pointer, phi-translate it and search // for available load/store to the pointer in predecessors. Type *AccessTy = LoadI->getType(); - const auto &DL = LoadI->getModule()->getDataLayout(); + const auto &DL = LoadI->getDataLayout(); MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB), LocationSize::precise(DL.getTypeStoreSize(AccessTy)), AATags); @@ -1421,7 +1410,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), LoadI->getName() + ".pr", false, LoadI->getAlign(), LoadI->getOrdering(), LoadI->getSyncScopeID(), - UnavailablePred->getTerminator()); + UnavailablePred->getTerminator()->getIterator()); NewVal->setDebugLoc(LoadI->getDebugLoc()); if (AATags) NewVal->setAAMetadata(AATags); @@ -1434,16 +1423,14 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); // Create a PHI node at the start of the block for the PRE'd load value. - pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); - PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), ""); + PHINode *PN = PHINode::Create(LoadI->getType(), pred_size(LoadBB), ""); PN->insertBefore(LoadBB->begin()); PN->takeName(LoadI); PN->setDebugLoc(LoadI->getDebugLoc()); // Insert new entries into the PHI for each predecessor. A single block may // have multiple entries here. - for (pred_iterator PI = PB; PI != PE; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(LoadBB)) { AvailablePredsTy::iterator I = llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr)); @@ -1456,8 +1443,8 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // predecessor use the same bitcast. Value *&PredV = I->second; if (PredV->getType() != LoadI->getType()) - PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "", - P->getTerminator()); + PredV = CastInst::CreateBitOrPointerCast( + PredV, LoadI->getType(), "", P->getTerminator()->getIterator()); PN->addIncoming(PredV, I->first); } @@ -1490,7 +1477,7 @@ findMostPopularDest(BasicBlock *BB, // Populate DestPopularity with the successors in the order they appear in the // successor list. This way, we ensure determinism by iterating it in the - // same order in std::max_element below. We map nullptr to 0 so that we can + // same order in llvm::max_element below. We map nullptr to 0 so that we can // return nullptr when PredToDestList contains nullptr only. DestPopularity[nullptr] = 0; for (auto *SuccBB : successors(BB)) @@ -1501,8 +1488,7 @@ findMostPopularDest(BasicBlock *BB, DestPopularity[PredToDest.second]++; // Find the most popular dest. - auto MostPopular = std::max_element( - DestPopularity.begin(), DestPopularity.end(), llvm::less_second()); + auto MostPopular = llvm::max_element(DestPopularity, llvm::less_second()); // Okay, we have finally picked the most popular destination. return MostPopular->first; @@ -1512,7 +1498,8 @@ findMostPopularDest(BasicBlock *BB, // BB->getSinglePredecessor() and then on to BB. Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB, - Value *V) { + Value *V, + const DataLayout &DL) { BasicBlock *PredBB = BB->getSinglePredecessor(); assert(PredBB && "Expected a single predecessor"); @@ -1537,11 +1524,12 @@ Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB, if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) { if (CondCmp->getParent() == BB) { Constant *Op0 = - evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0)); + evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0), DL); Constant *Op1 = - evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1)); + evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1), DL); if (Op0 && Op1) { - return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1); + return ConstantFoldCompareInstOperands(CondCmp->getPredicate(), Op0, + Op1, DL); } } return nullptr; @@ -1655,7 +1643,8 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB, // Finally update the terminator. Instruction *Term = BB->getTerminator(); - BranchInst::Create(OnlyDest, Term); + Instruction *NewBI = BranchInst::Create(OnlyDest, Term->getIterator()); + NewBI->setDebugLoc(Term->getDebugLoc()); ++NumFolds; Term->eraseFromParent(); DTU->applyUpdatesPermissive(Updates); @@ -1879,7 +1868,7 @@ bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) { static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, BasicBlock *OldPred, BasicBlock *NewPred, - DenseMap<Instruction*, Value*> &ValueMap) { + ValueToValueMapTy &ValueMap) { for (PHINode &PN : PHIBB->phis()) { // Ok, we have a PHI node. Figure out what the incoming value was for the // DestBlock. @@ -1887,7 +1876,7 @@ static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, // Remap the value if necessary. if (Instruction *Inst = dyn_cast<Instruction>(IV)) { - DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst); + ValueToValueMapTy::iterator I = ValueMap.find(Inst); if (I != ValueMap.end()) IV = I->second; } @@ -1948,9 +1937,8 @@ bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) { /// Update the SSA form. NewBB contains instructions that are copied from BB. /// ValueMapping maps old values in BB to new ones in NewBB. -void JumpThreadingPass::updateSSA( - BasicBlock *BB, BasicBlock *NewBB, - DenseMap<Instruction *, Value *> &ValueMapping) { +void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, + ValueToValueMapTy &ValueMapping) { // If there were values defined in BB that are used outside the block, then we // now have to update all uses of the value to use either the original value, // the cloned value, or some PHI derived value. This can require arbitrary @@ -1958,7 +1946,7 @@ void JumpThreadingPass::updateSSA( SSAUpdater SSAUpdate; SmallVector<Use *, 16> UsesToRename; SmallVector<DbgValueInst *, 4> DbgValues; - SmallVector<DPValue *, 4> DPValues; + SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; for (Instruction &I : *BB) { // Scan all uses of this instruction to see if it is used outside of its @@ -1975,16 +1963,16 @@ void JumpThreadingPass::updateSSA( } // Find debug values outside of the block - findDbgValues(DbgValues, &I, &DPValues); + findDbgValues(DbgValues, &I, &DbgVariableRecords); llvm::erase_if(DbgValues, [&](const DbgValueInst *DbgVal) { return DbgVal->getParent() == BB; }); - llvm::erase_if(DPValues, [&](const DPValue *DPVal) { - return DPVal->getParent() == BB; + llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) { + return DbgVarRec->getParent() == BB; }); // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty() && DbgValues.empty() && DPValues.empty()) + if (UsesToRename.empty() && DbgValues.empty() && DbgVariableRecords.empty()) continue; LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); @@ -1997,11 +1985,11 @@ void JumpThreadingPass::updateSSA( while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - if (!DbgValues.empty() || !DPValues.empty()) { + if (!DbgValues.empty() || !DbgVariableRecords.empty()) { SSAUpdate.UpdateDebugValues(&I, DbgValues); - SSAUpdate.UpdateDebugValues(&I, DPValues); + SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords); DbgValues.clear(); - DPValues.clear(); + DbgVariableRecords.clear(); } LLVM_DEBUG(dbgs() << "\n"); @@ -2011,14 +1999,15 @@ void JumpThreadingPass::updateSSA( /// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone /// arguments that come from PredBB. Return the map from the variables in the /// source basic block to the variables in the newly created basic block. -DenseMap<Instruction *, Value *> -JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, - BasicBlock::iterator BE, BasicBlock *NewBB, - BasicBlock *PredBB) { + +void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping, + BasicBlock::iterator BI, + BasicBlock::iterator BE, + BasicBlock *NewBB, + BasicBlock *PredBB) { // We are going to have to map operands from the source basic block to the new // copy of the block 'NewBB'. If there are PHI nodes in the source basic // block, evaluate them to account for entry from PredBB. - DenseMap<Instruction *, Value *> ValueMapping; // Retargets llvm.dbg.value to any renamed variables. auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool { @@ -2044,11 +2033,11 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, return true; }; - // Duplicate implementation of the above dbg.value code, using DPValues - // instead. - auto RetargetDPValueIfPossible = [&](DPValue *DPV) { + // Duplicate implementation of the above dbg.value code, using + // DbgVariableRecords instead. + auto RetargetDbgVariableRecordIfPossible = [&](DbgVariableRecord *DVR) { SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap; - for (auto *Op : DPV->location_ops()) { + for (auto *Op : DVR->location_ops()) { Instruction *OpInst = dyn_cast<Instruction>(Op); if (!OpInst) continue; @@ -2059,7 +2048,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, } for (auto &[OldOp, MappedOp] : OperandsToRemap) - DPV->replaceVariableLocationOp(OldOp, MappedOp); + DVR->replaceVariableLocationOp(OldOp, MappedOp); }; BasicBlock *RangeBB = BI->getParent(); @@ -2083,9 +2072,9 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context); auto CloneAndRemapDbgInfo = [&](Instruction *NewInst, Instruction *From) { - auto DPVRange = NewInst->cloneDebugInfoFrom(From); - for (DPValue &DPV : DPVRange) - RetargetDPValueIfPossible(&DPV); + auto DVRRange = NewInst->cloneDebugInfoFrom(From); + for (DbgVariableRecord &DVR : filterDbgVars(DVRRange)) + RetargetDbgVariableRecordIfPossible(&DVR); }; // Clone the non-phi instructions of the source basic block into NewBB, @@ -2106,24 +2095,24 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { - DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst); + ValueToValueMapTy::iterator I = ValueMapping.find(Inst); if (I != ValueMapping.end()) New->setOperand(i, I->second); } } - // There may be DPValues on the terminator, clone directly from marker - // to marker as there isn't an instruction there. - if (BE != RangeBB->end() && BE->hasDbgValues()) { + // There may be DbgVariableRecords on the terminator, clone directly from + // marker to marker as there isn't an instruction there. + if (BE != RangeBB->end() && BE->hasDbgRecords()) { // Dump them at the end. - DPMarker *Marker = RangeBB->getMarker(BE); - DPMarker *EndMarker = NewBB->createMarker(NewBB->end()); - auto DPVRange = EndMarker->cloneDebugInfoFrom(Marker, std::nullopt); - for (DPValue &DPV : DPVRange) - RetargetDPValueIfPossible(&DPV); + DbgMarker *Marker = RangeBB->getMarker(BE); + DbgMarker *EndMarker = NewBB->createMarker(NewBB->end()); + auto DVRRange = EndMarker->cloneDebugInfoFrom(Marker, std::nullopt); + for (DbgVariableRecord &DVR : filterDbgVars(DVRRange)) + RetargetDbgVariableRecordIfPossible(&DVR); } - return ValueMapping; + return; } /// Attempt to thread through two successive basic blocks. @@ -2194,12 +2183,13 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB, unsigned OneCount = 0; BasicBlock *ZeroPred = nullptr; BasicBlock *OnePred = nullptr; + const DataLayout &DL = BB->getDataLayout(); for (BasicBlock *P : predecessors(PredBB)) { // If PredPred ends with IndirectBrInst, we can't handle it. if (isa<IndirectBrInst>(P->getTerminator())) continue; if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>( - evaluateOnPredecessorEdge(BB, P, Cond))) { + evaluateOnPredecessorEdge(BB, P, Cond, DL))) { if (CI->isZero()) { ZeroCount++; ZeroPred = P; @@ -2298,8 +2288,9 @@ void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB, // We are going to have to map operands from the original BB block to the new // copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them // to account for entry from PredPredBB. - DenseMap<Instruction *, Value *> ValueMapping = - cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB); + ValueToValueMapTy ValueMapping; + cloneInstructions(ValueMapping, PredBB->begin(), PredBB->end(), NewBB, + PredPredBB); // Copy the edge probabilities from PredBB to NewBB. if (BPI) @@ -2422,8 +2413,9 @@ void JumpThreadingPass::threadEdge(BasicBlock *BB, } // Copy all the instructions from BB to NewBB except the terminator. - DenseMap<Instruction *, Value *> ValueMapping = - cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); + ValueToValueMapTy ValueMapping; + cloneInstructions(ValueMapping, BB->begin(), std::prev(BB->end()), NewBB, + PredBB); // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. @@ -2555,8 +2547,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BBSuccFreq.push_back(SuccFreq.getFrequency()); } - uint64_t MaxBBSuccFreq = - *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + uint64_t MaxBBSuccFreq = *llvm::max_element(BBSuccFreq); SmallVector<BranchProbability, 4> BBSuccProbs; if (MaxBBSuccFreq == 0) @@ -2614,7 +2605,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, Weights.push_back(Prob.getNumerator()); auto TI = BB->getTerminator(); - setBranchWeights(*TI, Weights); + setBranchWeights(*TI, Weights, hasBranchWeightOrigin(*TI)); } } @@ -2679,7 +2670,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( // We are going to have to map operands from the original BB block into the // PredBB block. Evaluate PHI nodes in BB. - DenseMap<Instruction*, Value*> ValueMapping; + ValueToValueMapTy ValueMapping; BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) @@ -2693,17 +2684,20 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { - DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); + ValueToValueMapTy::iterator I = ValueMapping.find(Inst); if (I != ValueMapping.end()) New->setOperand(i, I->second); } + // Remap debug variable operands. + remapDebugVariable(ValueMapping, New); + // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. if (Value *IV = simplifyInstruction( New, - {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) { + {BB->getDataLayout(), TLI, nullptr, nullptr, New})) { ValueMapping[&*BI] = IV; if (!New->mayHaveSideEffects()) { New->eraseFromParent(); @@ -2882,15 +2876,13 @@ bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { // Now check if one of the select values would allow us to constant fold the // terminator in BB. We don't do the transform if both sides fold, those // cases will be threaded in any case. - LazyValueInfo::Tristate LHSFolds = + Constant *LHSRes = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), CondRHS, Pred, BB, CondCmp); - LazyValueInfo::Tristate RHSFolds = + Constant *RHSRes = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), CondRHS, Pred, BB, CondCmp); - if ((LHSFolds != LazyValueInfo::Unknown || - RHSFolds != LazyValueInfo::Unknown) && - LHSFolds != RHSFolds) { + if ((LHSRes || RHSRes) && LHSRes != RHSRes) { unfoldSelectInstr(Pred, BB, SI, CondLHS, I); return true; } @@ -2973,15 +2965,16 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) { // Expand the select. Value *Cond = SI->getCondition(); if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) - Cond = new FreezeInst(Cond, "cond.fr", SI); + Cond = new FreezeInst(Cond, "cond.fr", SI->getIterator()); MDNode *BranchWeights = getBranchWeightMDNode(*SI); Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false, BranchWeights); BasicBlock *SplitBB = SI->getParent(); BasicBlock *NewBB = Term->getParent(); - PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); + PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI->getIterator()); NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); NewPN->addIncoming(SI->getFalseValue(), BB); + NewPN->setDebugLoc(SI->getDebugLoc()); SI->replaceAllUsesWith(NewPN); SI->eraseFromParent(); // NewBB and SplitBB are newly created blocks which require insertion. @@ -3063,7 +3056,7 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard, BasicBlock *TrueDest = BI->getSuccessor(0); BasicBlock *FalseDest = BI->getSuccessor(1); - auto &DL = BB->getModule()->getDataLayout(); + auto &DL = BB->getDataLayout(); bool TrueDestIsSafe = false; bool FalseDestIsSafe = false; @@ -3119,10 +3112,11 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard, PHINode *NewPN = PHINode::Create(Inst->getType(), 2); NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); + NewPN->setDebugLoc(Inst->getDebugLoc()); NewPN->insertBefore(InsertionPoint); Inst->replaceAllUsesWith(NewPN); } - Inst->dropDbgValues(); + Inst->dropDbgRecords(); Inst->eraseFromParent(); } return true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp index f3e40a5cb809..fe264503dee9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp @@ -110,6 +110,11 @@ STATISTIC(NumAddSubHoisted, "Number of add/subtract expressions reassociated " "and hoisted out of the loop"); STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions " "reassociated and hoisted out of the loop"); +STATISTIC(NumIntAssociationsHoisted, + "Number of invariant int expressions " + "reassociated and hoisted out of the loop"); +STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions " + "reassociated and hoisted out of the loop"); /// Memory promotion is enabled by default. static cl::opt<bool> @@ -135,6 +140,12 @@ static cl::opt<unsigned> FPAssociationUpperLimit( "Set upper limit for the number of transformations performed " "during a single round of hoisting the reassociated expressions.")); +cl::opt<unsigned> IntAssociationUpperLimit( + "licm-max-num-int-reassociations", cl::init(5U), cl::Hidden, + cl::desc( + "Set upper limit for the number of transformations performed " + "during a single round of hoisting the reassociated expressions.")); + // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to // address the same issue. LICM calls MemorySSAWalker's @@ -924,12 +935,14 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); ReciprocalDivisor->insertBefore(&I); + ReciprocalDivisor->setDebugLoc(I.getDebugLoc()); auto Product = BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); Product->setFastMathFlags(I.getFastMathFlags()); SafetyInfo->insertInstructionTo(Product, I.getParent()); Product->insertAfter(&I); + Product->setDebugLoc(I.getDebugLoc()); I.replaceAllUsesWith(Product); eraseInstruction(I, *SafetyInfo, MSSAU); @@ -1041,7 +1054,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, Loop *CurLoop) { Value *Addr = LI->getPointerOperand(); - const DataLayout &DL = LI->getModule()->getDataLayout(); + const DataLayout &DL = LI->getDataLayout(); const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType()); // It is not currently possible for clang to generate an invariant.start @@ -1208,6 +1221,14 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, if (CI->isConvergent()) return false; + // FIXME: Current LLVM IR semantics don't work well with coroutines and + // thread local globals. We currently treat getting the address of a thread + // local global as not accessing memory, even though it may not be a + // constant throughout a function with coroutines. Remove this check after + // we better model semantics of thread local globals. + if (CI->getFunction()->isPresplitCoroutine()) + return false; + using namespace PatternMatch; if (match(CI, m_Intrinsic<Intrinsic::assume>())) // Assumes don't actually alias anything or throw @@ -1216,14 +1237,6 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // Handle simple cases by querying alias analysis. MemoryEffects Behavior = AA->getMemoryEffects(CI); - // FIXME: we don't handle the semantics of thread local well. So that the - // address of thread locals are fake constants in coroutines. So We forbid - // to treat onlyReadsMemory call in coroutines as constants now. Note that - // it is possible to hide a thread local access in a onlyReadsMemory call. - // Remove this check after we handle the semantics of thread locals well. - if (Behavior.onlyReadsMemory() && CI->getFunction()->isPresplitCoroutine()) - return false; - if (Behavior.doesNotAccessMemory()) return true; if (Behavior.onlyReadsMemory()) { @@ -1442,6 +1455,7 @@ static Instruction *cloneInstructionInExitBlock( } New = CallInst::Create(CI, OpBundles); + New->copyMetadata(*CI); } else { New = I.clone(); } @@ -2031,7 +2045,7 @@ bool llvm::promoteLoopAccessesToScalars( bool SawNotAtomic = false; AAMDNodes AATags; - const DataLayout &MDL = Preheader->getModule()->getDataLayout(); + const DataLayout &MDL = Preheader->getDataLayout(); // If there are reads outside the promoted set, then promoting stores is // definitely not safe. @@ -2225,7 +2239,7 @@ bool llvm::promoteLoopAccessesToScalars( if (FoundLoadToPromote || !StoreIsGuanteedToExecute) { PreheaderLoad = new LoadInst(AccessTy, SomePtr, SomePtr->getName() + ".promoted", - Preheader->getTerminator()); + Preheader->getTerminator()->getIterator()); if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); PreheaderLoad->setAlignment(Alignment); @@ -2494,7 +2508,7 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, // The swapped GEPs are inbounds if both original GEPs are inbounds // and the sign of the offsets is the same. For simplicity, only // handle both offsets being non-negative. - const DataLayout &DL = GEP->getModule()->getDataLayout(); + const DataLayout &DL = GEP->getDataLayout(); auto NonNegative = [&](Value *V) { return isKnownNonNegative(V, SimplifyQuery(DL, DT, AC, GEP)); }; @@ -2544,7 +2558,7 @@ static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS, // freely move values from left side of inequality to right side (just as in // normal linear arithmetics). Overflows make things much more complicated, so // we want to avoid this. - auto &DL = L.getHeader()->getModule()->getDataLayout(); + auto &DL = L.getHeader()->getDataLayout(); bool ProvedNoOverflowAfterReassociate = computeOverflowForSignedSub(InvariantRHS, InvariantOp, SimplifyQuery(DL, DT, AC, &ICmp)) == @@ -2597,7 +2611,7 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS, // normal linear arithmetics). Overflows make things much more complicated, so // we want to avoid this. Likewise, for "C1 - LV < C2" we need to prove that // "C1 - C2" does not overflow. - auto &DL = L.getHeader()->getModule()->getDataLayout(); + auto &DL = L.getHeader()->getDataLayout(); SimplifyQuery SQ(DL, DT, AC, &ICmp); if (VariantSubtracted) { // C1 - LV < C2 --> LV > C1 - C2 @@ -2661,21 +2675,29 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, return false; } +static bool isReassociableOp(Instruction *I, unsigned IntOpcode, + unsigned FPOpcode) { + if (I->getOpcode() == IntOpcode) + return true; + if (I->getOpcode() == FPOpcode && I->hasAllowReassoc() && + I->hasNoSignedZeros()) + return true; + return false; +} + /// Try to reassociate expressions like ((A1 * B1) + (A2 * B2) + ...) * C where /// A1, A2, ... and C are loop invariants into expressions like /// ((A1 * C * B1) + (A2 * C * B2) + ...) and hoist the (A1 * C), (A2 * C), ... /// invariant expressions. This functions returns true only if any hoisting has /// actually occured. -static bool hoistFPAssociation(Instruction &I, Loop &L, - ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, AssumptionCache *AC, - DominatorTree *DT) { - using namespace PatternMatch; - Value *VariantOp = nullptr, *InvariantOp = nullptr; - - if (!match(&I, m_FMul(m_Value(VariantOp), m_Value(InvariantOp))) || - !I.hasAllowReassoc() || !I.hasNoSignedZeros()) +static bool hoistMulAddAssociation(Instruction &I, Loop &L, + ICFLoopSafetyInfo &SafetyInfo, + MemorySSAUpdater &MSSAU, AssumptionCache *AC, + DominatorTree *DT) { + if (!isReassociableOp(&I, Instruction::Mul, Instruction::FMul)) return false; + Value *VariantOp = I.getOperand(0); + Value *InvariantOp = I.getOperand(1); if (L.isLoopInvariant(VariantOp)) std::swap(VariantOp, InvariantOp); if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp)) @@ -2684,20 +2706,24 @@ static bool hoistFPAssociation(Instruction &I, Loop &L, // First, we need to make sure we should do the transformation. SmallVector<Use *> Changes; + SmallVector<BinaryOperator *> Adds; SmallVector<BinaryOperator *> Worklist; if (BinaryOperator *VariantBinOp = dyn_cast<BinaryOperator>(VariantOp)) Worklist.push_back(VariantBinOp); while (!Worklist.empty()) { BinaryOperator *BO = Worklist.pop_back_val(); - if (!BO->hasOneUse() || !BO->hasAllowReassoc() || !BO->hasNoSignedZeros()) + if (!BO->hasOneUse()) return false; - BinaryOperator *Op0, *Op1; - if (match(BO, m_FAdd(m_BinOp(Op0), m_BinOp(Op1)))) { - Worklist.push_back(Op0); - Worklist.push_back(Op1); + if (isReassociableOp(BO, Instruction::Add, Instruction::FAdd) && + isa<BinaryOperator>(BO->getOperand(0)) && + isa<BinaryOperator>(BO->getOperand(1))) { + Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0))); + Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1))); + Adds.push_back(BO); continue; } - if (BO->getOpcode() != Instruction::FMul || L.isLoopInvariant(BO)) + if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) || + L.isLoopInvariant(BO)) return false; Use &U0 = BO->getOperandUse(0); Use &U1 = BO->getOperandUse(1); @@ -2707,26 +2733,108 @@ static bool hoistFPAssociation(Instruction &I, Loop &L, Changes.push_back(&U1); else return false; - if (Changes.size() > FPAssociationUpperLimit) + unsigned Limit = I.getType()->isIntOrIntVectorTy() + ? IntAssociationUpperLimit + : FPAssociationUpperLimit; + if (Changes.size() > Limit) return false; } if (Changes.empty()) return false; + // Drop the poison flags for any adds we looked through. + if (I.getType()->isIntOrIntVectorTy()) { + for (auto *Add : Adds) + Add->dropPoisonGeneratingFlags(); + } + // We know we should do it so let's do the transformation. auto *Preheader = L.getLoopPreheader(); assert(Preheader && "Loop is not in simplify form?"); IRBuilder<> Builder(Preheader->getTerminator()); for (auto *U : Changes) { assert(L.isLoopInvariant(U->get())); - Instruction *Ins = cast<Instruction>(U->getUser()); - U->set(Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul")); + auto *Ins = cast<BinaryOperator>(U->getUser()); + Value *Mul; + if (I.getType()->isIntOrIntVectorTy()) { + Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul"); + // Drop the poison flags on the original multiply. + Ins->dropPoisonGeneratingFlags(); + } else + Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul"); + + // Rewrite the reassociable instruction. + unsigned OpIdx = U->getOperandNo(); + auto *LHS = OpIdx == 0 ? Mul : Ins->getOperand(0); + auto *RHS = OpIdx == 1 ? Mul : Ins->getOperand(1); + auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS, + Ins->getName() + ".reass", Ins); + NewBO->copyIRFlags(Ins); + if (VariantOp == Ins) + VariantOp = NewBO; + Ins->replaceAllUsesWith(NewBO); + eraseInstruction(*Ins, SafetyInfo, MSSAU); } + I.replaceAllUsesWith(VariantOp); eraseInstruction(I, SafetyInfo, MSSAU); return true; } +/// Reassociate general associative binary expressions of the form +/// +/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" +/// +/// where op is an associative binary op, LV is a loop variant, and C1 and C2 +/// are loop invariants that we want to hoist. +/// +/// TODO: This can be extended to more cases such as +/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV" +/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative +/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative +static bool hoistBOAssociation(Instruction &I, Loop &L, + ICFLoopSafetyInfo &SafetyInfo, + MemorySSAUpdater &MSSAU, AssumptionCache *AC, + DominatorTree *DT) { + BinaryOperator *BO = dyn_cast<BinaryOperator>(&I); + if (!BO || !BO->isAssociative()) + return false; + + Instruction::BinaryOps Opcode = BO->getOpcode(); + BinaryOperator *Op0 = dyn_cast<BinaryOperator>(BO->getOperand(0)); + + // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)" + if (Op0 && Op0->getOpcode() == Opcode) { + Value *LV = Op0->getOperand(0); + Value *C1 = Op0->getOperand(1); + Value *C2 = BO->getOperand(1); + + if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || + !L.isLoopInvariant(C2)) + return false; + + auto *Preheader = L.getLoopPreheader(); + assert(Preheader && "Loop is not in simplify form?"); + IRBuilder<> Builder(Preheader->getTerminator()); + Value *Inv = Builder.CreateBinOp(Opcode, C1, C2, "invariant.op"); + + auto *NewBO = + BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO); + NewBO->copyIRFlags(BO); + BO->replaceAllUsesWith(NewBO); + eraseInstruction(*BO, SafetyInfo, MSSAU); + + // Note: (LV op C1) might not be erased if it has more uses than the one we + // just replaced. + if (Op0->use_empty()) + eraseInstruction(*Op0, SafetyInfo, MSSAU); + + return true; + } + + return false; +} + static bool hoistArithmetics(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU, AssumptionCache *AC, @@ -2754,9 +2862,19 @@ static bool hoistArithmetics(Instruction &I, Loop &L, return true; } - if (hoistFPAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) { + bool IsInt = I.getType()->isIntOrIntVectorTy(); + if (hoistMulAddAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) { + ++NumHoisted; + if (IsInt) + ++NumIntAssociationsHoisted; + else + ++NumFPAssociationsHoisted; + return true; + } + + if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) { ++NumHoisted; - ++NumFPAssociationsHoisted; + ++NumBOAssociationsHoisted; return true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp index 9a27a08c86eb..6092cd1bc08b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp @@ -405,7 +405,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, : SE.getUMinExpr(NewBoundSCEV, SplitBoundSCEV); SCEVExpander Expander( - SE, L.getHeader()->getParent()->getParent()->getDataLayout(), "split"); + SE, L.getHeader()->getDataLayout(), "split"); Instruction *InsertPt = SplitLoopPH->getTerminator(); Value *NewBoundValue = Expander.expandCodeFor(NewBoundSCEV, NewBoundSCEV->getType(), InsertPt); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index cc1f56014eee..d85166e518f1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -391,7 +391,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { continue; BasicBlock *BB = P.InsertPt->getParent(); - SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); + SCEVExpander SCEVE(*SE, BB->getDataLayout(), "prefaddr"); const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), P.LSCEVAddRec->getStepRecurrence(*SE))); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index bfe9374cf2f8..b0b7ae60da98 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -273,9 +273,9 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT, if (LiveEdges.count({ Pred, BB })) { HasLivePreds = true; Value *Incoming = PN.getIncomingValueForBlock(Pred); - // Skip undefs. If they are present, we can assume they are equal to - // the non-undef input. - if (isa<UndefValue>(Incoming)) + // Skip poison. If they are present, we can assume they are equal to + // the non-poison input. + if (isa<PoisonValue>(Incoming)) continue; // Two inputs. if (OnlyInput && OnlyInput != Incoming) @@ -284,8 +284,8 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT, } assert(HasLivePreds && "No live predecessors?"); - // If all incoming live value were undefs, return undef. - return OnlyInput ? OnlyInput : UndefValue::get(PN.getType()); + // If all incoming live value were poison, return poison. + return OnlyInput ? OnlyInput : PoisonValue::get(PN.getType()); }; DenseMap<Value *, Value *> FirstIterValue; @@ -299,7 +299,7 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT, // iteration, mark this successor live. // 3b. If we cannot prove it, conservatively assume that all successors are // live. - auto &DL = Header->getModule()->getDataLayout(); + auto &DL = Header->getDataLayout(); const SimplifyQuery SQ(DL); for (auto *BB : RPOT) { Visited.insert(BB); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 626888c74bad..c84e419c2a24 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -26,7 +26,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -120,7 +120,7 @@ namespace { /// Maintains the set of instructions of the loop for a partition before /// cloning. After cloning, it hosts the new loop. class InstPartition { - using InstructionSet = SmallPtrSet<Instruction *, 8>; + using InstructionSet = SmallSetVector<Instruction *, 8>; public: InstPartition(Instruction *I, Loop *L, bool DepCycle = false) @@ -166,7 +166,7 @@ public: // Insert instructions from the loop that we depend on. for (Value *V : I->operand_values()) { auto *I = dyn_cast<Instruction>(V); - if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) + if (I && OrigLoop->contains(I->getParent()) && Set.insert(I)) Worklist.push_back(I); } } @@ -231,17 +231,16 @@ public: } } - void print() const { - if (DepCycle) - dbgs() << " (cycle)\n"; + void print(raw_ostream &OS) const { + OS << (DepCycle ? " (cycle)\n" : "\n"); for (auto *I : Set) // Prefix with the block name. - dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; + OS << " " << I->getParent()->getName() << ":" << *I << "\n"; } - void printBlocks() const { + void printBlocks(raw_ostream &OS) const { for (auto *BB : getDistributedLoop()->getBlocks()) - dbgs() << *BB; + OS << *BB; } private: @@ -368,11 +367,11 @@ public: std::tie(LoadToPart, NewElt) = LoadToPartition.insert(std::make_pair(Inst, PartI)); if (!NewElt) { - LLVM_DEBUG(dbgs() - << "Merging partitions due to this load in multiple " - << "partitions: " << PartI << ", " << LoadToPart->second - << "\n" - << *Inst << "\n"); + LLVM_DEBUG( + dbgs() + << "LDist: Merging partitions due to this load in multiple " + << "partitions: " << PartI << ", " << LoadToPart->second << "\n" + << *Inst << "\n"); auto PartJ = I; do { @@ -530,8 +529,8 @@ public: void print(raw_ostream &OS) const { unsigned Index = 0; for (const auto &P : PartitionContainer) { - OS << "Partition " << Index++ << " (" << &P << "):\n"; - P.print(); + OS << "LDist: Partition " << Index++ << ":"; + P.print(OS); } } @@ -545,11 +544,11 @@ public: } #endif - void printBlocks() const { + void printBlocks(raw_ostream &OS) const { unsigned Index = 0; for (const auto &P : PartitionContainer) { - dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; - P.printBlocks(); + OS << "LDist: Partition " << Index++ << ":"; + P.printBlocks(OS); } } @@ -628,7 +627,7 @@ public: const SmallVectorImpl<Dependence> &Dependences) { Accesses.append(Instructions.begin(), Instructions.end()); - LLVM_DEBUG(dbgs() << "Backward dependences:\n"); + LLVM_DEBUG(dbgs() << "LDist: Backward dependences:\n"); for (const auto &Dep : Dependences) if (Dep.isPossiblyBackward()) { // Note that the designations source and destination follow the program @@ -659,9 +658,9 @@ public: bool processLoop() { assert(L->isInnermost() && "Only process inner loops."); - LLVM_DEBUG(dbgs() << "\nLDist: In \"" - << L->getHeader()->getParent()->getName() - << "\" checking " << *L << "\n"); + LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '" + << L->getHeader()->getParent()->getName() << "' from " + << L->getLocStr() << "\n"); // Having a single exit block implies there's also one exiting block. if (!L->getExitBlock()) @@ -686,6 +685,9 @@ public: if (!Dependences || Dependences->empty()) return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); + LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: " + << L->getHeader()->getName() << "\n"); + InstPartitionContainer Partitions(L, LI, DT); // First, go through each memory operation and assign them to consecutive @@ -735,7 +737,7 @@ public: for (auto *Inst : DefsUsedOutside) Partitions.addToNewNonCyclicPartition(Inst); - LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); + LLVM_DEBUG(dbgs() << "LDist: Seeded partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); @@ -743,19 +745,19 @@ public: // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); - LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); + LLVM_DEBUG(dbgs() << "LDist: Merged partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); // Now, populate the partitions with non-memory operations. Partitions.populateUsedSet(); - LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); + LLVM_DEBUG(dbgs() << "LDist: Populated partitions:\n" << Partitions); // In order to preserve original lexical order for loads, keep them in the // partition that we set up in the MemoryInstructionDependences loop. if (Partitions.mergeToAvoidDuplicatedLoads()) { - LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" + LLVM_DEBUG(dbgs() << "LDist: Partitions merged to ensure unique loads:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", @@ -779,7 +781,8 @@ public: if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) return fail("HeuristicDisabled", "distribution heuristic disabled"); - LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); + LLVM_DEBUG(dbgs() << "LDist: Distributing loop: " + << L->getHeader()->getName() << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. Partitions.setupPartitionIdOnInstructions(); @@ -807,7 +810,7 @@ public: MDNode *OrigLoopID = L->getLoopID(); - LLVM_DEBUG(dbgs() << "\nPointers:\n"); + LLVM_DEBUG(dbgs() << "LDist: Pointers:\n"); LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); LVer.versionLoop(DefsUsedOutside); @@ -830,8 +833,8 @@ public: // Now, we remove the instruction from each loop that don't belong to that // partition. Partitions.removeUnusedInsts(); - LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); - LLVM_DEBUG(Partitions.printBlocks()); + LLVM_DEBUG(dbgs() << "LDist: After removing unused Instrs:\n"); + LLVM_DEBUG(Partitions.printBlocks(dbgs())); if (LDistVerify) { LI->verify(*DT); @@ -853,7 +856,7 @@ public: LLVMContext &Ctx = F->getContext(); bool Forced = isForced().value_or(false); - LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); + LLVM_DEBUG(dbgs() << "LDist: Skipping; " << Message << "\n"); // With Rpass-missed report that distribution failed. ORE->emit([&]() { @@ -962,11 +965,10 @@ private: } // end anonymous namespace -/// Shared implementation between new and old PMs. static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, LoopAccessInfoManager &LAIs) { - // Build up a worklist of inner-loops to vectorize. This is necessary as the + // Build up a worklist of inner-loops to distribute. This is necessary as the // act of distributing a loop creates new loops and can invalidate iterators // across the loops. SmallVector<Loop *, 8> Worklist; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 533cefaf1061..d5e91d3c1dec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -70,6 +70,7 @@ #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include <optional> @@ -97,6 +98,10 @@ static cl::opt<bool> cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening")); +static cl::opt<bool> + VersionLoops("loop-flatten-version-loops", cl::Hidden, cl::init(true), + cl::desc("Version loops if flattened loop could overflow")); + namespace { // We require all uses of both induction variables to match this pattern: // @@ -141,6 +146,8 @@ struct FlattenInfo { // has been applied. Used to skip // checks on phi nodes. + Value *NewTripCount = nullptr; // The tripcount of the flattened loop. + FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){}; bool isNarrowInductionPhi(PHINode *Phi) { @@ -637,7 +644,7 @@ static bool checkIVUsers(FlattenInfo &FI) { static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT, AssumptionCache *AC) { Function *F = FI.OuterLoop->getHeader()->getParent(); - const DataLayout &DL = F->getParent()->getDataLayout(); + const DataLayout &DL = F->getDataLayout(); // For debugging/testing. if (AssumeNoOverflow) @@ -752,11 +759,13 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ORE.emit(Remark); } - Value *NewTripCount = BinaryOperator::CreateMul( - FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount", - FI.OuterLoop->getLoopPreheader()->getTerminator()); - LLVM_DEBUG(dbgs() << "Created new trip count in preheader: "; - NewTripCount->dump()); + if (!FI.NewTripCount) { + FI.NewTripCount = BinaryOperator::CreateMul( + FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount", + FI.OuterLoop->getLoopPreheader()->getTerminator()->getIterator()); + LLVM_DEBUG(dbgs() << "Created new trip count in preheader: "; + FI.NewTripCount->dump()); + } // Fix up PHI nodes that take values from the inner loop back-edge, which // we are about to remove. @@ -769,13 +778,15 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // Modify the trip count of the outer loop to be the product of the two // trip counts. - cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount); + cast<User>(FI.OuterBranch->getCondition())->setOperand(1, FI.NewTripCount); // Replace the inner loop backedge with an unconditional branch to the exit. BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock(); BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock(); - InnerExitingBlock->getTerminator()->eraseFromParent(); - BranchInst::Create(InnerExitBlock, InnerExitingBlock); + Instruction *Term = InnerExitingBlock->getTerminator(); + Instruction *BI = BranchInst::Create(InnerExitBlock, InnerExitingBlock); + BI->setDebugLoc(Term->getDebugLoc()); + Term->eraseFromParent(); // Update the DomTree and MemorySSA. DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader()); @@ -799,8 +810,10 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // we need to insert the new GEP where the old GEP was. if (!DT->dominates(Base, &*Builder.GetInsertPoint())) Builder.SetInsertPoint(cast<Instruction>(V)); - OuterValue = Builder.CreateGEP(GEP->getSourceElementType(), Base, - OuterValue, "flatten." + V->getName()); + OuterValue = + Builder.CreateGEP(GEP->getSourceElementType(), Base, OuterValue, + "flatten." + V->getName(), + GEP->isInBounds() && InnerGEP->isInBounds()); } LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); dbgs() << "with: "; @@ -891,7 +904,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, const TargetTransformInfo *TTI, LPMUpdater *U, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, + const LoopAccessInfo &LAI) { LLVM_DEBUG( dbgs() << "Loop flattening running on outer loop " << FI.OuterLoop->getHeader()->getName() << " and inner loop " @@ -926,18 +940,55 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // variable might overflow. In this case, we need to version the loop, and // select the original version at runtime if the iteration space is too // large. - // TODO: We currently don't version the loop. OverflowResult OR = checkOverflow(FI, DT, AC); if (OR == OverflowResult::AlwaysOverflowsHigh || OR == OverflowResult::AlwaysOverflowsLow) { LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n"); return false; } else if (OR == OverflowResult::MayOverflow) { - LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n"); - return false; + Module *M = FI.OuterLoop->getHeader()->getParent()->getParent(); + const DataLayout &DL = M->getDataLayout(); + if (!VersionLoops) { + LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n"); + return false; + } else if (!DL.isLegalInteger( + FI.OuterTripCount->getType()->getScalarSizeInBits())) { + // If the trip count type isn't legal then it won't be possible to check + // for overflow using only a single multiply instruction, so don't + // flatten. + LLVM_DEBUG( + dbgs() << "Can't check overflow efficiently, not flattening\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Multiply might overflow, versioning loop\n"); + + // Version the loop. The overflow check isn't a runtime pointer check, so we + // pass an empty list of runtime pointer checks, causing LoopVersioning to + // emit 'false' as the branch condition, and add our own check afterwards. + BasicBlock *CheckBlock = FI.OuterLoop->getLoopPreheader(); + ArrayRef<RuntimePointerCheck> Checks(nullptr, nullptr); + LoopVersioning LVer(LAI, Checks, FI.OuterLoop, LI, DT, SE); + LVer.versionLoop(); + + // Check for overflow by calculating the new tripcount using + // umul_with_overflow and then checking if it overflowed. + BranchInst *Br = cast<BranchInst>(CheckBlock->getTerminator()); + assert(Br->isConditional() && + "Expected LoopVersioning to generate a conditional branch"); + assert(match(Br->getCondition(), m_Zero()) && + "Expected branch condition to be false"); + IRBuilder<> Builder(Br); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, + FI.OuterTripCount->getType()); + Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount}, + "flatten.mul"); + FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); + Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow"); + Br->setCondition(Overflow); + } else { + LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); } - LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); } @@ -958,13 +1009,15 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. + LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr); for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) continue; FlattenInfo FI(OuterLoop, InnerLoop); - Changed |= FlattenLoopPair(FI, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U, - MSSAU ? &*MSSAU : nullptr); + Changed |= + FlattenLoopPair(FI, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U, + MSSAU ? &*MSSAU : nullptr, LAIM.getInfo(*OuterLoop)); } if (!Changed) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp index e0b224d5ef73..8512b2accbe7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1684,7 +1684,7 @@ private: PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0"); L1HeaderPHI->insertBefore(L1HeaderIP); L1HeaderPHI->addIncoming(LCV, FC0.Latch); - L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), + L1HeaderPHI->addIncoming(PoisonValue::get(LCV->getType()), FC0.ExitingBlock); LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); @@ -2072,7 +2072,7 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); // Ensure loops are in simplifed form which is a pre-requisite for loop fusion // pass. Added only for new PM since the legacy PM has already added diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 3721564890dd..0ee1afa76a82 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -22,8 +22,6 @@ // // Future loop memory idioms to recognize: // memcmp, strlen, etc. -// Future floating point idioms to recognize in -ffast-math mode: -// fpowi // // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). @@ -233,12 +231,19 @@ private: bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); + bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX, + bool ZeroCheck, size_t CanonicalSize); + bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX, + Instruction *DefX, PHINode *CntPhi, + Instruction *CntInst); bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz + bool recognizeShiftUntilLessThan(); void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, - bool IsCntPhiUsedOutsideLoop); + bool IsCntPhiUsedOutsideLoop, + bool InsertSub = false); bool recognizeShiftUntilBitTest(); bool recognizeShiftUntilZero(); @@ -253,7 +258,7 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, if (DisableLIRP::All) return PreservedAnalyses::all(); - const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + const auto *DL = &L.getHeader()->getDataLayout(); // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations @@ -1107,7 +1112,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( GV->setAlignment(Align(16)); Value *PatternPtr = GV; NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); - + // Set the TBAA info if present. if (AATags.TBAA) NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA); @@ -1117,7 +1122,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( if (AATags.NoAlias) NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias); - } + } NewCall->setDebugLoc(TheStore->getDebugLoc()); @@ -1484,7 +1489,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { << CurLoop->getHeader()->getName() << "\n"); return recognizePopcount() || recognizeAndInsertFFS() || - recognizeShiftUntilBitTest() || recognizeShiftUntilZero(); + recognizeShiftUntilBitTest() || recognizeShiftUntilZero() || + recognizeShiftUntilLessThan(); } /// Check if the given conditional branch is based on the comparison between @@ -1519,6 +1525,34 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, return nullptr; } +/// Check if the given conditional branch is based on an unsigned less-than +/// comparison between a variable and a constant, and if the comparison is false +/// the control yields to the loop entry. If the branch matches the behaviour, +/// the variable involved in the comparison is returned. +static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, + APInt &Threshold) { + if (!BI || !BI->isConditional()) + return nullptr; + + ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!Cond) + return nullptr; + + ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1)); + if (!CmpConst) + return nullptr; + + BasicBlock *FalseSucc = BI->getSuccessor(1); + ICmpInst::Predicate Pred = Cond->getPredicate(); + + if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) { + Threshold = CmpConst->getValue(); + return Cond->getOperand(0); + } + + return nullptr; +} + // Check if the recurrence variable `VarX` is in the right form to create // the idiom. Returns the value coerced to a PHINode if so. static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, @@ -1530,6 +1564,107 @@ static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, return nullptr; } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) +/// or nullptr if there is no such. +/// 2) \p CntPhi is set to the corresponding phi node +/// or nullptr if there is no such. +/// 3) \p InitX is set to the value whose CTLZ could be used. +/// 4) \p DefX is set to the instruction calculating Loop exit condition. +/// 5) \p Threshold is set to the constant involved in the unsigned less-than +/// comparison. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 < 2) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val +/// do { +/// x = phi (x0, x.next); //PhiX +/// cnt = phi (cnt0, cnt.next) +/// +/// cnt.next = cnt + 1; +/// ... +/// x.next = x >> 1; // DefX +/// } while (x >= 4) +/// loop-exit: +/// \endcode +static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, + Intrinsic::ID &IntrinID, + Value *&InitX, Instruction *&CntInst, + PHINode *&CntPhi, Instruction *&DefX, + APInt &Threshold) { + BasicBlock *LoopEntry; + + DefX = nullptr; + CntInst = nullptr; + CntPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + if (Value *T = matchShiftULTCondition( + dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry, + Threshold)) + DefX = dyn_cast<Instruction>(T); + else + return false; + + // step 2: Check the recurrence of variable X + if (!DefX || !isa<PHINode>(DefX)) + return false; + + PHINode *VarPhi = cast<PHINode>(DefX); + int Idx = VarPhi->getBasicBlockIndex(LoopEntry); + if (Idx == -1) + return false; + + DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx)); + if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi) + return false; + + // step 3: detect instructions corresponding to "x.next = x >> 1" + if (DefX->getOpcode() != Instruction::LShr) + return false; + + IntrinID = Intrinsic::ctlz; + ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); + if (!Shft || !Shft->isOne()) + return false; + + InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader()); + + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 + // or cnt.next = cnt + -1. + // TODO: We can skip the step. If loop trip count is known (CTLZ), + // then all uses of "cnt.next" could be optimized to the trip count + // plus "cnt0". Currently it is not optimized. + // This step could be used to detect POPCNT instruction: + // cnt.next = cnt + (x.next & 1) + for (Instruction &Inst : llvm::make_range( + LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + if (Inst.getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1)); + if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) + continue; + + PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); + if (!Phi) + continue; + + CntInst = &Inst; + CntPhi = Phi; + break; + } + if (!CntInst) + return false; + + return true; +} + /// Return true iff the idiom is detected in the loop. /// /// Additionally: @@ -1758,27 +1893,35 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, return true; } -/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop -/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new -/// trip count returns true; otherwise, returns false. -bool LoopIdiomRecognize::recognizeAndInsertFFS() { - // Give up if the loop has multiple blocks or multiple backedges. - if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) - return false; +// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always +// profitable if we delete the loop. +bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID, + Value *InitX, bool ZeroCheck, + size_t CanonicalSize) { + const Value *Args[] = {InitX, + ConstantInt::getBool(InitX->getContext(), ZeroCheck)}; - Intrinsic::ID IntrinID; - Value *InitX; - Instruction *DefX = nullptr; - PHINode *CntPhi = nullptr; - Instruction *CntInst = nullptr; - // Help decide if transformation is profitable. For ShiftUntilZero idiom, - // this is always 6. - size_t IdiomCanonicalSize = 6; + // @llvm.dbg doesn't count as they have no semantic effect. + auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); + uint32_t HeaderSize = + std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); - if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, - CntInst, CntPhi, DefX)) + IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); + InstructionCost Cost = TTI->getIntrinsicInstrCost( + Attrs, TargetTransformInfo::TCK_SizeAndLatency); + if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic) return false; + return true; +} + +/// Convert CTLZ / CTTZ idiom loop into countable loop. +/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise, +/// returns false. +bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID, + Value *InitX, Instruction *DefX, + PHINode *CntPhi, + Instruction *CntInst) { bool IsCntPhiUsedOutsideLoop = false; for (User *U : CntPhi->users()) if (!CurLoop->contains(cast<Instruction>(U))) { @@ -1820,35 +1963,107 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() { ZeroCheck = true; } - // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always - // profitable if we delete the loop. - - // the loop has only 6 instructions: + // FFS idiom loop has only 6 instructions: // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] // %shr = ashr %n.addr.0, 1 // %tobool = icmp eq %shr, 0 // %inc = add nsw %i.0, 1 // br i1 %tobool + size_t IdiomCanonicalSize = 6; + if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize)) + return false; - const Value *Args[] = {InitX, - ConstantInt::getBool(InitX->getContext(), ZeroCheck)}; + transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, + DefX->getDebugLoc(), ZeroCheck, + IsCntPhiUsedOutsideLoop); + return true; +} - // @llvm.dbg doesn't count as they have no semantic effect. - auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); - uint32_t HeaderSize = - std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); +/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new +/// trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertFFS() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; - IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); - InstructionCost Cost = - TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency); - if (HeaderSize != IdiomCanonicalSize && - Cost > TargetTransformInfo::TCC_Basic) + Intrinsic::ID IntrinID; + Value *InitX; + Instruction *DefX = nullptr; + PHINode *CntPhi = nullptr; + Instruction *CntInst = nullptr; + + if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi, + DefX)) + return false; + + return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst); +} + +bool LoopIdiomRecognize::recognizeShiftUntilLessThan() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + Intrinsic::ID IntrinID; + Value *InitX; + Instruction *DefX = nullptr; + PHINode *CntPhi = nullptr; + Instruction *CntInst = nullptr; + + APInt LoopThreshold; + if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, + CntPhi, DefX, LoopThreshold)) + return false; + + if (LoopThreshold == 2) { + // Treat as regular FFS. + return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst); + } + + // Look for Floor Log2 Idiom. + if (LoopThreshold != 4) + return false; + + // Abort if CntPhi is used outside of the loop. + for (User *U : CntPhi->users()) + if (!CurLoop->contains(cast<Instruction>(U))) + return false; + + // It is safe to assume Preheader exist as it was checked in + // parent function RunOnLoop. + BasicBlock *PH = CurLoop->getLoopPreheader(); + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + if (!PreCondBI) + return false; + + APInt PreLoopThreshold; + if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX || + PreLoopThreshold != 2) return false; + bool ZeroCheck = true; + + // the loop has only 6 instructions: + // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] + // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] + // %shr = ashr %n.addr.0, 1 + // %tobool = icmp ult %n.addr.0, C + // %inc = add nsw %i.0, 1 + // br i1 %tobool + size_t IdiomCanonicalSize = 6; + if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize)) + return false; + + // log2(x) = w − 1 − clz(x) transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, DefX->getDebugLoc(), ZeroCheck, - IsCntPhiUsedOutsideLoop); + /*IsCntPhiUsedOutsideLoop=*/false, + /*InsertSub=*/true); return true; } @@ -1963,7 +2178,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, void LoopIdiomRecognize::transformLoopToCountable( Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, - bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) { BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator()); // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block @@ -1993,6 +2208,8 @@ void LoopIdiomRecognize::transformLoopToCountable( Type *CountTy = Count->getType(); Count = Builder.CreateSub( ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count); + if (InsertSub) + Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1)); Value *NewCount = Count; if (IsCntPhiUsedOutsideLoop) Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1)); @@ -2409,15 +2626,15 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) { // BitMask may be computed from BitPos, Freeze BitPos so we can increase // it's use count. - Instruction *InsertPt = nullptr; + std::optional<BasicBlock::iterator> InsertPt = std::nullopt; if (auto *BitPosI = dyn_cast<Instruction>(BitPos)) - InsertPt = &**BitPosI->getInsertionPointAfterDef(); + InsertPt = BitPosI->getInsertionPointAfterDef(); else - InsertPt = &*DT->getRoot()->getFirstNonPHIOrDbgOrAlloca(); + InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca(); if (!InsertPt) return false; FreezeInst *BitPosFrozen = - new FreezeInst(BitPos, BitPos->getName() + ".fr", InsertPt); + new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt); BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) { return U.getUser() != BitPosFrozen; }); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index cfe069d00bce..270c2120365c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -45,7 +45,7 @@ STATISTIC(NumSimplified, "Number of redundant instructions simplified"); static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, const TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU) { - const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = L.getHeader()->getDataLayout(); SimplifyQuery SQ(DL, &TLI, &DT, &AC); // On the first pass over the loop body we try to simplify every instruction. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 277f530ee25f..400973fd9fc9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -976,7 +976,7 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, } if (!findInductions(InnerLoop, InnerLoopInductions)) { - LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n"); + LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n"); return false; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 5ec387300aac..489f12e689d3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -96,7 +96,7 @@ struct StoreToLoadForwardingCandidate { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); - auto &DL = Load->getParent()->getModule()->getDataLayout(); + auto &DL = Load->getDataLayout(); assert(LoadPtr->getType()->getPointerAddressSpace() == StorePtr->getType()->getPointerAddressSpace() && @@ -126,8 +126,10 @@ struct StoreToLoadForwardingCandidate { // We don't need to check non-wrapping here because forward/backward // dependence wouldn't be valid if these weren't monotonic accesses. - auto *Dist = cast<SCEVConstant>( + auto *Dist = dyn_cast<SCEVConstant>( PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); + if (!Dist) + return false; const APInt &Val = Dist->getAPInt(); return Val == TypeByteSize * StrideLoad; } @@ -181,7 +183,8 @@ public: findStoreToLoadDependences(const LoopAccessInfo &LAI) { std::forward_list<StoreToLoadForwardingCandidate> Candidates; - const auto *Deps = LAI.getDepChecker().getDependences(); + const auto &DepChecker = LAI.getDepChecker(); + const auto *Deps = DepChecker.getDependences(); if (!Deps) return Candidates; @@ -192,8 +195,8 @@ public: SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence; for (const auto &Dep : *Deps) { - Instruction *Source = Dep.getSource(LAI); - Instruction *Destination = Dep.getDestination(LAI); + Instruction *Source = Dep.getSource(DepChecker); + Instruction *Destination = Dep.getDestination(DepChecker); if (Dep.Type == MemoryDepChecker::Dependence::Unknown || Dep.Type == MemoryDepChecker::Dependence::IndirectUnsafe) { @@ -222,7 +225,7 @@ public: // Only propagate if the stored values are bit/pointer castable. if (!CastInst::isBitOrNoopPointerCastable( getLoadStoreType(Store), getLoadStoreType(Load), - Store->getParent()->getModule()->getDataLayout())) + Store->getDataLayout())) continue; Candidates.emplace_front(Load, Store); @@ -349,19 +352,20 @@ public: // ld0. LoadInst *LastLoad = - std::max_element(Candidates.begin(), Candidates.end(), - [&](const StoreToLoadForwardingCandidate &A, - const StoreToLoadForwardingCandidate &B) { - return getInstrIndex(A.Load) < getInstrIndex(B.Load); - }) + llvm::max_element(Candidates, + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Load) < + getInstrIndex(B.Load); + }) ->Load; StoreInst *FirstStore = - std::min_element(Candidates.begin(), Candidates.end(), - [&](const StoreToLoadForwardingCandidate &A, - const StoreToLoadForwardingCandidate &B) { - return getInstrIndex(A.Store) < - getInstrIndex(B.Store); - }) + llvm::min_element(Candidates, + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Store) < + getInstrIndex(B.Store); + }) ->Store; // We're looking for stores after the first forwarding store until the end @@ -440,9 +444,14 @@ public: assert(PH && "Preheader should exist!"); Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), PH->getTerminator()); - Value *Initial = new LoadInst( - Cand.Load->getType(), InitialPtr, "load_initial", - /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator()); + Value *Initial = + new LoadInst(Cand.Load->getType(), InitialPtr, "load_initial", + /* isVolatile */ false, Cand.Load->getAlign(), + PH->getTerminator()->getIterator()); + // We don't give any debug location to Initial, because it is inserted + // into the loop's preheader. A debug location inside the loop will cause + // a misleading stepping when debugging. The test update-debugloc-store + // -forwarded.ll checks this. PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded"); PHI->insertBefore(L->getHeader()->begin()); @@ -450,20 +459,27 @@ public: Type *LoadType = Initial->getType(); Type *StoreType = Cand.Store->getValueOperand()->getType(); - auto &DL = Cand.Load->getParent()->getModule()->getDataLayout(); + auto &DL = Cand.Load->getDataLayout(); (void)DL; assert(DL.getTypeSizeInBits(LoadType) == DL.getTypeSizeInBits(StoreType) && "The type sizes should match!"); Value *StoreValue = Cand.Store->getValueOperand(); - if (LoadType != StoreType) - StoreValue = CastInst::CreateBitOrPointerCast( - StoreValue, LoadType, "store_forward_cast", Cand.Store); + if (LoadType != StoreType) { + StoreValue = CastInst::CreateBitOrPointerCast(StoreValue, LoadType, + "store_forward_cast", + Cand.Store->getIterator()); + // Because it casts the old `load` value and is used by the new `phi` + // which replaces the old `load`, we give the `load`'s debug location + // to it. + cast<Instruction>(StoreValue)->setDebugLoc(Cand.Load->getDebugLoc()); + } PHI->addIncoming(StoreValue, L->getLoopLatch()); Cand.Load->replaceAllUsesWith(PHI); + PHI->setDebugLoc(Cand.Load->getDebugLoc()); } /// Top-level driver for each loop: find store->load forwarding @@ -601,7 +617,7 @@ public: // Next, propagate the value stored by the store to the users of the load. // Also for the first iteration, generate the initial value of the load. - SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), + SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getDataLayout(), "storeforward"); for (const auto &Cand : Candidates) propagateStoredValueToLoadUsers(Cand, SEE); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp deleted file mode 100644 index 7f62526a4f6d..000000000000 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ /dev/null @@ -1,1679 +0,0 @@ -//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass implements a simple loop reroller. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Use.h" -#include "llvm/IR/User.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar/LoopReroll.h" -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" -#include <cassert> -#include <cstddef> -#include <cstdint> -#include <iterator> -#include <map> -#include <utility> - -using namespace llvm; - -#define DEBUG_TYPE "loop-reroll" - -STATISTIC(NumRerolledLoops, "Number of rerolled loops"); - -static cl::opt<unsigned> -NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), - cl::Hidden, - cl::desc("The maximum number of failures to tolerate" - " during fuzzy matching. (default: 400)")); - -// This loop re-rolling transformation aims to transform loops like this: -// -// int foo(int a); -// void bar(int *x) { -// for (int i = 0; i < 500; i += 3) { -// foo(i); -// foo(i+1); -// foo(i+2); -// } -// } -// -// into a loop like this: -// -// void bar(int *x) { -// for (int i = 0; i < 500; ++i) -// foo(i); -// } -// -// It does this by looking for loops that, besides the latch code, are composed -// of isomorphic DAGs of instructions, with each DAG rooted at some increment -// to the induction variable, and where each DAG is isomorphic to the DAG -// rooted at the induction variable (excepting the sub-DAGs which root the -// other induction-variable increments). In other words, we're looking for loop -// bodies of the form: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// f(%iv) -// %iv.1 = add %iv, 1 <-- a root increment -// f(%iv.1) -// %iv.2 = add %iv, 2 <-- a root increment -// f(%iv.2) -// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment -// f(%iv.scale_m_1) -// ... -// %iv.next = add %iv, scale -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// where each f(i) is a set of instructions that, collectively, are a function -// only of i (and other loop-invariant values). -// -// As a special case, we can also reroll loops like this: -// -// int foo(int); -// void bar(int *x) { -// for (int i = 0; i < 500; ++i) { -// x[3*i] = foo(0); -// x[3*i+1] = foo(0); -// x[3*i+2] = foo(0); -// } -// } -// -// into this: -// -// void bar(int *x) { -// for (int i = 0; i < 1500; ++i) -// x[i] = foo(0); -// } -// -// in which case, we're looking for inputs like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// %scaled.iv = mul %iv, scale -// f(%scaled.iv) -// %scaled.iv.1 = add %scaled.iv, 1 -// f(%scaled.iv.1) -// %scaled.iv.2 = add %scaled.iv, 2 -// f(%scaled.iv.2) -// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 -// f(%scaled.iv.scale_m_1) -// ... -// %iv.next = add %iv, 1 -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit - -namespace { - - enum IterationLimits { - /// The maximum number of iterations that we'll try and reroll. - IL_MaxRerollIterations = 32, - /// The bitvector index used by loop induction variables and other - /// instructions that belong to all iterations. - IL_All, - IL_End - }; - - class LoopReroll { - public: - LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE, - TargetLibraryInfo *TLI, DominatorTree *DT, bool PreserveLCSSA) - : AA(AA), LI(LI), SE(SE), TLI(TLI), DT(DT), - PreserveLCSSA(PreserveLCSSA) {} - bool runOnLoop(Loop *L); - - protected: - AliasAnalysis *AA; - LoopInfo *LI; - ScalarEvolution *SE; - TargetLibraryInfo *TLI; - DominatorTree *DT; - bool PreserveLCSSA; - - using SmallInstructionVector = SmallVector<Instruction *, 16>; - using SmallInstructionSet = SmallPtrSet<Instruction *, 16>; - using TinyInstructionVector = SmallVector<Instruction *, 1>; - - // Map between induction variable and its increment - DenseMap<Instruction *, int64_t> IVToIncMap; - - // For loop with multiple induction variables, remember the ones used only to - // control the loop. - TinyInstructionVector LoopControlIVs; - - // A chain of isomorphic instructions, identified by a single-use PHI - // representing a reduction. Only the last value may be used outside the - // loop. - struct SimpleLoopReduction { - SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) { - assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); - add(L); - } - - bool valid() const { - return Valid; - } - - Instruction *getPHI() const { - assert(Valid && "Using invalid reduction"); - return Instructions.front(); - } - - Instruction *getReducedValue() const { - assert(Valid && "Using invalid reduction"); - return Instructions.back(); - } - - Instruction *get(size_t i) const { - assert(Valid && "Using invalid reduction"); - return Instructions[i+1]; - } - - Instruction *operator [] (size_t i) const { return get(i); } - - // The size, ignoring the initial PHI. - size_t size() const { - assert(Valid && "Using invalid reduction"); - return Instructions.size()-1; - } - - using iterator = SmallInstructionVector::iterator; - using const_iterator = SmallInstructionVector::const_iterator; - - iterator begin() { - assert(Valid && "Using invalid reduction"); - return std::next(Instructions.begin()); - } - - const_iterator begin() const { - assert(Valid && "Using invalid reduction"); - return std::next(Instructions.begin()); - } - - iterator end() { return Instructions.end(); } - const_iterator end() const { return Instructions.end(); } - - protected: - bool Valid = false; - SmallInstructionVector Instructions; - - void add(Loop *L); - }; - - // The set of all reductions, and state tracking of possible reductions - // during loop instruction processing. - struct ReductionTracker { - using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>; - - // Add a new possible reduction. - void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } - - // Setup to track possible reductions corresponding to the provided - // rerolling scale. Only reductions with a number of non-PHI instructions - // that is divisible by the scale are considered. Three instructions sets - // are filled in: - // - A set of all possible instructions in eligible reductions. - // - A set of all PHIs in eligible reductions - // - A set of all reduced values (last instructions) in eligible - // reductions. - void restrictToScale(uint64_t Scale, - SmallInstructionSet &PossibleRedSet, - SmallInstructionSet &PossibleRedPHISet, - SmallInstructionSet &PossibleRedLastSet) { - PossibleRedIdx.clear(); - PossibleRedIter.clear(); - Reds.clear(); - - for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) - if (PossibleReds[i].size() % Scale == 0) { - PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); - PossibleRedPHISet.insert(PossibleReds[i].getPHI()); - - PossibleRedSet.insert(PossibleReds[i].getPHI()); - PossibleRedIdx[PossibleReds[i].getPHI()] = i; - for (Instruction *J : PossibleReds[i]) { - PossibleRedSet.insert(J); - PossibleRedIdx[J] = i; - } - } - } - - // The functions below are used while processing the loop instructions. - - // Are the two instructions both from reductions, and furthermore, from - // the same reduction? - bool isPairInSame(Instruction *J1, Instruction *J2) { - DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); - if (J1I != PossibleRedIdx.end()) { - DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); - if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) - return true; - } - - return false; - } - - // The two provided instructions, the first from the base iteration, and - // the second from iteration i, form a matched pair. If these are part of - // a reduction, record that fact. - void recordPair(Instruction *J1, Instruction *J2, unsigned i) { - if (PossibleRedIdx.count(J1)) { - assert(PossibleRedIdx.count(J2) && - "Recording reduction vs. non-reduction instruction?"); - - PossibleRedIter[J1] = 0; - PossibleRedIter[J2] = i; - - int Idx = PossibleRedIdx[J1]; - assert(Idx == PossibleRedIdx[J2] && - "Recording pair from different reductions?"); - Reds.insert(Idx); - } - } - - // The functions below can be called after we've finished processing all - // instructions in the loop, and we know which reductions were selected. - - bool validateSelected(); - void replaceSelected(); - - protected: - // The vector of all possible reductions (for any scale). - SmallReductionVector PossibleReds; - - DenseMap<Instruction *, int> PossibleRedIdx; - DenseMap<Instruction *, int> PossibleRedIter; - DenseSet<int> Reds; - }; - - // A DAGRootSet models an induction variable being used in a rerollable - // loop. For example, - // - // x[i*3+0] = y1 - // x[i*3+1] = y2 - // x[i*3+2] = y3 - // - // Base instruction -> i*3 - // +---+----+ - // / | \ - // ST[y1] +1 +2 <-- Roots - // | | - // ST[y2] ST[y3] - // - // There may be multiple DAGRoots, for example: - // - // x[i*2+0] = ... (1) - // x[i*2+1] = ... (1) - // x[i*2+4] = ... (2) - // x[i*2+5] = ... (2) - // x[(i+1234)*2+5678] = ... (3) - // x[(i+1234)*2+5679] = ... (3) - // - // The loop will be rerolled by adding a new loop induction variable, - // one for the Base instruction in each DAGRootSet. - // - struct DAGRootSet { - Instruction *BaseInst; - SmallInstructionVector Roots; - - // The instructions between IV and BaseInst (but not including BaseInst). - SmallInstructionSet SubsumedInsts; - }; - - // The set of all DAG roots, and state tracking of all roots - // for a particular induction variable. - struct DAGRootTracker { - DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, - ScalarEvolution *SE, AliasAnalysis *AA, - TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, - bool PreserveLCSSA, - DenseMap<Instruction *, int64_t> &IncrMap, - TinyInstructionVector LoopCtrlIVs) - : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), - PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap), - LoopControlIVs(LoopCtrlIVs) {} - - /// Stage 1: Find all the DAG roots for the induction variable. - bool findRoots(); - - /// Stage 2: Validate if the found roots are valid. - bool validate(ReductionTracker &Reductions); - - /// Stage 3: Assuming validate() returned true, perform the - /// replacement. - /// @param BackedgeTakenCount The backedge-taken count of L. - void replace(const SCEV *BackedgeTakenCount); - - protected: - using UsesTy = MapVector<Instruction *, BitVector>; - - void findRootsRecursive(Instruction *IVU, - SmallInstructionSet SubsumedInsts); - bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); - bool collectPossibleRoots(Instruction *Base, - std::map<int64_t,Instruction*> &Roots); - bool validateRootSet(DAGRootSet &DRS); - - bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); - void collectInLoopUserSet(const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - void collectInLoopUserSet(Instruction *Root, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - - UsesTy::iterator nextInstr(int Val, UsesTy &In, - const SmallInstructionSet &Exclude, - UsesTy::iterator *StartI=nullptr); - bool isBaseInst(Instruction *I); - bool isRootInst(Instruction *I); - bool instrDependsOn(Instruction *I, - UsesTy::iterator Start, - UsesTy::iterator End); - void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr); - - LoopReroll *Parent; - - // Members of Parent, replicated here for brevity. - Loop *L; - ScalarEvolution *SE; - AliasAnalysis *AA; - TargetLibraryInfo *TLI; - DominatorTree *DT; - LoopInfo *LI; - bool PreserveLCSSA; - - // The loop induction variable. - Instruction *IV; - - // Loop step amount. - int64_t Inc; - - // Loop reroll count; if Inc == 1, this records the scaling applied - // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; - // If Inc is not 1, Scale = Inc. - uint64_t Scale; - - // The roots themselves. - SmallVector<DAGRootSet,16> RootSets; - - // All increment instructions for IV. - SmallInstructionVector LoopIncs; - - // Map of all instructions in the loop (in order) to the iterations - // they are used in (or specially, IL_All for instructions - // used in the loop increment mechanism). - UsesTy Uses; - - // Map between induction variable and its increment - DenseMap<Instruction *, int64_t> &IVToIncMap; - - TinyInstructionVector LoopControlIVs; - }; - - // Check if it is a compare-like instruction whose user is a branch - bool isCompareUsedByBranch(Instruction *I) { - auto *TI = I->getParent()->getTerminator(); - if (!isa<BranchInst>(TI) || !isa<CmpInst>(I)) - return false; - return I->hasOneUse() && TI->getOperand(0) == I; - }; - - bool isLoopControlIV(Loop *L, Instruction *IV); - void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); - void collectPossibleReductions(Loop *L, - ReductionTracker &Reductions); - bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *BackedgeTakenCount, ReductionTracker &Reductions); - }; - -} // end anonymous namespace - -// Returns true if the provided instruction is used outside the given loop. -// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in -// non-loop blocks to be outside the loop. -static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (User *U : I->users()) { - if (!L->contains(cast<Instruction>(U))) - return true; - } - return false; -} - -// Check if an IV is only used to control the loop. There are two cases: -// 1. It only has one use which is loop increment, and the increment is only -// used by comparison and the PHI (could has sext with nsw in between), and the -// comparison is only used by branch. -// 2. It is used by loop increment and the comparison, the loop increment is -// only used by the PHI, and the comparison is used only by the branch. -bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { - unsigned IVUses = IV->getNumUses(); - if (IVUses != 2 && IVUses != 1) - return false; - - for (auto *User : IV->users()) { - int32_t IncOrCmpUses = User->getNumUses(); - bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User)); - - // User can only have one or two uses. - if (IncOrCmpUses != 2 && IncOrCmpUses != 1) - return false; - - // Case 1 - if (IVUses == 1) { - // The only user must be the loop increment. - // The loop increment must have two uses. - if (IsCompInst || IncOrCmpUses != 2) - return false; - } - - // Case 2 - if (IVUses == 2 && IncOrCmpUses != 1) - return false; - - // The users of the IV must be a binary operation or a comparison - if (auto *BO = dyn_cast<BinaryOperator>(User)) { - if (BO->getOpcode() == Instruction::Add) { - // Loop Increment - // User of Loop Increment should be either PHI or CMP - for (auto *UU : User->users()) { - if (PHINode *PN = dyn_cast<PHINode>(UU)) { - if (PN != IV) - return false; - } - // Must be a CMP or an ext (of a value with nsw) then CMP - else { - auto *UUser = cast<Instruction>(UU); - // Skip SExt if we are extending an nsw value - // TODO: Allow ZExt too - if (BO->hasNoSignedWrap() && UUser->hasOneUse() && - isa<SExtInst>(UUser)) - UUser = cast<Instruction>(*(UUser->user_begin())); - if (!isCompareUsedByBranch(UUser)) - return false; - } - } - } else - return false; - // Compare : can only have one use, and must be branch - } else if (!IsCompInst) - return false; - } - return true; -} - -// Collect the list of loop induction variables with respect to which it might -// be possible to reroll the loop. -void LoopReroll::collectPossibleIVs(Loop *L, - SmallInstructionVector &PossibleIVs) { - for (Instruction &IV : L->getHeader()->phis()) { - if (!IV.getType()->isIntegerTy() && !IV.getType()->isPointerTy()) - continue; - - if (const SCEVAddRecExpr *PHISCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&IV))) { - if (PHISCEV->getLoop() != L) - continue; - if (!PHISCEV->isAffine()) - continue; - const auto *IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE)); - if (IncSCEV) { - IVToIncMap[&IV] = IncSCEV->getValue()->getSExtValue(); - LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << IV << " = " << *PHISCEV - << "\n"); - - if (isLoopControlIV(L, &IV)) { - LoopControlIVs.push_back(&IV); - LLVM_DEBUG(dbgs() << "LRR: Loop control only IV: " << IV - << " = " << *PHISCEV << "\n"); - } else - PossibleIVs.push_back(&IV); - } - } - } -} - -// Add the remainder of the reduction-variable chain to the instruction vector -// (the initial PHINode has already been added). If successful, the object is -// marked as valid. -void LoopReroll::SimpleLoopReduction::add(Loop *L) { - assert(!Valid && "Cannot add to an already-valid chain"); - - // The reduction variable must be a chain of single-use instructions - // (including the PHI), except for the last value (which is used by the PHI - // and also outside the loop). - Instruction *C = Instructions.front(); - if (C->user_empty()) - return; - - do { - C = cast<Instruction>(*C->user_begin()); - if (C->hasOneUse()) { - if (!C->isBinaryOp()) - return; - - if (!(isa<PHINode>(Instructions.back()) || - C->isSameOperationAs(Instructions.back()))) - return; - - Instructions.push_back(C); - } - } while (C->hasOneUse()); - - if (Instructions.size() < 2 || - !C->isSameOperationAs(Instructions.back()) || - C->use_empty()) - return; - - // C is now the (potential) last instruction in the reduction chain. - for (User *U : C->users()) { - // The only in-loop user can be the initial PHI. - if (L->contains(cast<Instruction>(U))) - if (cast<Instruction>(U) != Instructions.front()) - return; - } - - Instructions.push_back(C); - Valid = true; -} - -// Collect the vector of possible reduction variables. -void LoopReroll::collectPossibleReductions(Loop *L, - ReductionTracker &Reductions) { - BasicBlock *Header = L->getHeader(); - for (BasicBlock::iterator I = Header->begin(), - IE = Header->getFirstInsertionPt(); I != IE; ++I) { - if (!isa<PHINode>(I)) - continue; - if (!I->getType()->isSingleValueType()) - continue; - - SimpleLoopReduction SLR(&*I, L); - if (!SLR.valid()) - continue; - - LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " - << SLR.size() << " chained instructions)\n"); - Reductions.addSLR(SLR); - } -} - -// Collect the set of all users of the provided root instruction. This set of -// users contains not only the direct users of the root instruction, but also -// all users of those users, and so on. There are two exceptions: -// -// 1. Instructions in the set of excluded instructions are never added to the -// use set (even if they are users). This is used, for example, to exclude -// including root increments in the use set of the primary IV. -// -// 2. Instructions in the set of final instructions are added to the use set -// if they are users, but their users are not added. This is used, for -// example, to prevent a reduction update from forcing all later reduction -// updates into the use set. -void LoopReroll::DAGRootTracker::collectInLoopUserSet( - Instruction *Root, const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users) { - SmallInstructionVector Queue(1, Root); - while (!Queue.empty()) { - Instruction *I = Queue.pop_back_val(); - if (!Users.insert(I).second) - continue; - - if (!Final.count(I)) - for (Use &U : I->uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *PN = dyn_cast<PHINode>(User)) { - // Ignore "wrap-around" uses to PHIs of this loop's header. - if (PN->getIncomingBlock(U) == L->getHeader()) - continue; - } - - if (L->contains(User) && !Exclude.count(User)) { - Queue.push_back(User); - } - } - - // We also want to collect single-user "feeder" values. - for (Use &U : I->operands()) { - if (Instruction *Op = dyn_cast<Instruction>(U)) - if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && - !Final.count(Op)) - Queue.push_back(Op); - } - } -} - -// Collect all of the users of all of the provided root instructions (combined -// into a single set). -void LoopReroll::DAGRootTracker::collectInLoopUserSet( - const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users) { - for (Instruction *Root : Roots) - collectInLoopUserSet(Root, Exclude, Final, Users); -} - -static bool isUnorderedLoadStore(Instruction *I) { - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->isUnordered(); - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->isUnordered(); - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) - return !MI->isVolatile(); - return false; -} - -/// Return true if IVU is a "simple" arithmetic operation. -/// This is used for narrowing the search space for DAGRoots; only arithmetic -/// and GEPs can be part of a DAGRoot. -static bool isSimpleArithmeticOp(User *IVU) { - if (Instruction *I = dyn_cast<Instruction>(IVU)) { - switch (I->getOpcode()) { - default: return false; - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::Shl: - case Instruction::AShr: - case Instruction::LShr: - case Instruction::GetElementPtr: - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - return true; - } - } - return false; -} - -static bool isLoopIncrement(User *U, Instruction *IV) { - BinaryOperator *BO = dyn_cast<BinaryOperator>(U); - - if ((BO && BO->getOpcode() != Instruction::Add) || - (!BO && !isa<GetElementPtrInst>(U))) - return false; - - for (auto *UU : U->users()) { - PHINode *PN = dyn_cast<PHINode>(UU); - if (PN && PN == IV) - return true; - } - return false; -} - -bool LoopReroll::DAGRootTracker:: -collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { - SmallInstructionVector BaseUsers; - - for (auto *I : Base->users()) { - ConstantInt *CI = nullptr; - - if (isLoopIncrement(I, IV)) { - LoopIncs.push_back(cast<Instruction>(I)); - continue; - } - - // The root nodes must be either GEPs, ORs or ADDs. - if (auto *BO = dyn_cast<BinaryOperator>(I)) { - if (BO->getOpcode() == Instruction::Add || - BO->getOpcode() == Instruction::Or) - CI = dyn_cast<ConstantInt>(BO->getOperand(1)); - } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { - Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); - CI = dyn_cast<ConstantInt>(LastOperand); - } - - if (!CI) { - if (Instruction *II = dyn_cast<Instruction>(I)) { - BaseUsers.push_back(II); - continue; - } else { - LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I - << "\n"); - return false; - } - } - - int64_t V = std::abs(CI->getValue().getSExtValue()); - if (Roots.find(V) != Roots.end()) - // No duplicates, please. - return false; - - Roots[V] = cast<Instruction>(I); - } - - // Make sure we have at least two roots. - if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty())) - return false; - - // If we found non-loop-inc, non-root users of Base, assume they are - // for the zeroth root index. This is because "add %a, 0" gets optimized - // away. - if (BaseUsers.size()) { - if (Roots.find(0) != Roots.end()) { - LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); - return false; - } - Roots[0] = Base; - } - - // Calculate the number of users of the base, or lowest indexed, iteration. - unsigned NumBaseUses = BaseUsers.size(); - if (NumBaseUses == 0) - NumBaseUses = Roots.begin()->second->getNumUses(); - - // Check that every node has the same number of users. - for (auto &KV : Roots) { - if (KV.first == 0) - continue; - if (!KV.second->hasNUses(NumBaseUses)) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " - << "#Base=" << NumBaseUses - << ", #Root=" << KV.second->getNumUses() << "\n"); - return false; - } - } - - return true; -} - -void LoopReroll::DAGRootTracker:: -findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { - // Does the user look like it could be part of a root set? - // All its users must be simple arithmetic ops. - if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1)) - return; - - if (I != IV && findRootsBase(I, SubsumedInsts)) - return; - - SubsumedInsts.insert(I); - - for (User *V : I->users()) { - Instruction *I = cast<Instruction>(V); - if (is_contained(LoopIncs, I)) - continue; - - if (!isSimpleArithmeticOp(I)) - continue; - - // The recursive call makes a copy of SubsumedInsts. - findRootsRecursive(I, SubsumedInsts); - } -} - -bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) { - if (DRS.Roots.empty()) - return false; - - // If the value of the base instruction is used outside the loop, we cannot - // reroll the loop. Check for other root instructions is unnecessary because - // they don't match any base instructions if their values are used outside. - if (hasUsesOutsideLoop(DRS.BaseInst, L)) - return false; - - // Consider a DAGRootSet with N-1 roots (so N different values including - // BaseInst). - // Define d = Roots[0] - BaseInst, which should be the same as - // Roots[I] - Roots[I-1] for all I in [1..N). - // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the - // loop iteration J. - // - // Now, For the loop iterations to be consecutive: - // D = d * N - const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); - if (!ADR) - return false; - - // Check that the first root is evenly spaced. - unsigned N = DRS.Roots.size() + 1; - const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR); - if (isa<SCEVCouldNotCompute>(StepSCEV) || StepSCEV->getType()->isPointerTy()) - return false; - const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); - if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) - return false; - - // Check that the remainling roots are evenly spaced. - for (unsigned i = 1; i < N - 1; ++i) { - const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]), - SE->getSCEV(DRS.Roots[i-1])); - if (NewStepSCEV != StepSCEV) - return false; - } - - return true; -} - -bool LoopReroll::DAGRootTracker:: -findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { - // The base of a RootSet must be an AddRec, so it can be erased. - const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU)); - if (!IVU_ADR || IVU_ADR->getLoop() != L) - return false; - - std::map<int64_t, Instruction*> V; - if (!collectPossibleRoots(IVU, V)) - return false; - - // If we didn't get a root for index zero, then IVU must be - // subsumed. - if (V.find(0) == V.end()) - SubsumedInsts.insert(IVU); - - // Partition the vector into monotonically increasing indexes. - DAGRootSet DRS; - DRS.BaseInst = nullptr; - - SmallVector<DAGRootSet, 16> PotentialRootSets; - - for (auto &KV : V) { - if (!DRS.BaseInst) { - DRS.BaseInst = KV.second; - DRS.SubsumedInsts = SubsumedInsts; - } else if (DRS.Roots.empty()) { - DRS.Roots.push_back(KV.second); - } else if (V.find(KV.first - 1) != V.end()) { - DRS.Roots.push_back(KV.second); - } else { - // Linear sequence terminated. - if (!validateRootSet(DRS)) - return false; - - // Construct a new DAGRootSet with the next sequence. - PotentialRootSets.push_back(DRS); - DRS.BaseInst = KV.second; - DRS.Roots.clear(); - } - } - - if (!validateRootSet(DRS)) - return false; - - PotentialRootSets.push_back(DRS); - - RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end()); - - return true; -} - -bool LoopReroll::DAGRootTracker::findRoots() { - Inc = IVToIncMap[IV]; - - assert(RootSets.empty() && "Unclean state!"); - if (std::abs(Inc) == 1) { - for (auto *IVU : IV->users()) { - if (isLoopIncrement(IVU, IV)) - LoopIncs.push_back(cast<Instruction>(IVU)); - } - findRootsRecursive(IV, SmallInstructionSet()); - LoopIncs.push_back(IV); - } else { - if (!findRootsBase(IV, SmallInstructionSet())) - return false; - } - - // Ensure all sets have the same size. - if (RootSets.empty()) { - LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); - return false; - } - for (auto &V : RootSets) { - if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { - LLVM_DEBUG( - dbgs() - << "LRR: Aborting because not all root sets have the same size\n"); - return false; - } - } - - Scale = RootSets[0].Roots.size() + 1; - - if (Scale > IL_MaxRerollIterations) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " - << "#Found=" << Scale - << ", #Max=" << IL_MaxRerollIterations << "\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale - << "\n"); - - return true; -} - -bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { - // Populate the MapVector with all instructions in the block, in order first, - // so we can iterate over the contents later in perfect order. - for (auto &I : *L->getHeader()) { - Uses[&I].resize(IL_End); - } - - SmallInstructionSet Exclude; - for (auto &DRS : RootSets) { - Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); - Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); - Exclude.insert(DRS.BaseInst); - } - Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - - for (auto &DRS : RootSets) { - DenseSet<Instruction*> VBase; - collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); - for (auto *I : VBase) { - Uses[I].set(0); - } - - unsigned Idx = 1; - for (auto *Root : DRS.Roots) { - DenseSet<Instruction*> V; - collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); - - // While we're here, check the use sets are the same size. - if (V.size() != VBase.size()) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); - return false; - } - - for (auto *I : V) { - Uses[I].set(Idx); - } - ++Idx; - } - - // Make sure our subsumed instructions are remembered too. - for (auto *I : DRS.SubsumedInsts) { - Uses[I].set(IL_All); - } - } - - // Make sure the loop increments are also accounted for. - - Exclude.clear(); - for (auto &DRS : RootSets) { - Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); - Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); - Exclude.insert(DRS.BaseInst); - } - - DenseSet<Instruction*> V; - collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); - for (auto *I : V) { - if (I->mayHaveSideEffects()) { - LLVM_DEBUG(dbgs() << "LRR: Aborting - " - << "An instruction which does not belong to any root " - << "sets must not have side effects: " << *I); - return false; - } - Uses[I].set(IL_All); - } - - return true; -} - -/// Get the next instruction in "In" that is a member of set Val. -/// Start searching from StartI, and do not return anything in Exclude. -/// If StartI is not given, start from In.begin(). -LoopReroll::DAGRootTracker::UsesTy::iterator -LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, - const SmallInstructionSet &Exclude, - UsesTy::iterator *StartI) { - UsesTy::iterator I = StartI ? *StartI : In.begin(); - while (I != In.end() && (I->second.test(Val) == 0 || - Exclude.contains(I->first))) - ++I; - return I; -} - -bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { - for (auto &DRS : RootSets) { - if (DRS.BaseInst == I) - return true; - } - return false; -} - -bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { - for (auto &DRS : RootSets) { - if (is_contained(DRS.Roots, I)) - return true; - } - return false; -} - -/// Return true if instruction I depends on any instruction between -/// Start and End. -bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, - UsesTy::iterator Start, - UsesTy::iterator End) { - for (auto *U : I->users()) { - for (auto It = Start; It != End; ++It) - if (U == It->first) - return true; - } - return false; -} - -static bool isIgnorableInst(const Instruction *I) { - if (isa<DbgInfoIntrinsic>(I)) - return true; - const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - if (!II) - return false; - switch (II->getIntrinsicID()) { - default: - return false; - case Intrinsic::annotation: - case Intrinsic::ptr_annotation: - case Intrinsic::var_annotation: - // TODO: the following intrinsics may also be allowed: - // lifetime_start, lifetime_end, invariant_start, invariant_end - return true; - } - return false; -} - -bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { - // We now need to check for equivalence of the use graph of each root with - // that of the primary induction variable (excluding the roots). Our goal - // here is not to solve the full graph isomorphism problem, but rather to - // catch common cases without a lot of work. As a result, we will assume - // that the relative order of the instructions in each unrolled iteration - // is the same (although we will not make an assumption about how the - // different iterations are intermixed). Note that while the order must be - // the same, the instructions may not be in the same basic block. - - // An array of just the possible reductions for this scale factor. When we - // collect the set of all users of some root instructions, these reduction - // instructions are treated as 'final' (their uses are not considered). - // This is important because we don't want the root use set to search down - // the reduction chain. - SmallInstructionSet PossibleRedSet; - SmallInstructionSet PossibleRedLastSet; - SmallInstructionSet PossibleRedPHISet; - Reductions.restrictToScale(Scale, PossibleRedSet, - PossibleRedPHISet, PossibleRedLastSet); - - // Populate "Uses" with where each instruction is used. - if (!collectUsedInstructions(PossibleRedSet)) - return false; - - // Make sure we mark the reduction PHIs as used in all iterations. - for (auto *I : PossibleRedPHISet) { - Uses[I].set(IL_All); - } - - // Make sure we mark loop-control-only PHIs as used in all iterations. See - // comment above LoopReroll::isLoopControlIV for more information. - BasicBlock *Header = L->getHeader(); - for (Instruction *LoopControlIV : LoopControlIVs) { - for (auto *U : LoopControlIV->users()) { - Instruction *IVUser = dyn_cast<Instruction>(U); - // IVUser could be loop increment or compare - Uses[IVUser].set(IL_All); - for (auto *UU : IVUser->users()) { - Instruction *UUser = dyn_cast<Instruction>(UU); - // UUser could be compare, PHI or branch - Uses[UUser].set(IL_All); - // Skip SExt - if (isa<SExtInst>(UUser)) { - UUser = dyn_cast<Instruction>(*(UUser->user_begin())); - Uses[UUser].set(IL_All); - } - // Is UUser a compare instruction? - if (UU->hasOneUse()) { - Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin()); - if (BI == cast<BranchInst>(Header->getTerminator())) - Uses[BI].set(IL_All); - } - } - } - } - - // Make sure all instructions in the loop are in one and only one - // set. - for (auto &KV : Uses) { - if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { - LLVM_DEBUG( - dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " - << *KV.first << " (#uses=" << KV.second.count() << ")\n"); - return false; - } - } - - LLVM_DEBUG(for (auto &KV - : Uses) { - dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; - }); - - BatchAAResults BatchAA(*AA); - for (unsigned Iter = 1; Iter < Scale; ++Iter) { - // In addition to regular aliasing information, we need to look for - // instructions from later (future) iterations that have side effects - // preventing us from reordering them past other instructions with side - // effects. - bool FutureSideEffects = false; - AliasSetTracker AST(BatchAA); - // The map between instructions in f(%iv.(i+1)) and f(%iv). - DenseMap<Value *, Value *> BaseMap; - - // Compare iteration Iter to the base. - SmallInstructionSet Visited; - auto BaseIt = nextInstr(0, Uses, Visited); - auto RootIt = nextInstr(Iter, Uses, Visited); - auto LastRootIt = Uses.begin(); - - while (BaseIt != Uses.end() && RootIt != Uses.end()) { - Instruction *BaseInst = BaseIt->first; - Instruction *RootInst = RootIt->first; - - // Skip over the IV or root instructions; only match their users. - bool Continue = false; - if (isBaseInst(BaseInst)) { - Visited.insert(BaseInst); - BaseIt = nextInstr(0, Uses, Visited); - Continue = true; - } - if (isRootInst(RootInst)) { - LastRootIt = RootIt; - Visited.insert(RootInst); - RootIt = nextInstr(Iter, Uses, Visited); - Continue = true; - } - if (Continue) continue; - - if (!BaseInst->isSameOperationAs(RootInst)) { - // Last chance saloon. We don't try and solve the full isomorphism - // problem, but try and at least catch the case where two instructions - // *of different types* are round the wrong way. We won't be able to - // efficiently tell, given two ADD instructions, which way around we - // should match them, but given an ADD and a SUB, we can at least infer - // which one is which. - // - // This should allow us to deal with a greater subset of the isomorphism - // problem. It does however change a linear algorithm into a quadratic - // one, so limit the number of probes we do. - auto TryIt = RootIt; - unsigned N = NumToleratedFailedMatches; - while (TryIt != Uses.end() && - !BaseInst->isSameOperationAs(TryIt->first) && - N--) { - ++TryIt; - TryIt = nextInstr(Iter, Uses, Visited, &TryIt); - } - - if (TryIt == Uses.end() || TryIt == RootIt || - instrDependsOn(TryIt->first, RootIt, TryIt)) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " - << *BaseInst << " vs. " << *RootInst << "\n"); - return false; - } - - RootIt = TryIt; - RootInst = TryIt->first; - } - - // All instructions between the last root and this root - // may belong to some other iteration. If they belong to a - // future iteration, then they're dangerous to alias with. - // - // Note that because we allow a limited amount of flexibility in the order - // that we visit nodes, LastRootIt might be *before* RootIt, in which - // case we've already checked this set of instructions so we shouldn't - // do anything. - for (; LastRootIt < RootIt; ++LastRootIt) { - Instruction *I = LastRootIt->first; - if (LastRootIt->second.find_first() < (int)Iter) - continue; - if (I->mayWriteToMemory()) - AST.add(I); - // Note: This is specifically guarded by a check on isa<PHINode>, - // which while a valid (somewhat arbitrary) micro-optimization, is - // needed because otherwise isSafeToSpeculativelyExecute returns - // false on PHI nodes. - if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) && - !isSafeToSpeculativelyExecute(I)) - // Intervening instructions cause side effects. - FutureSideEffects = true; - } - - // Make sure that this instruction, which is in the use set of this - // root instruction, does not also belong to the base set or the set of - // some other root instruction. - if (RootIt->second.count() > 1) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (prev. case overlap)\n"); - return false; - } - - // Make sure that we don't alias with any instruction in the alias set - // tracker. If we do, then we depend on a future iteration, and we - // can't reroll. - if (RootInst->mayReadFromMemory()) { - for (auto &K : AST) { - if (isModOrRefSet(K.aliasesUnknownInst(RootInst, BatchAA))) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " - << *BaseInst << " vs. " << *RootInst - << " (depends on future store)\n"); - return false; - } - } - } - - // If we've past an instruction from a future iteration that may have - // side effects, and this instruction might also, then we can't reorder - // them, and this matching fails. As an exception, we allow the alias - // set tracker to handle regular (unordered) load/store dependencies. - if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) && - !isSafeToSpeculativelyExecute(BaseInst)) || - (!isUnorderedLoadStore(RootInst) && - !isSafeToSpeculativelyExecute(RootInst)))) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst - << " (side effects prevent reordering)\n"); - return false; - } - - // For instructions that are part of a reduction, if the operation is - // associative, then don't bother matching the operands (because we - // already know that the instructions are isomorphic, and the order - // within the iteration does not matter). For non-associative reductions, - // we do need to match the operands, because we need to reject - // out-of-order instructions within an iteration! - // For example (assume floating-point addition), we need to reject this: - // x += a[i]; x += b[i]; - // x += a[i+1]; x += b[i+1]; - // x += b[i+2]; x += a[i+2]; - bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); - - if (!(InReduction && BaseInst->isAssociative())) { - bool Swapped = false, SomeOpMatched = false; - for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { - Value *Op2 = RootInst->getOperand(j); - - // If this is part of a reduction (and the operation is not - // associatve), then we match all operands, but not those that are - // part of the reduction. - if (InReduction) - if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) - if (Reductions.isPairInSame(RootInst, Op2I)) - continue; - - DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); - if (BMI != BaseMap.end()) { - Op2 = BMI->second; - } else { - for (auto &DRS : RootSets) { - if (DRS.Roots[Iter-1] == (Instruction*) Op2) { - Op2 = DRS.BaseInst; - break; - } - } - } - - if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { - // If we've not already decided to swap the matched operands, and - // we've not already matched our first operand (note that we could - // have skipped matching the first operand because it is part of a - // reduction above), and the instruction is commutative, then try - // the swapped match. - if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && - BaseInst->getOperand(!j) == Op2) { - Swapped = true; - } else { - LLVM_DEBUG(dbgs() - << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (operand " << j << ")\n"); - return false; - } - } - - SomeOpMatched = true; - } - } - - if ((!PossibleRedLastSet.count(BaseInst) && - hasUsesOutsideLoop(BaseInst, L)) || - (!PossibleRedLastSet.count(RootInst) && - hasUsesOutsideLoop(RootInst, L))) { - LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (uses outside loop)\n"); - return false; - } - - Reductions.recordPair(BaseInst, RootInst, Iter); - BaseMap.insert(std::make_pair(RootInst, BaseInst)); - - LastRootIt = RootIt; - Visited.insert(BaseInst); - Visited.insert(RootInst); - BaseIt = nextInstr(0, Uses, Visited); - RootIt = nextInstr(Iter, Uses, Visited); - } - assert(BaseIt == Uses.end() && RootIt == Uses.end() && - "Mismatched set sizes!"); - } - - LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV - << "\n"); - - return true; -} - -void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { - BasicBlock *Header = L->getHeader(); - - // Compute the start and increment for each BaseInst before we start erasing - // instructions. - SmallVector<const SCEV *, 8> StartExprs; - SmallVector<const SCEV *, 8> IncrExprs; - for (auto &DRS : RootSets) { - const SCEVAddRecExpr *IVSCEV = - cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); - StartExprs.push_back(IVSCEV->getStart()); - IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV)); - } - - // Remove instructions associated with non-base iterations. - for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) { - unsigned I = Uses[&Inst].find_first(); - if (I > 0 && I < IL_All) { - LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n"); - Inst.eraseFromParent(); - } - } - - // Rewrite each BaseInst using SCEV. - for (size_t i = 0, e = RootSets.size(); i != e; ++i) - // Insert the new induction variable. - replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]); - - { // Limit the lifetime of SCEVExpander. - BranchInst *BI = cast<BranchInst>(Header->getTerminator()); - const DataLayout &DL = Header->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "reroll"); - auto Zero = SE->getZero(BackedgeTakenCount->getType()); - auto One = SE->getOne(BackedgeTakenCount->getType()); - auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap); - Value *NewIV = - Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(), - Header->getFirstNonPHIOrDbg()); - // FIXME: This arithmetic can overflow. - auto TripCount = SE->getAddExpr(BackedgeTakenCount, One); - auto ScaledTripCount = SE->getMulExpr( - TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale)); - auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One); - Value *TakenCount = - Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(), - Header->getFirstNonPHIOrDbg()); - Value *Cond = - new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond"); - BI->setCondition(Cond); - - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); - } - - SimplifyInstructionsInBlock(Header, TLI); - DeleteDeadPHIs(Header, TLI); -} - -void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS, - const SCEV *Start, - const SCEV *IncrExpr) { - BasicBlock *Header = L->getHeader(); - Instruction *Inst = DRS.BaseInst; - - const SCEV *NewIVSCEV = - SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap); - - { // Limit the lifetime of SCEVExpander. - const DataLayout &DL = Header->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "reroll"); - Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(), - Header->getFirstNonPHIOrDbg()); - - for (auto &KV : Uses) - if (KV.second.find_first() == 0) - KV.first->replaceUsesOfWith(Inst, NewIV); - } -} - -// Validate the selected reductions. All iterations must have an isomorphic -// part of the reduction chain and, for non-associative reductions, the chain -// entries must appear in order. -bool LoopReroll::ReductionTracker::validateSelected() { - // For a non-associative reduction, the chain entries must appear in order. - for (int i : Reds) { - int PrevIter = 0, BaseCount = 0, Count = 0; - for (Instruction *J : PossibleReds[i]) { - // Note that all instructions in the chain must have been found because - // all instructions in the function must have been assigned to some - // iteration. - int Iter = PossibleRedIter[J]; - if (Iter != PrevIter && Iter != PrevIter + 1 && - !PossibleReds[i].getReducedValue()->isAssociative()) { - LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " - << J << "\n"); - return false; - } - - if (Iter != PrevIter) { - if (Count != BaseCount) { - LLVM_DEBUG(dbgs() - << "LRR: Iteration " << PrevIter << " reduction use count " - << Count << " is not equal to the base use count " - << BaseCount << "\n"); - return false; - } - - Count = 0; - } - - ++Count; - if (Iter == 0) - ++BaseCount; - - PrevIter = Iter; - } - } - - return true; -} - -// For all selected reductions, remove all parts except those in the first -// iteration (and the PHI). Replace outside uses of the reduced value with uses -// of the first-iteration reduced value (in other words, reroll the selected -// reductions). -void LoopReroll::ReductionTracker::replaceSelected() { - // Fixup reductions to refer to the last instruction associated with the - // first iteration (not the last). - for (int i : Reds) { - int j = 0; - for (int e = PossibleReds[i].size(); j != e; ++j) - if (PossibleRedIter[PossibleReds[i][j]] != 0) { - --j; - break; - } - - // Replace users with the new end-of-chain value. - SmallInstructionVector Users; - for (User *U : PossibleReds[i].getReducedValue()->users()) { - Users.push_back(cast<Instruction>(U)); - } - - for (Instruction *User : Users) - User->replaceUsesOfWith(PossibleReds[i].getReducedValue(), - PossibleReds[i][j]); - } -} - -// Reroll the provided loop with respect to the provided induction variable. -// Generally, we're looking for a loop like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// f(%iv) -// %iv.1 = add %iv, 1 <-- a root increment -// f(%iv.1) -// %iv.2 = add %iv, 2 <-- a root increment -// f(%iv.2) -// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment -// f(%iv.scale_m_1) -// ... -// %iv.next = add %iv, scale -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of -// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can -// be intermixed with eachother. The restriction imposed by this algorithm is -// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), -// etc. be the same. -// -// First, we collect the use set of %iv, excluding the other increment roots. -// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) -// times, having collected the use set of f(%iv.(i+1)), during which we: -// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to -// the next unmatched instruction in f(%iv.(i+1)). -// - Ensure that both matched instructions don't have any external users -// (with the exception of last-in-chain reduction instructions). -// - Track the (aliasing) write set, and other side effects, of all -// instructions that belong to future iterations that come before the matched -// instructions. If the matched instructions read from that write set, then -// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in -// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, -// if any of these future instructions had side effects (could not be -// speculatively executed), and so do the matched instructions, when we -// cannot reorder those side-effect-producing instructions, and rerolling -// fails. -// -// Finally, we make sure that all loop instructions are either loop increment -// roots, belong to simple latch code, parts of validated reductions, part of -// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions -// have been validated), then we reroll the loop. -bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *BackedgeTakenCount, - ReductionTracker &Reductions) { - DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, - IVToIncMap, LoopControlIVs); - - if (!DAGRoots.findRoots()) - return false; - LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV - << "\n"); - - if (!DAGRoots.validate(Reductions)) - return false; - if (!Reductions.validateSelected()) - return false; - // At this point, we've validated the rerolling, and we're committed to - // making changes! - - Reductions.replaceSelected(); - DAGRoots.replace(BackedgeTakenCount); - - ++NumRerolledLoops; - return true; -} - -bool LoopReroll::runOnLoop(Loop *L) { - BasicBlock *Header = L->getHeader(); - LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %" - << Header->getName() << " (" << L->getNumBlocks() - << " block(s))\n"); - - // For now, we'll handle only single BB loops. - if (L->getNumBlocks() > 1) - return false; - - if (!SE->hasLoopInvariantBackedgeTakenCount(L)) - return false; - - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); - LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); - LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount - << "\n"); - - // First, we need to find the induction variable with respect to which we can - // reroll (there may be several possible options). - SmallInstructionVector PossibleIVs; - IVToIncMap.clear(); - LoopControlIVs.clear(); - collectPossibleIVs(L, PossibleIVs); - - if (PossibleIVs.empty()) { - LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n"); - return false; - } - - ReductionTracker Reductions; - collectPossibleReductions(L, Reductions); - bool Changed = false; - - // For each possible IV, collect the associated possible set of 'root' nodes - // (i+1, i+2, etc.). - for (Instruction *PossibleIV : PossibleIVs) - if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) { - Changed = true; - break; - } - LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); - - // Trip count of L has changed so SE must be re-evaluated. - if (Changed) - SE->forgetLoop(L); - - return Changed; -} - -PreservedAnalyses LoopRerollPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &U) { - return LoopReroll(&AR.AA, &AR.LI, &AR.SE, &AR.TLI, &AR.DT, true).runOnLoop(&L) - ? getLoopPassPreservedAnalyses() - : PreservedAnalyses::all(); -} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp index eee855058706..acb79e94d087 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -64,11 +64,12 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, // Vectorization requires loop-rotation. Use default threshold for loops the // user explicitly marked for vectorization, even when header duplication is // disabled. - int Threshold = EnableHeaderDuplication || - hasVectorizeTransformation(&L) == TM_ForcedByUser - ? DefaultRotationThreshold - : 0; - const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); + int Threshold = + (EnableHeaderDuplication && !L.getHeader()->getParent()->hasMinSize()) || + hasVectorizeTransformation(&L) == TM_ForcedByUser + ? DefaultRotationThreshold + : 0; + const DataLayout &DL = L.getHeader()->getDataLayout(); const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL); std::optional<MemorySSAUpdater> MSSAU; @@ -89,79 +90,3 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, PA.preserve<MemorySSAAnalysis>(); return PA; } - -namespace { - -class LoopRotateLegacyPass : public LoopPass { - unsigned MaxHeaderSize; - bool PrepareForLTO; - -public: - static char ID; // Pass ID, replacement for typeid - LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1, - bool PrepareForLTO = false) - : LoopPass(ID), PrepareForLTO(PrepareForLTO) { - initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); - } - - // LCSSA form makes instruction renaming easier. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - getLoopAnalysisUsage(AU); - - // Lazy BFI and BPI are marked as preserved here so LoopRotate - // can remain part of the same loop pass manager as LICM. - AU.addPreserved<LazyBlockFrequencyInfoPass>(); - AU.addPreserved<LazyBranchProbabilityInfoPass>(); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (skipLoop(L)) - return false; - Function &F = *L->getHeader()->getParent(); - - auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); - std::optional<MemorySSAUpdater> MSSAU; - // Not requiring MemorySSA and getting it only if available will split - // the loop pass pipeline when LoopRotate is being run first. - auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); - if (MSSAA) - MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); - // Vectorization requires loop-rotation. Use default threshold for loops the - // user explicitly marked for vectorization, even when header duplication is - // disabled. - int Threshold = hasVectorizeTransformation(L) == TM_ForcedByUser - ? DefaultRotationThreshold - : MaxHeaderSize; - - return LoopRotation(L, LI, TTI, AC, &DT, &SE, MSSAU ? &*MSSAU : nullptr, SQ, - false, Threshold, false, - PrepareForLTO || PrepareForLTOOption); - } -}; -} // end namespace - -char LoopRotateLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", - false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) -INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, - false) - -Pass *llvm::createLoopRotatePass(int MaxHeaderSize, bool PrepareForLTO) { - return new LoopRotateLegacyPass(MaxHeaderSize, PrepareForLTO); -} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 028a487ecdbc..ae9103d0608a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -16,7 +16,6 @@ #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7ebc5da8b25a..11f9f7822a15 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -193,10 +193,18 @@ static cl::opt<cl::boolOrDefault> AllowTerminatingConditionFoldingAfterLSR( "lsr-term-fold", cl::Hidden, cl::desc("Attempt to replace primary IV with other IV.")); -static cl::opt<bool> AllowDropSolutionIfLessProfitable( - "lsr-drop-solution", cl::Hidden, cl::init(false), +static cl::opt<cl::boolOrDefault> AllowDropSolutionIfLessProfitable( + "lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable")); +static cl::opt<bool> EnableVScaleImmediates( + "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), + cl::desc("Enable analysis of vscale-relative immediates in LSR")); + +static cl::opt<bool> DropScaledForVScale( + "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), + cl::desc("Avoid using scaled registers with vscale-relative addressing")); + STATISTIC(NumTermFold, "Number of terminating condition fold recognized and performed"); @@ -247,6 +255,126 @@ public: void dump() const; }; +// An offset from an address that is either scalable or fixed. Used for +// per-target optimizations of addressing modes. +class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> { + constexpr Immediate(ScalarTy MinVal, bool Scalable) + : FixedOrScalableQuantity(MinVal, Scalable) {} + + constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V) + : FixedOrScalableQuantity(V) {} + +public: + constexpr Immediate() = delete; + + static constexpr Immediate getFixed(ScalarTy MinVal) { + return {MinVal, false}; + } + static constexpr Immediate getScalable(ScalarTy MinVal) { + return {MinVal, true}; + } + static constexpr Immediate get(ScalarTy MinVal, bool Scalable) { + return {MinVal, Scalable}; + } + static constexpr Immediate getZero() { return {0, false}; } + static constexpr Immediate getFixedMin() { + return {std::numeric_limits<int64_t>::min(), false}; + } + static constexpr Immediate getFixedMax() { + return {std::numeric_limits<int64_t>::max(), false}; + } + static constexpr Immediate getScalableMin() { + return {std::numeric_limits<int64_t>::min(), true}; + } + static constexpr Immediate getScalableMax() { + return {std::numeric_limits<int64_t>::max(), true}; + } + + constexpr bool isLessThanZero() const { return Quantity < 0; } + + constexpr bool isGreaterThanZero() const { return Quantity > 0; } + + constexpr bool isCompatibleImmediate(const Immediate &Imm) const { + return isZero() || Imm.isZero() || Imm.Scalable == Scalable; + } + + constexpr bool isMin() const { + return Quantity == std::numeric_limits<ScalarTy>::min(); + } + + constexpr bool isMax() const { + return Quantity == std::numeric_limits<ScalarTy>::max(); + } + + // Arithmetic 'operators' that cast to unsigned types first. + constexpr Immediate addUnsigned(const Immediate &RHS) const { + assert(isCompatibleImmediate(RHS) && "Incompatible Immediates"); + ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue(); + return {Value, Scalable || RHS.isScalable()}; + } + + constexpr Immediate subUnsigned(const Immediate &RHS) const { + assert(isCompatibleImmediate(RHS) && "Incompatible Immediates"); + ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue(); + return {Value, Scalable || RHS.isScalable()}; + } + + // Scale the quantity by a constant without caring about runtime scalability. + constexpr Immediate mulUnsigned(const ScalarTy RHS) const { + ScalarTy Value = (uint64_t)Quantity * RHS; + return {Value, Scalable}; + } + + // Helpers for generating SCEVs with vscale terms where needed. + const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const { + const SCEV *S = SE.getConstant(Ty, Quantity); + if (Scalable) + S = SE.getMulExpr(S, SE.getVScale(S->getType())); + return S; + } + + const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const { + const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity); + if (Scalable) + NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType())); + return NegS; + } + + const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const { + const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity)); + if (Scalable) + SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); + return SU; + } +}; + +// This is needed for the Compare type of std::map when Immediate is used +// as a key. We don't need it to be fully correct against any value of vscale, +// just to make sure that vscale-related terms in the map are considered against +// each other rather than being mixed up and potentially missing opportunities. +struct KeyOrderTargetImmediate { + bool operator()(const Immediate &LHS, const Immediate &RHS) const { + if (LHS.isScalable() && !RHS.isScalable()) + return false; + if (!LHS.isScalable() && RHS.isScalable()) + return true; + return LHS.getKnownMinValue() < RHS.getKnownMinValue(); + } +}; + +// This would be nicer if we could be generic instead of directly using size_t, +// but there doesn't seem to be a type trait for is_orderable or +// is_lessthan_comparable or similar. +struct KeyOrderSizeTAndImmediate { + bool operator()(const std::pair<size_t, Immediate> &LHS, + const std::pair<size_t, Immediate> &RHS) const { + size_t LSize = LHS.first; + size_t RSize = RHS.first; + if (LSize != RSize) + return LSize < RSize; + return KeyOrderTargetImmediate()(LHS.second, RHS.second); + } +}; } // end anonymous namespace #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -357,7 +485,7 @@ struct Formula { GlobalValue *BaseGV = nullptr; /// Base offset for complex addressing. - int64_t BaseOffset = 0; + Immediate BaseOffset = Immediate::getZero(); /// Whether any complex addressing has a base register. bool HasBaseReg = false; @@ -388,7 +516,7 @@ struct Formula { /// An additional constant offset which added near the use. This requires a /// temporary register, but the offset itself can live in an add immediate /// field rather than a register. - int64_t UnfoldedOffset = 0; + Immediate UnfoldedOffset = Immediate::getZero(); Formula() = default; @@ -628,7 +756,7 @@ void Formula::print(raw_ostream &OS) const { if (!First) OS << " + "; else First = false; BaseGV->printAsOperand(OS, /*PrintType=*/false); } - if (BaseOffset != 0) { + if (BaseOffset.isNonZero()) { if (!First) OS << " + "; else First = false; OS << BaseOffset; } @@ -652,7 +780,7 @@ void Formula::print(raw_ostream &OS) const { OS << "<unknown>"; OS << ')'; } - if (UnfoldedOffset != 0) { + if (UnfoldedOffset.isNonZero()) { if (!First) OS << " + "; OS << "imm(" << UnfoldedOffset << ')'; } @@ -798,28 +926,34 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, /// If S involves the addition of a constant integer value, return that integer /// value, and mutate S to point to a new SCEV with that value excluded. -static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { +static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { if (C->getAPInt().getSignificantBits() <= 64) { S = SE.getConstant(C->getType(), 0); - return C->getValue()->getSExtValue(); + return Immediate::getFixed(C->getValue()->getSExtValue()); } } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { SmallVector<const SCEV *, 8> NewOps(Add->operands()); - int64_t Result = ExtractImmediate(NewOps.front(), SE); - if (Result != 0) + Immediate Result = ExtractImmediate(NewOps.front(), SE); + if (Result.isNonZero()) S = SE.getAddExpr(NewOps); return Result; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { SmallVector<const SCEV *, 8> NewOps(AR->operands()); - int64_t Result = ExtractImmediate(NewOps.front(), SE); - if (Result != 0) + Immediate Result = ExtractImmediate(NewOps.front(), SE); + if (Result.isNonZero()) S = SE.getAddRecExpr(NewOps, AR->getLoop(), // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) SCEV::FlagAnyWrap); return Result; - } - return 0; + } else if (EnableVScaleImmediates) + if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0))) + if (isa<SCEVVScale>(M->getOperand(1))) { + S = SE.getConstant(M->getType(), 0); + return Immediate::getScalable(C->getValue()->getSExtValue()); + } + return Immediate::getZero(); } /// If S involves the addition of a GlobalValue address, return that symbol, and @@ -1134,7 +1268,7 @@ struct LSRFixup { /// A constant offset to be added to the LSRUse expression. This allows /// multiple fixups to share the same LSRUse with different offsets, for /// example in an unrolled loop. - int64_t Offset = 0; + Immediate Offset = Immediate::getZero(); LSRFixup() = default; @@ -1197,8 +1331,8 @@ public: SmallVector<LSRFixup, 8> Fixups; /// Keep track of the min and max offsets of the fixups. - int64_t MinOffset = std::numeric_limits<int64_t>::max(); - int64_t MaxOffset = std::numeric_limits<int64_t>::min(); + Immediate MinOffset = Immediate::getFixedMax(); + Immediate MaxOffset = Immediate::getFixedMin(); /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. @@ -1234,9 +1368,9 @@ public: void pushFixup(LSRFixup &f) { Fixups.push_back(f); - if (f.Offset > MaxOffset) + if (Immediate::isKnownGT(f.Offset, MaxOffset)) MaxOffset = f.Offset; - if (f.Offset < MinOffset) + if (Immediate::isKnownLT(f.Offset, MinOffset)) MinOffset = f.Offset; } @@ -1254,7 +1388,7 @@ public: static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, Instruction *Fixup = nullptr); @@ -1308,9 +1442,9 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, // If the step size matches the base offset, we could use pre-indexed // addressing. - if (AMK == TTI::AMK_PreIndexed) { + if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) { if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE))) - if (Step->getAPInt() == F.BaseOffset) + if (Step->getAPInt() == F.BaseOffset.getFixedValue()) LoopCost = 0; } else if (AMK == TTI::AMK_PostIndexed) { const SCEV *LoopStep = AR->getStepRecurrence(*SE); @@ -1401,27 +1535,32 @@ void Cost::RateFormula(const Formula &F, // allows to fold 2 registers. C.NumBaseAdds += NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F))); - C.NumBaseAdds += (F.UnfoldedOffset != 0); + C.NumBaseAdds += (F.UnfoldedOffset.isNonZero()); // Accumulate non-free scaling amounts. C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue(); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { - int64_t O = Fixup.Offset; - int64_t Offset = (uint64_t)O + F.BaseOffset; - if (F.BaseGV) - C.ImmCost += 64; // Handle symbolic values conservatively. - // TODO: This should probably be the pointer size. - else if (Offset != 0) - C.ImmCost += APInt(64, Offset, true).getSignificantBits(); - - // Check with target if this offset with this instruction is - // specifically not supported. - if (LU.Kind == LSRUse::Address && Offset != 0 && - !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, - Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) - C.NumBaseAdds++; + if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) { + Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset); + if (F.BaseGV) + C.ImmCost += 64; // Handle symbolic values conservatively. + // TODO: This should probably be the pointer size. + else if (Offset.isNonZero()) + C.ImmCost += + APInt(64, Offset.getKnownMinValue(), true).getSignificantBits(); + + // Check with target if this offset with this instruction is + // specifically not supported. + if (LU.Kind == LSRUse::Address && Offset.isNonZero() && + !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, + Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) + C.NumBaseAdds++; + } else { + // Incompatible immediate type, increase cost to avoid using + C.ImmCost += 2048; + } } // If we don't count instruction cost exit here. @@ -1546,7 +1685,7 @@ void LSRFixup::print(raw_ostream &OS) const { PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); } - if (Offset != 0) + if (Offset.isNonZero()) OS << ", Offset=" << Offset; } @@ -1673,14 +1812,19 @@ LLVM_DUMP_METHOD void LSRUse::dump() const { static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, - Instruction *Fixup/*= nullptr*/) { + Instruction *Fixup /* = nullptr */) { switch (Kind) { - case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, - HasBaseReg, Scale, AccessTy.AddrSpace, Fixup); - + case LSRUse::Address: { + int64_t FixedOffset = + BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue(); + int64_t ScalableOffset = + BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0; + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset, + HasBaseReg, Scale, AccessTy.AddrSpace, + Fixup, ScalableOffset); + } case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. @@ -1688,7 +1832,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, return false; // ICmp only has two operands; don't allow more than two non-trivial parts. - if (Scale != 0 && HasBaseReg && BaseOffset != 0) + if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero()) return false; // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by @@ -1698,7 +1842,12 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. - if (BaseOffset != 0) { + if (BaseOffset.isNonZero()) { + // We don't have an interface to query whether the target supports + // icmpzero against scalable quantities yet. + if (BaseOffset.isScalable()) + return false; + // We have one of: // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset @@ -1706,8 +1855,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, if (Scale == 0) // The cast does the right thing with // std::numeric_limits<int64_t>::min(). - BaseOffset = -(uint64_t)BaseOffset; - return TTI.isLegalICmpImmediate(BaseOffset); + BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue()); + return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue()); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1715,30 +1864,35 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, case LSRUse::Basic: // Only handle single-register values. - return !BaseGV && Scale == 0 && BaseOffset == 0; + return !BaseGV && Scale == 0 && BaseOffset.isZero(); case LSRUse::Special: // Special case Basic to handle -1 scales. - return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; + return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero(); } llvm_unreachable("Invalid LSRUse Kind!"); } static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - int64_t MinOffset, int64_t MaxOffset, + Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { + if (BaseOffset.isNonZero() && + (BaseOffset.isScalable() != MinOffset.isScalable() || + BaseOffset.isScalable() != MaxOffset.isScalable())) + return false; // Check for overflow. - if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != - (MinOffset > 0)) + int64_t Base = BaseOffset.getKnownMinValue(); + int64_t Min = MinOffset.getKnownMinValue(); + int64_t Max = MaxOffset.getKnownMinValue(); + if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0)) return false; - MinOffset = (uint64_t)BaseOffset + MinOffset; - if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != - (MaxOffset > 0)) + MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable()); + if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0)) return false; - MaxOffset = (uint64_t)BaseOffset + MaxOffset; + MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable()); return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, Scale) && @@ -1747,7 +1901,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - int64_t MinOffset, int64_t MaxOffset, + Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F, const Loop &L) { // For the purpose of isAMCompletelyFolded either having a canonical formula @@ -1763,10 +1917,10 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } /// Test whether we know how to expand the current formula. -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, +static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, - int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { + Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1777,13 +1931,21 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, BaseGV, BaseOffset, true, 0)); } -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, +static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegalAddImmediate(const TargetTransformInfo &TTI, + Immediate Offset) { + if (Offset.isScalable()) + return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue()); + + return TTI.isLegalAddImmediate(Offset.getFixedValue()); +} + static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F) { // Target may want to look at the user instructions. @@ -1816,12 +1978,20 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { // Check the scaling factor cost with both the min and max offsets. + int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0; + if (F.BaseOffset.isScalable()) { + ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue(); + ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue(); + } else { + FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue(); + FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue(); + } InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, - F.Scale, LU.AccessTy.AddrSpace); + LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin), + F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, - F.Scale, LU.AccessTy.AddrSpace); + LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax), + F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() && "Legal addressing mode has an illegal cost!"); @@ -1840,10 +2010,11 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg) { // Fast-path: zero is always foldable. - if (BaseOffset == 0 && !BaseGV) return true; + if (BaseOffset.isZero() && !BaseGV) + return true; // Conservatively, create an address with an immediate and a // base and a scale. @@ -1856,13 +2027,22 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, HasBaseReg = true; } + // FIXME: Try with + without a scale? Maybe based on TTI? + // I think basereg + scaledreg + immediateoffset isn't a good 'conservative' + // default for many architectures, not just AArch64 SVE. More investigation + // needed later to determine if this should be used more widely than just + // on scalable types. + if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero && + AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale) + Scale = 0; + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - ScalarEvolution &SE, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, + ScalarEvolution &SE, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const SCEV *S, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1870,14 +2050,18 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, // Conservatively, create an address with an immediate and a // base and a scale. - int64_t BaseOffset = ExtractImmediate(S, SE); + Immediate BaseOffset = ExtractImmediate(S, SE); GlobalValue *BaseGV = ExtractSymbol(S, SE); // If there's anything else involved, it's not foldable. if (!S->isZero()) return false; // Fast-path: zero is always foldable. - if (BaseOffset == 0 && !BaseGV) return true; + if (BaseOffset.isZero() && !BaseGV) + return true; + + if (BaseOffset.isScalable()) + return false; // Conservatively, create an address with an immediate and a // base and a scale. @@ -2026,11 +2210,11 @@ class LSRInstance { using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>; UseMapTy UseMap; - bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, - MemAccessTy AccessTy); + std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -2056,7 +2240,7 @@ class LSRInstance { void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, const Formula &Base, - const SmallVectorImpl<int64_t> &Worklist, + const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg = false); void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); @@ -2215,17 +2399,20 @@ void LSRInstance::OptimizeShadowIV() { // Ignore negative constants, as the code below doesn't handle them // correctly. TODO: Remove this restriction. - if (!C->getValue().isStrictlyPositive()) continue; + if (!C->getValue().isStrictlyPositive()) + continue; /* Add new PHINode. */ - PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); + PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator()); + NewPH->setDebugLoc(PH->getDebugLoc()); /* create new increment. '++d' in above example. */ Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); - BinaryOperator *NewIncr = - BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? - Instruction::FAdd : Instruction::FSub, - NewPH, CFP, "IV.S.next.", Incr); + BinaryOperator *NewIncr = BinaryOperator::Create( + Incr->getOpcode() == Instruction::Add ? Instruction::FAdd + : Instruction::FSub, + NewPH, CFP, "IV.S.next.", Incr->getIterator()); + NewIncr->setDebugLoc(Incr->getDebugLoc()); NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); @@ -2395,8 +2582,8 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // Ok, everything looks ok to change the condition into an SLT or SGE and // delete the max calculation. - ICmpInst *NewCond = - new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); + ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred, + Cond->getOperand(0), NewRHS, "scmp"); // Delete the max calculation instructions. NewCond->setDebugLoc(Cond->getDebugLoc()); @@ -2563,11 +2750,11 @@ LSRInstance::OptimizeLoopTermCond() { /// Determine if the given use can accommodate a fixup at the given offset and /// other details. If so, update the use and return true. -bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, +bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy) { - int64_t NewMinOffset = LU.MinOffset; - int64_t NewMaxOffset = LU.MaxOffset; + Immediate NewMinOffset = LU.MinOffset; + Immediate NewMaxOffset = LU.MaxOffset; MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to @@ -2587,18 +2774,25 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, } // Conservatively assume HasBaseReg is true for now. - if (NewOffset < LU.MinOffset) { + if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) { if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; - } else if (NewOffset > LU.MaxOffset) { + } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) { if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, NewOffset - LU.MinOffset, HasBaseReg)) return false; NewMaxOffset = NewOffset; } + // FIXME: We should be able to handle some level of scalable offset support + // for 'void', but in order to get basic support up and running this is + // being left out. + if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() && + (NewMinOffset.isScalable() || NewMaxOffset.isScalable())) + return false; + // Update the use. LU.MinOffset = NewMinOffset; LU.MaxOffset = NewMaxOffset; @@ -2609,17 +2803,17 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, /// Return an LSRUse index and an offset value for a fixup which needs the given /// expression, with the given kind and optional access type. Either reuse an /// existing use or create a new one, as needed. -std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - MemAccessTy AccessTy) { +std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; - int64_t Offset = ExtractImmediate(Expr, SE); + Immediate Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, Offset, /*HasBaseReg=*/ true)) { Expr = Copy; - Offset = 0; + Offset = Immediate::getFixed(0); } std::pair<UseMapTy::iterator, bool> P = @@ -2680,7 +2874,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, F.BaseGV == OrigF.BaseGV && F.Scale == OrigF.Scale && F.UnfoldedOffset == OrigF.UnfoldedOffset) { - if (F.BaseOffset == 0) + if (F.BaseOffset.isZero()) return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we @@ -3162,14 +3356,27 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); - if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) - return false; + Immediate IncOffset = Immediate::getZero(); + if (IncConst) { + if (IncConst && IncConst->getAPInt().getSignificantBits() > 64) + return false; + IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue()); + } else { + // Look for mul(vscale, constant), to detect a scalable offset. + auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr); + if (!IncVScale || IncVScale->getNumOperands() != 2 || + !isa<SCEVVScale>(IncVScale->getOperand(1))) + return false; + auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0)); + if (!Scale || Scale->getType()->getScalarSizeInBits() > 64) + return false; + IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue()); + } - if (IncConst->getAPInt().getSignificantBits() > 64) + if (!isAddressUse(TTI, UserInst, Operand)) return false; MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); - int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, IncOffset, /*HasBaseReg=*/false)) return false; @@ -3217,6 +3424,10 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); const SCEV *LeftOverExpr = nullptr; + const SCEV *Accum = SE.getZero(IntTy); + SmallVector<std::pair<const SCEV *, Value *>> Bases; + Bases.emplace_back(Accum, IVSrc); + for (const IVInc &Inc : Chain) { Instruction *InsertPt = Inc.UserInst; if (isa<PHINode>(InsertPt)) @@ -3229,10 +3440,31 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, // IncExpr was the result of subtraction of two narrow values, so must // be signed. const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy); + Accum = SE.getAddExpr(Accum, IncExpr); LeftOverExpr = LeftOverExpr ? SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; } - if (LeftOverExpr && !LeftOverExpr->isZero()) { + + // Look through each base to see if any can produce a nice addressing mode. + bool FoundBase = false; + for (auto [MapScev, MapIVOper] : reverse(Bases)) { + const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev); + if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) { + if (!Remainder->isZero()) { + Rewriter.clearPostInc(); + Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt); + const SCEV *IVOperExpr = + SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV)); + IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); + } else { + IVOper = MapIVOper; + } + + FoundBase = true; + break; + } + } + if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) { // Expand the IV increment. Rewriter.clearPostInc(); Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); @@ -3243,6 +3475,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, // If an IV increment can't be folded, use it as the next IV value. if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); + Bases.emplace_back(Accum, IVOper); IVSrc = IVOper; LeftOverExpr = nullptr; } @@ -3377,9 +3610,9 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { } // Get or create an LSRUse. - std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); + std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy); size_t LUIdx = P.first; - int64_t Offset = P.second; + Immediate Offset = P.second; LSRUse &LU = Uses[LUIdx]; // Record the fixup. @@ -3569,10 +3802,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { continue; } - std::pair<size_t, int64_t> P = getUse( - S, LSRUse::Basic, MemAccessTy()); + std::pair<size_t, Immediate> P = + getUse(S, LSRUse::Basic, MemAccessTy()); size_t LUIdx = P.first; - int64_t Offset = P.second; + Immediate Offset = P.second; LSRUse &LU = Uses[LUIdx]; LSRFixup &LF = LU.getNewFixup(); LF.UserInst = const_cast<Instruction *>(UserInst); @@ -3728,13 +3961,17 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, continue; Formula F = Base; + if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable()) + continue; + // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = - (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); + Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + + InnerSumSC->getValue()->getZExtValue()); if (IsScaledReg) F.ScaledReg = nullptr; else @@ -3747,10 +3984,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() + SC->getValue()->getZExtValue())) F.UnfoldedOffset = - (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); + Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + + SC->getValue()->getZExtValue()); else F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the @@ -3791,7 +4029,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. if (Base.BaseRegs.size() + (Base.Scale == 1) + - (Base.UnfoldedOffset != 0) <= 1) + (Base.UnfoldedOffset.isNonZero()) <= + 1) return; // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before @@ -3840,11 +4079,11 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // If we have an unfolded offset, generate a formula combining it with the // registers collected. - if (NewBase.UnfoldedOffset) { + if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) { assert(CombinedIntegerType && "Missing a type for the unfolded offset"); - Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, - true)); - NewBase.UnfoldedOffset = 0; + Ops.push_back(SE.getConstant(CombinedIntegerType, + NewBase.UnfoldedOffset.getFixedValue(), true)); + NewBase.UnfoldedOffset = Immediate::getFixed(0); GenerateFormula(SE.getAddExpr(Ops)); } } @@ -3884,15 +4123,18 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, /// Helper function for LSRInstance::GenerateConstantOffsets. void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, - const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { + const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) { - auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { + auto GenerateOffset = [&](const SCEV *G, Immediate Offset) { Formula F = Base; - F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + if (!Base.BaseOffset.isCompatibleImmediate(Offset)) + return; + F.BaseOffset = Base.BaseOffset.subUnsigned(Offset); if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G); + const SCEV *NewOffset = Offset.getSCEV(SE, G->getType()); + const SCEV *NewG = SE.getAddExpr(NewOffset, G); // If it cancelled out, drop the base register, otherwise update it. if (NewG->isZero()) { if (IsScaledReg) { @@ -3928,21 +4170,24 @@ void LSRInstance::GenerateConstantOffsetsImpl( int64_t Step = StepInt.isNegative() ? StepInt.getSExtValue() : StepInt.getZExtValue(); - for (int64_t Offset : Worklist) { - Offset -= Step; - GenerateOffset(G, Offset); + for (Immediate Offset : Worklist) { + if (Offset.isFixed()) { + Offset = Immediate::getFixed(Offset.getFixedValue() - Step); + GenerateOffset(G, Offset); + } } } } } - for (int64_t Offset : Worklist) + for (Immediate Offset : Worklist) GenerateOffset(G, Offset); - int64_t Imm = ExtractImmediate(G, SE); - if (G->isZero() || Imm == 0) + Immediate Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm.isZero() || + !Base.BaseOffset.isCompatibleImmediate(Imm)) return; Formula F = Base; - F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + F.BaseOffset = F.BaseOffset.addUnsigned(Imm); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) return; if (IsScaledReg) { @@ -3961,7 +4206,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: For now, just add the min and max offset, because it usually isn't // worthwhile looking at everything inbetween. - SmallVector<int64_t, 2> Worklist; + SmallVector<Immediate, 2> Worklist; Worklist.push_back(LU.MinOffset); if (LU.MaxOffset != LU.MinOffset) Worklist.push_back(LU.MaxOffset); @@ -4001,27 +4246,31 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (!ConstantInt::isValueValidForType(IntTy, Factor)) continue; // Check that the multiplication doesn't overflow. - if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1) + if (Base.BaseOffset.isMin() && Factor == -1) + continue; + // Not supporting scalable immediates. + if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable()) continue; - int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; + Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor); assert(Factor != 0 && "Zero factor not expected!"); - if (NewBaseOffset / Factor != Base.BaseOffset) + if (NewBaseOffset.getFixedValue() / Factor != + Base.BaseOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, NewBaseOffset)) + !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue())) continue; // Check that multiplying with the use offset doesn't overflow. - int64_t Offset = LU.MinOffset; - if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1) + Immediate Offset = LU.MinOffset; + if (Offset.isMin() && Factor == -1) continue; - Offset = (uint64_t)Offset * Factor; - if (Offset / Factor != LU.MinOffset) + Offset = Offset.mulUnsigned(Factor); + if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, Offset)) + !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue())) continue; Formula F = Base; @@ -4032,7 +4281,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; // Compensate for the use having MinOffset built into it. - F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; + F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset); const SCEV *FactorS = SE.getConstant(IntTy, Factor); @@ -4051,16 +4300,16 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, } // Check that multiplying with the unfolded offset doesn't overflow. - if (F.UnfoldedOffset != 0) { - if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() && - Factor == -1) + if (F.UnfoldedOffset.isNonZero()) { + if (F.UnfoldedOffset.isMin() && Factor == -1) continue; - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; - if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor); + if (F.UnfoldedOffset.getFixedValue() / Factor != + Base.UnfoldedOffset.getFixedValue()) continue; // If the offset will be truncated, check that it is in bounds. - if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset)) + if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType( + IntTy, F.UnfoldedOffset.getFixedValue())) continue; } @@ -4103,8 +4352,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { } // For an ICmpZero, negating a solitary base register won't lead to // new solutions. - if (LU.Kind == LSRUse::ICmpZero && - !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) + if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg && + Base.BaseOffset.isZero() && !Base.BaseGV) continue; // For each addrec base reg, if its loop is current loop, apply the scale. for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { @@ -4230,10 +4479,10 @@ namespace { /// structures moving underneath it. struct WorkItem { size_t LUIdx; - int64_t Imm; + Immediate Imm; const SCEV *OrigReg; - WorkItem(size_t LI, int64_t I, const SCEV *R) + WorkItem(size_t LI, Immediate I, const SCEV *R) : LUIdx(LI), Imm(I), OrigReg(R) {} void print(raw_ostream &OS) const; @@ -4257,14 +4506,14 @@ LLVM_DUMP_METHOD void WorkItem::dump() const { /// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. - using ImmMapTy = std::map<int64_t, const SCEV *>; + using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>; DenseMap<const SCEV *, ImmMapTy> Map; DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; SmallVector<const SCEV *, 8> Sequence; for (const SCEV *Use : RegUses) { const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify. - int64_t Imm = ExtractImmediate(Reg, SE); + Immediate Imm = ExtractImmediate(Reg, SE); auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy())); if (Pair.second) Sequence.push_back(Reg); @@ -4276,7 +4525,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // a list of work to do and do the work in a separate step so that we're // not adding formulae and register counts while we're searching. SmallVector<WorkItem, 32> WorkItems; - SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; + SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate> + UniqueItems; for (const SCEV *Reg : Sequence) { const ImmMapTy &Imms = Map.find(Reg)->second; @@ -4295,7 +4545,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { J != JE; ++J) { const SCEV *OrigReg = J->second; - int64_t JImm = J->first; + Immediate JImm = J->first; const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); if (!isa<SCEVConstant>(OrigReg) && @@ -4307,22 +4557,34 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. - int64_t First = Imms.begin()->first; - int64_t Last = std::prev(Imms.end())->first; + Immediate First = Imms.begin()->first; + Immediate Last = std::prev(Imms.end())->first; + if (!First.isCompatibleImmediate(Last)) { + LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg + << "\n"); + continue; + } + // Only scalable if both terms are scalable, or if one is scalable and + // the other is 0. + bool Scalable = First.isScalable() || Last.isScalable(); + int64_t FI = First.getKnownMinValue(); + int64_t LI = Last.getKnownMinValue(); // Compute (First + Last) / 2 without overflow using the fact that // First + Last = 2 * (First + Last) + (First ^ Last). - int64_t Avg = (First & Last) + ((First ^ Last) >> 1); - // If the result is negative and First is odd and Last even (or vice versa), + int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1); + // If the result is negative and FI is odd and LI even (or vice versa), // we rounded towards -inf. Add 1 in that case, to round towards 0. - Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63)); + Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63)); ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), std::prev(Imms.end()), - Imms.lower_bound(Avg)}; + Imms.lower_bound(Immediate::get(Avg, Scalable))}; for (const auto &M : OtherImms) { if (M == J || M == JE) continue; + if (!JImm.isCompatibleImmediate(M->first)) + continue; // Compute the difference between the two. - int64_t Imm = (uint64_t)JImm - M->first; + Immediate Imm = JImm.subUnsigned(M->first); for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) @@ -4340,11 +4602,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { for (const WorkItem &WI : WorkItems) { size_t LUIdx = WI.LUIdx; LSRUse &LU = Uses[LUIdx]; - int64_t Imm = WI.Imm; + Immediate Imm = WI.Imm; const SCEV *OrigReg = WI.OrigReg; Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); - const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); + const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); // TODO: Use a more targeted data structure. @@ -4357,10 +4619,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { - int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; + if (!F.BaseOffset.isCompatibleImmediate(Imm)) + continue; + Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale)); // Don't create 50 + reg(-50). - if (F.referencesReg(SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offset)))) + const SCEV *S = Offset.getNegativeSCEV(SE, IntTy); + if (F.referencesReg(S)) continue; Formula NewF = F; NewF.BaseOffset = Offset; @@ -4372,11 +4636,18 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // If the new scale is a constant in a register, and adding the constant // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. - if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) - if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) { + // FIXME: Do we need to do something for scalable immediates here? + // A scalable SCEV won't be constant, but we might still have + // something in the offset? Bail out for now to be safe. + if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable()) + continue; + if (C->getValue()->isNegative() != + (NewF.BaseOffset.isLessThanZero()) && (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) - .ule(std::abs(NewF.BaseOffset))) + .ule(std::abs(NewF.BaseOffset.getFixedValue()))) continue; + } // OK, looks good. NewF.canonicalize(*this->L); @@ -4388,16 +4659,21 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (BaseReg != OrigReg) continue; Formula NewF = F; - NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; + if (!NewF.BaseOffset.isCompatibleImmediate(Imm) || + !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) || + !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset)) + continue; + NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) continue; - if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm); + if (!isLegalAddImmediate(TTI, NewUnfoldedOffset)) continue; NewF = F; - NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + NewF.UnfoldedOffset = NewUnfoldedOffset; } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); @@ -4405,13 +4681,18 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // constant value to the immediate would produce a value closer to // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) - if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) - if ((C->getAPInt() + NewF.BaseOffset) + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) { + if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable()) + goto skip_formula; + if ((C->getAPInt() + NewF.BaseOffset.getFixedValue()) .abs() - .slt(std::abs(NewF.BaseOffset)) && - (C->getAPInt() + NewF.BaseOffset).countr_zero() >= - (unsigned)llvm::countr_zero<uint64_t>(NewF.BaseOffset)) + .slt(std::abs(NewF.BaseOffset.getFixedValue())) && + (C->getAPInt() + NewF.BaseOffset.getFixedValue()) + .countr_zero() >= + (unsigned)llvm::countr_zero<uint64_t>( + NewF.BaseOffset.getFixedValue())) goto skip_formula; + } // Ok, looks good. NewF.canonicalize(*this->L); @@ -4595,6 +4876,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { bool Any = false; for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { Formula &F = LU.Formulae[i]; + if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable()) + continue; // Look for a formula with a constant or GV in a register. If the use // also has a formula with that same value in an immediate field, // delete the one that uses a register. @@ -4604,7 +4887,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { Formula NewF = F; //FIXME: Formulas should store bitwidth to do wrapping properly. // See PR41034. - NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue(); + NewF.BaseOffset = + Immediate::getFixed(NewF.BaseOffset.getFixedValue() + + (uint64_t)C->getValue()->getSExtValue()); NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -4660,7 +4945,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; for (const Formula &F : LU.Formulae) { - if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) + if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1)) continue; LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); @@ -5247,10 +5532,20 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { assert(Solution.size() == Uses.size() && "Malformed solution!"); + const bool EnableDropUnprofitableSolution = [&] { + switch (AllowDropSolutionIfLessProfitable) { + case cl::BOU_TRUE: + return true; + case cl::BOU_FALSE: + return false; + case cl::BOU_UNSET: + return TTI.shouldDropLSRSolutionIfLessProfitable(); + } + llvm_unreachable("Unhandled cl::boolOrDefault enum"); + }(); + if (BaselineCost.isLess(SolutionCost)) { - LLVM_DEBUG(dbgs() << "The baseline solution requires "; - BaselineCost.print(dbgs()); dbgs() << "\n"); - if (!AllowDropSolutionIfLessProfitable) + if (!EnableDropUnprofitableSolution) LLVM_DEBUG( dbgs() << "Baseline is more profitable than chosen solution, " "add option 'lsr-drop-solution' to drop LSR solution.\n"); @@ -5485,31 +5780,36 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, Ops.push_back(SE.getUnknown(FullV)); } + // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail + // out at this point, or should we generate a SCEV adding together mixed + // offsets? + assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) && + "Expanding mismatched offsets\n"); // Expand the immediate portion. - int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; - if (Offset != 0) { + Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset); + if (Offset.isNonZero()) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a // negated immediate. if (!ICmpScaledV) - ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); + ICmpScaledV = + ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()); else { Ops.push_back(SE.getUnknown(ICmpScaledV)); - ICmpScaledV = ConstantInt::get(IntTy, Offset); + ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue()); } } else { // Just add the immediate values. These again are expected to be matched // as part of the address. - Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); + Ops.push_back(Offset.getUnknownSCEV(SE, IntTy)); } } // Expand the unfolded offset portion. - int64_t UnfoldedOffset = F.UnfoldedOffset; - if (UnfoldedOffset != 0) { + Immediate UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset.isNonZero()) { // Just add the immediate values. - Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, - UnfoldedOffset))); + Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy)); } // Emit instructions summing all the operands. @@ -5532,10 +5832,9 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, "a scale at the same time!"); if (F.Scale == -1) { if (ICmpScaledV->getType() != OpTy) { - Instruction *Cast = - CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, - OpTy, false), - ICmpScaledV, OpTy, "tmp", CI); + Instruction *Cast = CastInst::Create( + CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false), + ICmpScaledV, OpTy, "tmp", CI->getIterator()); ICmpScaledV = Cast; } CI->setOperand(1, ICmpScaledV); @@ -5546,11 +5845,11 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), - -(uint64_t)Offset); + -(uint64_t)Offset.getFixedValue()); if (C->getType() != OpTy) { C = ConstantFoldCastOperand( CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy, - CI->getModule()->getDataLayout()); + CI->getDataLayout()); assert(C && "Cast of ConstantInt should have folded"); } @@ -5635,11 +5934,10 @@ void LSRInstance::RewriteForPHI( // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) - FullV = - CastInst::Create(CastInst::getCastOpcode(FullV, false, - OpTy, false), - FullV, LF.OperandValToReplace->getType(), - "tmp", BB->getTerminator()); + FullV = CastInst::Create( + CastInst::getCastOpcode(FullV, false, OpTy, false), FullV, + LF.OperandValToReplace->getType(), "tmp", + BB->getTerminator()->getIterator()); // If the incoming block for this value is not in the loop, it means the // current PHI is not in a loop exit, so we must create a LCSSA PHI for @@ -5657,8 +5955,8 @@ void LSRInstance::RewriteForPHI( // formulae will not be implemented completely and some instructions // will not be eliminated. if (needUpdateFixups) { - for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) - for (LSRFixup &Fixup : Uses[LUIdx].Fixups) + for (LSRUse &LU : Uses) + for (LSRFixup &Fixup : LU.Fixups) // If fixup is supposed to rewrite some operand in the phi // that was just updated, it may be already moved to // another phi node. Such fixup requires update. @@ -5711,8 +6009,8 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) { Instruction *Cast = - CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), - FullV, OpTy, "tmp", LF.UserInst); + CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), + FullV, OpTy, "tmp", LF.UserInst->getIterator()); FullV = Cast; } @@ -5856,7 +6154,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0 ? PreferredAddresingMode : TTI.getPreferredAddressingMode(L, &SE)), - Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", false), + Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false), BaselineCost(L, SE, TTI, AMK) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) @@ -5930,6 +6228,8 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "The baseline solution requires "; + BaselineCost.print(dbgs()); dbgs() << "\n"); // Now use the reuse data to generate a bunch of interesting ways // to formulate the values needed for the uses. @@ -6368,10 +6668,10 @@ struct DVIRecoveryRec { DVIRecoveryRec(DbgValueInst *DbgValue) : DbgRef(DbgValue), Expr(DbgValue->getExpression()), HadLocationArgList(false) {} - DVIRecoveryRec(DPValue *DPV) - : DbgRef(DPV), Expr(DPV->getExpression()), HadLocationArgList(false) {} + DVIRecoveryRec(DbgVariableRecord *DVR) + : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {} - PointerUnion<DbgValueInst *, DPValue *> DbgRef; + PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef; DIExpression *Expr; bool HadLocationArgList; SmallVector<WeakVH, 2> LocationOps; @@ -6467,7 +6767,7 @@ static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, if (isa<DbgValueInst *>(DVIRec.DbgRef)) UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef)); else - UpdateDbgValueInstImpl(cast<DPValue *>(DVIRec.DbgRef)); + UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef)); } /// Cached location ops may be erased during LSR, in which case a poison is @@ -6513,7 +6813,7 @@ static void restorePreTransformState(DVIRecoveryRec &DVIRec) { if (isa<DbgValueInst *>(DVIRec.DbgRef)) RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef)); else - RestorePreTransformStateImpl(cast<DPValue *>(DVIRec.DbgRef)); + RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef)); } static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, @@ -6523,7 +6823,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, if (isa<DbgValueInst *>(DVIRec.DbgRef) ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation() - : !cast<DPValue *>(DVIRec.DbgRef)->isKillLocation()) + : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation()) return false; // LSR may have caused several changes to the dbg.value in the failed salvage @@ -6621,7 +6921,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n"); else LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " - << *cast<DPValue *>(DVIRec.DbgRef) << "\n"); + << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n"); return true; } @@ -6712,9 +7012,9 @@ static void DbgGatherSalvagableDVI( SalvageableDVISCEVs.push_back(std::move(NewRec)); return true; }; - for (auto &DPV : I.getDbgValueRange()) { - if (DPV.isDbgValue() || DPV.isDbgAssign()) - ProcessDbgValue(&DPV); + for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { + if (DVR.isDbgValue() || DVR.isDbgAssign()) + ProcessDbgValue(&DVR); } auto DVI = dyn_cast<DbgValueInst>(&I); if (!DVI) @@ -6762,7 +7062,7 @@ static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE, static std::optional<std::tuple<PHINode *, PHINode *, const SCEV *, bool>> canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - const LoopInfo &LI) { + const LoopInfo &LI, const TargetTransformInfo &TTI) { if (!L->isInnermost()) { LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n"); return std::nullopt; @@ -6808,18 +7108,35 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, if (!matchSimpleRecurrence(LHS, ToFold, ToFoldStart, ToFoldStep)) return std::nullopt; + // Ensure the simple recurrence is a part of the current loop. + if (ToFold->getParent() != L->getHeader()) + return std::nullopt; + // If that IV isn't dead after we rewrite the exit condition in terms of // another IV, there's no point in doing the transform. if (!isAlmostDeadIV(ToFold, LoopLatch, TermCond)) return std::nullopt; + // Inserting instructions in the preheader has a runtime cost, scale + // the allowed cost with the loops trip count as best we can. + const unsigned ExpansionBudget = [&]() { + unsigned Budget = 2 * SCEVCheapExpansionBudget; + if (unsigned SmallTC = SE.getSmallConstantMaxTripCount(L)) + return std::min(Budget, SmallTC); + if (std::optional<unsigned> SmallTC = getLoopEstimatedTripCount(L)) + return std::min(Budget, *SmallTC); + // Unknown trip count, assume long running by default. + return Budget; + }(); + const SCEV *BECount = SE.getBackedgeTakenCount(L); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = L->getHeader()->getDataLayout(); SCEVExpander Expander(SE, DL, "lsr_fold_term_cond"); PHINode *ToHelpFold = nullptr; const SCEV *TermValueS = nullptr; bool MustDropPoison = false; + auto InsertPt = L->getLoopPreheader()->getTerminator(); for (PHINode &PN : L->getHeader()->phis()) { if (ToFold == &PN) continue; @@ -6861,6 +7178,14 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, continue; } + if (Expander.isHighCostExpansion(TermValueSLocal, L, ExpansionBudget, + &TTI, InsertPt)) { + LLVM_DEBUG( + dbgs() << "Is too expensive to expand terminating value for phi node" + << PN << "\n"); + continue; + } + // The candidate IV may have been otherwise dead and poison from the // very first iteration. If we can't disprove that, we can't use the IV. if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) { @@ -6941,12 +7266,13 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakTrackingVH, 16> DeadInsts; - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = L->getHeader()->getDataLayout(); SCEVExpander Rewriter(SE, DL, "lsr", false); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI); + Rewriter.clear(); if (numFolded) { Changed = true; RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, @@ -6961,10 +7287,11 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // skip the updates in each loop iteration. if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) { SmallVector<WeakTrackingVH, 16> DeadInsts; - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = L->getHeader()->getDataLayout(); SCEVExpander Rewriter(SE, DL, "lsr", true); int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT, UnusedIndVarInLoop, DeadInsts); + Rewriter.clear(); if (Rewrites) { Changed = true; RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, @@ -6986,7 +7313,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, }(); if (EnableFormTerm) { - if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI)) { + if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI, TTI)) { auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt; Changed = true; @@ -7010,9 +7337,8 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags(); // SCEVExpander for both use in preheader and latch - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = L->getHeader()->getDataLayout(); SCEVExpander Expander(SE, DL, "lsr_fold_term_cond"); - SCEVExpanderCleaner ExpCleaner(Expander); assert(Expander.isSafeToExpand(TermValueS) && "Terminating value was checked safe in canFoldTerminatingCondition"); @@ -7043,10 +7369,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, BI->setCondition(NewTermCond); + Expander.clear(); OldTermCond->eraseFromParent(); DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); - - ExpCleaner.markResultUsed(); } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 7b4c54370e48..f8e2f1f28088 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns); if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) { - LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" - << " which cannot be duplicated or have invalid cost.\n"); + LLVM_DEBUG(dbgs() << " Loop not considered unrollable\n"); return LoopUnrollResult::Unmodified; } @@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return LoopUnrollResult::Unmodified; } - if (InnerUCE.Convergent || OuterUCE.Convergent) { + // FIXME: The call to canUnroll() allows some controlled convergent + // operations, but we block them here for future changes. + if (InnerUCE.Convergence != ConvergenceKind::None || + OuterUCE.Convergence != ConvergenceKind::None) { LLVM_DEBUG( dbgs() << " Not unrolling loop with convergent instructions.\n"); return LoopUnrollResult::Unmodified; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 7cfeb019af97..cbc35b6dd429 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -27,6 +28,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -173,6 +175,10 @@ static cl::opt<unsigned> cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations")); +static cl::opt<unsigned> PragmaUnrollFullMaxIterations( + "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden, + cl::desc("Maximum allowed iterations to unroll under pragma unroll full.")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -446,7 +452,15 @@ static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost( // First accumulate the cost of this instruction. if (!Cost.IsFree) { - UnrolledCost += TTI.getInstructionCost(I, CostKind); + // Consider simplified operands in instruction cost. + SmallVector<Value *, 4> Operands; + transform(I->operands(), std::back_inserter(Operands), + [&](Value *Op) { + if (auto Res = SimplifiedValues.lookup(Op)) + return Res; + return Op; + }); + UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind); LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration << "): "); LLVM_DEBUG(I->dump()); @@ -670,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator( const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) { CodeMetrics Metrics; for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, TTI, EphValues); + Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false, + L); NumInlineCandidates = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; - Convergent = Metrics.convergent; + Convergence = Metrics.Convergence; LoopSize = Metrics.NumInsts; + ConvergenceAllowsRuntime = + Metrics.Convergence != ConvergenceKind::Uncontrolled && + !getLoopConvergenceHeart(L); // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's @@ -687,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator( LoopSize = BEInsns + 1; } +bool UnrollCostEstimator::canUnroll() const { + switch (Convergence) { + case ConvergenceKind::ExtendedLoop: + LLVM_DEBUG(dbgs() << " Convergence prevents unrolling.\n"); + return false; + default: + break; + } + if (!LoopSize.isValid()) { + LLVM_DEBUG(dbgs() << " Invalid loop size prevents unrolling.\n"); + return false; + } + if (NotDuplicatable) { + LLVM_DEBUG(dbgs() << " Non-duplicatable blocks prevent unrolling.\n"); + return false; + } + return true; +} + uint64_t UnrollCostEstimator::getUnrolledLoopSize( const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite) const { @@ -776,8 +813,17 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, return PInfo.PragmaCount; } - if (PInfo.PragmaFullUnroll && TripCount != 0) + if (PInfo.PragmaFullUnroll && TripCount != 0) { + // Certain cases with UBSAN can cause trip count to be calculated as + // INT_MAX, Block full unrolling at a reasonable limit so that the compiler + // doesn't hang trying to unroll the loop. See PR77842 + if (TripCount > PragmaUnrollFullMaxIterations) { + LLVM_DEBUG(dbgs() << "Won't unroll; trip count is too large\n"); + return std::nullopt; + } + return TripCount; + } if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount && MaxTripCount <= UP.MaxUpperBound) @@ -1119,7 +1165,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, std::optional<bool> ProvidedUpperBound, std::optional<bool> ProvidedAllowPeeling, std::optional<bool> ProvidedAllowProfileBasedPeeling, - std::optional<unsigned> ProvidedFullUnrollMaxCount) { + std::optional<unsigned> ProvidedFullUnrollMaxCount, + AAResults *AA = nullptr) { LLVM_DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" @@ -1182,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); if (!UCE.canUnroll()) { - LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" - << " which cannot be duplicated or have invalid cost.\n"); + LLVM_DEBUG(dbgs() << " Loop not considered unrollable.\n"); return LoopUnrollResult::Unmodified; } @@ -1230,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, // is unsafe -- it adds a control-flow dependency to the convergent // operation. Therefore restrict remainder loop (try unrolling without). // - // TODO: This is quite conservative. In practice, convergent_op() - // is likely to be called unconditionally in the loop. In this - // case, the program would be ill-formed (on most architectures) - // unless n were the same on all threads in a thread group. - // Assuming n is the same on all threads, any kind of unrolling is - // safe. But currently llvm's notion of convergence isn't powerful - // enough to express this. - if (UCE.Convergent) - UP.AllowRemainder = false; + // TODO: This is somewhat conservative; we could allow the remainder if the + // trip count is uniform. + UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime; // Try to find the trip count upper bound if we cannot find the exact trip // count. @@ -1258,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, if (!UP.Count) return LoopUnrollResult::Unmodified; + UP.Runtime &= UCE.ConvergenceAllowsRuntime; + if (PP.PeelCount) { assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step"); LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName() @@ -1271,7 +1313,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, ValueToValueMapTy VMap; if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) { - simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI); + simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr); // If the loop was peeled, we already "used up" the profile information // we had, so we don't want to unroll or peel again. if (PP.PeelProfiledIterations) @@ -1282,7 +1324,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && !(UP.Count >= MaxTripCount)) { + if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; @@ -1300,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, // Unroll the loop. Loop *RemainderLoop = nullptr; + UnrollLoopOptions ULO; + ULO.Count = UP.Count; + ULO.Force = UP.Force; + ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount; + ULO.UnrollRemainder = UP.UnrollRemainder; + ULO.Runtime = UP.Runtime; + ULO.ForgetAllSCEV = ForgetAllSCEV; + ULO.Heart = getLoopConvergenceHeart(L); LoopUnrollResult UnrollResult = UnrollLoop( - L, - {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, - UP.UnrollRemainder, ForgetAllSCEV}, - LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); + L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; @@ -1551,6 +1598,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + AAResults &AA = AM.getResult<AAManager>(F); LoopAnalysisManager *LAM = nullptr; if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F)) @@ -1606,7 +1654,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, /*Count*/ std::nullopt, /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling, - UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount); + UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount, + &AA); Changed |= Result != LoopUnrollResult::Unmodified; // The parent must not be damaged by unrolling! diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index f39c24484840..663715948241 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -582,7 +582,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp index 6aba913005d0..b42d3b2bc09a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp @@ -20,7 +20,7 @@ #include "llvm/Transforms/Utils/LowerAtomic.h" using namespace llvm; -#define DEBUG_TYPE "loweratomic" +#define DEBUG_TYPE "lower-atomic" static bool LowerFenceInst(FenceInst *FI) { FI->eraseFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp index b167120a906d..bd7895feb64a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -85,8 +85,11 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II, if (Target && Target != Other) { BasicBlock *Source = BI->getParent(); Other->removePredecessor(Source); + + Instruction *NewBI = BranchInst::Create(Target, Source); + NewBI->setDebugLoc(BI->getDebugLoc()); BI->eraseFromParent(); - BranchInst::Create(Target, Source); + if (DTU) DTU->applyUpdates({{DominatorTree::Delete, Source, Other}}); if (pred_empty(Other)) @@ -103,7 +106,7 @@ static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI, DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy); bool HasDeadBlocks = false; - const auto &DL = F.getParent()->getDataLayout(); + const auto &DL = F.getDataLayout(); SmallVector<WeakTrackingVH, 8> Worklist; ReversePostOrderTraversal<Function *> RPOT(&F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 6f87e4d91d2c..17c5a4ee1fd0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -102,7 +102,7 @@ static bool handleSwitchExpect(SwitchInst &SI) { misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true); SI.setCondition(ArgValue); - setBranchWeights(SI, Weights); + setBranchWeights(SI, Weights, /*IsExpected=*/true); return true; } @@ -262,11 +262,13 @@ static void handlePhiDef(CallInst *Expect) { if (IsOpndComingFromSuccessor(BI->getSuccessor(1))) BI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(LikelyBranchWeightVal, - UnlikelyBranchWeightVal)); + UnlikelyBranchWeightVal, + /*IsExpected=*/true)); else if (IsOpndComingFromSuccessor(BI->getSuccessor(0))) BI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(UnlikelyBranchWeightVal, - LikelyBranchWeightVal)); + LikelyBranchWeightVal, + /*IsExpected=*/true)); } } @@ -331,12 +333,12 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { SmallVector<uint32_t, 4> ExpectedWeights; if ((ExpectedValue->getZExtValue() == ValueComparedTo) == (Predicate == CmpInst::ICMP_EQ)) { - Node = - MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal); + Node = MDB.createBranchWeights( + LikelyBranchWeightVal, UnlikelyBranchWeightVal, /*IsExpected=*/true); ExpectedWeights = {LikelyBranchWeightVal, UnlikelyBranchWeightVal}; } else { - Node = - MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal); + Node = MDB.createBranchWeights(UnlikelyBranchWeightVal, + LikelyBranchWeightVal, /*IsExpected=*/true); ExpectedWeights = {UnlikelyBranchWeightVal, LikelyBranchWeightVal}; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 72b9db1e73d7..6a681fd93397 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -19,6 +19,7 @@ #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -192,6 +193,109 @@ Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, return VecStart; } +namespace { +struct ShapeInfo { + unsigned NumRows; + unsigned NumColumns; + + bool IsColumnMajor; + + ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) + : NumRows(NumRows), NumColumns(NumColumns), + IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} + + ShapeInfo(Value *NumRows, Value *NumColumns) + : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(), + cast<ConstantInt>(NumColumns)->getZExtValue()) {} + + bool operator==(const ShapeInfo &other) { + return NumRows == other.NumRows && NumColumns == other.NumColumns; + } + bool operator!=(const ShapeInfo &other) { return !(*this == other); } + + /// Returns true if shape-information is defined, meaning both dimensions + /// are != 0. + operator bool() const { + assert(NumRows == 0 || NumColumns != 0); + return NumRows != 0; + } + + unsigned getStride() const { + if (IsColumnMajor) + return NumRows; + return NumColumns; + } + + unsigned getNumVectors() const { + if (IsColumnMajor) + return NumColumns; + return NumRows; + } + + /// Returns the transposed shape. + ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); } +}; +} // namespace + +static bool isUniformShape(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return true; + + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: // Scalar multiply. + case Instruction::FNeg: + case Instruction::Add: + case Instruction::Mul: + case Instruction::Sub: + return true; + default: + return false; + } +} + +/// Return the ShapeInfo for the result of \p I, it it can be determined. +static std::optional<ShapeInfo> +computeShapeInfoForInst(Instruction *I, + const ValueMap<Value *, ShapeInfo> &ShapeMap) { + Value *M; + Value *N; + Value *K; + if (match(I, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(), m_Value(), m_Value(M), m_Value(N), m_Value(K)))) + return ShapeInfo(M, K); + if (match(I, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(), m_Value(M), + m_Value(N)))) { + // Flip dimensions. + return ShapeInfo(N, M); + } + if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_store>( + m_Value(), m_Value(), m_Value(), m_Value(), m_Value(M), + m_Value(N)))) + return ShapeInfo(N, M); + if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_load>( + m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N)))) + return ShapeInfo(M, N); + Value *MatrixA; + if (match(I, m_Store(m_Value(MatrixA), m_Value()))) { + auto OpShape = ShapeMap.find(MatrixA); + if (OpShape != ShapeMap.end()) + return OpShape->second; + } + + if (isUniformShape(I)) { + // Find the first operand that has a known shape and use that. + for (auto &Op : I->operands()) { + auto OpShape = ShapeMap.find(Op.get()); + if (OpShape != ShapeMap.end()) + return OpShape->second; + } + } + return std::nullopt; +} + /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. /// /// Currently, the lowering for each matrix intrinsic is done as follows: @@ -383,48 +487,6 @@ class LowerMatrixIntrinsics { } }; - struct ShapeInfo { - unsigned NumRows; - unsigned NumColumns; - - bool IsColumnMajor; - - ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) - : NumRows(NumRows), NumColumns(NumColumns), - IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} - - ShapeInfo(Value *NumRows, Value *NumColumns) - : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(), - cast<ConstantInt>(NumColumns)->getZExtValue()) {} - - bool operator==(const ShapeInfo &other) { - return NumRows == other.NumRows && NumColumns == other.NumColumns; - } - bool operator!=(const ShapeInfo &other) { return !(*this == other); } - - /// Returns true if shape-information is defined, meaning both dimensions - /// are != 0. - operator bool() const { - assert(NumRows == 0 || NumColumns != 0); - return NumRows != 0; - } - - unsigned getStride() const { - if (IsColumnMajor) - return NumRows; - return NumColumns; - } - - unsigned getNumVectors() const { - if (IsColumnMajor) - return NumColumns; - return NumRows; - } - - /// Returns the transposed shape. - ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); } - }; - /// Maps instructions to their shape information. The shape information /// describes the shape to be used while lowering. This matches the shape of /// the result value of the instruction, with the only exceptions being store @@ -459,7 +521,7 @@ public: LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, OptimizationRemarkEmitter *ORE) - : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT), + : Func(F), DL(F.getDataLayout()), TTI(TTI), AA(AA), DT(DT), LI(LI), ORE(ORE) {} unsigned getNumOps(Type *VT) { @@ -554,25 +616,6 @@ public: return true; } - bool isUniformShape(Value *V) { - Instruction *I = dyn_cast<Instruction>(V); - if (!I) - return true; - - switch (I->getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: // Scalar multiply. - case Instruction::FNeg: - case Instruction::Add: - case Instruction::Mul: - case Instruction::Sub: - return true; - default: - return false; - } - } - /// Returns true if shape information can be used for \p V. The supported /// instructions must match the instructions that can be lowered by this pass. bool supportsShapeInfo(Value *V) { @@ -610,43 +653,8 @@ public: // New entry, set the value and insert operands bool Propagate = false; - - Value *MatrixA; - Value *MatrixB; - Value *M; - Value *N; - Value *K; - if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>( - m_Value(MatrixA), m_Value(MatrixB), m_Value(M), - m_Value(N), m_Value(K)))) { - Propagate = setShapeInfo(Inst, {M, K}); - } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>( - m_Value(MatrixA), m_Value(M), m_Value(N)))) { - // Flip dimensions. - Propagate = setShapeInfo(Inst, {N, M}); - } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>( - m_Value(MatrixA), m_Value(), m_Value(), - m_Value(), m_Value(M), m_Value(N)))) { - Propagate = setShapeInfo(Inst, {N, M}); - } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>( - m_Value(), m_Value(), m_Value(), m_Value(M), - m_Value(N)))) { - Propagate = setShapeInfo(Inst, {M, N}); - } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { - auto OpShape = ShapeMap.find(MatrixA); - if (OpShape != ShapeMap.end()) - setShapeInfo(Inst, OpShape->second); - continue; - } else if (isUniformShape(Inst)) { - // Find the first operand that has a known shape and use that. - for (auto &Op : Inst->operands()) { - auto OpShape = ShapeMap.find(Op.get()); - if (OpShape != ShapeMap.end()) { - Propagate |= setShapeInfo(Inst, OpShape->second); - break; - } - } - } + if (auto SI = computeShapeInfoForInst(Inst, ShapeMap)) + Propagate = setShapeInfo(Inst, *SI); if (Propagate) { NewWorkList.push_back(Inst); @@ -891,20 +899,28 @@ public: updateShapeAndReplaceAllUsesWith(I, NewInst); CleanupBinOp(I, A, B); } - // A^t + B ^t -> (A + B)^t + // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If + // the shape of the second transpose is different, there's a shape conflict + // which gets resolved by picking the shape of the first operand. else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) && match(A, m_Intrinsic<Intrinsic::matrix_transpose>( m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) && match(B, m_Intrinsic<Intrinsic::matrix_transpose>( - m_Value(BT), m_ConstantInt(R), m_ConstantInt(C)))) { + m_Value(BT), m_ConstantInt(), m_ConstantInt()))) { IRBuilder<> Builder(&I); - Value *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd")); - setShapeInfo(Add, {C, R}); + auto *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd")); + setShapeInfo(Add, {R, C}); MatrixBuilder MBuilder(Builder); Instruction *NewInst = MBuilder.CreateMatrixTranspose( - Add, C->getZExtValue(), R->getZExtValue(), "mfadd_t"); + Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t"); updateShapeAndReplaceAllUsesWith(I, NewInst); + assert(computeShapeInfoForInst(NewInst, ShapeMap) == + computeShapeInfoForInst(&I, ShapeMap) && + "Shape of new instruction doesn't match original shape."); CleanupBinOp(I, A, B); + assert(computeShapeInfoForInst(Add, ShapeMap).value_or(ShapeMap[Add]) == + ShapeMap[Add] && + "Shape of updated addition doesn't match cached shape."); } } @@ -975,12 +991,15 @@ public: bool Changed = false; SmallVector<CallInst *, 16> MaybeFusableInsts; SmallVector<Instruction *, 16> MatrixInsts; + SmallVector<IntrinsicInst *, 16> LifetimeEnds; // First, collect all instructions with shape information and candidates for // fusion (currently only matrix multiplies). ReversePostOrderTraversal<Function *> RPOT(&Func); for (auto *BB : RPOT) for (Instruction &I : *BB) { + if (match(&I, m_Intrinsic<Intrinsic::lifetime_end>())) + LifetimeEnds.push_back(cast<IntrinsicInst>(&I)); if (ShapeMap.find(&I) == ShapeMap.end()) continue; if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>())) @@ -995,7 +1014,7 @@ public: // Third, try to fuse candidates. for (CallInst *CI : MaybeFusableInsts) - LowerMatrixMultiplyFused(CI, FusedInsts); + LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds); Changed = !FusedInsts.empty(); @@ -1332,8 +1351,8 @@ public: if (!IsIntVec && !FMF.allowReassoc()) return; - auto CanBeFlattened = [this](Value *Op) { - if (match(Op, m_BinOp()) && ShapeMap.find(Op) != ShapeMap.end()) + auto CanBeFlattened = [](Value *Op) { + if (match(Op, m_BinOp())) return true; return match( Op, m_OneUse(m_CombineOr( @@ -1346,6 +1365,9 @@ public: // the returned cost is < 0, the argument is cheaper to use in the // dot-product lowering. auto GetCostForArg = [this, &CanBeFlattened](Value *Op, unsigned N) { + if (ShapeMap.find(Op) == ShapeMap.end()) + return InstructionCost::getInvalid(); + if (!isa<Instruction>(Op)) return InstructionCost(0); @@ -1356,7 +1378,7 @@ public: InstructionCost EmbedCost(0); // Roughly estimate the cost for embedding the columns into a vector. for (unsigned I = 1; I < N; ++I) - EmbedCost -= + EmbedCost += TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), std::nullopt, TTI::TCK_RecipThroughput); return EmbedCost; @@ -1378,7 +1400,7 @@ public: // vector. InstructionCost EmbedCost(0); for (unsigned I = 1; I < N; ++I) - EmbedCost += + EmbedCost -= TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), std::nullopt, TTI::TCK_RecipThroughput); return EmbedCost; @@ -1391,7 +1413,29 @@ public: return TTI.getMemoryOpCost(Instruction::Load, VecTy, Align(1), 0) - N * TTI.getMemoryOpCost(Instruction::Load, EltTy, Align(1), 0); }; - auto LHSCost = GetCostForArg(LHS, LShape.NumColumns); + + // Iterate over LHS and operations feeding LHS and check if it is profitable + // to flatten the visited ops. For each op, we compute the difference + // between the flattened and matrix versions. + SmallPtrSet<Value *, 4> Seen; + SmallVector<Value *> WorkList; + SmallVector<Value *> ToFlatten; + WorkList.push_back(LHS); + InstructionCost LHSCost(0); + while (!WorkList.empty()) { + Value *Op = WorkList.pop_back_val(); + if (!Seen.insert(Op).second) + continue; + + InstructionCost OpCost = GetCostForArg(Op, LShape.NumColumns); + if (OpCost + LHSCost >= LHSCost) + continue; + + LHSCost += OpCost; + ToFlatten.push_back(Op); + if (auto *I = dyn_cast<Instruction>(Op)) + WorkList.append(I->op_begin(), I->op_end()); + } // We compare the costs of a vector.reduce.add to sequential add. int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd; @@ -1412,16 +1456,16 @@ public: FusedInsts.insert(MatMul); IRBuilder<> Builder(MatMul); auto FlattenArg = [&Builder, &FusedInsts, &CanBeFlattened, - this](Value *Op) -> Value * { + this](Value *Op) { // Matmul must be the only user of loads because we don't use LowerLoad // for row vectors (LowerLoad results in scalar loads and shufflevectors // instead of single vector load). if (!CanBeFlattened(Op)) - return Op; + return; if (match(Op, m_BinOp()) && ShapeMap.find(Op) != ShapeMap.end()) { ShapeMap[Op] = ShapeMap[Op].t(); - return Op; + return; } FusedInsts.insert(cast<Instruction>(Op)); @@ -1432,16 +1476,19 @@ public: auto *NewLoad = Builder.CreateLoad(Op->getType(), Arg); Op->replaceAllUsesWith(NewLoad); cast<Instruction>(Op)->eraseFromParent(); - return NewLoad; + return; } else if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>( m_Value(Arg)))) { ToRemove.push_back(cast<Instruction>(Op)); - return Arg; + Op->replaceAllUsesWith(Arg); + return; } - - return Op; }; - LHS = FlattenArg(LHS); + + for (auto *V : ToFlatten) + FlattenArg(V); + + LHS = MatMul->getArgOperand(0); // Insert mul/fmul and llvm.vector.reduce.fadd Value *Mul = @@ -1594,7 +1641,7 @@ public: IRBuilder<> Builder(MatMul); Check0->getTerminator()->eraseFromParent(); Builder.SetInsertPoint(Check0); - Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout()); + Type *IntPtrTy = Builder.getIntPtrTy(Load->getDataLayout()); Value *StoreBegin = Builder.CreatePtrToInt( const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin"); Value *StoreEnd = Builder.CreateAdd( @@ -1813,8 +1860,10 @@ public: /// /// Call finalizeLowering on lowered instructions. Instructions that are /// completely eliminated by fusion are added to \p FusedInsts. - void LowerMatrixMultiplyFused(CallInst *MatMul, - SmallPtrSetImpl<Instruction *> &FusedInsts) { + void + LowerMatrixMultiplyFused(CallInst *MatMul, + SmallPtrSetImpl<Instruction *> &FusedInsts, + SmallVector<IntrinsicInst *, 16> &LifetimeEnds) { if (!FuseMatrix || !DT) return; @@ -1903,6 +1952,55 @@ public: for (Instruction *I : ToHoist) I->moveBefore(MatMul); + // Deal with lifetime.end calls that might be between Load0/Load1 and the + // store. To avoid introducing loads to dead objects (i.e. after the + // lifetime has been termined by @llvm.lifetime.end), either sink them + // after the store if in the same block, or remove the lifetime.end marker + // otherwise. This might pessimize further optimizations, by extending the + // lifetime of the object until the function returns, but should be + // conservatively correct. + MemoryLocation Load0Loc = MemoryLocation::get(LoadOp0); + MemoryLocation Load1Loc = MemoryLocation::get(LoadOp1); + BasicBlock *StoreParent = Store->getParent(); + bool FusableOpsInSameBlock = LoadOp0->getParent() == StoreParent && + LoadOp1->getParent() == StoreParent; + for (unsigned Idx = 0; Idx != LifetimeEnds.size();) { + IntrinsicInst *End = LifetimeEnds[Idx]; + auto Inc = make_scope_exit([&Idx]() { Idx++; }); + // If the lifetime.end is guaranteed to be before the loads or after the + // store, it won't interfere with fusion. + if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1)) + continue; + if (DT->dominates(Store, End)) + continue; + // If all fusable ops are in the same block and the lifetime.end is in a + // different block, it won't interfere with fusion. + if (FusableOpsInSameBlock && End->getParent() != StoreParent) + continue; + + // If the loads don't alias the lifetime.end, it won't interfere with + // fusion. + MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr); + if (!EndLoc.Ptr) + continue; + if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc)) + continue; + + // If both lifetime.end and the store are in the same block, extend the + // lifetime until after the store, so the new lifetime covers the loads + // we introduce later. + if (End->getParent() == StoreParent) { + End->moveAfter(Store); + continue; + } + + // Otherwise remove the conflicting lifetime.end marker. + ToRemove.push_back(End); + std::swap(LifetimeEnds[Idx], LifetimeEnds.back()); + LifetimeEnds.pop_back(); + Inc.release(); + } + emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts); return; } @@ -2364,7 +2462,7 @@ public: RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix, OptimizationRemarkEmitter &ORE, Function &Func) : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func), - DL(Func.getParent()->getDataLayout()) {} + DL(Func.getDataLayout()) {} /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are /// instructions in Inst2Matrix returning void or without any users in diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index 78e474f925b5..aea17aa82a88 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/GuardUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 805bbe40bd7c..cee34f0a6da1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -99,7 +100,7 @@ struct MemsetRange { MaybeAlign Alignment; /// TheStores - The actual stores that make up this range. - SmallVector<Instruction*, 16> TheStores; + SmallVector<Instruction *, 16> TheStores; bool isProfitableToUseMemset(const DataLayout &DL) const; }; @@ -108,10 +109,12 @@ struct MemsetRange { bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. - if (TheStores.size() >= 4 || End-Start >= 16) return true; + if (TheStores.size() >= 4 || End - Start >= 16) + return true; // If there is nothing to merge, don't do anything. - if (TheStores.size() < 2) return false; + if (TheStores.size() < 2) + return false; // If any of the stores are a memset, then it is always good to extend the // memset. @@ -121,7 +124,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // Assume that the code generator is capable of merging pairs of stores // together if it wants to. - if (TheStores.size() == 2) return false; + if (TheStores.size() == 2) + return false; // If we have fewer than 8 stores, it can still be worthwhile to do this. // For example, merging 4 i8 stores into an i32 store is useful almost always. @@ -133,7 +137,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // the maximum GPR width is the same size as the largest legal integer // size. If so, check to see whether we will end up actually reducing the // number of stores used. - unsigned Bytes = unsigned(End-Start); + unsigned Bytes = unsigned(End - Start); unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; if (MaxIntSize == 0) MaxIntSize = 1; @@ -145,7 +149,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 // etc. - return TheStores.size() > NumPointerStores+NumByteStores; + return TheStores.size() > NumPointerStores + NumByteStores; } namespace { @@ -197,7 +201,7 @@ public: /// existing ranges as appropriate. void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment, Instruction *Inst) { - int64_t End = Start+Size; + int64_t End = Start + Size; range_iterator I = partition_point( Ranges, [=](const MemsetRange &O) { return O.End < Start; }); @@ -207,10 +211,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, // to insert a new range. Handle this now. if (I == Ranges.end() || End < I->Start) { MemsetRange &R = *Ranges.insert(I, MemsetRange()); - R.Start = Start; - R.End = End; - R.StartPtr = Ptr; - R.Alignment = Alignment; + R.Start = Start; + R.End = End; + R.StartPtr = Ptr; + R.Alignment = Alignment; R.TheStores.push_back(Inst); return; } @@ -354,7 +358,7 @@ static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { - const DataLayout &DL = StartInst->getModule()->getDataLayout(); + const DataLayout &DL = StartInst->getDataLayout(); // We can't track scalable types if (auto *SI = dyn_cast<StoreInst>(StartInst)) @@ -397,7 +401,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, if (auto *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. - if (!NextStore->isSimple()) break; + if (!NextStore->isSimple()) + break; Value *StoredVal = NextStore->getValueOperand(); @@ -460,7 +465,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; for (const MemsetRange &Range : Ranges) { - if (Range.TheStores.size() == 1) continue; + if (Range.TheStores.size() == 1) + continue; // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(DL)) @@ -481,12 +487,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); - auto *NewDef = - cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI - ? MSSAU->createMemoryAccessBefore( - AMemSet, nullptr, MemInsertPoint) - : MSSAU->createMemoryAccessAfter( - AMemSet, nullptr, MemInsertPoint)); + auto *NewDef = cast<MemoryDef>( + MemInsertPoint->getMemoryInst() == &*BI + ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint) + : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint)); MSSAU->insertDef(NewDef, /*RenameUses=*/true); MemInsertPoint = NewDef; @@ -512,12 +516,13 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) { // Keep track of the arguments of all instruction we plan to lift // so we can make sure to lift them as well if appropriate. - DenseSet<Instruction*> Args; + DenseSet<Instruction *> Args; auto AddArg = [&](Value *Arg) { auto *I = dyn_cast<Instruction>(Arg); if (I && I->getParent() == SI->getParent()) { // Cannot hoist user of P above P - if (I == P) return false; + if (I == P) + return false; Args.insert(I); } return true; @@ -630,8 +635,7 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) { bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, const DataLayout &DL, BasicBlock::iterator &BBI) { - if (!LI->isSimple() || !LI->hasOneUse() || - LI->getParent() != SI->getParent()) + if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent()) return false; auto *T = LI->getType(); @@ -677,22 +681,21 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, if (isModSet(AA->getModRefInfo(SI, LoadLoc))) UseMemMove = true; - uint64_t Size = DL.getTypeStoreSize(T); - IRBuilder<> Builder(P); + Value *Size = + Builder.CreateTypeSize(Builder.getInt64Ty(), DL.getTypeStoreSize(T)); Instruction *M; if (UseMemMove) - M = Builder.CreateMemMove( - SI->getPointerOperand(), SI->getAlign(), - LI->getPointerOperand(), LI->getAlign(), Size); + M = Builder.CreateMemMove(SI->getPointerOperand(), SI->getAlign(), + LI->getPointerOperand(), LI->getAlign(), + Size); else - M = Builder.CreateMemCpy( - SI->getPointerOperand(), SI->getAlign(), - LI->getPointerOperand(), LI->getAlign(), Size); + M = Builder.CreateMemCpy(SI->getPointerOperand(), SI->getAlign(), + LI->getPointerOperand(), LI->getAlign(), Size); M->copyMetadata(*SI, LLVMContext::MD_DIAssignID); - LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " - << *M << "\n"); + LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M + << "\n"); auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)); @@ -755,7 +758,8 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, } bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { - if (!SI->isSimple()) return false; + if (!SI->isSimple()) + return false; // Avoid merging nontemporal stores since the resulting // memcpy/memset would not be able to preserve the nontemporal hint. @@ -766,7 +770,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (SI->getMetadata(LLVMContext::MD_nontemporal)) return false; - const DataLayout &DL = SI->getModule()->getDataLayout(); + const DataLayout &DL = SI->getDataLayout(); Value *StoredVal = SI->getValueOperand(); @@ -794,8 +798,8 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // 0xA0A0A0A0 and 0.0. auto *V = SI->getOperand(0); if (Value *ByteVal = isBytewiseValue(V, DL)) { - if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), - ByteVal)) { + if (Instruction *I = + tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -816,8 +820,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // The newly inserted memset is immediately overwritten by the original // store, so we do not need to rename uses. auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI)); - auto *NewAccess = MSSAU->createMemoryAccessBefore( - M, nullptr, StoreDef); + auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef); MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false); eraseInstruction(SI); @@ -836,8 +839,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { // See if there is another memset or store neighboring this memset which // allows us to widen out the memset to do a single larger store. if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) - if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), - MSI->getValue())) { + if (Instruction *I = + tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -850,7 +853,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore, Value *cpyDest, Value *cpySrc, TypeSize cpySize, - Align cpyDestAlign, BatchAAResults &BAA, + Align cpyDestAlign, + BatchAAResults &BAA, std::function<CallInst *()> GetC) { // The general transformation to keep in mind is // @@ -879,7 +883,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, if (!srcArraySize) return false; - const DataLayout &DL = cpyLoad->getModule()->getDataLayout(); + const DataLayout &DL = cpyLoad->getDataLayout(); TypeSize SrcAllocaSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()); // We can't optimize scalable types. if (SrcAllocaSize.isScalable()) @@ -898,15 +902,15 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) return false; - if (C->getParent() != cpyStore->getParent()) { LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n"); return false; } - MemoryLocation DestLoc = isa<StoreInst>(cpyStore) ? - MemoryLocation::get(cpyStore) : - MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore)); + MemoryLocation DestLoc = + isa<StoreInst>(cpyStore) + ? MemoryLocation::get(cpyStore) + : MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore)); // Check that nothing touches the dest of the copy between // the call and the store/memcpy. @@ -980,10 +984,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, append_range(srcUseList, U->users()); continue; } - if (const auto *G = dyn_cast<GetElementPtrInst>(U)) { - if (!G->hasAllZeroIndices()) - return false; - + if (const auto *G = dyn_cast<GetElementPtrInst>(U); + G && G->hasAllZeroIndices()) { append_range(srcUseList, U->users()); continue; } @@ -991,8 +993,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, if (IT->isLifetimeStartOrEnd()) continue; - if (U != C && U != cpyLoad) + if (U != C && U != cpyLoad) { + LLVM_DEBUG(dbgs() << "Call slot: Source accessed by " << *U << "\n"); return false; + } } // Check whether src is captured by the called function, in which case there @@ -1121,28 +1125,79 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, BatchAAResults &BAA) { - // We can only transforms memcpy's where the dest of one is the source of the - // other. - if (M->getSource() != MDep->getDest() || MDep->isVolatile()) - return false; - // If dep instruction is reading from our current input, then it is a noop - // transfer and substituting the input won't change this instruction. Just - // ignore the input and let someone else zap MDep. This handles cases like: + // transfer and substituting the input won't change this instruction. Just + // ignore the input and let someone else zap MDep. This handles cases like: // memcpy(a <- a) // memcpy(b <- a) if (M->getSource() == MDep->getSource()) return false; - // Second, the length of the memcpy's must be the same, or the preceding one + // We can only optimize non-volatile memcpy's. + if (MDep->isVolatile()) + return false; + + int64_t MForwardOffset = 0; + const DataLayout &DL = M->getModule()->getDataLayout(); + // We can only transforms memcpy's where the dest of one is the source of the + // other, or they have an offset in a range. + if (M->getSource() != MDep->getDest()) { + std::optional<int64_t> Offset = + M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL); + if (!Offset || *Offset < 0) + return false; + MForwardOffset = *Offset; + } + + // The length of the memcpy's must be the same, or the preceding one // must be larger than the following one. - if (MDep->getLength() != M->getLength()) { + if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) { auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); auto *MLen = dyn_cast<ConstantInt>(M->getLength()); - if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) + if (!MDepLen || !MLen || + MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) return false; } + IRBuilder<> Builder(M); + auto *CopySource = MDep->getSource(); + Instruction *NewCopySource = nullptr; + auto CleanupOnRet = llvm::make_scope_exit([&NewCopySource] { + if (NewCopySource && NewCopySource->use_empty()) + // Safety: It's safe here because we will only allocate more instructions + // after finishing all BatchAA queries, but we have to be careful if we + // want to do something like this in another place. Then we'd probably + // have to delay instruction removal until all transforms on an + // instruction finished. + NewCopySource->eraseFromParent(); + }); + MaybeAlign CopySourceAlign = MDep->getSourceAlign(); + // We just need to calculate the actual size of the copy. + auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize( + MemoryLocation::getForSource(M).Size); + + // When the forwarding offset is greater than 0, we transform + // memcpy(d1 <- s1) + // memcpy(d2 <- d1+o) + // to + // memcpy(d2 <- s1+o) + if (MForwardOffset > 0) { + // The copy destination of `M` maybe can serve as the source of copying. + std::optional<int64_t> MDestOffset = + M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL); + if (MDestOffset == MForwardOffset) + CopySource = M->getDest(); + else { + CopySource = Builder.CreateInBoundsPtrAdd( + CopySource, Builder.getInt64(MForwardOffset)); + NewCopySource = dyn_cast<Instruction>(CopySource); + } + // We need to update `MCopyLoc` if an offset exists. + MCopyLoc = MCopyLoc.getWithNewPtr(CopySource); + if (CopySourceAlign) + CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset); + } + // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: // memcpy(a <- b) @@ -1152,12 +1207,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // // TODO: If the code between M and MDep is transparent to the destination "c", // then we could still perform the xform by moving M up to the first memcpy. - // TODO: It would be sufficient to check the MDep source up to the memcpy - // size of M, rather than MDep. - if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep), - MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M))) + if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep), + MSSA->getMemoryAccess(M))) return false; + // No need to create `memcpy(a <- a)`. + if (BAA.isMustAlias(M->getDest(), CopySource)) { + // Remove the instruction we're replacing. + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } + // If the dest of the second might alias the source of the first, then the // source and dest might overlap. In addition, if the source of the first // points to constant memory, they won't overlap by definition. Otherwise, we @@ -1175,27 +1236,27 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // If all checks passed, then we can transform M. LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" - << *MDep << '\n' << *M << '\n'); + << *MDep << '\n' + << *M << '\n'); // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. - IRBuilder<> Builder(M); Instruction *NewM; if (UseMemMove) - NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), - MDep->getRawSource(), MDep->getSourceAlign(), - M->getLength(), M->isVolatile()); + NewM = + Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, + CopySourceAlign, M->getLength(), M->isVolatile()); else if (isa<MemCpyInlineInst>(M)) { // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is // never allowed since that would allow the latter to be lowered as a call // to an external function. - NewM = Builder.CreateMemCpyInline( - M->getRawDest(), M->getDestAlign(), MDep->getRawSource(), - MDep->getSourceAlign(), M->getLength(), M->isVolatile()); + NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), + CopySource, CopySourceAlign, + M->getLength(), M->isVolatile()); } else - NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), - MDep->getRawSource(), MDep->getSourceAlign(), - M->getLength(), M->isVolatile()); + NewM = + Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource, + CopySourceAlign, M->getLength(), M->isVolatile()); NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID); assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M))); @@ -1235,6 +1296,15 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, if (!BAA.isMustAlias(MemSet->getDest(), MemCpy->getDest())) return false; + // Don't perform the transform if src_size may be zero. In that case, the + // transform is essentially a complex no-op and may lead to an infinite + // loop if BasicAA is smart enough to understand that dst and dst + src_size + // are still MustAlias after the transform. + Value *SrcSize = MemCpy->getLength(); + if (!isKnownNonZero(SrcSize, + SimplifyQuery(MemCpy->getDataLayout(), DT, AC, MemCpy))) + return false; + // Check that src and dst of the memcpy aren't the same. While memcpy // operands cannot partially overlap, exact equality is allowed. if (isModSet(BAA.getModRefInfo(MemCpy, MemoryLocation::getForSource(MemCpy)))) @@ -1251,7 +1321,6 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, // Use the same i8* dest as the memcpy, killing the memset dest if different. Value *Dest = MemCpy->getRawDest(); Value *DestSize = MemSet->getLength(); - Value *SrcSize = MemCpy->getLength(); if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy)) return false; @@ -1307,8 +1376,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, // memcpy's defining access is the memset about to be removed. auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); - auto *NewAccess = MSSAU->createMemoryAccessBefore( - NewMemSet, nullptr, LastDef); + auto *NewAccess = + MSSAU->createMemoryAccessBefore(NewMemSet, nullptr, LastDef); MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); eraseInstruction(MemSet); @@ -1338,7 +1407,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, // The size also doesn't matter, as an out-of-bounds access would be UB. if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) { if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) { - const DataLayout &DL = Alloca->getModule()->getDataLayout(); + const DataLayout &DL = Alloca->getDataLayout(); if (std::optional<TypeSize> AllocaSize = Alloca->getAllocationSize(DL)) if (*AllocaSize == LTSize->getValue()) @@ -1384,7 +1453,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, return false; // A known memcpy size is also required. - auto *CCopySize = dyn_cast<ConstantInt>(CopySize); + auto *CCopySize = dyn_cast<ConstantInt>(CopySize); if (!CCopySize) return false; if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { @@ -1445,7 +1514,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, } // Check that copy is full with static size. - const DataLayout &DL = DestAlloca->getModule()->getDataLayout(); + const DataLayout &DL = DestAlloca->getDataLayout(); std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL); if (!SrcSize || Size != *SrcSize) { LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n"); @@ -1640,7 +1709,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, static bool isZeroSize(Value *Size) { if (auto *I = dyn_cast<Instruction>(Size)) - if (auto *Res = simplifyInstruction(I, I->getModule()->getDataLayout())) + if (auto *Res = simplifyInstruction(I, I->getDataLayout())) Size = Res; // Treat undef/poison size like zero. if (auto *C = dyn_cast<Constant>(Size)) @@ -1655,7 +1724,8 @@ static bool isZeroSize(Value *Size) { /// altogether. bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { // We can only optimize non-volatile memcpy's. - if (M->isVolatile()) return false; + if (M->isVolatile()) + return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { @@ -1664,8 +1734,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { return true; } - // If the size is zero, remove the memcpy. This also prevents infinite loops - // in processMemSetMemCpyDependence, which is a no-op for zero-length memcpys. + // If the size is zero, remove the memcpy. if (isZeroSize(M->getLength())) { ++BBI; eraseInstruction(M); @@ -1681,7 +1750,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { if (auto *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer(), - M->getModule()->getDataLayout())) { + M->getDataLayout())) { IRBuilder<> Builder(M); Instruction *NewM = Builder.CreateMemSet( M->getRawDest(), ByteVal, M->getLength(), M->getDestAlign(), false); @@ -1796,11 +1865,10 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) { << "\n"); // If not, then we know we can transform this. - Type *ArgTys[3] = { M->getRawDest()->getType(), - M->getRawSource()->getType(), - M->getLength()->getType() }; - M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), - Intrinsic::memcpy, ArgTys)); + Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(), + M->getLength()->getType()}; + M->setCalledFunction( + Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys)); // For MemorySSA nothing really changes (except that memcpy may imply stricter // aliasing guarantees). @@ -1811,7 +1879,7 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) { /// This is called on every byval argument in call sites. bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { - const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); + const DataLayout &DL = CB.getDataLayout(); // Find out what feeds this byval argument. Value *ByValArg = CB.getArgOperand(ArgNo); Type *ByValTy = CB.getParamByValType(ArgNo); @@ -1843,7 +1911,8 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { // Get the alignment of the byval. If the call doesn't specify the alignment, // then it is some target specific value that we can't know. MaybeAlign ByValAlign = CB.getParamAlign(ArgNo); - if (!ByValAlign) return false; + if (!ByValAlign) + return false; // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. @@ -1897,7 +1966,7 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) && CB.paramHasAttr(ArgNo, Attribute::NoCapture))) return false; - const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); + const DataLayout &DL = CB.getDataLayout(); Value *ImmutArg = CB.getArgOperand(ArgNo); // 2. Check that arg is alloca @@ -1987,7 +2056,7 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { continue; for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { - // Avoid invalidating the iterator. + // Avoid invalidating the iterator. Instruction *I = &*BI++; bool RepeatInstruction = false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 1e0906717549..4291f3aee0cd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -74,7 +74,7 @@ namespace { struct BCEAtom { BCEAtom() = default; BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset) - : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {} + : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(std::move(Offset)) {} BCEAtom(const BCEAtom &) = delete; BCEAtom &operator=(const BCEAtom &) = delete; @@ -151,7 +151,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n"); return {}; } - const auto &DL = LoadI->getModule()->getDataLayout(); + const auto &DL = LoadI->getDataLayout(); if (!isDereferenceablePointer(Addr, LoadI->getType(), DL)) { LLVM_DEBUG(dbgs() << "not dereferenceable\n"); // We need to make sure that we can do comparison in any order, so we @@ -325,7 +325,7 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI, auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId); if (!Rhs.BaseId) return std::nullopt; - const auto &DL = CmpI->getModule()->getDataLayout(); + const auto &DL = CmpI->getDataLayout(); return BCECmp(std::move(Lhs), std::move(Rhs), DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()), CmpI); } @@ -658,7 +658,7 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, unsigned IntBits = TLI.getIntSize(); // Create memcmp() == 0. - const auto &DL = Phi.getModule()->getDataLayout(); + const auto &DL = Phi.getDataLayout(); Value *const MemCmpCall = emitMemCmp( Lhs, Rhs, ConstantInt::get(Builder.getIntNTy(SizeTBits), TotalSizeBits / 8), diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index d65054a6ff9d..299239fb7020 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -199,7 +199,7 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, CastInst::isBitOrNoopPointerCastable( Store0->getValueOperand()->getType(), Store1->getValueOperand()->getType(), - Store0->getModule()->getDataLayout())) + Store0->getDataLayout())) return Store1; } return nullptr; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index 7fe1a222021e..c00c71fcb0b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -205,7 +205,7 @@ bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_, SE = SE_; TLI = TLI_; TTI = TTI_; - DL = &F.getParent()->getDataLayout(); + DL = &F.getDataLayout(); bool Changed = false, ChangedInThisIteration; do { @@ -511,14 +511,15 @@ Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr, Instruction *NewI = nullptr; switch (I->getOpcode()) { case Instruction::Add: - NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I->getIterator()); break; case Instruction::Mul: - NewI = BinaryOperator::CreateMul(LHS, RHS, "", I); + NewI = BinaryOperator::CreateMul(LHS, RHS, "", I->getIterator()); break; default: llvm_unreachable("Unexpected instruction."); } + NewI->setDebugLoc(I->getDebugLoc()); NewI->takeName(I); return NewI; } @@ -564,14 +565,24 @@ NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr, // optimization makes the algorithm O(n). while (!Candidates.empty()) { // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's - // removed - // during rewriting. - if (Value *Candidate = Candidates.back()) { + // removed during rewriting. + if (Value *Candidate = Candidates.pop_back_val()) { Instruction *CandidateInstruction = cast<Instruction>(Candidate); - if (DT->dominates(CandidateInstruction, Dominatee)) - return CandidateInstruction; + if (!DT->dominates(CandidateInstruction, Dominatee)) + continue; + + // Make sure that the instruction is safe to reuse without introducing + // poison. + SmallVector<Instruction *> DropPoisonGeneratingInsts; + if (!SE->canReuseInstruction(CandidateExpr, CandidateInstruction, + DropPoisonGeneratingInsts)) + continue; + + for (Instruction *I : DropPoisonGeneratingInsts) + I->dropPoisonGeneratingAnnotations(); + + return CandidateInstruction; } - Candidates.pop_back(); } return nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp index 19ac9526b5f8..fc0b31c43396 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -529,7 +529,11 @@ class NewGVN { // IR. SmallPtrSet<const Instruction *, 8> PHINodeUses; - DenseMap<const Value *, bool> OpSafeForPHIOfOps; + // The cached results, in general, are only valid for the specific block where + // they were computed. The unsigned part of the key is a unique block + // identifier + DenseMap<std::pair<const Value *, unsigned>, bool> OpSafeForPHIOfOps; + unsigned CacheIdx; // Map a temporary instruction we created to a parent block. DenseMap<const Value *, BasicBlock *> TempToBlock; @@ -892,7 +896,7 @@ private: // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. - int64_t StartingVNCounter = 0; + DebugCounter::CounterState StartingVNCounter; }; } // end anonymous namespace @@ -1199,7 +1203,7 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) { Value *V = simplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(), ArrayRef(std::next(E->op_begin()), E->op_end()), - GEPI->isInBounds(), Q); + GEPI->getNoWrapFlags(), Q); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (AllConstant) { @@ -2525,18 +2529,14 @@ void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) { BasicBlock *TargetBlock = Case.getCaseSuccessor(); updateReachableEdge(B, TargetBlock); } else { - for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { - BasicBlock *TargetBlock = SI->getSuccessor(i); + for (BasicBlock *TargetBlock : successors(SI->getParent())) updateReachableEdge(B, TargetBlock); - } } } else { // Otherwise this is either unconditional, or a type we have no // idea about. Just mark successors as reachable. - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - BasicBlock *TargetBlock = TI->getSuccessor(i); + for (BasicBlock *TargetBlock : successors(TI->getParent())) updateReachableEdge(B, TargetBlock); - } // This also may be a memory defining terminator, in which case, set it // equivalent only to itself. @@ -2600,19 +2600,19 @@ bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock, if (!isa<Instruction>(I)) continue; - auto OISIt = OpSafeForPHIOfOps.find(I); + auto OISIt = OpSafeForPHIOfOps.find({I, CacheIdx}); if (OISIt != OpSafeForPHIOfOps.end()) return OISIt->second; // Keep walking until we either dominate the phi block, or hit a phi, or run // out of things to check. if (DT->properlyDominates(getBlockForValue(I), PHIBlock)) { - OpSafeForPHIOfOps.insert({I, true}); + OpSafeForPHIOfOps.insert({{I, CacheIdx}, true}); continue; } // PHI in the same block. if (isa<PHINode>(I) && getBlockForValue(I) == PHIBlock) { - OpSafeForPHIOfOps.insert({I, false}); + OpSafeForPHIOfOps.insert({{I, CacheIdx}, false}); return false; } @@ -2631,10 +2631,10 @@ bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock, if (!isa<Instruction>(Op)) continue; // Stop now if we find an unsafe operand. - auto OISIt = OpSafeForPHIOfOps.find(OrigI); + auto OISIt = OpSafeForPHIOfOps.find({OrigI, CacheIdx}); if (OISIt != OpSafeForPHIOfOps.end()) { if (!OISIt->second) { - OpSafeForPHIOfOps.insert({I, false}); + OpSafeForPHIOfOps.insert({{I, CacheIdx}, false}); return false; } continue; @@ -2644,7 +2644,7 @@ bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock, Worklist.push_back(cast<Instruction>(Op)); } } - OpSafeForPHIOfOps.insert({V, true}); + OpSafeForPHIOfOps.insert({{V, CacheIdx}, true}); return true; } @@ -3278,7 +3278,7 @@ void NewGVN::verifyIterationSettled(Function &F) { #ifndef NDEBUG LLVM_DEBUG(dbgs() << "Beginning iteration verification\n"); if (DebugCounter::isCounterSet(VNCounter)) - DebugCounter::setCounterValue(VNCounter, StartingVNCounter); + DebugCounter::setCounterState(VNCounter, StartingVNCounter); // Note that we have to store the actual classes, as we may change existing // classes during iteration. This is because our memory iteration propagation @@ -3297,6 +3297,7 @@ void NewGVN::verifyIterationSettled(Function &F) { TouchedInstructions.set(); TouchedInstructions.reset(0); OpSafeForPHIOfOps.clear(); + CacheIdx = 0; iterateTouchedInstructions(); DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>> EqualClasses; @@ -3400,6 +3401,8 @@ void NewGVN::iterateTouchedInstructions() { << " because it is unreachable\n"); continue; } + // Use the appropriate cache for "OpIsSafeForPHIOfOps". + CacheIdx = RPOOrdering.lookup(DT->getNode(CurrBlock)) - 1; updateProcessedCount(CurrBlock); } // Reset after processing (because we may mark ourselves as touched when @@ -3423,7 +3426,7 @@ void NewGVN::iterateTouchedInstructions() { // This is the main transformation entry point. bool NewGVN::runGVN() { if (DebugCounter::isCounterSet(VNCounter)) - StartingVNCounter = DebugCounter::getCounterValue(VNCounter); + StartingVNCounter = DebugCounter::getCounterState(VNCounter); bool Changed = false; NumFuncArgs = F.arg_size(); MSSAWalker = MSSA->getWalker(); @@ -3479,6 +3482,8 @@ bool NewGVN::runGVN() { LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock()) << " marked reachable\n"); ReachableBlocks.insert(&F.getEntryBlock()); + // Use index corresponding to entry block. + CacheIdx = 0; iterateTouchedInstructions(); verifyMemoryCongruency(); @@ -3721,7 +3726,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { new StoreInst( PoisonValue::get(Int8Ty), Constant::getNullValue(PointerType::getUnqual(BB->getContext())), - BB->getTerminator()); + BB->getTerminator()->getIterator()); } void NewGVN::markInstructionForDeletion(Instruction *I) { @@ -4019,7 +4024,7 @@ bool NewGVN::eliminateInstructions(Function &F) { // dominated defs as dead. if (Def) { // For anything in this case, what and how we value number - // guarantees that any side-effets that would have occurred (ie + // guarantees that any side-effects that would have occurred (ie // throwing, etc) can be proven to either still occur (because it's // dominated by something that has the same side-effects), or never // occur. Otherwise, we would not have been able to prove it value @@ -4237,7 +4242,7 @@ PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) { auto &AA = AM.getResult<AAManager>(F); auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); bool Changed = - NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout()) + NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getDataLayout()) .runGVN(); if (!Changed) return PreservedAnalyses::all(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 0266eb1a9f50..77d67a2ce0f3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -60,6 +60,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Statepoint.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -190,7 +191,7 @@ static bool enableBackedgeSafepoints(Function &F); static bool enableCallSafepoints(Function &F); static void -InsertSafepointPoll(Instruction *InsertBefore, +InsertSafepointPoll(BasicBlock::iterator InsertBefore, std::vector<CallBase *> &ParsePointsNeeded /*rval*/, const TargetLibraryInfo &TLI); @@ -288,6 +289,8 @@ bool PlaceSafepointsPass::runImpl(Function &F, const TargetLibraryInfo &TLI) { // with for the moment. legacy::FunctionPassManager FPM(F.getParent()); bool CanAssumeCallSafepoints = enableCallSafepoints(F); + + FPM.add(new TargetLibraryInfoWrapperPass(TLI)); auto *PBS = new PlaceBackedgeSafepointsLegacyPass(CanAssumeCallSafepoints); FPM.add(PBS); FPM.run(F); @@ -308,8 +311,7 @@ bool PlaceSafepointsPass::runImpl(Function &F, const TargetLibraryInfo &TLI) { // We can sometimes end up with duplicate poll locations. This happens if // a single loop is visited more than once. The fact this happens seems // wrong, but it does happen for the split-backedge.ll test case. - PollLocations.erase(std::unique(PollLocations.begin(), PollLocations.end()), - PollLocations.end()); + PollLocations.erase(llvm::unique(PollLocations), PollLocations.end()); // Insert a poll at each point the analysis pass identified // The poll location must be the terminator of a loop latch block. @@ -368,7 +370,7 @@ bool PlaceSafepointsPass::runImpl(Function &F, const TargetLibraryInfo &TLI) { // safepoint polls themselves. for (Instruction *PollLocation : PollsNeeded) { std::vector<CallBase *> RuntimeCalls; - InsertSafepointPoll(PollLocation, RuntimeCalls, TLI); + InsertSafepointPoll(PollLocation->getIterator(), RuntimeCalls, TLI); llvm::append_range(ParsePointNeeded, RuntimeCalls); } @@ -517,7 +519,7 @@ static bool doesNotRequireEntrySafepointBefore(CallBase *Call) { switch (II->getIntrinsicID()) { case Intrinsic::experimental_gc_statepoint: case Intrinsic::experimental_patchpoint_void: - case Intrinsic::experimental_patchpoint_i64: + case Intrinsic::experimental_patchpoint: // The can wrap an actual call which may grow the stack by an unbounded // amount or run forever. return false; @@ -591,7 +593,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F, const char GCSafepointPollName[] = "gc.safepoint_poll"; static bool isGCSafepointPoll(Function &F) { - return F.getName().equals(GCSafepointPollName); + return F.getName() == GCSafepointPollName; } /// Returns true if this function should be rewritten to include safepoint @@ -619,7 +621,7 @@ static bool enableCallSafepoints(Function &F) { return !NoCall; } // not handle the parsability of state at the runtime call, that's the // callers job. static void -InsertSafepointPoll(Instruction *InsertBefore, +InsertSafepointPoll(BasicBlock::iterator InsertBefore, std::vector<CallBase *> &ParsePointsNeeded /*rval*/, const TargetLibraryInfo &TLI) { BasicBlock *OrigBB = InsertBefore->getParent(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp index 818c7b40d489..e742d2ed12af 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -246,7 +246,8 @@ void ReassociatePass::canonicalizeOperands(Instruction *I) { } static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, - Instruction *InsertBefore, Value *FlagsOp) { + BasicBlock::iterator InsertBefore, + Value *FlagsOp) { if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore); else { @@ -258,7 +259,8 @@ static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, } static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, - Instruction *InsertBefore, Value *FlagsOp) { + BasicBlock::iterator InsertBefore, + Value *FlagsOp) { if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore); else { @@ -270,7 +272,8 @@ static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, } static Instruction *CreateNeg(Value *S1, const Twine &Name, - Instruction *InsertBefore, Value *FlagsOp) { + BasicBlock::iterator InsertBefore, + Value *FlagsOp) { if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateNeg(S1, Name, InsertBefore); @@ -290,7 +293,8 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { Constant *NegOne = Ty->isIntOrIntVectorTy() ? ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0); - BinaryOperator *Res = CreateMul(Neg->getOperand(OpNo), NegOne, "", Neg, Neg); + BinaryOperator *Res = + CreateMul(Neg->getOperand(OpNo), NegOne, "", Neg->getIterator(), Neg); Neg->setOperand(OpNo, Constant::getNullValue(Ty)); // Drop use of op. Res->takeName(Neg); Neg->replaceAllUsesWith(Res); @@ -298,98 +302,7 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { return Res; } -/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael -/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for -/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. -/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every -/// even x in Bitwidth-bit arithmetic. -static unsigned CarmichaelShift(unsigned Bitwidth) { - if (Bitwidth < 3) - return Bitwidth - 1; - return Bitwidth - 2; -} - -/// Add the extra weight 'RHS' to the existing weight 'LHS', -/// reducing the combined weight using any special properties of the operation. -/// The existing weight LHS represents the computation X op X op ... op X where -/// X occurs LHS times. The combined weight represents X op X op ... op X with -/// X occurring LHS + RHS times. If op is "Xor" for example then the combined -/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; -/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. -static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { - // If we were working with infinite precision arithmetic then the combined - // weight would be LHS + RHS. But we are using finite precision arithmetic, - // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct - // for nilpotent operations and addition, but not for idempotent operations - // and multiplication), so it is important to correctly reduce the combined - // weight back into range if wrapping would be wrong. - - // If RHS is zero then the weight didn't change. - if (RHS.isMinValue()) - return; - // If LHS is zero then the combined weight is RHS. - if (LHS.isMinValue()) { - LHS = RHS; - return; - } - // From this point on we know that neither LHS nor RHS is zero. - - if (Instruction::isIdempotent(Opcode)) { - // Idempotent means X op X === X, so any non-zero weight is equivalent to a - // weight of 1. Keeping weights at zero or one also means that wrapping is - // not a problem. - assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); - return; // Return a weight of 1. - } - if (Instruction::isNilpotent(Opcode)) { - // Nilpotent means X op X === 0, so reduce weights modulo 2. - assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); - LHS = 0; // 1 + 1 === 0 modulo 2. - return; - } - if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) { - // TODO: Reduce the weight by exploiting nsw/nuw? - LHS += RHS; - return; - } - - assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) && - "Unknown associative operation!"); - unsigned Bitwidth = LHS.getBitWidth(); - // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth - // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth - // bit number x, since either x is odd in which case x^CM = 1, or x is even in - // which case both x^W and x^(W - CM) are zero. By subtracting off multiples - // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) - // which by a happy accident means that they can always be represented using - // Bitwidth bits. - // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than - // the Carmichael number). - if (Bitwidth > 3) { - /// CM - The value of Carmichael's lambda function. - APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); - // Any weight W >= Threshold can be replaced with W - CM. - APInt Threshold = CM + Bitwidth; - assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); - // For Bitwidth 4 or more the following sum does not overflow. - LHS += RHS; - while (LHS.uge(Threshold)) - LHS -= CM; - } else { - // To avoid problems with overflow do everything the same as above but using - // a larger type. - unsigned CM = 1U << CarmichaelShift(Bitwidth); - unsigned Threshold = CM + Bitwidth; - assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && - "Weights not reduced!"); - unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); - while (Total >= Threshold) - Total -= CM; - LHS = Total; - } -} - -using RepeatedValue = std::pair<Value*, APInt>; +using RepeatedValue = std::pair<Value *, uint64_t>; /// Given an associative binary expression, return the leaf /// nodes in Ops along with their weights (how many times the leaf occurs). The @@ -467,11 +380,10 @@ using RepeatedValue = std::pair<Value*, APInt>; static bool LinearizeExprTree(Instruction *I, SmallVectorImpl<RepeatedValue> &Ops, ReassociatePass::OrderedSet &ToRedo, - bool &HasNUW) { + reassociate::OverflowTracking &Flags) { assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) && "Expected a UnaryOperator or BinaryOperator!"); LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); - unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); unsigned Opcode = I->getOpcode(); assert(I->isAssociative() && I->isCommutative() && "Expected an associative and commutative operation!"); @@ -486,8 +398,8 @@ static bool LinearizeExprTree(Instruction *I, // with their weights, representing a certain number of paths to the operator. // If an operator occurs in the worklist multiple times then we found multiple // ways to get to it. - SmallVector<std::pair<Instruction*, APInt>, 8> Worklist; // (Op, Weight) - Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); + SmallVector<std::pair<Instruction *, uint64_t>, 8> Worklist; // (Op, Weight) + Worklist.push_back(std::make_pair(I, 1)); bool Changed = false; // Leaves of the expression are values that either aren't the right kind of @@ -505,23 +417,25 @@ static bool LinearizeExprTree(Instruction *I, // Leaves - Keeps track of the set of putative leaves as well as the number of // paths to each leaf seen so far. - using LeafMap = DenseMap<Value *, APInt>; + using LeafMap = DenseMap<Value *, uint64_t>; LeafMap Leaves; // Leaf -> Total weight so far. SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order. + const DataLayout DL = I->getDataLayout(); #ifndef NDEBUG SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme. #endif while (!Worklist.empty()) { - std::pair<Instruction*, APInt> P = Worklist.pop_back_val(); - I = P.first; // We examine the operands of this binary operator. + // We examine the operands of this binary operator. + auto [I, Weight] = Worklist.pop_back_val(); - if (isa<OverflowingBinaryOperator>(I)) - HasNUW &= I->hasNoUnsignedWrap(); + if (isa<OverflowingBinaryOperator>(I)) { + Flags.HasNUW &= I->hasNoUnsignedWrap(); + Flags.HasNSW &= I->hasNoSignedWrap(); + } for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands. Value *Op = I->getOperand(OpIdx); - APInt Weight = P.second; // Number of paths to this operand. LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); assert(!Op->use_empty() && "No uses, so how did we get to it?!"); @@ -555,26 +469,8 @@ static bool LinearizeExprTree(Instruction *I, "In leaf map but not visited!"); // Update the number of paths to the leaf. - IncorporateWeight(It->second, Weight, Opcode); - -#if 0 // TODO: Re-enable once PR13021 is fixed. - // The leaf already has one use from inside the expression. As we want - // exactly one such use, drop this new use of the leaf. - assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); - I->setOperand(OpIdx, UndefValue::get(I->getType())); - Changed = true; - - // If the leaf is a binary operation of the right kind and we now see - // that its multiple original uses were in fact all by nodes belonging - // to the expression, then no longer consider it to be a leaf and add - // its operands to the expression. - if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { - LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); - Worklist.push_back(std::make_pair(BO, It->second)); - Leaves.erase(It); - continue; - } -#endif + It->second += Weight; + assert(It->second >= Weight && "Weight overflows"); // If we still have uses that are not accounted for by the expression // then it is not safe to modify the value. @@ -637,13 +533,22 @@ static bool LinearizeExprTree(Instruction *I, // Node initially thought to be a leaf wasn't. continue; assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!"); - APInt Weight = It->second; - if (Weight.isMinValue()) - // Leaf already output or weight reduction eliminated it. - continue; + uint64_t Weight = It->second; // Ensure the leaf is only output once. It->second = 0; Ops.push_back(std::make_pair(V, Weight)); + if (Opcode == Instruction::Add && Flags.AllKnownNonNegative && Flags.HasNSW) + Flags.AllKnownNonNegative &= isKnownNonNegative(V, SimplifyQuery(DL)); + else if (Opcode == Instruction::Mul) { + // To preserve NUW we need all inputs non-zero. + // To preserve NSW we need all inputs strictly positive. + if (Flags.AllKnownNonZero && + (Flags.HasNUW || (Flags.HasNSW && Flags.AllKnownNonNegative))) { + Flags.AllKnownNonZero &= isKnownNonZero(V, SimplifyQuery(DL)); + if (Flags.HasNSW && Flags.AllKnownNonNegative) + Flags.AllKnownNonNegative &= isKnownNonNegative(V, SimplifyQuery(DL)); + } + } } // For nilpotent operations or addition there may be no operands, for example @@ -652,7 +557,7 @@ static bool LinearizeExprTree(Instruction *I, if (Ops.empty()) { Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); assert(Identity && "Associative operation without identity!"); - Ops.emplace_back(Identity, APInt(Bitwidth, 1)); + Ops.emplace_back(Identity, 1); } return Changed; @@ -662,7 +567,7 @@ static bool LinearizeExprTree(Instruction *I, /// linearized and optimized, emit them in-order. void ReassociatePass::RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops, - bool HasNUW) { + OverflowTracking Flags) { assert(Ops.size() > 1 && "Single values should be used directly!"); // Since our optimizations should never increase the number of operations, the @@ -691,8 +596,8 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, /// of leaf nodes as inner nodes cannot occur by remembering all of the future /// leaves and refusing to reuse any of them as inner nodes. SmallPtrSet<Value*, 8> NotRewritable; - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - NotRewritable.insert(Ops[i].Op); + for (const ValueEntry &Op : Ops) + NotRewritable.insert(Op.Op); // ExpressionChangedStart - Non-null if the rewritten expression differs from // the original in some non-trivial way, requiring the clearing of optional @@ -792,9 +697,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, // stupid, create a new node if there are none left. BinaryOperator *NewOp; if (NodesToRewrite.empty()) { - Constant *Undef = UndefValue::get(I->getType()); - NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), - Undef, Undef, "", I); + Constant *Poison = PoisonValue::get(I->getType()); + NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Poison, + Poison, "", I->getIterator()); if (isa<FPMathOperator>(NewOp)) NewOp->setFastMathFlags(I->getFastMathFlags()); } else { @@ -827,11 +732,14 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, ExpressionChangedStart->setFastMathFlags(Flags); } else { ExpressionChangedStart->clearSubclassOptionalData(); - // Note that it doesn't hold for mul if one of the operands is zero. - // TODO: We can preserve NUW flag if we prove that all mul operands - // are non-zero. - if (HasNUW && ExpressionChangedStart->getOpcode() == Instruction::Add) - ExpressionChangedStart->setHasNoUnsignedWrap(); + if (ExpressionChangedStart->getOpcode() == Instruction::Add || + (ExpressionChangedStart->getOpcode() == Instruction::Mul && + Flags.AllKnownNonZero)) { + if (Flags.HasNUW) + ExpressionChangedStart->setHasNoUnsignedWrap(); + if (Flags.HasNSW && (Flags.AllKnownNonNegative || Flags.HasNUW)) + ExpressionChangedStart->setHasNoSignedWrap(); + } } } @@ -854,8 +762,8 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, } // Throw away any left over nodes from the original expression. - for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) - RedoInsts.insert(NodesToRewrite[i]); + for (BinaryOperator *BO : NodesToRewrite) + RedoInsts.insert(BO); } /// Insert instructions before the instruction pointed to by BI, @@ -868,7 +776,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, static Value *NegateValue(Value *V, Instruction *BI, ReassociatePass::OrderedSet &ToRedo) { if (auto *C = dyn_cast<Constant>(V)) { - const DataLayout &DL = BI->getModule()->getDataLayout(); + const DataLayout &DL = BI->getDataLayout(); Constant *Res = C->getType()->isFPOrFPVectorTy() ? ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL) : ConstantExpr::getNeg(C); @@ -945,7 +853,13 @@ static Value *NegateValue(Value *V, Instruction *BI, ->getIterator(); } + // Check that if TheNeg is moved out of its parent block, we drop its + // debug location to avoid extra coverage. + // See test dropping_debugloc_the_neg.ll for a detailed example. + if (TheNeg->getParent() != InsertPt->getParent()) + TheNeg->dropLocation(); TheNeg->moveBefore(*InsertPt->getParent(), InsertPt); + if (TheNeg->getOpcode() == Instruction::Sub) { TheNeg->setHasNoUnsignedWrap(false); TheNeg->setHasNoSignedWrap(false); @@ -958,7 +872,8 @@ static Value *NegateValue(Value *V, Instruction *BI, // Insert a 'neg' instruction that subtracts the value from zero to get the // negation. - Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); + Instruction *NewNeg = + CreateNeg(V, V->getName() + ".neg", BI->getIterator(), BI); ToRedo.insert(NewNeg); return NewNeg; } @@ -1044,8 +959,8 @@ static bool shouldConvertOrWithNoCommonBitsToAdd(Instruction *Or) { /// transform this into (X+Y) to allow arithmetics reassociation. static BinaryOperator *convertOrWithNoCommonBitsToAdd(Instruction *Or) { // Convert an or into an add. - BinaryOperator *New = - CreateAdd(Or->getOperand(0), Or->getOperand(1), "", Or, Or); + BinaryOperator *New = CreateAdd(Or->getOperand(0), Or->getOperand(1), "", + Or->getIterator(), Or); New->setHasNoSignedWrap(); New->setHasNoUnsignedWrap(); New->takeName(Or); @@ -1097,7 +1012,8 @@ static BinaryOperator *BreakUpSubtract(Instruction *Sub, // Calculate the negative value of Operand 1 of the sub instruction, // and set it as the RHS of the add instruction we just made. Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); - BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); + BinaryOperator *New = + CreateAdd(Sub->getOperand(0), NegVal, "", Sub->getIterator(), Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. New->takeName(Sub); @@ -1115,10 +1031,11 @@ static BinaryOperator *BreakUpSubtract(Instruction *Sub, static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { Constant *MulCst = ConstantInt::get(Shl->getType(), 1); auto *SA = cast<ConstantInt>(Shl->getOperand(1)); - MulCst = ConstantExpr::getShl(MulCst, SA); + MulCst = ConstantFoldBinaryInstruction(Instruction::Shl, MulCst, SA); + assert(MulCst && "Constant folding of immediate constants failed"); - BinaryOperator *Mul = - BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); + BinaryOperator *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, + "", Shl->getIterator()); Shl->setOperand(0, PoisonValue::get(Shl->getType())); // Drop use of op. Mul->takeName(Shl); @@ -1168,13 +1085,13 @@ static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops, /// Emit a tree of add instructions, summing Ops together /// and returning the result. Insert the tree before I. -static Value *EmitAddTreeOfValues(Instruction *I, +static Value *EmitAddTreeOfValues(BasicBlock::iterator It, SmallVectorImpl<WeakTrackingVH> &Ops) { if (Ops.size() == 1) return Ops.back(); Value *V1 = Ops.pop_back_val(); - Value *V2 = EmitAddTreeOfValues(I, Ops); - return CreateAdd(V2, V1, "reass.add", I, I); + Value *V2 = EmitAddTreeOfValues(It, Ops); + return CreateAdd(V2, V1, "reass.add", It, &*It); } /// If V is an expression tree that is a multiplication sequence, @@ -1186,14 +1103,13 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { return nullptr; SmallVector<RepeatedValue, 8> Tree; - bool HasNUW = true; - MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, HasNUW); + OverflowTracking Flags; + MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, Flags); SmallVector<ValueEntry, 8> Factors; Factors.reserve(Tree.size()); for (unsigned i = 0, e = Tree.size(); i != e; ++i) { RepeatedValue E = Tree[i]; - Factors.append(E.second.getZExtValue(), - ValueEntry(getRank(E.first), E.first)); + Factors.append(E.second, ValueEntry(getRank(E.first), E.first)); } bool FoundFactor = false; @@ -1229,7 +1145,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { if (!FoundFactor) { // Make sure to restore the operands to the expression tree. - RewriteExprTree(BO, Factors, HasNUW); + RewriteExprTree(BO, Factors, Flags); return nullptr; } @@ -1241,12 +1157,12 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { RedoInsts.insert(BO); V = Factors[0].Op; } else { - RewriteExprTree(BO, Factors, HasNUW); + RewriteExprTree(BO, Factors, Flags); V = BO; } if (NeedsNegate) - V = CreateNeg(V, "neg", &*InsertPt, BO); + V = CreateNeg(V, "neg", InsertPt, BO); return V; } @@ -1321,7 +1237,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode, /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will /// be returned. -static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, +static Value *createAndInstr(BasicBlock::iterator InsertBefore, Value *Opnd, const APInt &ConstOpnd) { if (ConstOpnd.isZero()) return nullptr; @@ -1342,7 +1258,7 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, // If it was successful, true is returned, and the "R" and "C" is returned // via "Res" and "ConstOpnd", respectively; otherwise, false is returned, // and both "Res" and "ConstOpnd" remain unchanged. -bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, +bool ReassociatePass::CombineXorOpnd(BasicBlock::iterator It, XorOpnd *Opnd1, APInt &ConstOpnd, Value *&Res) { // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 // = ((x | c1) ^ c1) ^ (c1 ^ c2) @@ -1359,7 +1275,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, return false; Value *X = Opnd1->getSymbolicPart(); - Res = createAndInstr(I, X, ~C1); + Res = createAndInstr(It, X, ~C1); // ConstOpnd was C2, now C1 ^ C2. ConstOpnd ^= C1; @@ -1376,7 +1292,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, // via "Res" and "ConstOpnd", respectively (If the entire expression is // evaluated to a constant, the Res is set to NULL); otherwise, false is // returned, and both "Res" and "ConstOpnd" remain unchanged. -bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, +bool ReassociatePass::CombineXorOpnd(BasicBlock::iterator It, XorOpnd *Opnd1, XorOpnd *Opnd2, APInt &ConstOpnd, Value *&Res) { Value *X = Opnd1->getSymbolicPart(); @@ -1411,7 +1327,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, return false; } - Res = createAndInstr(I, X, C3); + Res = createAndInstr(It, X, C3); ConstOpnd ^= C1; } else if (Opnd1->isOrExpr()) { // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2 @@ -1427,7 +1343,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, return false; } - Res = createAndInstr(I, X, C3); + Res = createAndInstr(It, X, C3); ConstOpnd ^= C3; } else { // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2)) @@ -1435,7 +1351,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, const APInt &C1 = Opnd1->getConstPart(); const APInt &C2 = Opnd2->getConstPart(); APInt C3 = C1 ^ C2; - Res = createAndInstr(I, X, C3); + Res = createAndInstr(It, X, C3); } // Put the original operands in the Redo list; hope they will be deleted @@ -1483,8 +1399,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, // the "OpndPtrs" as well. For the similar reason, do not fuse this loop // with the previous loop --- the iterator of the "Opnds" may be invalidated // when new elements are added to the vector. - for (unsigned i = 0, e = Opnds.size(); i != e; ++i) - OpndPtrs.push_back(&Opnds[i]); + for (XorOpnd &Op : Opnds) + OpndPtrs.push_back(&Op); // Step 2: Sort the Xor-Operands in a way such that the operands containing // the same symbolic value cluster together. For instance, the input operand @@ -1512,7 +1428,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, Value *CV; // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" - if (!ConstOpnd.isZero() && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { + if (!ConstOpnd.isZero() && + CombineXorOpnd(I->getIterator(), CurrOpnd, ConstOpnd, CV)) { Changed = true; if (CV) *CurrOpnd = XorOpnd(CV); @@ -1529,7 +1446,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, // step 3.2: When previous and current operands share the same symbolic // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" - if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) { + if (CombineXorOpnd(I->getIterator(), CurrOpnd, PrevOpnd, ConstOpnd, CV)) { // Remove previous operand PrevOpnd->Invalidate(); if (CV) { @@ -1600,7 +1517,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, Type *Ty = TheOp->getType(); Constant *C = Ty->isIntOrIntVectorTy() ? ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound); - Instruction *Mul = CreateMul(TheOp, C, "factor", I, I); + Instruction *Mul = CreateMul(TheOp, C, "factor", I->getIterator(), I); // Now that we have inserted a multiply, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: @@ -1764,7 +1681,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, DummyInst->deleteValue(); unsigned NumAddedValues = NewMulOps.size(); - Value *V = EmitAddTreeOfValues(I, NewMulOps); + Value *V = EmitAddTreeOfValues(I->getIterator(), NewMulOps); // Now that we have inserted the add tree, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: @@ -1775,7 +1692,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, RedoInsts.insert(VI); // Create the multiply. - Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I); + Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I->getIterator(), I); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. @@ -1914,10 +1831,10 @@ ReassociatePass::buildMinimalMultiplyDAG(IRBuilderBase &Builder, } // Unique factors with equal powers -- we've folded them into the first one's // base. - Factors.erase(std::unique(Factors.begin(), Factors.end(), - [](const Factor &LHS, const Factor &RHS) { - return LHS.Power == RHS.Power; - }), + Factors.erase(llvm::unique(Factors, + [](const Factor &LHS, const Factor &RHS) { + return LHS.Power == RHS.Power; + }), Factors.end()); // Iteratively collect the base of each factor with an add power into the @@ -1974,7 +1891,7 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - const DataLayout &DL = I->getModule()->getDataLayout(); + const DataLayout &DL = I->getDataLayout(); Constant *Cst = nullptr; unsigned Opcode = I->getOpcode(); while (!Ops.empty()) { @@ -2071,8 +1988,8 @@ void ReassociatePass::EraseInst(Instruction *I) { I->eraseFromParent(); // Optimize its operands. SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) { + for (Value *V : Ops) + if (Instruction *Op = dyn_cast<Instruction>(V)) { // If this is a node in an expression tree, climb to the expression root // and add that since that's where optimization actually happens. unsigned Opcode = Op->getOpcode(); @@ -2270,7 +2187,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) { shouldConvertOrWithNoCommonBitsToAdd(I) && !isLoadCombineCandidate(I) && (cast<PossiblyDisjointInst>(I)->isDisjoint() || haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), - SimplifyQuery(I->getModule()->getDataLayout(), + SimplifyQuery(I->getDataLayout(), /*DT=*/nullptr, /*AC=*/nullptr, I)))) { Instruction *NI = convertOrWithNoCommonBitsToAdd(I); RedoInsts.insert(I); @@ -2366,12 +2283,12 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { // First, walk the expression tree, linearizing the tree, collecting the // operand information. SmallVector<RepeatedValue, 8> Tree; - bool HasNUW = true; - MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, HasNUW); + OverflowTracking Flags; + MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, Flags); SmallVector<ValueEntry, 8> Ops; Ops.reserve(Tree.size()); for (const RepeatedValue &E : Tree) - Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first)); + Ops.append(E.second, ValueEntry(getRank(E.first), E.first)); LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); @@ -2560,7 +2477,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { dbgs() << '\n'); // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. - RewriteExprTree(I, Ops, HasNUW); + RewriteExprTree(I, Ops, Flags); } void diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 6c2b3e9bd4a7..ebc5075aa36f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -64,7 +64,7 @@ static bool runPass(Function &F) { CastInst *AllocaInsertionPoint = new BitCastInst( Constant::getNullValue(Type::getInt32Ty(F.getContext())), - Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I); + Type::getInt32Ty(F.getContext()), "reg2mem alloca point", I); // Find the escaped instructions. But don't create stack slots for // allocas in entry block. @@ -76,7 +76,7 @@ static bool runPass(Function &F) { // Demote escaped instructions NumRegsDemoted += WorkList.size(); for (Instruction *I : WorkList) - DemoteRegToStack(*I, false, AllocaInsertionPoint); + DemoteRegToStack(*I, false, AllocaInsertionPoint->getIterator()); WorkList.clear(); @@ -88,7 +88,7 @@ static bool runPass(Function &F) { // Demote phi nodes NumPhisDemoted += WorkList.size(); for (Instruction *I : WorkList) - DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint); + DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint->getIterator()); return true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 45ce3bf3ceae..2b99e28acb4e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1143,7 +1143,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache, assert(Base && "Can't be null"); // The cast is needed since base traversal may strip away bitcasts if (Base->getType() != Input->getType() && InsertPt) - Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt); + Base = new BitCastInst(Base, Input->getType(), "cast", + InsertPt->getIterator()); return Base; }; @@ -1251,7 +1252,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache, // get the data layout to compare the sizes of base/derived pointer values [[maybe_unused]] auto &DL = - cast<llvm::Instruction>(Def)->getModule()->getDataLayout(); + cast<llvm::Instruction>(Def)->getDataLayout(); // Cache all of our results so we can cheaply reuse them // NOTE: This is actually two caches: one of the base defining value // relation and one of the base pointer relation! FIXME @@ -1322,7 +1323,7 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, IsKnownBaseMapTy &KnownBases) { StatepointLiveSetTy PotentiallyDerivedPointers = result.LiveSet; // We assume that all pointers passed to deopt are base pointers; as an - // optimization, we can use this to avoid seperately materializing the base + // optimization, we can use this to avoid separately materializing the base // pointer graph. This is only relevant since we're very conservative about // generating new conflict nodes during base pointer insertion. If we were // smarter there, this would be irrelevant. @@ -1612,7 +1613,7 @@ public: // Note: we've inserted instructions, so the call to llvm.deoptimize may // not necessarily be followed by the matching return. auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator()); - new UnreachableInst(RI->getContext(), RI); + new UnreachableInst(RI->getContext(), RI->getIterator()); RI->eraseFromParent(); } @@ -1684,10 +1685,10 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ // Pass through the requested lowering if any. The default is live-through. StringRef DeoptLowering = getDeoptLowering(Call); - if (DeoptLowering.equals("live-in")) + if (DeoptLowering == "live-in") Flags |= uint32_t(StatepointFlags::DeoptLiveIn); else { - assert(DeoptLowering.equals("live-through") && "Unsupported value!"); + assert(DeoptLowering == "live-through" && "Unsupported value!"); } FunctionCallee CallTarget(Call->getFunctionType(), Call->getCalledOperand()); @@ -1733,7 +1734,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ // memcpy(dest_derived, source_derived, ...) => // memcpy(dest_base, dest_offset, source_base, source_offset, ...) auto &Context = Call->getContext(); - auto &DL = Call->getModule()->getDataLayout(); + auto &DL = Call->getDataLayout(); auto GetBaseAndOffset = [&](Value *Derived) { Value *Base = nullptr; // Optimizations in unreachable code might substitute the real pointer @@ -1976,7 +1977,7 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, // Emit store into the related alloca. assert(Relocate->getNextNode() && "Should always have one since it's not a terminator"); - new StoreInst(Relocate, Alloca, Relocate->getNextNode()); + new StoreInst(Relocate, Alloca, std::next(Relocate->getIterator())); #ifndef NDEBUG VisitedLiveValues.insert(OriginalValue); @@ -1999,7 +2000,7 @@ static void insertRematerializationStores( Value *Alloca = AllocaMap[OriginalValue]; new StoreInst(RematerializedValue, Alloca, - RematerializedValue->getNextNode()); + std::next(RematerializedValue->getIterator())); #ifndef NDEBUG VisitedLiveValues.insert(OriginalValue); @@ -2029,11 +2030,11 @@ static void relocationViaAlloca( // Emit alloca for "LiveValue" and record it in "allocaMap" and // "PromotableAllocas" - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); auto emitAllocaFor = [&](Value *LiveValue) { - AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), - DL.getAllocaAddrSpace(), "", - F.getEntryBlock().getFirstNonPHI()); + AllocaInst *Alloca = + new AllocaInst(LiveValue->getType(), DL.getAllocaAddrSpace(), "", + F.getEntryBlock().getFirstNonPHIIt()); AllocaMap[LiveValue] = Alloca; PromotableAllocas.push_back(Alloca); }; @@ -2100,7 +2101,7 @@ static void relocationViaAlloca( ToClobber.push_back(Alloca); } - auto InsertClobbersAt = [&](Instruction *IP) { + auto InsertClobbersAt = [&](BasicBlock::iterator IP) { for (auto *AI : ToClobber) { auto AT = AI->getAllocatedType(); Constant *CPN; @@ -2115,10 +2116,11 @@ static void relocationViaAlloca( // Insert the clobbering stores. These may get intermixed with the // gc.results and gc.relocates, but that's fine. if (auto II = dyn_cast<InvokeInst>(Statepoint)) { - InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); - InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt()); + InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); } else { - InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode()); + InsertClobbersAt( + std::next(cast<Instruction>(Statepoint)->getIterator())); } } } @@ -2146,7 +2148,7 @@ static void relocationViaAlloca( } llvm::sort(Uses); - auto Last = std::unique(Uses.begin(), Uses.end()); + auto Last = llvm::unique(Uses); Uses.erase(Last, Uses.end()); for (Instruction *Use : Uses) { @@ -2154,15 +2156,15 @@ static void relocationViaAlloca( PHINode *Phi = cast<PHINode>(Use); for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) { if (Def == Phi->getIncomingValue(i)) { - LoadInst *Load = - new LoadInst(Alloca->getAllocatedType(), Alloca, "", - Phi->getIncomingBlock(i)->getTerminator()); + LoadInst *Load = new LoadInst( + Alloca->getAllocatedType(), Alloca, "", + Phi->getIncomingBlock(i)->getTerminator()->getIterator()); Phi->setIncomingValue(i, Load); } } } else { - LoadInst *Load = - new LoadInst(Alloca->getAllocatedType(), Alloca, "", Use); + LoadInst *Load = new LoadInst(Alloca->getAllocatedType(), Alloca, "", + Use->getIterator()); Use->replaceUsesOfWith(Def, Load); } } @@ -2229,16 +2231,16 @@ static void insertUseHolderAfter(CallBase *Call, const ArrayRef<Value *> Values, if (isa<CallInst>(Call)) { // For call safepoints insert dummy calls right after safepoint Holders.push_back( - CallInst::Create(Func, Values, "", &*++Call->getIterator())); + CallInst::Create(Func, Values, "", std::next(Call->getIterator()))); return; } // For invoke safepooints insert dummy calls both in normal and // exceptional destination blocks auto *II = cast<InvokeInst>(Call); Holders.push_back(CallInst::Create( - Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt())); + Func, Values, "", II->getNormalDest()->getFirstInsertionPt())); Holders.push_back(CallInst::Create( - Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt())); + Func, Values, "", II->getUnwindDest()->getFirstInsertionPt())); } static void findLiveReferences( @@ -2269,7 +2271,7 @@ static Value* findRematerializableChainToBasePointer( } if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { - if (!CI->isNoopCast(CI->getModule()->getDataLayout())) + if (!CI->isNoopCast(CI->getDataLayout())) return CI; ChainToBase.push_back(CI); @@ -2291,7 +2293,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain, for (Instruction *Instr : Chain) { if (CastInst *CI = dyn_cast<CastInst>(Instr)) { - assert(CI->isNoopCast(CI->getModule()->getDataLayout()) && + assert(CI->isNoopCast(CI->getDataLayout()) && "non noop cast is found during rematerialization"); Type *SrcTy = CI->getOperand(0)->getType(); @@ -2599,7 +2601,7 @@ static bool inlineGetBaseAndOffset(Function &F, DefiningValueMapTy &DVCache, IsKnownBaseMapTy &KnownBases) { auto &Context = F.getContext(); - auto &DL = F.getParent()->getDataLayout(); + auto &DL = F.getDataLayout(); bool Changed = false; for (auto *Callsite : Intrinsics) @@ -3044,8 +3046,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, // which doesn't know how to produce a proper deopt state. So if we see a // non-leaf memcpy/memmove without deopt state just treat it as a leaf // copy and don't produce a statepoint. - if (!AllowStatepointWithNoDeoptInfo && - !Call->getOperandBundle(LLVMContext::OB_deopt)) { + if (!AllowStatepointWithNoDeoptInfo && !Call->hasDeoptState()) { assert((isa<AtomicMemCpyInst>(Call) || isa<AtomicMemMoveInst>(Call)) && "Don't expect any other calls here!"); return false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp index 8a491e74b91c..ce45c58e624e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -119,7 +119,7 @@ static bool runSCCP(Function &F, const DataLayout &DL, } PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) { - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp index 17a94f9381bf..c738a2a6f39a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp @@ -116,10 +116,6 @@ STATISTIC( STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); -/// Hidden option to experiment with completely strict handling of inbounds -/// GEPs. -static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), - cl::Hidden); /// Disable running mem2reg during SROA in order to test or debug SROA. static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden); @@ -293,7 +289,7 @@ calculateFragment(DILocalVariable *Variable, if (!CurrentFragment) { if (auto Size = Variable->getSizeInBits()) { // Treat the current fragment as covering the whole variable. - CurrentFragment = DIExpression::FragmentInfo(*Size, 0); + CurrentFragment = DIExpression::FragmentInfo(*Size, 0); if (Target == CurrentFragment) return UseNoFrag; } @@ -319,28 +315,21 @@ static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) { return DebugVariable(DVI->getVariable(), std::nullopt, DVI->getDebugLoc().getInlinedAt()); } -static DebugVariable getAggregateVariable(DPValue *DPV) { - return DebugVariable(DPV->getVariable(), std::nullopt, - DPV->getDebugLoc().getInlinedAt()); +static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) { + return DebugVariable(DVR->getVariable(), std::nullopt, + DVR->getDebugLoc().getInlinedAt()); } -static DPValue *createLinkedAssign(DPValue *, DIBuilder &DIB, - Instruction *LinkedInstr, Value *NewValue, - DILocalVariable *Variable, - DIExpression *Expression, Value *Address, - DIExpression *AddressExpression, - const DILocation *DI) { - (void)DIB; - return DPValue::createLinkedDPVAssign(LinkedInstr, NewValue, Variable, - Expression, Address, AddressExpression, - DI); +/// Helpers for handling new and old debug info modes in migrateDebugInfo. +/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based +/// on the \p Unused parameter type. +DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) { + (void)Unused; + return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P)); } -static DbgAssignIntrinsic *createLinkedAssign( - DbgAssignIntrinsic *, DIBuilder &DIB, Instruction *LinkedInstr, - Value *NewValue, DILocalVariable *Variable, DIExpression *Expression, - Value *Address, DIExpression *AddressExpression, const DILocation *DI) { - return DIB.insertDbgAssign(LinkedInstr, NewValue, Variable, Expression, - Address, AddressExpression, DI); +DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) { + (void)Unused; + return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P)); } /// Find linked dbg.assign and generate a new one with the correct @@ -363,9 +352,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL) { auto MarkerRange = at::getAssignmentMarkers(OldInst); - auto DPVAssignMarkerRange = at::getDPVAssignmentMarkers(OldInst); + auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst); // Nothing to do if OldInst has no linked dbg.assign intrinsics. - if (MarkerRange.empty() && DPVAssignMarkerRange.empty()) + if (MarkerRange.empty() && DVRAssignMarkerRange.empty()) return; LLVM_DEBUG(dbgs() << " migrateDebugInfo\n"); @@ -386,9 +375,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, for (auto *DAI : at::getAssignmentMarkers(OldAlloca)) BaseFragments[getAggregateVariable(DAI)] = DAI->getExpression()->getFragmentInfo(); - for (auto *DPV : at::getDPVAssignmentMarkers(OldAlloca)) - BaseFragments[getAggregateVariable(DPV)] = - DPV->getExpression()->getFragmentInfo(); + for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca)) + BaseFragments[getAggregateVariable(DVR)] = + DVR->getExpression()->getFragmentInfo(); // The new inst needs a DIAssignID unique metadata tag (if OldInst has // one). It shouldn't already have one: assert this assumption. @@ -398,7 +387,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false); assert(OldAlloca->isStaticAlloca()); - auto MigrateDbgAssign = [&](auto DbgAssign) { + auto MigrateDbgAssign = [&](auto *DbgAssign) { LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign << "\n"); auto *Expr = DbgAssign->getExpression(); @@ -452,10 +441,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, } ::Value *NewValue = Value ? Value : DbgAssign->getValue(); - auto *NewAssign = createLinkedAssign( - DbgAssign, DIB, Inst, NewValue, DbgAssign->getVariable(), Expr, Dest, - DIExpression::get(Expr->getContext(), std::nullopt), - DbgAssign->getDebugLoc()); + auto *NewAssign = UnwrapDbgInstPtr( + DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, + Dest, + DIExpression::get(Expr->getContext(), std::nullopt), + DbgAssign->getDebugLoc()), + DbgAssign); // If we've updated the value but the original dbg.assign has an arglist // then kill it now - we can't use the requested new value. @@ -493,7 +484,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, }; for_each(MarkerRange, MigrateDbgAssign); - for_each(DPVAssignMarkerRange, MigrateDbgAssign); + for_each(DVRAssignMarkerRange, MigrateDbgAssign); } namespace { @@ -510,9 +501,9 @@ class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter { public: void SetNamePrefix(const Twine &P) { Prefix = P.str(); } - void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, + void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const override { - IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB, + IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), InsertPt); } }; @@ -635,7 +626,7 @@ public: int OldSize = Slices.size(); Slices.append(NewSlices.begin(), NewSlices.end()); auto SliceI = Slices.begin() + OldSize; - llvm::sort(SliceI, Slices.end()); + std::stable_sort(SliceI, Slices.end()); std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } @@ -1100,45 +1091,6 @@ private: if (GEPI.use_empty()) return markAsDead(GEPI); - if (SROAStrictInbounds && GEPI.isInBounds()) { - // FIXME: This is a manually un-factored variant of the basic code inside - // of GEPs with checking of the inbounds invariant specified in the - // langref in a very strict sense. If we ever want to enable - // SROAStrictInbounds, this code should be factored cleanly into - // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds - // by writing out the code here where we have the underlying allocation - // size readily available. - APInt GEPOffset = Offset; - const DataLayout &DL = GEPI.getModule()->getDataLayout(); - for (gep_type_iterator GTI = gep_type_begin(GEPI), - GTE = gep_type_end(GEPI); - GTI != GTE; ++GTI) { - ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); - if (!OpC) - break; - - // Handle a struct index, which adds its field offset to the pointer. - if (StructType *STy = GTI.getStructTypeOrNull()) { - unsigned ElementIdx = OpC->getZExtValue(); - const StructLayout *SL = DL.getStructLayout(STy); - GEPOffset += - APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); - } else { - // For array or vector indices, scale the index by the size of the - // type. - APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); - GEPOffset += Index * APInt(Offset.getBitWidth(), - GTI.getSequentialElementStride(DL)); - } - - // If this index has computed an intermediate pointer which is not - // inbounds, then the result of the GEP is a poison value and we can - // delete it and all uses. - if (GEPOffset.ugt(AllocSize)) - return markAsDead(GEPI); - } - } - return Base::visitGetElementPtrInst(GEPI); } @@ -1213,8 +1165,9 @@ private: if (!IsOffsetKnown) return PI.setAborted(&II); - insertUse(II, Offset, Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(), + insertUse(II, Offset, + Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), (bool)Length); } @@ -1327,7 +1280,7 @@ private: SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; Visited.insert(Root); Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); - const DataLayout &DL = Root->getModule()->getDataLayout(); + const DataLayout &DL = Root->getDataLayout(); // If there are no loads or stores, the access is dead. We mark that as // a size zero access. Size = 0; @@ -1574,7 +1527,7 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h static bool isSafePHIToSpeculate(PHINode &PN) { - const DataLayout &DL = PN.getModule()->getDataLayout(); + const DataLayout &DL = PN.getDataLayout(); // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. @@ -1669,7 +1622,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) { } // Inject loads into all of the pred blocks. - DenseMap<BasicBlock*, Value*> InjectedLoads; + DenseMap<BasicBlock *, Value *> InjectedLoads; for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { BasicBlock *Pred = PN.getIncomingBlock(Idx); Value *InVal = PN.getIncomingValue(Idx); @@ -1678,7 +1631,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) { // basic block, as long as the value is the same. So if we already injected // a load in the predecessor, then we should reuse the same load for all // duplicated entries. - if (Value* V = InjectedLoads.lookup(Pred)) { + if (Value *V = InjectedLoads.lookup(Pred)) { NewPN->addIncoming(V, Pred); continue; } @@ -1732,7 +1685,7 @@ isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) { assert(LI.isSimple() && "Only for simple loads"); SelectHandSpeculativity Spec; - const DataLayout &DL = SI.getModule()->getDataLayout(); + const DataLayout &DL = SI.getDataLayout(); for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()}) if (isSafeToLoadUnconditionally(Value, LI.getType(), LI.getAlign(), DL, &LI)) @@ -1852,7 +1805,7 @@ static void rewriteMemOpOfSelect(SelectInst &SI, T &I, Tail->setName(Head->getName() + ".cont"); PHINode *PN; if (isa<LoadInst>(I)) - PN = PHINode::Create(I.getType(), 2, "", &I); + PN = PHINode::Create(I.getType(), 2, "", I.getIterator()); for (BasicBlock *SuccBB : successors(Head)) { bool IsThen = SuccBB == HeadBI->getSuccessor(0); int SuccIdx = IsThen ? 0 : 1; @@ -2077,8 +2030,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, if (BeginIndex * ElementSize != BeginOffset || BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements()) return false; - uint64_t EndOffset = - std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); + uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); uint64_t EndIndex = EndOffset / ElementSize; if (EndIndex * ElementSize != EndOffset || EndIndex > cast<FixedVectorType>(Ty)->getNumElements()) @@ -2226,8 +2178,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, cast<FixedVectorType>(LHSTy)->getNumElements(); }; llvm::sort(CandidateTys, RankVectorTypesComp); - CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(), - RankVectorTypesEq), + CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq), CandidateTys.end()); } else { // The only way to have the same element type in every vector type is to @@ -2780,8 +2731,8 @@ public: Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); IRB.SetInsertPoint(OldUserI); IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); - IRB.getInserter().SetNamePrefix( - Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); + IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." + + Twine(BeginOffset) + "."); CanSROA &= visit(cast<Instruction>(OldUse->getUser())); if (VecTy || IntTy) @@ -2834,7 +2785,7 @@ private: #else Twine() #endif - ); + ); } /// Compute suitable alignment to access this slice of the *new* @@ -2940,7 +2891,8 @@ private: // Do this after copyMetadataForLoad() to preserve the TBAA shift. if (AATags) - NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + NewLI->setAAMetadata(AATags.adjustForAccess( + NewBeginOffset - BeginOffset, NewLI->getType(), DL)); // Try to preserve nonnull metadata V = NewLI; @@ -2961,8 +2913,11 @@ private: LoadInst *NewLI = IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy), getSliceAlign(), LI.isVolatile(), LI.getName()); + if (AATags) - NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + NewLI->setAAMetadata(AATags.adjustForAccess( + NewBeginOffset - BeginOffset, NewLI->getType(), DL)); + if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access, @@ -2982,7 +2937,12 @@ private: assert(DL.typeSizeEqualsStoreSize(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); + BasicBlock::iterator LIIt = std::next(LI.getIterator()); + // Ensure the insertion point comes before any debug-info immediately + // after the load, so that variable values referring to the load are + // dominated by it. + LIIt.setHeadBit(true); + IRB.SetInsertPoint(LI.getParent(), LIIt); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving @@ -3032,7 +2992,8 @@ private: Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, + V->getType(), DL)); Pass.DeadInsts.push_back(&SI); // NOTE: Careful to use OrigV rather than V. @@ -3059,7 +3020,8 @@ private: Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, + V->getType(), DL)); migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI, Store, Store->getPointerOperand(), @@ -3119,7 +3081,8 @@ private: NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, + V->getType(), DL)); if (SI.isVolatile()) NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); if (NewSI->isAtomic()) @@ -3188,7 +3151,7 @@ private: // emit dbg.assign intrinsics for mem intrinsics storing through non- // constant geps, or storing a variable number of bytes. assert(at::getAssignmentMarkers(&II).empty() && - at::getDPVAssignmentMarkers(&II).empty() && + at::getDVRAssignmentMarkers(&II).empty() && "AT: Unexpected link to non-const GEP"); deleteIfTriviallyDead(OldPtr); return false; @@ -3203,8 +3166,7 @@ private: const bool CanContinue = [&]() { if (VecTy || IntTy) return true; - if (BeginOffset > NewAllocaBeginOffset || - EndOffset < NewAllocaEndOffset) + if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) return false; // Length must be in range for FixedVectorType. auto *C = cast<ConstantInt>(II.getLength()); @@ -3221,12 +3183,14 @@ private: // a single value type, just emit a memset. if (!CanContinue) { Type *SizeTy = II.getLength()->getType(); - Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); + unsigned Sz = NewEndOffset - NewBeginOffset; + Constant *Size = ConstantInt::get(SizeTy, Sz); MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet( getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, MaybeAlign(getSliceAlign()), II.isVolatile())); if (AATags) - New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + New->setAAMetadata( + AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz)); migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II, New, New->getRawDest(), nullptr, DL); @@ -3302,7 +3266,8 @@ private: New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, + V->getType(), DL)); migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II, New, New->getPointerOperand(), V, DL); @@ -3341,7 +3306,7 @@ private: DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr); }; for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress); - for_each(at::getDPVAssignmentMarkers(&II), UpdateAssignAddress); + for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress); II.setDest(AdjustedPtr); II.setDestAlignment(SliceAlign); } else { @@ -3507,7 +3472,8 @@ private: Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, + Load->getType(), DL)); Src = Load; } @@ -3529,7 +3495,8 @@ private: Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) - Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); + Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, + Src->getType(), DL)); APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0); if (IsDest) { @@ -3857,7 +3824,8 @@ private: DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); if (AATags && GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) - Load->setAAMetadata(AATags.shift(Offset.getZExtValue())); + Load->setAAMetadata( + AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL)); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); LLVM_DEBUG(dbgs() << " to: " << *Load << "\n"); @@ -3908,8 +3876,10 @@ private: APInt Offset( DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset); - if (AATags) - Store->setAAMetadata(AATags.shift(Offset.getZExtValue())); + if (AATags) { + Store->setAAMetadata(AATags.adjustForAccess( + Offset.getZExtValue(), ExtractValue->getType(), DL)); + } // migrateDebugInfo requires the base Alloca. Walk to it from this gep. // If we cannot (because there's an intervening non-const or unbounded @@ -3925,7 +3895,7 @@ private: DL); } else { assert(at::getAssignmentMarkers(Store).empty() && - at::getDPVAssignmentMarkers(Store).empty() && + at::getDVRAssignmentMarkers(Store).empty() && "AT: unexpected debug.assign linked to store through " "unbounded GEP"); } @@ -3963,30 +3933,62 @@ private: return false; } - // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2) - bool foldGEPSelect(GetElementPtrInst &GEPI) { - if (!GEPI.hasAllConstantIndices()) + // Unfold gep (select cond, ptr1, ptr2), idx + // => select cond, gep(ptr1, idx), gep(ptr2, idx) + // and gep ptr, (select cond, idx1, idx2) + // => select cond, gep(ptr, idx1), gep(ptr, idx2) + bool unfoldGEPSelect(GetElementPtrInst &GEPI) { + // Check whether the GEP has exactly one select operand and all indices + // will become constant after the transform. + SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand()); + for (Value *Op : GEPI.indices()) { + if (auto *SI = dyn_cast<SelectInst>(Op)) { + if (Sel) + return false; + + Sel = SI; + if (!isa<ConstantInt>(Sel->getTrueValue()) || + !isa<ConstantInt>(Sel->getFalseValue())) + return false; + continue; + } + + if (!isa<ConstantInt>(Op)) + return false; + } + + if (!Sel) return false; - SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand()); + LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n"; + dbgs() << " original: " << *Sel << "\n"; + dbgs() << " " << GEPI << "\n";); + + auto GetNewOps = [&](Value *SelOp) { + SmallVector<Value *> NewOps; + for (Value *Op : GEPI.operands()) + if (Op == Sel) + NewOps.push_back(SelOp); + else + NewOps.push_back(Op); + return NewOps; + }; - LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):" - << "\n original: " << *Sel - << "\n " << GEPI); + Value *True = Sel->getTrueValue(); + Value *False = Sel->getFalseValue(); + SmallVector<Value *> TrueOps = GetNewOps(True); + SmallVector<Value *> FalseOps = GetNewOps(False); IRB.SetInsertPoint(&GEPI); - SmallVector<Value *, 4> Index(GEPI.indices()); - bool IsInBounds = GEPI.isInBounds(); + GEPNoWrapFlags NW = GEPI.getNoWrapFlags(); Type *Ty = GEPI.getSourceElementType(); - Value *True = Sel->getTrueValue(); - Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep", - IsInBounds); - - Value *False = Sel->getFalseValue(); + Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(), + True->getName() + ".sroa.gep", NW); - Value *NFalse = IRB.CreateGEP(Ty, False, Index, - False->getName() + ".sroa.gep", IsInBounds); + Value *NFalse = + IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(), + False->getName() + ".sroa.gep", NW); Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, Sel->getName() + ".sroa.sel"); @@ -3997,75 +3999,114 @@ private: Visited.insert(NSelI); enqueueUsers(*NSelI); - LLVM_DEBUG(dbgs() << "\n to: " << *NTrue - << "\n " << *NFalse - << "\n " << *NSel << '\n'); + LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n"; + dbgs() << " " << *NFalse << "\n"; + dbgs() << " " << *NSel << "\n";); return true; } - // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2) - bool foldGEPPhi(GetElementPtrInst &GEPI) { - if (!GEPI.hasAllConstantIndices()) - return false; + // Unfold gep (phi ptr1, ptr2), idx + // => phi ((gep ptr1, idx), (gep ptr2, idx)) + // and gep ptr, (phi idx1, idx2) + // => phi ((gep ptr, idx1), (gep ptr, idx2)) + bool unfoldGEPPhi(GetElementPtrInst &GEPI) { + // To prevent infinitely expanding recursive phis, bail if the GEP pointer + // operand (looking through the phi if it is the phi we want to unfold) is + // an instruction besides a static alloca. + PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand()); + auto IsInvalidPointerOperand = [](Value *V) { + if (!isa<Instruction>(V)) + return false; + if (auto *AI = dyn_cast<AllocaInst>(V)) + return !AI->isStaticAlloca(); + return true; + }; + if (Phi) { + if (any_of(Phi->operands(), IsInvalidPointerOperand)) + return false; + } else { + if (IsInvalidPointerOperand(GEPI.getPointerOperand())) + return false; + } + // Check whether the GEP has exactly one phi operand (including the pointer + // operand) and all indices will become constant after the transform. + for (Value *Op : GEPI.indices()) { + if (auto *SI = dyn_cast<PHINode>(Op)) { + if (Phi) + return false; + + Phi = SI; + if (!all_of(Phi->incoming_values(), + [](Value *V) { return isa<ConstantInt>(V); })) + return false; + continue; + } + + if (!isa<ConstantInt>(Op)) + return false; + } - PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand()); - if (GEPI.getParent() != PHI->getParent() || - llvm::any_of(PHI->incoming_values(), [](Value *In) - { Instruction *I = dyn_cast<Instruction>(In); - return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) || - succ_empty(I->getParent()) || - !I->getParent()->isLegalToHoistInto(); - })) + if (!Phi) return false; - LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):" - << "\n original: " << *PHI - << "\n " << GEPI - << "\n to: "); - - SmallVector<Value *, 4> Index(GEPI.indices()); - bool IsInBounds = GEPI.isInBounds(); - IRB.SetInsertPoint(GEPI.getParent(), GEPI.getParent()->getFirstNonPHIIt()); - PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(), - PHI->getName() + ".sroa.phi"); - for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { - BasicBlock *B = PHI->getIncomingBlock(I); - Value *NewVal = nullptr; - int Idx = NewPN->getBasicBlockIndex(B); - if (Idx >= 0) { - NewVal = NewPN->getIncomingValue(Idx); - } else { - Instruction *In = cast<Instruction>(PHI->getIncomingValue(I)); + LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n"; + dbgs() << " original: " << *Phi << "\n"; + dbgs() << " " << GEPI << "\n";); + + auto GetNewOps = [&](Value *PhiOp) { + SmallVector<Value *> NewOps; + for (Value *Op : GEPI.operands()) + if (Op == Phi) + NewOps.push_back(PhiOp); + else + NewOps.push_back(Op); + return NewOps; + }; - IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator())); - Type *Ty = GEPI.getSourceElementType(); - NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep", - IsInBounds); + IRB.SetInsertPoint(Phi); + PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(), + Phi->getName() + ".sroa.phi"); + + Type *SourceTy = GEPI.getSourceElementType(); + // We only handle arguments, constants, and static allocas here, so we can + // insert GEPs at the end of the entry block. + IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator()); + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { + Value *Op = Phi->getIncomingValue(I); + BasicBlock *BB = Phi->getIncomingBlock(I); + Value *NewGEP; + if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) { + NewGEP = NewPhi->getIncomingValue(NI); + } else { + SmallVector<Value *> NewOps = GetNewOps(Op); + NewGEP = + IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(), + Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags()); } - NewPN->addIncoming(NewVal, B); + NewPhi->addIncoming(NewGEP, BB); } Visited.erase(&GEPI); - GEPI.replaceAllUsesWith(NewPN); + GEPI.replaceAllUsesWith(NewPhi); GEPI.eraseFromParent(); - Visited.insert(NewPN); - enqueueUsers(*NewPN); + Visited.insert(NewPhi); + enqueueUsers(*NewPhi); - LLVM_DEBUG(for (Value *In : NewPN->incoming_values()) - dbgs() << "\n " << *In; - dbgs() << "\n " << *NewPN << '\n'); + LLVM_DEBUG(dbgs() << " to: "; + for (Value *In + : NewPhi->incoming_values()) dbgs() + << "\n " << *In; + dbgs() << "\n " << *NewPhi << '\n'); return true; } bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { - if (isa<SelectInst>(GEPI.getPointerOperand()) && - foldGEPSelect(GEPI)) + if (unfoldGEPSelect(GEPI)) return true; - if (isa<PHINode>(GEPI.getPointerOperand()) && - foldGEPPhi(GEPI)) + if (unfoldGEPPhi(GEPI)) return true; enqueueUsers(GEPI); @@ -4137,17 +4178,17 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, return nullptr; if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) { - Type *ElementTy; - uint64_t TyNumElements; - if (auto *AT = dyn_cast<ArrayType>(Ty)) { - ElementTy = AT->getElementType(); - TyNumElements = AT->getNumElements(); - } else { - // FIXME: This isn't right for vectors with non-byte-sized or - // non-power-of-two sized elements. - auto *VT = cast<FixedVectorType>(Ty); - ElementTy = VT->getElementType(); - TyNumElements = VT->getNumElements(); + Type *ElementTy; + uint64_t TyNumElements; + if (auto *AT = dyn_cast<ArrayType>(Ty)) { + ElementTy = AT->getElementType(); + TyNumElements = AT->getNumElements(); + } else { + // FIXME: This isn't right for vectors with non-byte-sized or + // non-power-of-two sized elements. + auto *VT = cast<FixedVectorType>(Ty); + ElementTy = VT->getElementType(); + TyNumElements = VT->getNumElements(); } uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue(); uint64_t NumSkippedElements = Offset / ElementSize; @@ -4458,7 +4499,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // them to the alloca slices. SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; std::vector<LoadInst *> SplitLoads; - const DataLayout &DL = AI.getModule()->getDataLayout(); + const DataLayout &DL = AI.getDataLayout(); for (LoadInst *LI : Loads) { SplitLoads.clear(); @@ -4532,6 +4573,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { Value *StoreBasePtr = SI->getPointerOperand(); IRB.SetInsertPoint(SI); + AAMDNodes AATags = SI->getAAMetadata(); LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); @@ -4551,6 +4593,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group, LLVMContext::MD_DIAssignID}); + + if (AATags) + PStore->setAAMetadata( + AATags.adjustForAccess(PartOffset, PLoad->getType(), DL)); LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); } @@ -4747,7 +4793,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // or an i8 array of an appropriate size. Type *SliceTy = nullptr; VectorType *SliceVecTy = nullptr; - const DataLayout &DL = AI.getModule()->getDataLayout(); + const DataLayout &DL = AI.getDataLayout(); std::pair<Type *, IntegerType *> CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); // Do all uses operate on the same type? @@ -4817,15 +4863,15 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, NewAI = new AllocaInst( SliceTy, AI.getAddressSpace(), nullptr, IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment, - AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); + AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), + AI.getIterator()); // Copy the old AI debug location over to the new one. NewAI->setDebugLoc(AI.getDebugLoc()); ++NumNewAllocas; } - LLVM_DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << P.beginOffset() << "," << P.endOffset() - << ") to: " << *NewAI << "\n"); + LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset() + << "," << P.endOffset() << ") to: " << *NewAI << "\n"); // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in @@ -4921,45 +4967,236 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return NewAI; } -static void insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, - AllocaInst *NewAddr, DIExpression *NewFragmentExpr, - Instruction *BeforeInst) { - DIB.insertDeclare(NewAddr, Orig->getVariable(), NewFragmentExpr, +// There isn't a shared interface to get the "address" parts out of a +// dbg.declare and dbg.assign, so provide some wrappers now for +// both debug intrinsics and records. +const Value *getAddress(const DbgVariableIntrinsic *DVI) { + if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI)) + return DAI->getAddress(); + return cast<DbgDeclareInst>(DVI)->getAddress(); +} + +const Value *getAddress(const DbgVariableRecord *DVR) { + assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || + DVR->getType() == DbgVariableRecord::LocationType::Assign); + return DVR->getAddress(); +} + +bool isKillAddress(const DbgVariableIntrinsic *DVI) { + if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI)) + return DAI->isKillAddress(); + return cast<DbgDeclareInst>(DVI)->isKillLocation(); +} + +bool isKillAddress(const DbgVariableRecord *DVR) { + assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || + DVR->getType() == DbgVariableRecord::LocationType::Assign); + if (DVR->getType() == DbgVariableRecord::LocationType::Assign) + return DVR->isKillAddress(); + return DVR->isKillLocation(); +} + +const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) { + if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI)) + return DAI->getAddressExpression(); + return cast<DbgDeclareInst>(DVI)->getExpression(); +} + +const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) { + assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || + DVR->getType() == DbgVariableRecord::LocationType::Assign); + if (DVR->getType() == DbgVariableRecord::LocationType::Assign) + return DVR->getAddressExpression(); + return DVR->getExpression(); +} + +/// Create or replace an existing fragment in a DIExpression with \p Frag. +/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext +/// operation, add \p BitExtractOffset to the offset part. +/// +/// Returns the new expression, or nullptr if this fails (see details below). +/// +/// This function is similar to DIExpression::createFragmentExpression except +/// for 3 important distinctions: +/// 1. The new fragment isn't relative to an existing fragment. +/// 2. It assumes the computed location is a memory location. This means we +/// don't need to perform checks that creating the fragment preserves the +/// expression semantics. +/// 3. Existing extract_bits are modified independently of fragment changes +/// using \p BitExtractOffset. A change to the fragment offset or size +/// may affect a bit extract. But a bit extract offset can change +/// independently of the fragment dimensions. +/// +/// Returns the new expression, or nullptr if one couldn't be created. +/// Ideally this is only used to signal that a bit-extract has become +/// zero-sized (and thus the new debug record has no size and can be +/// dropped), however, it fails for other reasons too - see the FIXME below. +/// +/// FIXME: To keep the change that introduces this function NFC it bails +/// in some situations unecessarily, e.g. when fragment and bit extract +/// sizes differ. +static DIExpression *createOrReplaceFragment(const DIExpression *Expr, + DIExpression::FragmentInfo Frag, + int64_t BitExtractOffset) { + SmallVector<uint64_t, 8> Ops; + bool HasFragment = false; + bool HasBitExtract = false; + + for (auto &Op : Expr->expr_ops()) { + if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) { + HasFragment = true; + continue; + } + if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext || + Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) { + HasBitExtract = true; + int64_t ExtractOffsetInBits = Op.getArg(0); + int64_t ExtractSizeInBits = Op.getArg(1); + + // DIExpression::createFragmentExpression doesn't know how to handle + // a fragment that is smaller than the extract. Copy the behaviour + // (bail) to avoid non-NFC changes. + // FIXME: Don't do this. + if (Frag.SizeInBits < uint64_t(ExtractSizeInBits)) + return nullptr; + + assert(BitExtractOffset <= 0); + int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset; + + // DIExpression::createFragmentExpression doesn't know what to do + // if the new extract starts "outside" the existing one. Copy the + // behaviour (bail) to avoid non-NFC changes. + // FIXME: Don't do this. + if (AdjustedOffset < 0) + return nullptr; + + Ops.push_back(Op.getOp()); + Ops.push_back(std::max<int64_t>(0, AdjustedOffset)); + Ops.push_back(ExtractSizeInBits); + continue; + } + Op.appendToVector(Ops); + } + + // Unsupported by createFragmentExpression, so don't support it here yet to + // preserve NFC-ness. + if (HasFragment && HasBitExtract) + return nullptr; + + if (!HasBitExtract) { + Ops.push_back(dwarf::DW_OP_LLVM_fragment); + Ops.push_back(Frag.OffsetInBits); + Ops.push_back(Frag.SizeInBits); + } + return DIExpression::get(Expr->getContext(), Ops); +} + +/// Insert a new dbg.declare. +/// \p Orig Original to copy debug loc and variable from. +/// \p NewAddr Location's new base address. +/// \p NewAddrExpr New expression to apply to address. +/// \p BeforeInst Insert position. +/// \p NewFragment New fragment (absolute, non-relative). +/// \p BitExtractAdjustment Offset to apply to any extract_bits op. +static void +insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr, + DIExpression *NewAddrExpr, Instruction *BeforeInst, + std::optional<DIExpression::FragmentInfo> NewFragment, + int64_t BitExtractAdjustment) { + if (NewFragment) + NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment, + BitExtractAdjustment); + if (!NewAddrExpr) + return; + + DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr, Orig->getDebugLoc(), BeforeInst); } -static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, - AllocaInst *NewAddr, DIExpression *NewFragmentExpr, - Instruction *BeforeInst) { + +/// Insert a new dbg.assign. +/// \p Orig Original to copy debug loc, variable, value and value expression +/// from. +/// \p NewAddr Location's new base address. +/// \p NewAddrExpr New expression to apply to address. +/// \p BeforeInst Insert position. +/// \p NewFragment New fragment (absolute, non-relative). +/// \p BitExtractAdjustment Offset to apply to any extract_bits op. +static void +insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr, + DIExpression *NewAddrExpr, Instruction *BeforeInst, + std::optional<DIExpression::FragmentInfo> NewFragment, + int64_t BitExtractAdjustment) { + // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr. (void)BeforeInst; + + // A dbg.assign puts fragment info in the value expression only. The address + // expression has already been built: NewAddrExpr. + DIExpression *NewFragmentExpr = Orig->getExpression(); + if (NewFragment) + NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment, + BitExtractAdjustment); + if (!NewFragmentExpr) + return; + + // Apply a DIAssignID to the store if it doesn't already have it. if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { NewAddr->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(NewAddr->getContext())); } - auto *NewAssign = DIB.insertDbgAssign( - NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, - Orig->getAddressExpression(), Orig->getDebugLoc()); + + Instruction *NewAssign = + DIB.insertDbgAssign(NewAddr, Orig->getValue(), Orig->getVariable(), + NewFragmentExpr, NewAddr, NewAddrExpr, + Orig->getDebugLoc()) + .get<Instruction *>(); LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n"); (void)NewAssign; } -static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr, - DIExpression *NewFragmentExpr, - Instruction *BeforeInst) { + +/// Insert a new DbgRecord. +/// \p Orig Original to copy record type, debug loc and variable from, and +/// additionally value and value expression for dbg_assign records. +/// \p NewAddr Location's new base address. +/// \p NewAddrExpr New expression to apply to address. +/// \p BeforeInst Insert position. +/// \p NewFragment New fragment (absolute, non-relative). +/// \p BitExtractAdjustment Offset to apply to any extract_bits op. +static void +insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, + DIExpression *NewAddrExpr, Instruction *BeforeInst, + std::optional<DIExpression::FragmentInfo> NewFragment, + int64_t BitExtractAdjustment) { (void)DIB; + + // A dbg_assign puts fragment info in the value expression only. The address + // expression has already been built: NewAddrExpr. A dbg_declare puts the + // new fragment info into NewAddrExpr (as it only has one expression). + DIExpression *NewFragmentExpr = + Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr; + if (NewFragment) + NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment, + BitExtractAdjustment); + if (!NewFragmentExpr) + return; + if (Orig->isDbgDeclare()) { - DPValue *DPV = DPValue::createDPVDeclare( + DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare( NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc()); - BeforeInst->getParent()->insertDPValueBefore(DPV, - BeforeInst->getIterator()); + BeforeInst->getParent()->insertDbgRecordBefore(DVR, + BeforeInst->getIterator()); return; } + + // Apply a DIAssignID to the store if it doesn't already have it. if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { NewAddr->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(NewAddr->getContext())); } - auto *NewAssign = DPValue::createLinkedDPVAssign( + + DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign( NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, - Orig->getAddressExpression(), Orig->getDebugLoc()); - LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n"); + NewAddrExpr, Orig->getDebugLoc()); + LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n"); (void)NewAssign; } @@ -5010,8 +5247,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { IsSorted = false; } } - } - else { + } else { // We only allow whole-alloca splittable loads and stores // for a large alloca to avoid creating too large BitVector. for (Slice &S : AS) { @@ -5030,7 +5266,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { } if (!IsSorted) - llvm::sort(AS); + llvm::stable_sort(AS); /// Describes the allocas introduced by rewritePartition in order to migrate /// the debug info. @@ -5039,7 +5275,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { uint64_t Offset; uint64_t Size; Fragment(AllocaInst *AI, uint64_t O, uint64_t S) - : Alloca(AI), Offset(O), Size(S) {} + : Alloca(AI), Offset(O), Size(S) {} }; SmallVector<Fragment, 4> Fragments; @@ -5053,7 +5289,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue(); // Don't include any padding. uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); - Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size)); + Fragments.push_back( + Fragment(NewAI, P.beginOffset() * SizeOfByte, Size)); } } ++NumPartitions; @@ -5065,54 +5302,78 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. auto MigrateOne = [&](auto *DbgVariable) { - auto *Expr = DbgVariable->getExpression(); - DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); - uint64_t AllocaSize = - DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedValue(); - for (auto Fragment : Fragments) { - // Create a fragment expression describing the new partition or reuse AI's - // expression if there is only one partition. - auto *FragmentExpr = Expr; - if (Fragment.Size < AllocaSize || Expr->isFragment()) { - // If this alloca is already a scalar replacement of a larger aggregate, - // Fragment.Offset describes the offset inside the scalar. - auto ExprFragment = Expr->getFragmentInfo(); - uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0; - uint64_t Start = Offset + Fragment.Offset; - uint64_t Size = Fragment.Size; - if (ExprFragment) { - uint64_t AbsEnd = - ExprFragment->OffsetInBits + ExprFragment->SizeInBits; - if (Start >= AbsEnd) { - // No need to describe a SROAed padding. - continue; - } - Size = std::min(Size, AbsEnd - Start); - } - // The new, smaller fragment is stenciled out from the old fragment. - if (auto OrigFragment = FragmentExpr->getFragmentInfo()) { - assert(Start >= OrigFragment->OffsetInBits && - "new fragment is outside of original fragment"); - Start -= OrigFragment->OffsetInBits; - } + // Can't overlap with undef memory. + if (isKillAddress(DbgVariable)) + return; - // The alloca may be larger than the variable. - auto VarSize = DbgVariable->getVariable()->getSizeInBits(); - if (VarSize) { - if (Size > *VarSize) - Size = *VarSize; - if (Size == 0 || Start + Size > *VarSize) - continue; - } + const Value *DbgPtr = getAddress(DbgVariable); + DIExpression::FragmentInfo VarFrag = + DbgVariable->getFragmentOrEntireVariable(); + // Get the address expression constant offset if one exists and the ops + // that come after it. + int64_t CurrentExprOffsetInBytes = 0; + SmallVector<uint64_t> PostOffsetOps; + if (!getAddressExpression(DbgVariable) + ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps)) + return; // Couldn't interpret this DIExpression - drop the var. + + // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext. + int64_t ExtractOffsetInBits = 0; + for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) { + if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext || + Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) { + ExtractOffsetInBits = Op.getArg(0); + break; + } + } - // Avoid creating a fragment expression that covers the entire variable. - if (!VarSize || *VarSize != Size) { - if (auto E = - DIExpression::createFragmentExpression(Expr, Start, Size)) - FragmentExpr = *E; - else - continue; - } + DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); + for (auto Fragment : Fragments) { + int64_t OffsetFromLocationInBits; + std::optional<DIExpression::FragmentInfo> NewDbgFragment; + // Find the variable fragment that the new alloca slice covers. + // Drop debug info for this variable fragment if we can't compute an + // intersect between it and the alloca slice. + if (!DIExpression::calculateFragmentIntersect( + DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr, + CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag, + NewDbgFragment, OffsetFromLocationInBits)) + continue; // Do not migrate this fragment to this slice. + + // Zero sized fragment indicates there's no intersect between the variable + // fragment and the alloca slice. Skip this slice for this variable + // fragment. + if (NewDbgFragment && !NewDbgFragment->SizeInBits) + continue; // Do not migrate this fragment to this slice. + + // No fragment indicates DbgVariable's variable or fragment exactly + // overlaps the slice; copy its fragment (or nullopt if there isn't one). + if (!NewDbgFragment) + NewDbgFragment = DbgVariable->getFragment(); + + // Reduce the new expression offset by the bit-extract offset since + // we'll be keeping that. + int64_t OffestFromNewAllocaInBits = + OffsetFromLocationInBits - ExtractOffsetInBits; + // We need to adjust an existing bit extract if the offset expression + // can't eat the slack (i.e., if the new offset would be negative). + int64_t BitExtractOffset = + std::min<int64_t>(0, OffestFromNewAllocaInBits); + // The magnitude of a negative value indicates the number of bits into + // the existing variable fragment that the memory region begins. The new + // variable fragment already excludes those bits - the new DbgPtr offset + // only needs to be applied if it's positive. + OffestFromNewAllocaInBits = + std::max(int64_t(0), OffestFromNewAllocaInBits); + + // Rebuild the expression: + // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment} + // Add NewDbgFragment later, because dbg.assigns don't want it in the + // address expression but the value expression instead. + DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps); + if (OffestFromNewAllocaInBits > 0) { + int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8; + NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes); } // Remove any existing intrinsics on the new alloca describing @@ -5127,18 +5388,19 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { OldDII->eraseFromParent(); }; for_each(findDbgDeclares(Fragment.Alloca), RemoveOne); - for_each(findDPVDeclares(Fragment.Alloca), RemoveOne); + for_each(findDVRDeclares(Fragment.Alloca), RemoveOne); - insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI); + insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI, + NewDbgFragment, BitExtractOffset); } }; // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. for_each(findDbgDeclares(&AI), MigrateOne); - for_each(findDPVDeclares(&AI), MigrateOne); + for_each(findDVRDeclares(&AI), MigrateOne); for_each(at::getAssignmentMarkers(&AI), MigrateOne); - for_each(at::getDPVAssignmentMarkers(&AI), MigrateOne); + for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne); return Changed; } @@ -5177,7 +5439,7 @@ SROA::runOnAlloca(AllocaInst &AI) { Changed = true; return {Changed, CFGChanged}; } - const DataLayout &DL = AI.getModule()->getDataLayout(); + const DataLayout &DL = AI.getDataLayout(); // Skip alloca forms that this analysis can't handle. auto *AT = AI.getAllocatedType(); @@ -5262,7 +5524,7 @@ bool SROA::deleteDeadInstructions( DeletedAllocas.insert(AI); for (DbgDeclareInst *OldDII : findDbgDeclares(AI)) OldDII->eraseFromParent(); - for (DPValue *OldDII : findDPVDeclares(AI)) + for (DbgVariableRecord *OldDII : findDVRDeclares(AI)) OldDII->eraseFromParent(); } @@ -5309,7 +5571,7 @@ bool SROA::promoteAllocas(Function &F) { std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) { LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); I != E; ++I) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp index 4ce6ce93be33..cb1456b14632 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -29,7 +29,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeInstSimplifyLegacyPassPass(Registry); initializeLegacyLICMPassPass(Registry); initializeLoopDataPrefetchLegacyPassPass(Registry); - initializeLoopRotateLegacyPassPass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopUnrollPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); @@ -49,4 +48,5 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReduceLegacyPassPass(Registry); initializePlaceBackedgeSafepointsLegacyPassPass(Registry); + initializePostInlineEntryExitInstrumenterPass(Registry); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index c01d03f64472..8eadf8900020 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -627,6 +627,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, Value *Ptr = CI->getArgOperand(0); Value *Mask = CI->getArgOperand(1); Value *PassThru = CI->getArgOperand(2); + Align Alignment = CI->getParamAlign(0).valueOrOne(); auto *VecType = cast<FixedVectorType>(CI->getType()); @@ -644,6 +645,10 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, // The result vector Value *VResult = PassThru; + // Adjust alignment for the scalar instruction. + const Align AdjustedAlignment = + commonAlignment(Alignment, EltTy->getPrimitiveSizeInBits() / 8); + // Shorten the way if the mask is a vector of constants. // Create a build_vector pattern, with loads/poisons as necessary and then // shuffle blend with the pass through value. @@ -659,7 +664,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, } else { Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex); - InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1), + InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, AdjustedAlignment, "Load" + Twine(Idx)); ShuffleMask[Idx] = Idx; ++MemIndex; @@ -713,7 +718,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, CondBlock->setName("cond.load"); Builder.SetInsertPoint(CondBlock->getTerminator()); - LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1)); + LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, AdjustedAlignment); Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx); // Move the pointer if there are more blocks to come. @@ -755,6 +760,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, Value *Src = CI->getArgOperand(0); Value *Ptr = CI->getArgOperand(1); Value *Mask = CI->getArgOperand(2); + Align Alignment = CI->getParamAlign(1).valueOrOne(); auto *VecType = cast<FixedVectorType>(Src->getType()); @@ -767,6 +773,10 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, Type *EltTy = VecType->getElementType(); + // Adjust alignment for the scalar instruction. + const Align AdjustedAlignment = + commonAlignment(Alignment, EltTy->getPrimitiveSizeInBits() / 8); + unsigned VectorWidth = VecType->getNumElements(); // Shorten the way if the mask is a vector of constants. @@ -778,7 +788,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, Value *OneElt = Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx)); Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex); - Builder.CreateAlignedStore(OneElt, NewPtr, Align(1)); + Builder.CreateAlignedStore(OneElt, NewPtr, AdjustedAlignment); ++MemIndex; } CI->eraseFromParent(); @@ -824,7 +834,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, Builder.SetInsertPoint(CondBlock->getTerminator()); Value *OneElt = Builder.CreateExtractElement(Src, Idx); - Builder.CreateAlignedStore(OneElt, Ptr, Align(1)); + Builder.CreateAlignedStore(OneElt, Ptr, AdjustedAlignment); // Move the pointer if there are more blocks to come. Value *NewPtr; @@ -852,6 +862,69 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, ModifiedDT = true; } +static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, + DomTreeUpdater *DTU, + bool &ModifiedDT) { + // If we extend histogram to return a result someday (like the updated vector) + // then we'll need to support it here. + assert(CI->getType()->isVoidTy() && "Histogram with non-void return."); + Value *Ptrs = CI->getArgOperand(0); + Value *Inc = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + + auto *AddrType = cast<FixedVectorType>(Ptrs->getType()); + Type *EltTy = Inc->getType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + Builder.SetInsertPoint(InsertPt); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // FIXME: Do we need to add an alignment parameter to the intrinsic? + unsigned VectorWidth = AddrType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + if (isConstantIntVector(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); + Value *Add = Builder.CreateAdd(Load, Inc); + Builder.CreateStore(Add, Ptr); + } + CI->eraseFromParent(); + return; + } + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + Value *Predicate = + Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DTU); + + BasicBlock *CondBlock = ThenTerm->getParent(); + CondBlock->setName("cond.histogram.update"); + + Builder.SetInsertPoint(CondBlock->getTerminator()); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); + Value *Add = Builder.CreateAdd(Load, Inc); + Builder.CreateStore(Add, Ptr); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0); + NewIfBlock->setName("else"); + Builder.SetInsertPoint(NewIfBlock, NewIfBlock->begin()); + } + + CI->eraseFromParent(); + ModifiedDT = true; +} + static bool runImpl(Function &F, const TargetTransformInfo &TTI, DominatorTree *DT) { std::optional<DomTreeUpdater> DTU; @@ -860,7 +933,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI, bool EverMadeChange = false; bool MadeChange = true; - auto &DL = F.getParent()->getDataLayout(); + auto &DL = F.getDataLayout(); while (MadeChange) { MadeChange = false; for (BasicBlock &BB : llvm::make_early_inc_range(F)) { @@ -928,6 +1001,12 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, switch (II->getIntrinsicID()) { default: break; + case Intrinsic::experimental_vector_histogram_add: + if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(), + CI->getArgOperand(1)->getType())) + return false; + scalarizeMaskedVectorHistogram(DL, CI, DTU, ModifiedDT); + return true; case Intrinsic::masked_load: // Scalarize unsupported vector masked load if (TTI.isLegalMaskedLoad( @@ -969,12 +1048,16 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, return true; } case Intrinsic::masked_expandload: - if (TTI.isLegalMaskedExpandLoad(CI->getType())) + if (TTI.isLegalMaskedExpandLoad( + CI->getType(), + CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne())) return false; scalarizeMaskedExpandLoad(DL, CI, DTU, ModifiedDT); return true; case Intrinsic::masked_compressstore: - if (TTI.isLegalMaskedCompressStore(CI->getArgOperand(0)->getType())) + if (TTI.isLegalMaskedCompressStore( + CI->getArgOperand(0)->getType(), + CI->getAttributes().getParamAttrs(1).getAlignment().valueOrOne())) return false; scalarizeMaskedCompressStore(DL, CI, DTU, ModifiedDT); return true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 3eca9ac7c267..2bed3480da1c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -523,8 +523,8 @@ void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV) { SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; Op->getAllMetadataOtherThanDebugLoc(MDs); - for (unsigned I = 0, E = CV.size(); I != E; ++I) { - if (Instruction *New = dyn_cast<Instruction>(CV[I])) { + for (Value *V : CV) { + if (Instruction *New = dyn_cast<Instruction>(V)) { for (const auto &MD : MDs) if (canTransferMetadata(MD.first)) New->setMetadata(MD.first, MD.second); @@ -1107,7 +1107,7 @@ bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) { return false; std::optional<VectorLayout> Layout = getVectorLayout( - LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout()); + LI.getType(), LI.getAlign(), LI.getDataLayout()); if (!Layout) return false; @@ -1133,7 +1133,7 @@ bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) { Value *FullValue = SI.getValueOperand(); std::optional<VectorLayout> Layout = getVectorLayout( - FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout()); + FullValue->getType(), SI.getAlign(), SI.getDataLayout()); if (!Layout) return false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 17c466f38c9c..73e3ff296cf1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -57,7 +57,7 @@ // // base = gep a, 0, x, y // load base -// laod base + 1 * sizeof(float) +// load base + 1 * sizeof(float) // load base + 32 * sizeof(float) // load base + 33 * sizeof(float) // @@ -174,6 +174,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -235,18 +236,16 @@ public: /// \p UserChainTail Outputs the tail of UserChain so that we can /// garbage-collect unused instructions in UserChain. static Value *Extract(Value *Idx, GetElementPtrInst *GEP, - User *&UserChainTail, const DominatorTree *DT); + User *&UserChainTail); /// Looks for a constant offset from the given GEP index without extracting /// it. It returns the numeric value of the extracted constant offset (0 if /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, GetElementPtrInst *GEP, - const DominatorTree *DT); + static int64_t Find(Value *Idx, GetElementPtrInst *GEP); private: - ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT) - : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) { - } + ConstantOffsetExtractor(BasicBlock::iterator InsertionPt) + : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {} /// Searches the expression that computes V for a non-zero constant C s.t. /// V can be reassociated into the form V' + C. If the searching is @@ -333,10 +332,9 @@ private: SmallVector<CastInst *, 16> ExtInsts; /// Insertion position of cloned instructions. - Instruction *IP; + BasicBlock::iterator IP; const DataLayout &DL; - const DominatorTree *DT; }; /// A pass that tries to split every GEP in the function into a variadic @@ -393,6 +391,11 @@ private: /// and returns true if the splitting succeeds. bool splitGEP(GetElementPtrInst *GEP); + /// Tries to reorder the given GEP with the GEP that produces the base if + /// doing so results in producing a constant offset as the outermost + /// index. + bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI); + /// Lower a GEP with multiple indices into multiple GEPs with a single index. /// Function splitGEP already split the original GEP into a variadic part and /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the @@ -519,12 +522,10 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, } Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); - // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS - // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). - // FIXME: this does not appear to be covered by any tests - // (with x86/aarch64 backends at least) + // Do not trace into "or" unless it is equivalent to "add". + // This is the case if the or's disjoint flag is set. if (BO->getOpcode() == Instruction::Or && - !haveNoCommonBitsSet(LHS, RHS, SimplifyQuery(DL, DT, /*AC*/ nullptr, BO))) + !cast<PossiblyDisjointInst>(BO)->isDisjoint()) return false; // FIXME: We don't currently support constants from the RHS of subs, @@ -669,7 +670,7 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) { Instruction *Ext = I->clone(); Ext->setOperand(0, Current); - Ext->insertBefore(IP); + Ext->insertBefore(*IP->getParent(), IP); Current = Ext; } return Current; @@ -778,9 +779,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { } Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, - User *&UserChainTail, - const DominatorTree *DT) { - ConstantOffsetExtractor Extractor(GEP, DT); + User *&UserChainTail) { + ConstantOffsetExtractor Extractor(GEP->getIterator()); // Find a non-zero constant offset first. APInt ConstantOffset = Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, @@ -795,10 +795,9 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, return IdxWithoutConstOffset; } -int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP, - const DominatorTree *DT) { +int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - return ConstantOffsetExtractor(GEP, DT) + return ConstantOffsetExtractor(GEP->getIterator()) .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, GEP->isInBounds()) .getSExtValue(); @@ -814,7 +813,8 @@ bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize( // Skip struct member indices which must be i32. if (GTI.isSequential()) { if ((*I)->getType() != PtrIdxTy) { - *I = CastInst::CreateIntegerCast(*I, PtrIdxTy, true, "idxprom", GEP); + *I = CastInst::CreateIntegerCast(*I, PtrIdxTy, true, "idxprom", + GEP->getIterator()); Changed = true; } } @@ -836,7 +836,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT); + ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); if (ConstantOffset != 0) { NeedsExtraction = true; // A GEP may have multiple indices. We accumulate the extracted @@ -970,6 +970,49 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, Variadic->eraseFromParent(); } +bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP, + TargetTransformInfo &TTI) { + auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand()); + if (!PtrGEP) + return false; + + bool NestedNeedsExtraction; + int64_t NestedByteOffset = + accumulateByteOffset(PtrGEP, NestedNeedsExtraction); + if (!NestedNeedsExtraction) + return false; + + unsigned AddrSpace = PtrGEP->getPointerAddressSpace(); + if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), + /*BaseGV=*/nullptr, NestedByteOffset, + /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) + return false; + + bool GEPInBounds = GEP->isInBounds(); + bool PtrGEPInBounds = PtrGEP->isInBounds(); + bool IsChainInBounds = GEPInBounds && PtrGEPInBounds; + if (IsChainInBounds) { + auto IsKnownNonNegative = [this](Value *V) { + return isKnownNonNegative(V, *DL); + }; + IsChainInBounds &= all_of(GEP->indices(), IsKnownNonNegative); + if (IsChainInBounds) + IsChainInBounds &= all_of(PtrGEP->indices(), IsKnownNonNegative); + } + + IRBuilder<> Builder(GEP); + // For trivial GEP chains, we can swap the indices. + Value *NewSrc = Builder.CreateGEP( + GEP->getSourceElementType(), PtrGEP->getPointerOperand(), + SmallVector<Value *, 4>(GEP->indices()), "", IsChainInBounds); + Value *NewGEP = Builder.CreateGEP(PtrGEP->getSourceElementType(), NewSrc, + SmallVector<Value *, 4>(PtrGEP->indices()), + "", IsChainInBounds); + GEP->replaceAllUsesWith(NewGEP); + RecursivelyDeleteTriviallyDeadInstructions(GEP); + return true; +} + bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Skip vector GEPs. if (GEP->getType()->isVectorTy()) @@ -985,11 +1028,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { bool NeedsExtraction; int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); - if (!NeedsExtraction) - return Changed; - TargetTransformInfo &TTI = GetTTI(*GEP->getFunction()); + if (!NeedsExtraction) { + Changed |= reorderGEP(GEP, TTI); + return Changed; + } + // If LowerGEP is disabled, before really splitting the GEP, check whether the // backend supports the addressing mode we are about to produce. If no, this // splitting probably won't be beneficial. @@ -1026,7 +1071,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Value *OldIdx = GEP->getOperand(I); User *UserChainTail; Value *NewIdx = - ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT); + ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail); if (NewIdx != nullptr) { // Switches to the index with the constant offset removed. GEP->setOperand(I, NewIdx); @@ -1057,8 +1102,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // // TODO(jingyue): do some range analysis to keep as many inbounds as // possible. GEPs with inbounds are more friendly to alias analysis. + // TODO(gep_nowrap): Preserve nuw at least. bool GEPWasInBounds = GEP->isInBounds(); - GEP->setIsInBounds(false); + GEP->setNoWrapFlags(GEPNoWrapFlags::none()); // Lowers a GEP to either GEPs with a single index or arithmetic operations. if (LowerGEP) { @@ -1133,7 +1179,7 @@ bool SeparateConstOffsetFromGEP::run(Function &F) { if (DisableSeparateConstOffsetFromGEP) return false; - DL = &F.getParent()->getDataLayout(); + DL = &F.getDataLayout(); bool Changed = false; for (BasicBlock &B : F) { if (!DT->isReachableFromEntry(&B)) @@ -1188,9 +1234,11 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { if (LHS->getType() == RHS->getType()) { ExprKey Key = createNormalizedCommutablePair(LHS, RHS); if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) { - Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + Instruction *NewSExt = + new SExtInst(Dom, I->getType(), "", I->getIterator()); NewSExt->takeName(I); I->replaceAllUsesWith(NewSExt); + NewSExt->setDebugLoc(I->getDebugLoc()); RecursivelyDeleteTriviallyDeadInstructions(I); return true; } @@ -1199,9 +1247,11 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { if (LHS->getType() == RHS->getType()) { if (auto *Dom = findClosestMatchingDominator({LHS, RHS}, I, DominatingSubs)) { - Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + Instruction *NewSExt = + new SExtInst(Dom, I->getType(), "", I->getIterator()); NewSExt->takeName(I); I->replaceAllUsesWith(NewSExt); + NewSExt->setDebugLoc(I->getDebugLoc()); RecursivelyDeleteTriviallyDeadInstructions(I); return true; } @@ -1321,7 +1371,7 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, Second->setOperand(1, Offset1); // We changed p+o+c to p+c+o, p+c may not be inbound anymore. - const DataLayout &DAL = First->getModule()->getDataLayout(); + const DataLayout &DAL = First->getDataLayout(); APInt Offset(DAL.getIndexSizeInBits( cast<PointerType>(First->getType())->getAddressSpace()), 0); @@ -1330,8 +1380,9 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, uint64_t ObjectSize; if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || Offset.ugt(ObjectSize)) { - First->setIsInBounds(false); - Second->setIsInBounds(false); + // TODO(gep_nowrap): Make flag preservation more precise. + First->setNoWrapFlags(GEPNoWrapFlags::none()); + Second->setNoWrapFlags(GEPNoWrapFlags::none()); } else First->setIsInBounds(true); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 7eb0ba1c2c17..c235d2fb2a5b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Use.h" @@ -133,6 +134,7 @@ static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( "not-taken 1/<this option> times or less."), cl::init(16)); +AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key; namespace { struct CompareDesc { BranchInst *Term; @@ -630,7 +632,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, } else { // Create a new unconditional branch that will continue the loop as a new // terminator. - BranchInst::Create(ContinueBB, ParentBB); + Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB); + NewBI->setDebugLoc(BI.getDebugLoc()); } BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); @@ -664,10 +667,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // Finish updating dominator tree and memory ssa for full unswitch. if (FullUnswitch) { if (MSSAU) { - // Remove the cloned branch instruction. - ParentBB->getTerminator()->eraseFromParent(); - // Create unconditional branch now. - BranchInst::Create(ContinueBB, ParentBB); + Instruction *Term = ParentBB->getTerminator(); + // Remove the cloned branch instruction and create unconditional branch + // now. + Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB); + NewBI->setDebugLoc(Term->getDebugLoc()); + Term->eraseFromParent(); MSSAU->removeEdge(ParentBB, LoopExitBB); } DT.deleteEdge(ParentBB, LoopExitBB); @@ -859,8 +864,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); OldPH->getTerminator()->eraseFromParent(); - // Now add the unswitched switch. + // Now add the unswitched switch. This new switch instruction inherits the + // debug location of the old switch, because it semantically replace the old + // one. auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); + NewSI->setDebugLoc(SIW->getDebugLoc()); SwitchInstProfUpdateWrapper NewSIW(*NewSI); // Rewrite the IR for the unswitched basic blocks. This requires two steps. @@ -970,8 +978,9 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, /*KeepOneInputPHIs*/ true); } // Now nuke the switch and replace it with a direct branch. + Instruction *NewBI = BranchInst::Create(CommonSuccBB, BB); + NewBI->setDebugLoc(SIW->getDebugLoc()); SIW.eraseFromParent(); - BranchInst::Create(CommonSuccBB, BB); } else if (DefaultExitBB) { assert(SI.getNumCases() > 0 && "If we had no cases we'd have a common successor!"); @@ -1243,9 +1252,12 @@ static BasicBlock *buildClonedLoopBlocks( if (SE && isa<PHINode>(I)) SE->forgetValue(&I); + BasicBlock::iterator InsertPt = MergeBB->getFirstInsertionPt(); + auto *MergePN = PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi"); - MergePN->insertBefore(MergeBB->getFirstInsertionPt()); + MergePN->insertBefore(InsertPt); + MergePN->setDebugLoc(InsertPt->getDebugLoc()); I.replaceAllUsesWith(MergePN); MergePN->addIncoming(&I, ExitBB); MergePN->addIncoming(&ClonedI, ClonedExitBB); @@ -1260,8 +1272,8 @@ static BasicBlock *buildClonedLoopBlocks( Module *M = ClonedPH->getParent()->getParent(); for (auto *ClonedBB : NewBlocks) for (Instruction &I : *ClonedBB) { - RemapDPValueRange(M, I.getDbgValueRange(), VMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + RemapDbgRecordRange(M, I.getDbgRecordRange(), VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); if (auto *II = dyn_cast<AssumeInst>(&I)) @@ -1304,8 +1316,9 @@ static BasicBlock *buildClonedLoopBlocks( else if (auto *SI = dyn_cast<SwitchInst>(ClonedTerminator)) ClonedConditionToErase = SI->getCondition(); + Instruction *BI = BranchInst::Create(ClonedSuccBB, ClonedParentBB); + BI->setDebugLoc(ClonedTerminator->getDebugLoc()); ClonedTerminator->eraseFromParent(); - BranchInst::Create(ClonedSuccBB, ClonedParentBB); if (ClonedConditionToErase) RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr, @@ -2332,23 +2345,27 @@ static void unswitchNontrivialInvariants( // nuke the initial terminator placed in the split block. SplitBB->getTerminator()->eraseFromParent(); if (FullUnswitch) { - // Splice the terminator from the original loop and rewrite its - // successors. - TI.moveBefore(*SplitBB, SplitBB->end()); - // Keep a clone of the terminator for MSSA updates. Instruction *NewTI = TI.clone(); NewTI->insertInto(ParentBB, ParentBB->end()); + // Splice the terminator from the original loop and rewrite its + // successors. + TI.moveBefore(*SplitBB, SplitBB->end()); + TI.dropLocation(); + // First wire up the moved terminator to the preheaders. if (BI) { BasicBlock *ClonedPH = ClonedPHs.begin()->second; BI->setSuccessor(ClonedSucc, ClonedPH); BI->setSuccessor(1 - ClonedSucc, LoopPH); Value *Cond = skipTrivialSelect(BI->getCondition()); - if (InsertFreeze) - Cond = new FreezeInst( - Cond, Cond->getName() + ".fr", BI); + if (InsertFreeze) { + // We don't give any debug location to the new freeze, because the + // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted + // out of the loop. + Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator()); + } BI->setCondition(Cond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); } else { @@ -2365,8 +2382,9 @@ static void unswitchNontrivialInvariants( Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); if (InsertFreeze) - SI->setCondition(new FreezeInst( - SI->getCondition(), SI->getCondition()->getName() + ".fr", SI)); + SI->setCondition(new FreezeInst(SI->getCondition(), + SI->getCondition()->getName() + ".fr", + SI->getIterator())); // We need to use the set to populate domtree updates as even when there // are multiple cases pointing at the same successor we only want to @@ -2430,12 +2448,13 @@ static void unswitchNontrivialInvariants( DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); } - // After MSSAU update, remove the cloned terminator instruction NewTI. - ParentBB->getTerminator()->eraseFromParent(); - // Create a new unconditional branch to the continuing block (as opposed to // the one cloned). - BranchInst::Create(RetainedSuccBB, ParentBB); + Instruction *NewBI = BranchInst::Create(RetainedSuccBB, ParentBB); + NewBI->setDebugLoc(NewTI->getDebugLoc()); + + // After MSSAU update, remove the cloned terminator instruction NewTI. + NewTI->eraseFromParent(); } else { assert(BI && "Only branches have partial unswitching."); assert(UnswitchedSuccBBs.size() == 1 && @@ -2704,9 +2723,11 @@ static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, if (MSSAU) MSSAU->moveAllAfterSpliceBlocks(HeadBB, TailBB, SI); - PHINode *Phi = PHINode::Create(SI->getType(), 2, "unswitched.select", SI); + PHINode *Phi = + PHINode::Create(SI->getType(), 2, "unswitched.select", SI->getIterator()); Phi->addIncoming(SI->getTrueValue(), ThenBB); Phi->addIncoming(SI->getFalseValue(), HeadBB); + Phi->setDebugLoc(SI->getDebugLoc()); SI->replaceAllUsesWith(Phi); SI->eraseFromParent(); @@ -3092,7 +3113,7 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, // unswitching will break. Better optimize it away later. auto *InjectedCond = ICmpInst::Create(Instruction::ICmp, Pred, LHS, RHS, "injected.cond", - Preheader->getTerminator()); + Preheader->getTerminator()->getIterator()); BasicBlock *CheckBlock = BasicBlock::Create(Ctx, BB->getName() + ".check", BB->getParent(), InLoopSucc); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 7017f6adf3a2..11de37f7a7c1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -77,6 +77,9 @@ static cl::opt<bool> UserSinkCommonInsts( "sink-common-insts", cl::Hidden, cl::init(false), cl::desc("Sink common instructions (default = false)")); +static cl::opt<bool> UserSpeculateUnpredictables( + "speculate-unpredictables", cl::Hidden, cl::init(false), + cl::desc("Speculate unpredictable branches (default = false)")); STATISTIC(NumSimpl, "Number of blocks simplified"); @@ -142,8 +145,10 @@ performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs, // And turn BB into a block that just unconditionally branches // to the canonical block. + Instruction *BI = BranchInst::Create(CanonicalBB, BB); + BI->setDebugLoc(Term->getDebugLoc()); Term->eraseFromParent(); - BranchInst::Create(CanonicalBB, BB); + if (Updates) Updates->push_back({DominatorTree::Insert, BB, CanonicalBB}); } @@ -323,6 +328,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) { Options.HoistCommonInsts = UserHoistCommonInsts; if (UserSinkCommonInsts.getNumOccurrences()) Options.SinkCommonInsts = UserSinkCommonInsts; + if (UserSpeculateUnpredictables.getNumOccurrences()) + Options.SpeculateUnpredictables = UserSpeculateUnpredictables; } SimplifyCFGPass::SimplifyCFGPass() { @@ -349,7 +356,9 @@ void SimplifyCFGPass::printPipeline( OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;"; OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;"; OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;"; - OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch"; + OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;"; + OS << (Options.SpeculateUnpredictables ? "" : "no-") + << "speculate-unpredictables"; OS << '>'; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index 7a5318d4404c..ed9c1828ce06 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -260,36 +260,47 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I, } } +// Do not hoist any debug info intrinsics. +// ... +// if (cond) { +// x = y * z; +// foo(); +// } +// ... +// -------- Which then becomes: +// ... +// if.then: +// %x = mul i32 %y, %z +// call void @llvm.dbg.value(%x, !"x", !DIExpression()) +// call void foo() +// +// SpeculativeExecution might decide to hoist the 'y * z' calculation +// out of the 'if' block, because it is more efficient that way, so the +// '%x = mul i32 %y, %z' moves to the block above. But it might also +// decide to hoist the 'llvm.dbg.value' call. +// This is incorrect, because even if we've moved the calculation of +// 'y * z', we should not see the value of 'x' change unless we +// actually go inside the 'if' block. + bool SpeculativeExecutionPass::considerHoistingFromTo( BasicBlock &FromBlock, BasicBlock &ToBlock) { SmallPtrSet<const Instruction *, 8> NotHoisted; - const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](const User *U) { - // Debug variable has special operand to check it's not hoisted. - if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) { - return all_of(DVI->location_ops(), [&NotHoisted](Value *V) { - if (const auto *I = dyn_cast_or_null<Instruction>(V)) { - if (!NotHoisted.contains(I)) - return true; - } - return false; - }); - } - - // Usially debug label intrinsic corresponds to label in LLVM IR. In these - // cases we should not move it here. - // TODO: Possible special processing needed to detect it is related to a - // hoisted instruction. - if (isa<DbgLabelInst>(U)) - return false; - - for (const Value *V : U->operand_values()) { - if (const Instruction *I = dyn_cast<Instruction>(V)) { + auto HasNoUnhoistedInstr = [&NotHoisted](auto Values) { + for (const Value *V : Values) { + if (const auto *I = dyn_cast_or_null<Instruction>(V)) if (NotHoisted.contains(I)) return false; - } } return true; }; + auto AllPrecedingUsesFromBlockHoisted = + [&HasNoUnhoistedInstr](const User *U) { + // Do not hoist any debug info intrinsics. + if (isa<DbgInfoIntrinsic>(U)) + return false; + + return HasNoUnhoistedInstr(U->operand_values()); + }; InstructionCost TotalSpeculationCost = 0; unsigned NotHoistedInstCount = 0; @@ -316,7 +327,8 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( auto Current = I; ++I; if (!NotHoisted.count(&*Current)) { - Current->moveBeforePreserving(ToBlock.getTerminator()); + Current->moveBefore(ToBlock.getTerminator()); + Current->dropLocation(); } } return true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 75910d7b698a..75585fcc8026 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -425,14 +425,12 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd( // Returns true if A matches B + C where C is constant. static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) { - return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) || - match(A, m_Add(m_ConstantInt(C), m_Value(B)))); + return match(A, m_c_Add(m_Value(B), m_ConstantInt(C))); } // Returns true if A matches B | C where C is constant. static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) { - return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) || - match(A, m_Or(m_ConstantInt(C), m_Value(B)))); + return match(A, m_c_Or(m_Value(B), m_ConstantInt(C))); } void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( @@ -715,7 +713,7 @@ namespace llvm { PreservedAnalyses StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) { - const DataLayout *DL = &F.getParent()->getDataLayout(); + const DataLayout *DL = &F.getDataLayout(); auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); auto *TTI = &AM.getResult<TargetIRAnalysis>(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 7d96a3478858..9c711ec18382 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -772,7 +772,7 @@ void StructurizeCFG::simplifyAffectedPhis() { bool Changed; do { Changed = false; - SimplifyQuery Q(Func->getParent()->getDataLayout()); + SimplifyQuery Q(Func->getDataLayout()); Q.DT = DT; // Setting CanUseUndef to true might extend value liveness, set it to false // to achieve better register pressure. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index c6e8505d5ab4..1b3e6d9549b8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -349,7 +349,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { // does not write to memory and the load provably won't trap. // Writes to memory only matter if they may alias the pointer // being loaded from. - const DataLayout &DL = L->getModule()->getDataLayout(); + const DataLayout &DL = L->getDataLayout(); if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(), L->getAlign(), DL, L)) @@ -509,8 +509,10 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB); NewEntry->takeName(HeaderBB); HeaderBB->setName("tailrecurse"); - BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry); - BI->setDebugLoc(CI->getDebugLoc()); + BranchInst::Create(HeaderBB, NewEntry); + // If the new branch preserves the debug location of CI, it could result in + // misleading stepping, if CI is located in a conditional branch. + // So, here we don't give any debug location to the new branch. // Move all fixed sized allocas from HeaderBB to NewEntry. for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), @@ -592,7 +594,7 @@ void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI, int OpndIdx) { Type *AggTy = CI->getParamByValType(OpndIdx); assert(AggTy); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); // Get alignment of byVal operand. Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne()); @@ -601,7 +603,7 @@ void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI, // Put alloca into the entry block. Value *NewAlloca = new AllocaInst( AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment, - CI->getArgOperand(OpndIdx)->getName(), &*F.getEntryBlock().begin()); + CI->getArgOperand(OpndIdx)->getName(), F.getEntryBlock().begin()); IRBuilder<> Builder(CI); Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy)); @@ -619,7 +621,7 @@ void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments( CallInst *CI, int OpndIdx) { Type *AggTy = CI->getParamByValType(OpndIdx); assert(AggTy); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getDataLayout(); // Get alignment of byVal operand. Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne()); @@ -714,8 +716,9 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) { // We found a return value we want to use, insert a select instruction to // select it if we don't already know what our return value will be and // store the result in our return value PHI node. - SelectInst *SI = SelectInst::Create( - RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret); + SelectInst *SI = + SelectInst::Create(RetKnownPN, RetPN, Ret->getReturnValue(), + "current.ret.tr", Ret->getIterator()); RetSelects.push_back(SI); RetPN->addIncoming(SI, BB); @@ -728,7 +731,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) { // Now that all of the PHI nodes are in place, remove the call and // ret instructions, replacing them with an unconditional branch. - BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret); + BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret->getIterator()); NewBI->setDebugLoc(CI->getDebugLoc()); Ret->eraseFromParent(); // Remove return. @@ -746,7 +749,7 @@ void TailRecursionEliminator::cleanupAndFinalize() { // call. for (PHINode *PN : ArgumentPHIs) { // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = simplifyInstruction(PN, F.getParent()->getDataLayout())) { + if (Value *PNV = simplifyInstruction(PN, F.getDataLayout())) { PN->replaceAllUsesWith(PNV); PN->eraseFromParent(); } @@ -776,6 +779,7 @@ void TailRecursionEliminator::cleanupAndFinalize() { AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN, RI->getOperand(0)); AccRecInstrNew->insertBefore(RI); + AccRecInstrNew->dropLocation(); RI->setOperand(0, AccRecInstrNew); } } @@ -787,8 +791,9 @@ void TailRecursionEliminator::cleanupAndFinalize() { if (!RI) continue; - SelectInst *SI = SelectInst::Create( - RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI); + SelectInst *SI = + SelectInst::Create(RetKnownPN, RetPN, RI->getOperand(0), + "current.ret.tr", RI->getIterator()); RetSelects.push_back(SI); RI->setOperand(0, SI); } @@ -803,6 +808,7 @@ void TailRecursionEliminator::cleanupAndFinalize() { AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN, SI->getFalseValue()); AccRecInstrNew->insertBefore(SI); + AccRecInstrNew->dropLocation(); SI->setFalseValue(AccRecInstrNew); } } |