diff options
Diffstat (limited to 'lib/Transforms')
20 files changed, 482 insertions, 209 deletions
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 3d57acf06e74..93eab680ca6b 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -2026,6 +2026,24 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, continue; } + // LLVM's definition of dominance allows instructions that are cyclic + // in unreachable blocks, e.g.: + // %pat = select i1 %condition, @global, i16* %pat + // because any instruction dominates an instruction in a block that's + // not reachable from entry. + // So, remove unreachable blocks from the function, because a) there's + // no point in analyzing them and b) GlobalOpt should otherwise grow + // some more complicated logic to break these cycles. + // Removing unreachable blocks might invalidate the dominator so we + // recalculate it. + if (!F->isDeclaration()) { + if (removeUnreachableBlocks(*F)) { + auto &DT = LookupDomTree(*F); + DT.recalculate(*F); + Changed = true; + } + } + Changed |= processGlobal(*F, TLI, LookupDomTree); if (!F->hasLocalLinkage()) diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 00ddb93df830..317770d133b3 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -909,7 +909,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // To check this we also need to nuke any dead constant uses (perhaps // made dead by this operation on other functions). Callee.removeDeadConstantUsers(); - if (Callee.use_empty()) { + if (Callee.use_empty() && !CG.isLibFunction(Callee)) { Calls.erase( std::remove_if(Calls.begin() + i + 1, Calls.end(), [&Callee](const std::pair<CallSite, int> &Call) { diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index ac4765f96075..6baada2c1ae1 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -173,8 +173,10 @@ protected: void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); bool computeBlockWeights(Function &F); void findEquivalenceClasses(Function &F); + template <bool IsPostDom> void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, - DominatorTreeBase<BasicBlock> *DomTree); + DominatorTreeBase<BasicBlock, IsPostDom> *DomTree); + void propagateWeights(Function &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); @@ -217,7 +219,7 @@ protected: /// \brief Dominance, post-dominance and loop information. std::unique_ptr<DominatorTree> DT; - std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT; + std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT; std::unique_ptr<LoopInfo> LI; AssumptionCacheTracker *ACT; @@ -773,9 +775,10 @@ bool SampleProfileLoader::inlineHotFunctions( /// \param DomTree Opposite dominator tree. If \p Descendants is filled /// with blocks from \p BB1's dominator tree, then /// this is the post-dominator tree, and vice versa. +template <bool IsPostDom> void SampleProfileLoader::findEquivalencesFor( BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, - DominatorTreeBase<BasicBlock> *DomTree) { + DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) { const BasicBlock *EC = EquivalenceClass[BB1]; uint64_t Weight = BlockWeights[EC]; for (const auto *BB2 : Descendants) { @@ -1283,7 +1286,7 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { DT.reset(new DominatorTree); DT->recalculate(F); - PDT.reset(new DominatorTreeBase<BasicBlock>(true)); + PDT.reset(new PostDomTreeBase<BasicBlock>()); PDT->recalculate(F); LI.reset(new LoopInfo); diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 773c86e23707..fdc9c373b95e 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1284,6 +1284,16 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = SimplifyBSwap(I, Builder)) return replaceInstUsesWith(I, V); + if (match(Op1, m_One())) { + // (1 << x) & 1 --> zext(x == 0) + // (1 >> x) & 1 --> zext(x == 0) + Value *X; + if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X))))) { + Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0)); + return new ZExtInst(IsZero, I.getType()); + } + } + if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) { const APInt &AndRHSMask = AndRHS->getValue(); @@ -1315,23 +1325,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { break; } - case Instruction::Sub: - // -x & 1 -> x & 1 - if (AndRHSMask.isOneValue() && match(Op0LHS, m_Zero())) - return BinaryOperator::CreateAnd(Op0RHS, AndRHS); - - break; - - case Instruction::Shl: - case Instruction::LShr: - // (1 << x) & 1 --> zext(x == 0) - // (1 >> x) & 1 --> zext(x == 0) - if (AndRHSMask.isOneValue() && Op0LHS == AndRHS) { - Value *NewICmp = - Builder.CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType())); - return new ZExtInst(NewICmp, I.getType()); - } - break; } // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth @@ -1417,12 +1410,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } } - // (A&((~A)|B)) -> A&B - if (match(Op0, m_c_Or(m_Not(m_Specific(Op1)), m_Value(A)))) - return BinaryOperator::CreateAnd(A, Op1); - if (match(Op1, m_c_Or(m_Not(m_Specific(Op0)), m_Value(A)))) - return BinaryOperator::CreateAnd(A, Op0); - // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) @@ -2020,18 +2007,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *A, *B; - // ((~A & B) | A) -> (A | B) - if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A)))) - return BinaryOperator::CreateOr(A, Op1); - if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A)))) - return BinaryOperator::CreateOr(Op0, A); - - // ((A & B) | ~A) -> (~A | B) - // The NOT is guaranteed to be in the RHS by complexity ordering. - if (match(Op1, m_Not(m_Value(A))) && - match(Op0, m_c_And(m_Specific(A), m_Value(B)))) - return BinaryOperator::CreateOr(Op1, B); - // (A & C)|(B & D) Value *C = nullptr, *D = nullptr; if (match(Op0, m_And(m_Value(A), m_Value(C))) && @@ -2176,17 +2151,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return BinaryOperator::CreateOr(Not, Op0); } - // (A & B) | (~A ^ B) -> (~A ^ B) - // (A & B) | (B ^ ~A) -> (~A ^ B) - // (B & A) | (~A ^ B) -> (~A ^ B) - // (B & A) | (B ^ ~A) -> (~A ^ B) - // The match order is important: match the xor first because the 'not' - // operation defines 'A'. We do not need to match the xor as Op0 because the - // xor was canonicalized to Op1 above. - if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && - match(Op0, m_c_And(m_Specific(A), m_Specific(B)))) - return BinaryOperator::CreateXor(Builder.CreateNot(A), B); - if (SwappedForXor) std::swap(Op0, Op1); diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 60d1cde971dd..a8faaecb5c34 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1814,9 +1814,21 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or, Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType())); Value *CmpQ = Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType())); - auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And - : Instruction::Or; - return BinaryOperator::Create(LogicOpc, CmpP, CmpQ); + auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; + return BinaryOperator::Create(BOpc, CmpP, CmpQ); + } + + // Are we using xors to bitwise check for a pair of (in)equalities? Convert to + // a shorter form that has more potential to be folded even further. + Value *X1, *X2, *X3, *X4; + if (match(Or->getOperand(0), m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) && + match(Or->getOperand(1), m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) { + // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4) + // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4) + Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2); + Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4); + auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; + return BinaryOperator::Create(BOpc, Cmp12, Cmp34); } return nullptr; @@ -3737,6 +3749,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, const APInt &CVal = CI->getValue(); if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth) return nullptr; + } else { + // In this case we could have the operand of the binary operation + // being defined in another block, and performing the replacement + // could break the dominance relation. + return nullptr; } } else { // Other uses prohibit this transformation. @@ -3856,18 +3873,17 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { assert(BO->getOpcode() == Instruction::And); // Replace (mul & mask) --> zext (mul.with.overflow & short_mask) - Value *ShortMask = - Builder.CreateTrunc(BO->getOperand(1), Builder.getIntNTy(MulWidth)); + ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1)); + APInt ShortMask = CI->getValue().trunc(MulWidth); Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask); - Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType()); - if (auto *ZextI = dyn_cast<Instruction>(Zext)) - IC.Worklist.Add(ZextI); + Instruction *Zext = + cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType())); + IC.Worklist.Add(Zext); IC.replaceInstUsesWith(*BO, Zext); } else { llvm_unreachable("Unexpected Binary operation"); } - if (auto *UI = dyn_cast<Instruction>(U)) - IC.Worklist.Add(UI); + IC.Worklist.Add(cast<Instruction>(U)); } } if (isa<Instruction>(OtherVal)) diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index c59e1ce69ac2..451036545741 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -998,8 +998,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // that this code is not reachable. We do this instead of inserting // an unreachable instruction directly because we cannot modify the // CFG. - new StoreInst(UndefValue::get(LI.getType()), - Constant::getNullValue(Op->getType()), &LI); + StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()), + Constant::getNullValue(Op->getType()), &LI); + SI->setDebugLoc(LI.getDebugLoc()); return replaceInstUsesWith(LI, UndefValue::get(LI.getType())); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 5689c0604239..a20f474cbf40 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -417,8 +417,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // the highest demanded bit, we just return the other side. if (DemandedFromOps.isSubsetOf(RHSKnown.Zero)) return I->getOperand(0); - // We can't do this with the LHS for subtraction. - if (I->getOpcode() == Instruction::Add && + // We can't do this with the LHS for subtraction, unless we are only + // demanding the LSB. + if ((I->getOpcode() == Instruction::Add || + DemandedFromOps.isOneValue()) && DemandedFromOps.isSubsetOf(LHSKnown.Zero)) return I->getOperand(1); } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 90e232399155..c7766568fd9d 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -636,17 +636,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS; Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I)); + // Do "A op C" and "B op C" both simplify? - if (Value *L = - SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) - if (Value *R = - SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I))) { - // They do! Return "L op' R". - ++NumExpand; - C = Builder.CreateBinOp(InnerOpcode, L, R); - C->takeName(&I); - return C; - } + if (L && R) { + // They do! Return "L op' R". + ++NumExpand; + C = Builder.CreateBinOp(InnerOpcode, L, R); + C->takeName(&I); + return C; + } + + // Does "A op C" simplify to the identity value for the inner opcode? + if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { + // They do! Return "B op C". + ++NumExpand; + C = Builder.CreateBinOp(TopLevelOpcode, B, C); + C->takeName(&I); + return C; + } + + // Does "B op C" simplify to the identity value for the inner opcode? + if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { + // They do! Return "A op C". + ++NumExpand; + C = Builder.CreateBinOp(TopLevelOpcode, A, C); + C->takeName(&I); + return C; + } } if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) { @@ -655,17 +673,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1); Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op' + Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I)); + Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + // Do "A op B" and "A op C" both simplify? - if (Value *L = - SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I))) - if (Value *R = - SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) { - // They do! Return "L op' R". - ++NumExpand; - A = Builder.CreateBinOp(InnerOpcode, L, R); - A->takeName(&I); - return A; - } + if (L && R) { + // They do! Return "L op' R". + ++NumExpand; + A = Builder.CreateBinOp(InnerOpcode, L, R); + A->takeName(&I); + return A; + } + + // Does "A op B" simplify to the identity value for the inner opcode? + if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { + // They do! Return "A op C". + ++NumExpand; + A = Builder.CreateBinOp(TopLevelOpcode, A, C); + A->takeName(&I); + return A; + } + + // Does "A op C" simplify to the identity value for the inner opcode? + if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { + // They do! Return "A op B". + ++NumExpand; + A = Builder.CreateBinOp(TopLevelOpcode, A, B); + A->takeName(&I); + return A; + } } // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0)) diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 184940b7ea58..057f746e052d 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -22,9 +22,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" @@ -43,6 +45,7 @@ #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" @@ -192,6 +195,11 @@ static cl::opt<uint32_t> ClMaxInlinePoisoningSize( static cl::opt<bool> ClUseAfterReturn("asan-use-after-return", cl::desc("Check stack-use-after-return"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args", + cl::desc("Create redzones for byval " + "arguments (extra copy " + "required)"), cl::Hidden, + cl::init(true)); static cl::opt<bool> ClUseAfterScope("asan-use-after-scope", cl::desc("Check stack-use-after-scope"), cl::Hidden, cl::init(false)); @@ -747,6 +755,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { bool runOnFunction() { if (!ClStack) return false; + + if (ClRedzoneByvalArgs) copyArgsPassedByValToAllocas(); + // Collect alloca, ret, lifetime instructions etc. for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); @@ -763,6 +774,11 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { return true; } + // Arguments marked with the "byval" attribute are implicitly copied without + // using an alloca instruction. To produce redzones for those arguments, we + // copy them a second time into memory allocated with an alloca instruction. + void copyArgsPassedByValToAllocas(); + // Finds all Alloca instructions and puts // poisoned red zones around all of them. // Then unpoison everything back before the function returns. @@ -2528,6 +2544,28 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) { llvm_unreachable("impossible LocalStackSize"); } +void FunctionStackPoisoner::copyArgsPassedByValToAllocas() { + BasicBlock &FirstBB = *F.begin(); + IRBuilder<> IRB(&FirstBB, FirstBB.getFirstInsertionPt()); + const DataLayout &DL = F.getParent()->getDataLayout(); + for (Argument &Arg : F.args()) { + if (Arg.hasByValAttr()) { + Type *Ty = Arg.getType()->getPointerElementType(); + unsigned Align = Arg.getParamAlignment(); + if (Align == 0) Align = DL.getABITypeAlignment(Ty); + + const std::string &Name = Arg.hasName() ? Arg.getName().str() : + "Arg" + llvm::to_string(Arg.getArgNo()); + AllocaInst *AI = IRB.CreateAlloca(Ty, nullptr, Twine(Name) + ".byval"); + AI->setAlignment(Align); + Arg.replaceAllUsesWith(AI); + + uint64_t AllocSize = DL.getTypeAllocSize(Ty); + IRB.CreateMemCpy(AI, &Arg, AllocSize, Align); + } + } +} + PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue, Instruction *ThenTerm, diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 1348e0ed0ed0..b7c6271869cd 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3039,7 +3039,7 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVAStartInst(VAStartInst &I) override { - if (F.getCallingConv() == CallingConv::X86_64_Win64) + if (F.getCallingConv() == CallingConv::Win64) return; IRBuilder<> IRB(&I); VAStartInstrumentationList.push_back(&I); @@ -3053,7 +3053,7 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVACopyInst(VACopyInst &I) override { - if (F.getCallingConv() == CallingConv::X86_64_Win64) + if (F.getCallingConv() == CallingConv::Win64) return; IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index e3c36c98ab0d..06fe07598374 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -281,6 +281,16 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { SanCovTraceSwitchFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction( SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy)); + // Make sure smaller parameters are zero-extended to i64 as required by the + // x86_64 ABI. + if (TargetTriple.getArch() == Triple::x86_64) { + for (int i = 0; i < 3; i++) { + SanCovTraceCmpFunction[i]->addParamAttr(0, Attribute::ZExt); + SanCovTraceCmpFunction[i]->addParamAttr(1, Attribute::ZExt); + } + SanCovTraceDivFunction[0]->addParamAttr(0, Attribute::ZExt); + } + // We insert an empty inline asm after cov callbacks to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 7fd77a082b82..c5c9b2c185d6 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -562,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, if (!MSSA) return false; + // If MemorySSA has determined that one of EarlierInst or LaterInst does not + // read/write memory, then we can safely return true here. + // FIXME: We could be more aggressive when checking doesNotAccessMemory(), + // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass + // by also checking the MemorySSA MemoryAccess on the instruction. Initial + // experiments suggest this isn't worthwhile, at least for C/C++ code compiled + // with the default optimization pipeline. + auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst); + if (!EarlierMA) + return true; + auto *LaterMA = MSSA->getMemoryAccess(LaterInst); + if (!LaterMA) + return true; + // Since we know LaterDef dominates LaterInst and EarlierInst dominates // LaterInst, if LaterDef dominates EarlierInst then it can't occur between // EarlierInst and LaterInst and neither can any other write that potentially // clobbers LaterInst. MemoryAccess *LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst); - return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst)); + return MSSA->dominates(LaterDef, EarlierMA); } bool EarlyCSE::processNode(DomTreeNode *Node) { diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0fe72f3f7331..ea28705e684d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1168,6 +1168,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, LI->isVolatile(), LI->getAlignment(), LI->getOrdering(), LI->getSyncScopeID(), UnavailablePred->getTerminator()); + NewLoad->setDebugLoc(LI->getDebugLoc()); // Transfer the old load's AA tags to the new load. AAMDNodes Tags; diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index a40c22c3fce9..99b4458ea0fa 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -805,6 +805,25 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP ConstantInt *One = ConstantInt::get(IndVarTy, 1); // TODO: generalize the predicates here to also match their unsigned variants. if (IsIncreasing) { + bool DecreasedRightValueByOne = false; + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (++i != len) { while (++i < len) { + // ... ---> ... + // } } + Pred = ICmpInst::ICMP_SLT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeSMin(SE, RightSCEV)) { + // while (true) { while (true) { + // if (++i == len) ---> if (++i > len - 1) + // break; break; + // ... ... + // } } + Pred = ICmpInst::ICMP_SGT; + RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); + DecreasedRightValueByOne = true; + } + bool FoundExpectedPred = (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); @@ -829,16 +848,41 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(Preheader->getTerminator()); - RightValue = B.CreateAdd(RightValue, One); + // We need to increase the right value unless we have already decreased + // it virtually when we replaced EQ with SGT. + if (!DecreasedRightValueByOne) { + IRBuilder<> B(Preheader->getTerminator()); + RightValue = B.CreateAdd(RightValue, One); + } } else { if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart, RightSCEV)) { FailureReason = "Induction variable start not bounded by upper limit"; return None; } + assert(!DecreasedRightValueByOne && + "Right value can be decreased only for LatchBrExitIdx == 0!"); } } else { + bool IncreasedRightValueByOne = false; + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (--i != len) { while (--i > len) { + // ... ---> ... + // } } + Pred = ICmpInst::ICMP_SGT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeSMax(SE, RightSCEV)) { + // while (true) { while (true) { + // if (--i == len) ---> if (--i < len + 1) + // break; break; + // ... ... + // } } + Pred = ICmpInst::ICMP_SLT; + RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); + IncreasedRightValueByOne = true; + } + bool FoundExpectedPred = (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); @@ -863,14 +907,20 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(Preheader->getTerminator()); - RightValue = B.CreateSub(RightValue, One); + // We need to decrease the right value unless we have already increased + // it virtually when we replaced EQ with SLT. + if (!IncreasedRightValueByOne) { + IRBuilder<> B(Preheader->getTerminator()); + RightValue = B.CreateSub(RightValue, One); + } } else { if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart, RightSCEV)) { FailureReason = "Induction variable start not bounded by lower limit"; return None; } + assert(!IncreasedRightValueByOne && + "Right value can be increased only for LatchBrExitIdx == 0!"); } } @@ -922,14 +972,18 @@ LoopConstrainer::calculateSubRanges() const { bool Increasing = MainLoopStructure.IndVarIncreasing; - // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the - // range of values the induction variable takes. + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or + // [Smallest, GreatestSeen] is the range of values the induction variable + // takes. - const SCEV *Smallest = nullptr, *Greatest = nullptr; + const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr; + const SCEV *One = SE.getOne(Ty); if (Increasing) { Smallest = Start; Greatest = End; + // No overflow, because the range [Smallest, GreatestSeen] is not empty. + GreatestSeen = SE.getMinusSCEV(End, One); } else { // These two computations may sign-overflow. Here is why that is okay: // @@ -947,9 +1001,9 @@ LoopConstrainer::calculateSubRanges() const { // will be an empty range. Returning an empty range is always safe. // - const SCEV *One = SE.getOne(Ty); Smallest = SE.getAddExpr(End, One); Greatest = SE.getAddExpr(Start, One); + GreatestSeen = Start; } auto Clamp = [this, Smallest, Greatest](const SCEV *S) { @@ -964,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const { Result.LowLimit = Clamp(Range.getBegin()); bool ProvablyNoPostLoop = - SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd()); + SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd()); if (!ProvablyNoPostLoop) Result.HighLimit = Clamp(Range.getEnd()); diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index ee3de51b1360..4056cc5cb346 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -2168,11 +2168,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { return false; } -/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form +/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the +/// same BB in the form /// bb: /// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... -/// %s = select p, trueval, falseval +/// %s = select %p, trueval, falseval /// +/// or +/// +/// bb: +/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... +/// %c = cmp %p, 0 +/// %s = select %c, trueval, falseval +// /// And expand the select into a branch structure. This later enables /// jump-threading over bb in this pass. /// @@ -2186,44 +2194,54 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { if (LoopHeaders.count(BB)) return false; - // Look for a Phi/Select pair in the same basic block. The Phi feeds the - // condition of the Select and at least one of the incoming values is a - // constant. for (BasicBlock::iterator BI = BB->begin(); PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { - unsigned NumPHIValues = PN->getNumIncomingValues(); - if (NumPHIValues == 0 || !PN->hasOneUse()) - continue; - - SelectInst *SI = dyn_cast<SelectInst>(PN->user_back()); - if (!SI || SI->getParent() != BB) - continue; - - Value *Cond = SI->getCondition(); - if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1)) + // Look for a Phi having at least one constant incoming value. + if (llvm::all_of(PN->incoming_values(), + [](Value *V) { return !isa<ConstantInt>(V); })) continue; - bool HasConst = false; - for (unsigned i = 0; i != NumPHIValues; ++i) { - if (PN->getIncomingBlock(i) == BB) + auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) { + // Check if SI is in BB and use V as condition. + if (SI->getParent() != BB) return false; - if (isa<ConstantInt>(PN->getIncomingValue(i))) - HasConst = true; - } + Value *Cond = SI->getCondition(); + return (Cond && Cond == V && Cond->getType()->isIntegerTy(1)); + }; - if (HasConst) { - // Expand the select. - TerminatorInst *Term = - SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); - PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); - NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); - NewPN->addIncoming(SI->getFalseValue(), BB); - SI->replaceAllUsesWith(NewPN); - SI->eraseFromParent(); - return true; + SelectInst *SI = nullptr; + for (Use &U : PN->uses()) { + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) { + // Look for a ICmp in BB that compares PN with a constant and is the + // condition of a Select. + if (Cmp->getParent() == BB && Cmp->hasOneUse() && + isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo()))) + if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back())) + if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) { + SI = SelectI; + break; + } + } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) { + // Look for a Select in BB that uses PN as condtion. + if (isUnfoldCandidate(SelectI, U.get())) { + SI = SelectI; + break; + } + } } + + if (!SI) + continue; + // Expand the select. + TerminatorInst *Term = + SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); + PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); + NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); + NewPN->addIncoming(SI->getFalseValue(), BB); + SI->replaceAllUsesWith(NewPN); + SI->eraseFromParent(); + return true; } - return false; } diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index 606136dc31a4..2e0d8e0374c0 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) + LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, + OptimizationRemarkEmitter *ORE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {} + PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -353,6 +355,8 @@ private: LoopInfo *LI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; bool InnerLoopHasReduction; }; @@ -361,8 +365,9 @@ private: /// loop. class LoopInterchangeProfitability { public: - LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, @@ -376,6 +381,8 @@ private: /// Scev analysis. ScalarEvolution *SE; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; }; /// LoopInterchangeTransform interchanges the loop. @@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass { DependenceInfo *DI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; + LoopInterchange() : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); @@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass { AU.addRequired<DependenceAnalysisWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } bool runOnFunction(Function &F) override { @@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass { DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); // Build up a worklist of loop pairs to analyze. @@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass { Loop *OuterLoop = LoopList[OuterLoopId]; LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, - PreserveLCSSA); + PreserveLCSSA, ORE); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); return false; } DEBUG(dbgs() << "Loops are legal to interchange\n"); - LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE); + LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Interchanging loops not profitable\n"); return false; } + ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Loop interchanged with enclosing loop."); + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); @@ -760,6 +777,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedPHIInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."); return true; } @@ -767,6 +790,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (Inductions.size() != 1) { DEBUG(dbgs() << "We currently only support loops with 1 induction variable." << "Failed to interchange due to current limitation\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "MultiInductionInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with 1 induction variable can be " + "interchanged currently."); return true; } if (Reductions.size() > 0) @@ -777,6 +806,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."); return true; } @@ -785,18 +820,35 @@ bool LoopInterchangeLegality::currentLimitations() { if (!Reductions.empty()) { DEBUG(dbgs() << "Outer loops with reductions are not supported " << "currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "ReductionsOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Outer loops with reductions cannot be interchangeed " + "currently."); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { DEBUG(dbgs() << "Loops with more than 1 induction variables are not " << "supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " + "interchanged currently."); return true; } // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { DEBUG(dbgs() << "Loop structure not understood by pass\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedStructureInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Inner loop structure not understood currently."); return true; } @@ -805,12 +857,24 @@ bool LoopInterchangeLegality::currentLimitations() { getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoLCSSAPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with LCSSA PHIs can be interchange " + "currently."); return true; } LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoLCSSAPHIOuterInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with LCSSA PHIs can be interchange " + "currently."); return true; } @@ -835,6 +899,11 @@ bool LoopInterchangeLegality::currentLimitations() { if (!InnerIndexVarInc) { DEBUG(dbgs() << "Did not find an instruction to increment the induction " << "variable.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoIncrementInInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "The inner loop does not increment the induction variable."); return true; } @@ -852,6 +921,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!I.isIdenticalTo(InnerIndexVarInc)) { DEBUG(dbgs() << "Found unsupported instructions between induction " << "variable increment and branch.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedInsBetweenInduction", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Found unsupported instruction between induction variable " + "increment and branch."); return true; } @@ -862,6 +937,11 @@ bool LoopInterchangeLegality::currentLimitations() { // current limitation. if (!FoundInduction) { DEBUG(dbgs() << "Did not find the induction variable.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoIndutionVariable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Did not find the induction variable."); return true; } return false; @@ -875,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << " due to dependence\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "Dependence", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops due to dependences."); return false; } @@ -910,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, // Check if the loops are tightly nested. if (!tightlyNested(OuterLoop, InnerLoop)) { DEBUG(dbgs() << "Loops not tightly nested\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NotTightlyNested", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops because they are not tightly " + "nested."); return false; } @@ -1005,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. - bool ImprovesPar = - isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); - return ImprovesPar; + if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) + return true; + + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops is too costly (cost=" + << ore::NV("Cost", Cost) << ", threshold=" + << ore::NV("Threshold", LoopInterchangeCostThreshold) << + ") and it does not improve parallelism."); + return false; } void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, @@ -1291,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 9397b87cdf56..90c5c243f464 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -68,6 +68,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced"); /// If it contains any dynamic allocas, returns false. static bool canTRE(Function &F) { // Because of PR962, we don't TRE dynamic allocas. - for (auto &BB : F) { - for (auto &I : BB) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { - if (!AI->isStaticAlloca()) - return false; - } - } - } - - return true; + return llvm::all_of(instructions(F), [](Instruction &I) { + auto *AI = dyn_cast<AllocaInst>(&I); + return !AI || AI->isStaticAlloca(); + }); } namespace { diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 5170c68e2915..d43ce7abb7cd 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -22,6 +22,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -736,7 +737,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // remainder are connected to the original Loop's exit blocks. The remaining // work is to update the phi nodes in the original loop, and take in the // values from the cloned region. Also update the dominator info for - // OtherExits, since we have new edges into OtherExits. + // OtherExits and their immediate successors, since we have new edges into + // OtherExits. + SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks; for (auto *BB : OtherExits) { for (auto &II : *BB) { @@ -759,12 +762,35 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)])); } } +#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG) + for (BasicBlock *SuccBB : successors(BB)) { + assert(!(any_of(OtherExits, + [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) || + SuccBB == LatchExit) && + "Breaks the definition of dedicated exits!"); + } +#endif // Update the dominator info because the immediate dominator is no longer the // header of the original Loop. BB has edges both from L and remainder code. // Since the preheader determines which loop is run (L or directly jump to // the remainder code), we set the immediate dominator as the preheader. - if (DT) + if (DT) { DT->changeImmediateDominator(BB, PreHeader); + // Also update the IDom for immediate successors of BB. If the current + // IDom is the header, update the IDom to be the preheader because that is + // the nearest common dominator of all predecessors of SuccBB. We need to + // check for IDom being the header because successors of exit blocks can + // have edges from outside the loop, and we should not incorrectly update + // the IDom in that case. + for (BasicBlock *SuccBB: successors(BB)) + if (ImmediateSuccessorsOfExitBlocks.insert(SuccBB).second) { + if (DT->getNode(SuccBB)->getIDom()->getBlock() == Header) { + assert(!SuccBB->getSinglePredecessor() && + "BB should be the IDom then!"); + DT->changeImmediateDominator(SuccBB, PreHeader); + } + } + } } // Loop structure should be the following: diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index eb82ee283d44..012b10c8a9b0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -574,11 +574,9 @@ protected: /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(Loop *NewLoop); - /// Emit a bypass check to see if the trip count would overflow, or we - /// wouldn't have enough iterations to execute one vector loop. + /// Emit a bypass check to see if the vector trip count is zero, including if + /// it overflows. void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); - /// Emit a bypass check to see if the vector trip count is nonzero. - void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. void emitSCEVChecks(Loop *L, BasicBlock *Bypass); @@ -3289,37 +3287,16 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. - Value *CheckMinIters = Builder.CreateICmpULT( - Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); + // Generate code to check if the loop's trip count is less than VF * UF, or + // equal to it in case a scalar epilogue is required; this implies that the + // vector trip count is zero. This check also covers the case where adding one + // to the backedge-taken count overflowed leading to an incorrect trip count + // of zero. In this case we will also jump to the scalar loop. + auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = - BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); -} - -void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, - BasicBlock *Bypass) { - Value *TC = getOrCreateVectorTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), - "cmp.zero"); - - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass @@ -3328,7 +3305,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, Cmp)); + BranchInst::Create(Bypass, NewBB, CheckMinIters)); LoopBypassBlocks.push_back(BB); } @@ -3477,14 +3454,13 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() { Value *StartIdx = ConstantInt::get(IdxTy, 0); - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - emitVectorLoopEnteredCheck(Lp, ScalarPH); + // jump to the scalar loop. This check also covers the case where the + // backedge-taken count is uint##_max: adding one to it will overflow leading + // to an incorrect trip count of zero. In this (rare) case we will also jump + // to the scalar loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); + // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(Lp, ScalarPH); @@ -3527,7 +3503,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() { // We know what the end value is. EndValue = CountRoundDown; } else { - IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); + IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(CountRoundDown, true, StepType, true); @@ -4168,7 +4144,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // This is the vector-clone of the value that leaves the loop. Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4425043ad39a..dcbcab459a6b 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -434,7 +434,7 @@ private: /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. - Value *alreadyVectorized(ArrayRef<Value *> VL) const; + Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const; /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -857,7 +857,7 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); + bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); @@ -1212,7 +1212,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that all of the users of the scalars that we want to vectorize are // schedulable. Instruction *VL0 = cast<Instruction>(VL[0]); - BasicBlock *BB = cast<Instruction>(VL0)->getParent(); + BasicBlock *BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with @@ -1237,7 +1237,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this)) { + if (!BS.tryScheduleBundle(VL, this, VL0)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && @@ -2427,8 +2427,8 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { return Vec; } -Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { - if (const TreeEntry *En = getTreeEntry(VL[0])) { +Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const { + if (const TreeEntry *En = getTreeEntry(OpValue)) { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } @@ -2553,7 +2553,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *InVec = vectorizeTree(INVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CastInst *CI = dyn_cast<CastInst>(VL0); @@ -2575,7 +2575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); @@ -2604,7 +2604,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *True = vectorizeTree(TrueVec); Value *False = vectorizeTree(FalseVec); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; Value *V = Builder.CreateSelect(Cond, True, False); @@ -2644,7 +2644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; BinaryOperator *BinOp = cast<BinaryOperator>(VL0); @@ -2806,7 +2806,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; // Create a vector of LHS op1 RHS @@ -3097,8 +3097,8 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - BoUpSLP *SLP) { - if (isa<PHINode>(VL[0])) + BoUpSLP *SLP, Value *OpValue) { + if (isa<PHINode>(OpValue)) return true; // Initialize the instruction bundle. @@ -3106,7 +3106,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. @@ -3177,7 +3177,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, } } if (!Bundle->isReady()) { - cancelScheduling(VL, VL[0]); + cancelScheduling(VL, OpValue); return false; } return true; |