diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-14 18:50:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-14 18:50:02 +0000 |
commit | 1f917f69ff07f09b6dbb670971f57f8efe718b84 (patch) | |
tree | 99293cbc1411737cd995dac10a99b2c40ef0944c /llvm/lib/Transforms | |
parent | 145449b1e420787bb99721a429341fa6be3adfb6 (diff) |
Diffstat (limited to 'llvm/lib/Transforms')
63 files changed, 1923 insertions, 1418 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index d09607bb1c4c..51eb8ebf0369 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -881,16 +881,16 @@ static DIType *solveDIType(DIBuilder &Builder, Type *Ty, dwarf::DW_ATE_float, llvm::DINode::FlagArtificial); } else if (Ty->isPointerTy()) { - // Construct BasicType instead of PointerType to avoid infinite - // search problem. - // For example, we would be in trouble if we traverse recursively: + // Construct PointerType points to null (aka void *) instead of exploring + // pointee type to avoid infinite search problem. For example, we would be + // in trouble if we traverse recursively: // // struct Node { // Node* ptr; // }; - RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), - dwarf::DW_ATE_address, - llvm::DINode::FlagArtificial); + RetType = Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), + Layout.getABITypeAlignment(Ty), + /*DWARFAddressSpace=*/None, Name); } else if (Ty->isStructTy()) { auto *DIStruct = Builder.createStructType( Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), @@ -914,13 +914,21 @@ static DIType *solveDIType(DIBuilder &Builder, Type *Ty, RetType = DIStruct; } else { - LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n";); - SmallString<32> Buffer; - raw_svector_ostream OS(Buffer); - OS << Name.str() << "_" << Layout.getTypeSizeInBits(Ty); - RetType = Builder.createBasicType(OS.str(), Layout.getTypeSizeInBits(Ty), - dwarf::DW_ATE_address, - llvm::DINode::FlagArtificial); + LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); + TypeSize Size = Layout.getTypeSizeInBits(Ty); + auto *CharSizeType = Builder.createBasicType( + Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); + + if (Size <= 8) + RetType = CharSizeType; + else { + if (Size % 8 != 0) + Size = TypeSize::Fixed(Size + 8 - (Size % 8)); + + RetType = Builder.createArrayType( + Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, + Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); + } } DITypeCache.insert({Ty, RetType}); @@ -971,7 +979,8 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, unsigned LineNum = PromiseDIVariable->getLine(); DICompositeType *FrameDITy = DBuilder.createStructType( - DIS, "__coro_frame_ty", DFile, LineNum, Shape.FrameSize * 8, + DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), + DFile, LineNum, Shape.FrameSize * 8, Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); StructType *FrameTy = Shape.FrameTy; @@ -995,14 +1004,12 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, *IndexTy = FrameTy->getElementType(IndexIndex); DenseMap<unsigned, DIType *> TyCache; - TyCache.insert({ResumeIndex, - DBuilder.createBasicType("__resume_fn", - Layout.getTypeSizeInBits(ResumeFnTy), - dwarf::DW_ATE_address)}); TyCache.insert( - {DestroyIndex, DBuilder.createBasicType( - "__destroy_fn", Layout.getTypeSizeInBits(DestroyFnTy), - dwarf::DW_ATE_address)}); + {ResumeIndex, DBuilder.createPointerType( + nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); + TyCache.insert( + {DestroyIndex, DBuilder.createPointerType( + nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); /// FIXME: If we fill the field `SizeInBits` with the actual size of /// __coro_index in bits, then __coro_index wouldn't show in the debugger. diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index ead552d9be4e..9c1b247cdb39 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -389,7 +389,7 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) { // Replace CoroSave with a store to Index: // %index.addr = getelementptr %f.frame... (index field number) - // store i32 0, i32* %index.addr1 + // store i32 %IndexVal, i32* %index.addr1 auto *Save = S->getCoroSave(); Builder.SetInsertPoint(Save); if (S->isFinal()) { diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index b05b7990e3f0..e5ff98e4f73f 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -718,8 +718,8 @@ Argument *IRPosition::getAssociatedArgument() const { } // If we found a unique callback candidate argument, return it. - if (CBCandidateArg && CBCandidateArg.getValue()) - return CBCandidateArg.getValue(); + if (CBCandidateArg && CBCandidateArg.value()) + return CBCandidateArg.value(); // If no callbacks were found, or none used the underlying call site operand // exclusively, use the direct callee argument if available. @@ -1048,11 +1048,11 @@ Attributor::getAssumedConstant(const IRPosition &IRP, recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); return llvm::None; } - if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) { + if (isa_and_nonnull<UndefValue>(SimplifiedV.value())) { recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); return UndefValue::get(IRP.getAssociatedType()); } - Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue()); + Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.value()); if (CI) CI = dyn_cast_or_null<Constant>( AA::getWithType(*CI, *IRP.getAssociatedType())); @@ -2697,8 +2697,8 @@ void InformationCache::initializeInformationCache(const Function &CF, Optional<short> &NumUses = AssumeUsesMap[I]; if (!NumUses) NumUses = I->getNumUses(); - NumUses = NumUses.getValue() - /* this assume */ 1; - if (NumUses.getValue() != 0) + NumUses = NumUses.value() - /* this assume */ 1; + if (NumUses.value() != 0) continue; AssumeOnlyValues.insert(I); for (const Value *Op : I->operands()) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 4d99ce7e3175..1ff54b78e27e 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -437,7 +437,7 @@ static bool genericValueTraversal( A.getAssumedSimplified(*V, QueryingAA, UsedAssumedInformation); if (!SimpleV) continue; - Value *NewV = SimpleV.getValue(); + Value *NewV = SimpleV.value(); if (NewV && NewV != V) { if ((VS & AA::Interprocedural) || !CtxI || AA::isValidInScope(*NewV, CtxI->getFunction())) { @@ -1891,14 +1891,14 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { // Check if we have an assumed unique return value that we could manifest. Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A); - if (!UniqueRV || !UniqueRV.getValue()) + if (!UniqueRV || !UniqueRV.value()) return Changed; // Bookkeeping. STATS_DECLTRACK(UniqueReturnValue, FunctionReturn, "Number of function with unique return"); // If the assumed unique return value is an argument, annotate it. - if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) { + if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.value())) { if (UniqueRVArg->getType()->canLosslesslyBitCastTo( getAssociatedFunction()->getReturnType())) { getIRPosition() = IRPosition::argument(*UniqueRVArg); @@ -2666,9 +2666,9 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { // Either we stopped and the appropriate action was taken, // or we got back a simplified value to continue. Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I); - if (!SimplifiedPtrOp || !SimplifiedPtrOp.getValue()) + if (!SimplifiedPtrOp || !SimplifiedPtrOp.value()) return true; - const Value *PtrOpVal = SimplifiedPtrOp.getValue(); + const Value *PtrOpVal = SimplifiedPtrOp.value(); // A memory access through a pointer is considered UB // only if the pointer has constant null value. @@ -2757,14 +2757,14 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { IRPosition::value(*ArgVal), *this, UsedAssumedInformation); if (UsedAssumedInformation) continue; - if (SimplifiedVal && !SimplifiedVal.getValue()) + if (SimplifiedVal && !SimplifiedVal.value()) return true; - if (!SimplifiedVal || isa<UndefValue>(*SimplifiedVal.getValue())) { + if (!SimplifiedVal || isa<UndefValue>(*SimplifiedVal.value())) { KnownUBInsts.insert(&I); continue; } if (!ArgVal->getType()->isPointerTy() || - !isa<ConstantPointerNull>(*SimplifiedVal.getValue())) + !isa<ConstantPointerNull>(*SimplifiedVal.value())) continue; auto &NonNullAA = A.getAAFor<AANonNull>(*this, CalleeArgumentIRP, DepClassTy::NONE); @@ -4101,11 +4101,11 @@ identifyAliveSuccessors(Attributor &A, const SwitchInst &SI, bool UsedAssumedInformation = false; Optional<Constant *> C = A.getAssumedConstant(*SI.getCondition(), AA, UsedAssumedInformation); - if (!C || isa_and_nonnull<UndefValue>(C.getValue())) { + if (!C || isa_and_nonnull<UndefValue>(C.value())) { // No value yet, assume all edges are dead. - } else if (isa_and_nonnull<ConstantInt>(C.getValue())) { + } else if (isa_and_nonnull<ConstantInt>(C.value())) { for (auto &CaseIt : SI.cases()) { - if (CaseIt.getCaseValue() == C.getValue()) { + if (CaseIt.getCaseValue() == C.value()) { AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front()); return UsedAssumedInformation; } @@ -5523,11 +5523,10 @@ struct AAValueSimplifyImpl : AAValueSimplify { if (!SimpleV) return PoisonValue::get(&Ty); Value *EffectiveV = &V; - if (SimpleV.getValue()) - EffectiveV = SimpleV.getValue(); + if (SimpleV.value()) + EffectiveV = SimpleV.value(); if (auto *C = dyn_cast<Constant>(EffectiveV)) - if (!C->canTrap()) - return C; + return C; if (CtxI && AA::isValidAtPosition(AA::ValueAndContext(*EffectiveV, *CtxI), A.getInfoCache())) return ensureType(A, *EffectiveV, Ty, CtxI, Check); @@ -5541,7 +5540,7 @@ struct AAValueSimplifyImpl : AAValueSimplify { /// nullptr if we don't have one that makes sense. Value *manifestReplacementValue(Attributor &A, Instruction *CtxI) const { Value *NewV = SimplifiedAssociatedValue - ? SimplifiedAssociatedValue.getValue() + ? SimplifiedAssociatedValue.value() : UndefValue::get(getAssociatedType()); if (NewV && NewV != &getAssociatedValue()) { ValueToValueMapTy VMap; @@ -5672,7 +5671,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl { A.getAssumedConstant(ACSArgPos, *this, UsedAssumedInformation); if (!SimpleArgOp) return true; - if (!SimpleArgOp.getValue()) + if (!SimpleArgOp.value()) return false; if (!AA::isDynamicallyUnique(A, *this, **SimpleArgOp)) return false; @@ -5787,7 +5786,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return true; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return false; LHS = *SimplifiedLHS; @@ -5796,7 +5795,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return true; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return false; RHS = *SimplifiedRHS; @@ -5868,8 +5867,8 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { if (!SimplifiedOp) return true; - if (SimplifiedOp.getValue()) - NewOps[Idx] = SimplifiedOp.getValue(); + if (SimplifiedOp.value()) + NewOps[Idx] = SimplifiedOp.value(); else NewOps[Idx] = Op; @@ -6112,6 +6111,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack { /// but which is not in the deallocation infos. bool HasPotentiallyFreeingUnknownUses = false; + /// Flag to indicate that we should place the new alloca in the function + /// entry block rather than where the call site (CB) is. + bool MoveAllocaIntoEntry = true; + /// The set of free calls that use this allocation. SmallSetVector<CallBase *, 1> PotentialFreeCalls{}; }; @@ -6242,17 +6245,6 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Function *F = getAnchorScope(); const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); - LoopInfo *LI = - A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(*F); - Optional<bool> MayContainIrreducibleControl; - auto IsInLoop = [&](BasicBlock &BB) { - if (!MayContainIrreducibleControl.has_value()) - MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI); - if (MayContainIrreducibleControl.value()) - return true; - return LI->getLoopFor(&BB) != nullptr; - }; - for (auto &It : AllocationInfos) { AllocationInfo &AI = *It.second; if (AI.Status == AllocationInfo::INVALID) @@ -6294,25 +6286,25 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Size = SizeOffsetPair.first; } - Instruction *IP = (!SizeAPI.has_value() || IsInLoop(*AI.CB->getParent())) - ? AI.CB - : &F->getEntryBlock().front(); + Instruction *IP = + AI.MoveAllocaIntoEntry ? &F->getEntryBlock().front() : AI.CB; Align Alignment(1); if (MaybeAlign RetAlign = AI.CB->getRetAlign()) Alignment = std::max(Alignment, *RetAlign); if (Value *Align = getAllocAlignment(AI.CB, TLI)) { Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align); - assert(AlignmentAPI && AlignmentAPI.getValue().getZExtValue() > 0 && + assert(AlignmentAPI && AlignmentAPI.value().getZExtValue() > 0 && "Expected an alignment during manifest!"); Alignment = std::max( - Alignment, assumeAligned(AlignmentAPI.getValue().getZExtValue())); + Alignment, assumeAligned(AlignmentAPI.value().getZExtValue())); } // TODO: Hoist the alloca towards the function entry. unsigned AS = DL.getAllocaAddrSpace(); - Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS, - Size, Alignment, "", IP); + Instruction *Alloca = + new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment, + AI.CB->getName() + ".h2s", IP); if (Alloca->getType() != AI.CB->getType()) Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( @@ -6354,7 +6346,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack { A.getAssumedConstant(V, AA, UsedAssumedInformation); if (!SimpleV) return APInt(64, 0); - if (auto *CI = dyn_cast_or_null<ConstantInt>(SimpleV.getValue())) + if (auto *CI = dyn_cast_or_null<ConstantInt>(SimpleV.value())) return CI->getValue(); return llvm::None; } @@ -6400,6 +6392,21 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { bool StackIsAccessibleByOtherThreads = A.getInfoCache().stackIsAccessibleByOtherThreads(); + LoopInfo *LI = + A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(*F); + Optional<bool> MayContainIrreducibleControl; + auto IsInLoop = [&](BasicBlock &BB) { + if (&F->getEntryBlock() == &BB) + return false; + if (!MayContainIrreducibleControl.has_value()) + MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI); + if (MayContainIrreducibleControl.value()) + return true; + if (!LI) + return true; + return LI->getLoopFor(&BB) != nullptr; + }; + // Flag to ensure we update our deallocation information at most once per // updateImpl call and only if we use the free check reasoning. bool HasUpdatedFrees = false; @@ -6617,21 +6624,20 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { AI.Status = AllocationInfo::INVALID; Changed = ChangeStatus::CHANGED; continue; - } else { - if (APAlign->ugt(llvm::Value::MaximumAlignment) || - !APAlign->isPowerOf2()) { - LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign - << "\n"); - AI.Status = AllocationInfo::INVALID; - Changed = ChangeStatus::CHANGED; - continue; - } + } + if (APAlign->ugt(llvm::Value::MaximumAlignment) || + !APAlign->isPowerOf2()) { + LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign + << "\n"); + AI.Status = AllocationInfo::INVALID; + Changed = ChangeStatus::CHANGED; + continue; } } + Optional<APInt> Size = getSize(A, *this, AI); if (MaxHeapToStackSize != -1) { - Optional<APInt> Size = getSize(A, *this, AI); - if (!Size || Size.getValue().ugt(MaxHeapToStackSize)) { + if (!Size || Size.value().ugt(MaxHeapToStackSize)) { LLVM_DEBUG({ if (!Size) dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n"; @@ -6649,18 +6655,23 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { switch (AI.Status) { case AllocationInfo::STACK_DUE_TO_USE: if (UsesCheck(AI)) - continue; + break; AI.Status = AllocationInfo::STACK_DUE_TO_FREE; LLVM_FALLTHROUGH; case AllocationInfo::STACK_DUE_TO_FREE: if (FreeCheck(AI)) - continue; + break; AI.Status = AllocationInfo::INVALID; Changed = ChangeStatus::CHANGED; - continue; + break; case AllocationInfo::INVALID: llvm_unreachable("Invalid allocations should never reach this point!"); }; + + // Check if we still think we can move it into the entry block. + if (AI.MoveAllocaIntoEntry && + (!Size.has_value() || IsInLoop(*AI.CB->getParent()))) + AI.MoveAllocaIntoEntry = false; } return Changed; @@ -6748,8 +6759,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { LLVM_DEBUG({ dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: "; - if (CSTy && CSTy.getValue()) - CSTy.getValue()->print(dbgs()); + if (CSTy && CSTy.value()) + CSTy.value()->print(dbgs()); else if (CSTy) dbgs() << "<nullptr>"; else @@ -6760,8 +6771,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { LLVM_DEBUG({ dbgs() << " : New Type: "; - if (Ty && Ty.getValue()) - Ty.getValue()->print(dbgs()); + if (Ty && Ty.value()) + Ty.value()->print(dbgs()); else if (Ty) dbgs() << "<nullptr>"; else @@ -6769,7 +6780,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { dbgs() << "\n"; }); - return !Ty || Ty.getValue(); + return !Ty || Ty.value(); }; if (!A.checkForAllCallSites(CallSiteCheck, *this, true, @@ -6783,7 +6794,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { PrivatizableType = identifyPrivatizableType(A); if (!PrivatizableType) return ChangeStatus::UNCHANGED; - if (!PrivatizableType.getValue()) + if (!PrivatizableType.value()) return indicatePessimisticFixpoint(); // The dependence is optional so we don't give up once we give up on the @@ -6871,7 +6882,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType(); if (!CBArgPrivTy) continue; - if (CBArgPrivTy.getValue() == PrivatizableType) + if (CBArgPrivTy.value() == PrivatizableType) continue; } @@ -6918,7 +6929,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType(); if (!DCArgPrivTy) return true; - if (DCArgPrivTy.getValue() == PrivatizableType) + if (DCArgPrivTy.value() == PrivatizableType) return true; } } @@ -7060,7 +7071,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { ChangeStatus manifest(Attributor &A) override { if (!PrivatizableType) return ChangeStatus::UNCHANGED; - assert(PrivatizableType.getValue() && "Expected privatizable type!"); + assert(PrivatizableType.value() && "Expected privatizable type!"); // Collect all tail calls in the function as we cannot allow new allocas to // escape into tail recursion. @@ -7093,9 +7104,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { Instruction *IP = &*EntryBB.getFirstInsertionPt(); const DataLayout &DL = IP->getModule()->getDataLayout(); unsigned AS = DL.getAllocaAddrSpace(); - Instruction *AI = new AllocaInst(PrivatizableType.getValue(), AS, + Instruction *AI = new AllocaInst(PrivatizableType.value(), AS, Arg->getName() + ".priv", IP); - createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn, + createInitialization(PrivatizableType.value(), *AI, ReplacementFn, ArgIt->getArgNo(), *IP); if (AI->getType() != Arg->getType()) @@ -7203,7 +7214,7 @@ struct AAPrivatizablePtrCallSiteArgument final PrivatizableType = identifyPrivatizableType(A); if (!PrivatizableType) return ChangeStatus::UNCHANGED; - if (!PrivatizableType.getValue()) + if (!PrivatizableType.value()) return indicatePessimisticFixpoint(); const IRPosition &IRP = getIRPosition(); @@ -8664,7 +8675,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return true; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return false; LHS = *SimplifiedLHS; @@ -8673,7 +8684,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return true; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return false; RHS = *SimplifiedRHS; @@ -8717,7 +8728,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedOpV) return true; - if (!SimplifiedOpV.getValue()) + if (!SimplifiedOpV.value()) return false; OpV = *SimplifiedOpV; @@ -8747,7 +8758,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return true; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return false; LHS = *SimplifiedLHS; @@ -8756,7 +8767,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return true; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return false; RHS = *SimplifiedRHS; @@ -8821,7 +8832,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedOpV) return true; - if (!SimplifiedOpV.getValue()) + if (!SimplifiedOpV.value()) return false; Value *VPtr = *SimplifiedOpV; @@ -9182,7 +9193,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return indicatePessimisticFixpoint(); LHS = *SimplifiedLHS; @@ -9191,7 +9202,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return indicatePessimisticFixpoint(); RHS = *SimplifiedRHS; @@ -9265,7 +9276,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return indicatePessimisticFixpoint(); LHS = *SimplifiedLHS; @@ -9274,7 +9285,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return indicatePessimisticFixpoint(); RHS = *SimplifiedRHS; @@ -9340,7 +9351,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedSrc) return ChangeStatus::UNCHANGED; - if (!SimplifiedSrc.getValue()) + if (!SimplifiedSrc.value()) return indicatePessimisticFixpoint(); Src = *SimplifiedSrc; @@ -9373,7 +9384,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return indicatePessimisticFixpoint(); LHS = *SimplifiedLHS; @@ -9382,7 +9393,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return indicatePessimisticFixpoint(); RHS = *SimplifiedRHS; @@ -9441,7 +9452,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { UsedAssumedInformation); if (!SimplifiedIncomingValue) continue; - if (!SimplifiedIncomingValue.getValue()) + if (!SimplifiedIncomingValue.value()) return indicatePessimisticFixpoint(); IncomingValue = *SimplifiedIncomingValue; @@ -9930,7 +9941,7 @@ private: const Function &Fn) { Optional<bool> Cached = isCachedReachable(Fn); if (Cached) - return Cached.getValue(); + return Cached.value(); // The query was not cached, thus it is new. We need to request an update // explicitly to make sure this the information is properly run to a diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 1a1bde4f0668..1ad6e2b2a1d2 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1584,11 +1584,6 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, } Value *StoredOnceValue = GS.getStoredOnceValue(); if (GS.StoredType == GlobalStatus::StoredOnce && StoredOnceValue) { - // Avoid speculating constant expressions that might trap (div/rem). - auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue); - if (SOVConstant && SOVConstant->canTrap()) - return Changed; - Function &StoreFn = const_cast<Function &>(*GS.StoredOnceStore->getFunction()); bool CanHaveNonUndefGlobalInitializer = @@ -1601,6 +1596,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, // This is restricted to address spaces that allow globals to have // initializers. NVPTX, for example, does not support initializers for // shared memory (AS 3). + auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue); if (SOVConstant && isa<UndefValue>(GV->getInitializer()) && DL.getTypeAllocSize(SOVConstant->getType()) == DL.getTypeAllocSize(GV->getValueType()) && diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index d75d99e307fd..28bc43aa1633 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -555,7 +555,7 @@ collectRegionsConstants(OutlinableRegion &Region, for (Value *V : ID.OperVals) { Optional<unsigned> GVNOpt = C.getGVN(V); assert(GVNOpt && "Expected a GVN for operand?"); - unsigned GVN = GVNOpt.getValue(); + unsigned GVN = GVNOpt.value(); // Check if this global value has been found to not be the same already. if (NotSame.contains(GVN)) { @@ -570,7 +570,7 @@ collectRegionsConstants(OutlinableRegion &Region, // it is considered to not be the same value. Optional<bool> ConstantMatches = constantMatches(V, GVN, GVNToConstant); if (ConstantMatches) { - if (ConstantMatches.getValue()) + if (ConstantMatches.value()) continue; else ConstantsTheSame = false; @@ -651,7 +651,7 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group, // Transfer the swifterr attribute to the correct function parameter. if (Group.SwiftErrorArgument) - Group.OutlinedFunction->addParamAttr(Group.SwiftErrorArgument.getValue(), + Group.OutlinedFunction->addParamAttr(Group.SwiftErrorArgument.value(), Attribute::SwiftError); Group.OutlinedFunction->addFnAttr(Attribute::OptimizeForSize); @@ -809,7 +809,7 @@ static void mapInputsToGVNs(IRSimilarityCandidate &C, if (OutputMappings.find(Input) != OutputMappings.end()) Input = OutputMappings.find(Input)->second; assert(C.getGVN(Input) && "Could not find a numbering for the given input"); - EndInputNumbers.push_back(C.getGVN(Input).getValue()); + EndInputNumbers.push_back(C.getGVN(Input).value()); } } @@ -948,11 +948,11 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, for (unsigned InputVal : InputGVNs) { Optional<unsigned> CanonicalNumberOpt = C.getCanonicalNum(InputVal); assert(CanonicalNumberOpt && "Canonical number not found?"); - unsigned CanonicalNumber = CanonicalNumberOpt.getValue(); + unsigned CanonicalNumber = CanonicalNumberOpt.value(); Optional<Value *> InputOpt = C.fromGVN(InputVal); assert(InputOpt && "Global value number not found?"); - Value *Input = InputOpt.getValue(); + Value *Input = InputOpt.value(); DenseMap<unsigned, unsigned>::iterator AggArgIt = Group.CanonicalNumberToAggArg.find(CanonicalNumber); @@ -1236,13 +1236,13 @@ static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region, Optional<unsigned> BBGVN = Cand.getGVN(PHIBB); assert(BBGVN && "Could not find GVN for the incoming block!"); - BBGVN = Cand.getCanonicalNum(BBGVN.getValue()); + BBGVN = Cand.getCanonicalNum(BBGVN.value()); assert(BBGVN && "Could not find canonical number for the incoming block!"); // Create a pair of the exit block canonical value, and the aggregate // argument location, connected to the canonical numbers stored in the // PHINode. PHINodeData TemporaryPair = - std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs); + std::make_pair(std::make_pair(BBGVN.value(), AggArgIdx), PHIGVNs); hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair); // Look for and create a new entry in our connection between canonical @@ -1516,8 +1516,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { // Make sure that the argument in the new function has the SwiftError // argument. if (Group.SwiftErrorArgument) - Call->addParamAttr(Group.SwiftErrorArgument.getValue(), - Attribute::SwiftError); + Call->addParamAttr(Group.SwiftErrorArgument.value(), Attribute::SwiftError); return Call; } @@ -2082,9 +2081,9 @@ static void alignOutputBlockWithAggFunc( if (MatchingBB) { LLVM_DEBUG(dbgs() << "Set output block for region in function" << Region.ExtractedFunction << " to " - << MatchingBB.getValue()); + << MatchingBB.value()); - Region.OutputBlockNum = MatchingBB.getValue(); + Region.OutputBlockNum = MatchingBB.value(); for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) VtoBB.second->eraseFromParent(); return; @@ -2679,15 +2678,14 @@ void IROutliner::updateOutputMapping(OutlinableRegion &Region, if (!OutputIdx) return; - if (OutputMappings.find(Outputs[OutputIdx.getValue()]) == - OutputMappings.end()) { + if (OutputMappings.find(Outputs[OutputIdx.value()]) == OutputMappings.end()) { LLVM_DEBUG(dbgs() << "Mapping extracted output " << *LI << " to " - << *Outputs[OutputIdx.getValue()] << "\n"); - OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.getValue()])); + << *Outputs[OutputIdx.value()] << "\n"); + OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.value()])); } else { - Value *Orig = OutputMappings.find(Outputs[OutputIdx.getValue()])->second; + Value *Orig = OutputMappings.find(Outputs[OutputIdx.value()])->second; LLVM_DEBUG(dbgs() << "Mapping extracted output " << *Orig << " to " - << *Outputs[OutputIdx.getValue()] << "\n"); + << *Outputs[OutputIdx.value()] << "\n"); OutputMappings.insert(std::make_pair(LI, Orig)); } } diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 227ad8501f25..8e0ca8c6c997 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3340,6 +3340,9 @@ struct AAKernelInfoFunction : AAKernelInfo { } bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { + if (!mayContainParallelRegion()) + return false; + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); if (!SPMDCompatibilityTracker.isAssumed()) { @@ -4428,10 +4431,10 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { if (!SimplifiedValue) return Str + std::string("none"); - if (!SimplifiedValue.getValue()) + if (!SimplifiedValue.value()) return Str + std::string("nullptr"); - if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue())) + if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.value())) return Str + std::to_string(CI->getSExtValue()); return Str + std::string("unknown"); @@ -4456,7 +4459,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional<Value *> { assert((isValidState() || - (SimplifiedValue && SimplifiedValue.getValue() == nullptr)) && + (SimplifiedValue && SimplifiedValue.value() == nullptr)) && "Unexpected invalid state!"); if (!isAtFixpoint()) { diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index ae787be40c55..8eef82675e86 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -898,183 +898,6 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createAnnotationRemarksLegacyPass()); } -void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { - // Load sample profile before running the LTO optimization pipeline. - if (!PGOSampleUse.empty()) { - PM.add(createPruneEHPass()); - PM.add(createSampleProfileLoaderPass(PGOSampleUse)); - } - - // Remove unused virtual tables to improve the quality of code generated by - // whole-program devirtualization and bitset lowering. - PM.add(createGlobalDCEPass()); - - // Provide AliasAnalysis services for optimizations. - addInitialAliasAnalysisPasses(PM); - - // Allow forcing function attributes as a debugging and tuning aid. - PM.add(createForceFunctionAttrsLegacyPass()); - - // Infer attributes about declarations if possible. - PM.add(createInferFunctionAttrsLegacyPass()); - - if (OptLevel > 1) { - // Split call-site with more constrained arguments. - PM.add(createCallSiteSplittingPass()); - - // Propage constant function arguments by specializing the functions. - if (EnableFunctionSpecialization && OptLevel > 2) - PM.add(createFunctionSpecializationPass()); - - // Propagate constants at call sites into the functions they call. This - // opens opportunities for globalopt (and inlining) by substituting function - // pointers passed as arguments to direct uses of functions. - PM.add(createIPSCCPPass()); - - // Attach metadata to indirect call sites indicating the set of functions - // they may target at run-time. This should follow IPSCCP. - PM.add(createCalledValuePropagationPass()); - - // Infer attributes on declarations, call sites, arguments, etc. - if (AttributorRun & AttributorRunOption::MODULE) - PM.add(createAttributorLegacyPass()); - } - - // Infer attributes about definitions. The readnone attribute in particular is - // required for virtual constant propagation. - PM.add(createPostOrderFunctionAttrsLegacyPass()); - PM.add(createReversePostOrderFunctionAttrsPass()); - - // Split globals using inrange annotations on GEP indices. This can help - // improve the quality of generated code when virtual constant propagation or - // control flow integrity are enabled. - PM.add(createGlobalSplitPass()); - - // Apply whole-program devirtualization and virtual constant propagation. - PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); - - // That's all we need at opt level 1. - if (OptLevel == 1) - return; - - // Now that we internalized some globals, see if we can hack on them! - PM.add(createGlobalOptimizerPass()); - // Promote any localized global vars. - PM.add(createPromoteMemoryToRegisterPass()); - - // Linking modules together can lead to duplicated global constants, only - // keep one copy of each constant. - PM.add(createConstantMergePass()); - - // Remove unused arguments from functions. - PM.add(createDeadArgEliminationPass()); - - // Reduce the code after globalopt and ipsccp. Both can open up significant - // simplification opportunities, and both can propagate functions through - // function pointers. When this happens, we often have to resolve varargs - // calls, etc, so let instcombine do this. - if (OptLevel > 2) - PM.add(createAggressiveInstCombinerPass()); - PM.add(createInstructionCombiningPass()); - addExtensionsToPM(EP_Peephole, PM); - - // Inline small functions - bool RunInliner = Inliner; - if (RunInliner) { - PM.add(Inliner); - Inliner = nullptr; - } - - PM.add(createPruneEHPass()); // Remove dead EH info. - - // Infer attributes on declarations, call sites, arguments, etc. for an SCC. - if (AttributorRun & AttributorRunOption::CGSCC) - PM.add(createAttributorCGSCCLegacyPass()); - - // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if - // there are no OpenMP runtime calls present in the module. - if (OptLevel > 1) - PM.add(createOpenMPOptCGSCCLegacyPass()); - - // Optimize globals again if we ran the inliner. - if (RunInliner) - PM.add(createGlobalOptimizerPass()); - PM.add(createGlobalDCEPass()); // Remove dead functions. - - // The IPO passes may leave cruft around. Clean up after them. - PM.add(createInstructionCombiningPass()); - addExtensionsToPM(EP_Peephole, PM); - PM.add(createJumpThreadingPass()); - - // Break up allocas - PM.add(createSROAPass()); - - // LTO provides additional opportunities for tailcall elimination due to - // link-time inlining, and visibility of nocapture attribute. - if (OptLevel > 1) - PM.add(createTailCallEliminationPass()); - - // Infer attributes on declarations, call sites, arguments, etc. - PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture. - // Run a few AA driven optimizations here and now, to cleanup the code. - PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. - - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, - /*AllowSpeculation=*/true)); - PM.add(NewGVN ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. - PM.add(createMemCpyOptPass()); // Remove dead memcpys. - - // Nuke dead stores. - PM.add(createDeadStoreEliminationPass()); - PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds. - - // More loops are countable; try to optimize them. - if (EnableLoopFlatten) - PM.add(createLoopFlattenPass()); - PM.add(createIndVarSimplifyPass()); - PM.add(createLoopDeletionPass()); - if (EnableLoopInterchange) - PM.add(createLoopInterchangePass()); - - if (EnableConstraintElimination) - PM.add(createConstraintEliminationPass()); - - // Unroll small loops and perform peeling. - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopDistributePass()); - - addVectorPasses(PM, /* IsFullLTO */ true); - - addExtensionsToPM(EP_Peephole, PM); - - PM.add(createJumpThreadingPass()); -} - -void PassManagerBuilder::addLateLTOOptimizationPasses( - legacy::PassManagerBase &PM) { - // See comment in the new PM for justification of scheduling splitting at - // this stage (\ref buildLTODefaultPipeline). - if (EnableHotColdSplit) - PM.add(createHotColdSplittingPass()); - - // Delete basic blocks, which optimization passes may have killed. - PM.add( - createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true))); - - // Drop bodies of available externally objects to improve GlobalDCE. - PM.add(createEliminateAvailableExternallyPass()); - - // Now that we have optimized the program, discard unreachable functions. - PM.add(createGlobalDCEPass()); - - // FIXME: this is profitable (for compiler time) to do at -O0 too, but - // currently it damages debug info. - if (MergeFunctions) - PM.add(createMergeFunctionsPass()); -} - LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() { PassManagerBuilder *PMB = new PassManagerBuilder(); return wrap(PMB); diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index 6859953de962..764fd57d245f 100644 --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -130,7 +130,7 @@ void ContextTrieNode::addFunctionSize(uint32_t FSize) { if (!FuncSize) FuncSize = 0; - FuncSize = FuncSize.getValue() + FSize; + FuncSize = FuncSize.value() + FSize; } LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; } diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 40de69bbf2cf..55fee213cd5f 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1350,14 +1350,14 @@ SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { Optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); - return Cost ? !!Cost.getValue() : false; + return Cost ? !!Cost.value() : false; } InlineCost SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { if (Optional<InlineCost> ReplayCost = getExternalInlineAdvisorCost(*Candidate.CallInstr)) - return ReplayCost.getValue(); + return ReplayCost.value(); // Adjust threshold based on call site hotness, only do this for callsite // prioritized inliner because otherwise cost-benefit check is done earlier. int SampleThreshold = SampleColdCallSiteThreshold; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index f4d8b79a5311..535a7736454c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1660,8 +1660,9 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { Constant *MulC; if (match(&I, m_c_FAdd(m_FMul(m_Value(X), m_ImmConstant(MulC)), m_Deferred(X)))) { - MulC = ConstantExpr::getFAdd(MulC, ConstantFP::get(I.getType(), 1.0)); - return BinaryOperator::CreateFMulFMF(X, MulC, &I); + if (Constant *NewMulC = ConstantFoldBinaryOpOperands( + Instruction::FAdd, MulC, ConstantFP::get(I.getType(), 1.0), DL)) + return BinaryOperator::CreateFMulFMF(X, NewMulC, &I); } if (Value *V = FAddCombine(Builder).simplify(&I)) @@ -1750,6 +1751,52 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS, return Builder.CreateIntCast(Result, Ty, true); } +static Instruction *foldSubOfMinMax(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Type *Ty = I.getType(); + auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op1); + if (!MinMax) + return nullptr; + + // sub(add(X,Y), s/umin(X,Y)) --> s/umax(X,Y) + // sub(add(X,Y), s/umax(X,Y)) --> s/umin(X,Y) + Value *X = MinMax->getLHS(); + Value *Y = MinMax->getRHS(); + if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) && + (Op0->hasOneUse() || Op1->hasOneUse())) { + Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); + Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + return CallInst::Create(F, {X, Y}); + } + + // sub(add(X,Y),umin(Y,Z)) --> add(X,usub.sat(Y,Z)) + // sub(add(X,Z),umin(Y,Z)) --> add(X,usub.sat(Z,Y)) + Value *Z; + if (match(Op1, m_OneUse(m_UMin(m_Value(Y), m_Value(Z))))) { + if (match(Op0, m_OneUse(m_c_Add(m_Specific(Y), m_Value(X))))) { + Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, Ty, {Y, Z}); + return BinaryOperator::CreateAdd(X, USub); + } + if (match(Op0, m_OneUse(m_c_Add(m_Specific(Z), m_Value(X))))) { + Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, Ty, {Z, Y}); + return BinaryOperator::CreateAdd(X, USub); + } + } + + // sub Op0, smin((sub nsw Op0, Z), 0) --> smax Op0, Z + // sub Op0, smax((sub nsw Op0, Z), 0) --> smin Op0, Z + if (MinMax->isSigned() && match(Y, m_ZeroInt()) && + match(X, m_NSWSub(m_Specific(Op0), m_Value(Z)))) { + Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); + Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + return CallInst::Create(F, {Op0, Z}); + } + + return nullptr; +} + Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { if (Value *V = simplifySubInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), @@ -1919,14 +1966,12 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2)); } + // If there's no chance any bit will need to borrow from an adjacent bit: + // sub C, X --> xor X, C const APInt *Op0C; - if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) { - // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known - // zero. - KnownBits RHSKnown = computeKnownBits(Op1, 0, &I); - if ((*Op0C | RHSKnown.Zero).isAllOnes()) - return BinaryOperator::CreateXor(Op1, Op0); - } + if (match(Op0, m_APInt(Op0C)) && + (~computeKnownBits(Op1, 0, &I).Zero).isSubsetOf(*Op0C)) + return BinaryOperator::CreateXor(Op1, Op0); { Value *Y; @@ -2016,36 +2061,8 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { } } - if (auto *II = dyn_cast<MinMaxIntrinsic>(Op1)) { - { - // sub(add(X,Y), s/umin(X,Y)) --> s/umax(X,Y) - // sub(add(X,Y), s/umax(X,Y)) --> s/umin(X,Y) - Value *X = II->getLHS(); - Value *Y = II->getRHS(); - if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) && - (Op0->hasOneUse() || Op1->hasOneUse())) { - Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID()); - Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y); - return replaceInstUsesWith(I, InvMaxMin); - } - } - - { - // sub(add(X,Y),umin(Y,Z)) --> add(X,usub.sat(Y,Z)) - // sub(add(X,Z),umin(Y,Z)) --> add(X,usub.sat(Z,Y)) - Value *X, *Y, *Z; - if (match(Op1, m_OneUse(m_UMin(m_Value(Y), m_Value(Z))))) { - if (match(Op0, m_OneUse(m_c_Add(m_Specific(Y), m_Value(X))))) - return BinaryOperator::CreateAdd( - X, Builder.CreateIntrinsic(Intrinsic::usub_sat, I.getType(), - {Y, Z})); - if (match(Op0, m_OneUse(m_c_Add(m_Specific(Z), m_Value(X))))) - return BinaryOperator::CreateAdd( - X, Builder.CreateIntrinsic(Intrinsic::usub_sat, I.getType(), - {Z, Y})); - } - } - } + if (Instruction *R = foldSubOfMinMax(I, Builder)) + return R; { // If we have a subtraction between some value and a select between @@ -2437,13 +2454,15 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) { // (X * C) - X --> X * (C - 1.0) if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) { - Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0)); - return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I); + if (Constant *CSubOne = ConstantFoldBinaryOpOperands( + Instruction::FSub, C, ConstantFP::get(Ty, 1.0), DL)) + return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I); } // X - (X * C) --> X * (1.0 - C) if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) { - Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C); - return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I); + if (Constant *OneSubC = ConstantFoldBinaryOpOperands( + Instruction::FSub, ConstantFP::get(Ty, 1.0), C, DL)) + return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I); } // Reassociate fsub/fadd sequences to create more fadd instructions and diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ae8865651ece..a8f2cd79830a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1771,6 +1771,16 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { return new ZExtInst(IsZero, Ty); } + // (-(X & 1)) & Y --> (X & 1) == 0 ? 0 : Y + Value *Neg; + if (match(&I, + m_c_And(m_CombineAnd(m_Value(Neg), + m_OneUse(m_Neg(m_And(m_Value(), m_One())))), + m_Value(Y)))) { + Value *Cmp = Builder.CreateIsNull(Neg); + return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Y); + } + const APInt *C; if (match(Op1, m_APInt(C))) { const APInt *XorC; @@ -1798,7 +1808,8 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { unsigned Width = Ty->getScalarSizeInBits(); const APInt *ShiftC; - if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) { + if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC))))) && + ShiftC->ult(Width)) { if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) { // We are clearing high bits that were potentially set by sext+ashr: // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index 2540e545ae4d..0327efbf9614 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -61,7 +61,13 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) { /// equivalent to its value operand. bool isSaturating(AtomicRMWInst& RMWI) { if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) - switch(RMWI.getOperation()) { + switch (RMWI.getOperation()) { + case AtomicRMWInst::FMax: + // maxnum(x, +inf) -> +inf + return !CF->isNegative() && CF->isInfinity(); + case AtomicRMWInst::FMin: + // minnum(x, -inf) -> +inf + return CF->isNegative() && CF->isInfinity(); case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: return CF->isNaN(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 67ef2e895b6c..edfdf70c2b97 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1543,7 +1543,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { !ShAmtC->containsConstantExpression()) { // Canonicalize a shift amount constant operand to modulo the bit-width. Constant *WidthC = ConstantInt::get(Ty, BitWidth); - Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); + Constant *ModuloC = + ConstantFoldBinaryOpOperands(Instruction::URem, ShAmtC, WidthC, DL); + if (!ModuloC) + return nullptr; if (ModuloC != ShAmtC) return replaceOperand(*II, 2, ModuloC); @@ -2679,7 +2682,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Handle target specific intrinsics Optional<Instruction *> V = targetInstCombineIntrinsic(*II); if (V) - return V.getValue(); + return V.value(); break; } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index e9e779b8619b..a9a930555b3c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1756,11 +1756,12 @@ static bool isKnownExactCastIntToFP(CastInst &I, InstCombinerImpl &IC) { // TODO: // Try harder to find if the source integer type has less significant bits. - // For example, compute number of sign bits or compute low bit mask. + // For example, compute number of sign bits. KnownBits SrcKnown = IC.computeKnownBits(Src, 0, &I); - int LowBits = - (int)SrcTy->getScalarSizeInBits() - SrcKnown.countMinLeadingZeros(); - if (LowBits <= DestNumSigBits) + int SigBits = (int)SrcTy->getScalarSizeInBits() - + SrcKnown.countMinLeadingZeros() - + SrcKnown.countMinTrailingZeros(); + if (SigBits <= DestNumSigBits) return true; return false; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index d1f89973caa1..9f6d36b85522 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1436,7 +1436,7 @@ Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) { // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...). Constant *C = dyn_cast<Constant>(Op1); - if (!C || C->canTrap()) + if (!C) return nullptr; if (auto *Phi = dyn_cast<PHINode>(Op0)) @@ -1777,11 +1777,16 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp, return new ICmpInst(NewPred, X, Zero); } + APInt NewC2 = *C2; + KnownBits Know = computeKnownBits(And->getOperand(0), 0, And); + // Set high zeros of C2 to allow matching negated power-of-2. + NewC2 = *C2 + APInt::getHighBitsSet(C2->getBitWidth(), + Know.countMinLeadingZeros()); + // Restrict this fold only for single-use 'and' (PR10267). // ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two. - if ((~(*C2) + 1).isPowerOf2()) { - Constant *NegBOC = - ConstantExpr::getNeg(cast<Constant>(And->getOperand(1))); + if (NewC2.isNegatedPowerOf2()) { + Constant *NegBOC = ConstantInt::get(And->getType(), -NewC2); auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; return new ICmpInst(NewPred, X, NegBOC); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 2a34edbf6cb8..8cb09cbac86f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -505,20 +505,23 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { Constant *C1; if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { // (C1 / X) * C --> (C * C1) / X - Constant *CC1 = ConstantExpr::getFMul(C, C1); - if (CC1->isNormalFP()) + Constant *CC1 = + ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL); + if (CC1 && CC1->isNormalFP()) return BinaryOperator::CreateFDivFMF(CC1, X, &I); } if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { // (X / C1) * C --> X * (C / C1) - Constant *CDivC1 = ConstantExpr::getFDiv(C, C1); - if (CDivC1->isNormalFP()) + Constant *CDivC1 = + ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C1, DL); + if (CDivC1 && CDivC1->isNormalFP()) return BinaryOperator::CreateFMulFMF(X, CDivC1, &I); // If the constant was a denormal, try reassociating differently. // (X / C1) * C --> X / (C1 / C) - Constant *C1DivC = ConstantExpr::getFDiv(C1, C); - if (Op0->hasOneUse() && C1DivC->isNormalFP()) + Constant *C1DivC = + ConstantFoldBinaryOpOperands(Instruction::FDiv, C1, C, DL); + if (C1DivC && Op0->hasOneUse() && C1DivC->isNormalFP()) return BinaryOperator::CreateFDivFMF(X, C1DivC, &I); } @@ -527,15 +530,19 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { // further folds and (X * C) + C2 is 'fma'. if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) { // (X + C1) * C --> (X * C) + (C * C1) - Constant *CC1 = ConstantExpr::getFMul(C, C1); - Value *XC = Builder.CreateFMulFMF(X, C, &I); - return BinaryOperator::CreateFAddFMF(XC, CC1, &I); + if (Constant *CC1 = ConstantFoldBinaryOpOperands( + Instruction::FMul, C, C1, DL)) { + Value *XC = Builder.CreateFMulFMF(X, C, &I); + return BinaryOperator::CreateFAddFMF(XC, CC1, &I); + } } if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) { // (C1 - X) * C --> (C * C1) - (X * C) - Constant *CC1 = ConstantExpr::getFMul(C, C1); - Value *XC = Builder.CreateFMulFMF(X, C, &I); - return BinaryOperator::CreateFSubFMF(CC1, XC, &I); + if (Constant *CC1 = ConstantFoldBinaryOpOperands( + Instruction::FMul, C, C1, DL)) { + Value *XC = Builder.CreateFMulFMF(X, C, &I); + return BinaryOperator::CreateFSubFMF(CC1, XC, &I); + } } } @@ -1232,8 +1239,10 @@ static Instruction *foldFDivConstantDivisor(BinaryOperator &I) { // on all targets. // TODO: Use Intrinsic::canonicalize or let function attributes tell us that // denorms are flushed? - auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C); - if (!RecipC->isNormalFP()) + const DataLayout &DL = I.getModule()->getDataLayout(); + auto *RecipC = ConstantFoldBinaryOpOperands( + Instruction::FDiv, ConstantFP::get(I.getType(), 1.0), C, DL); + if (!RecipC || !RecipC->isNormalFP()) return nullptr; // X / C --> X * (1 / C) @@ -1256,12 +1265,13 @@ static Instruction *foldFDivConstantDividend(BinaryOperator &I) { // Try to reassociate C / X expressions where X includes another constant. Constant *C2, *NewC = nullptr; + const DataLayout &DL = I.getModule()->getDataLayout(); if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) { // C / (X * C2) --> (C / C2) / X - NewC = ConstantExpr::getFDiv(C, C2); + NewC = ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C2, DL); } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) { // C / (X / C2) --> (C * C2) / X - NewC = ConstantExpr::getFMul(C, C2); + NewC = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C2, DL); } // Disallow denormal constants because we don't know what would happen // on all targets. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 9d4c01ac03e2..febd0f51d25f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -925,7 +925,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic( *II, DemandedMask, Known, KnownBitsComputed); if (V) - return V.getValue(); + return V.value(); break; } } @@ -1636,7 +1636,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, *II, DemandedElts, UndefElts, UndefElts2, UndefElts3, simplifyAndSetOp); if (V) - return V.getValue(); + return V.value(); break; } } // switch on IntrinsicID diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 22659a8e4951..b80c58183dd5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -228,8 +228,9 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { // truncate a subset of scalar bits of an insert op. if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) { Value *Scalar; + Value *Vec; uint64_t InsIndexC; - if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar), + if (!match(X, m_InsertElt(m_Value(Vec), m_Value(Scalar), m_ConstantInt(InsIndexC)))) return nullptr; @@ -239,8 +240,19 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { // of elements 4-7 of the bitcasted vector. unsigned NarrowingRatio = NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue(); - if (ExtIndexC / NarrowingRatio != InsIndexC) + + if (ExtIndexC / NarrowingRatio != InsIndexC) { + // Remove insertelement, if we don't use the inserted element. + // extractelement (bitcast (insertelement (Vec, b)), a) -> + // extractelement (bitcast (Vec), a) + // FIXME: this should be removed to SimplifyDemandedVectorElts, + // once scale vectors are supported. + if (X->hasOneUse() && Ext.getVectorOperand()->hasOneUse()) { + Value *NewBC = Builder.CreateBitCast(Vec, Ext.getVectorOperandType()); + return ExtractElementInst::Create(NewBC, Ext.getIndexOperand()); + } return nullptr; + } // We are extracting part of the original scalar. How that scalar is // inserted into the vector depends on the endian-ness. Example: diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 0816a4a575d9..75520a0c8d5f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -523,11 +523,12 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) { // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" // if C1 and C2 are constants. Value *A, *B; - Constant *C1, *C2; + Constant *C1, *C2, *CRes; if (Op0 && Op1 && Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode && match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) && - match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) { + match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2)))) && + (CRes = ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL))) { bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0) && hasNoUnsignedWrap(*Op1); @@ -544,7 +545,7 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) { InsertNewInstWith(NewBO, I); NewBO->takeName(Op1); replaceOperand(I, 0, NewBO); - replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2)); + replaceOperand(I, 1, CRes); // Conservatively clear the optional flags, since they may not be // preserved by the reassociation. ClearSubclassDataAfterReassociation(I); @@ -1324,6 +1325,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { if (!isGuaranteedToTransferExecutionToSuccessor(&*BBIter)) return nullptr; + // Fold constants for the predecessor block with constant incoming values. + Constant *NewC = ConstantFoldBinaryOpOperands(BO.getOpcode(), C0, C1, DL); + if (!NewC) + return nullptr; + // Make a new binop in the predecessor block with the non-constant incoming // values. Builder.SetInsertPoint(PredBlockBranch); @@ -1333,9 +1339,6 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { if (auto *NotFoldedNewBO = dyn_cast<BinaryOperator>(NewBO)) NotFoldedNewBO->copyIRFlags(&BO); - // Fold constants for the predecessor block with constant incoming values. - Constant *NewC = ConstantExpr::get(BO.getOpcode(), C0, C1); - // Replace the binop with a phi of the new values. The old phis are dead. PHINode *NewPhi = PHINode::Create(BO.getType(), 2); NewPhi->addIncoming(NewBO, OtherBB); @@ -1774,9 +1777,10 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { // for target-independent shuffle creation. if (I >= SrcVecNumElts || ShMask[I] < 0) { Constant *MaybeUndef = - ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) - : ConstantExpr::get(Opcode, CElt, UndefScalar); - if (!match(MaybeUndef, m_Undef())) { + ConstOp1 + ? ConstantFoldBinaryOpOperands(Opcode, UndefScalar, CElt, DL) + : ConstantFoldBinaryOpOperands(Opcode, CElt, UndefScalar, DL); + if (!MaybeUndef || !match(MaybeUndef, m_Undef())) { MayChange = false; break; } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 7a5a74aa4fff..4fed4bd18fb1 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -101,6 +101,7 @@ static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL; static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000; static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44; static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52; +static const uint64_t kMIPS_ShadowOffsetN32 = 1ULL << 29; static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000; static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37; static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36; @@ -476,6 +477,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, TargetTriple.getArch() == Triple::ppc64le; bool IsSystemZ = TargetTriple.getArch() == Triple::systemz; bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64; + bool IsMIPSN32ABI = TargetTriple.getEnvironment() == Triple::GNUABIN32; bool IsMIPS32 = TargetTriple.isMIPS32(); bool IsMIPS64 = TargetTriple.isMIPS64(); bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb(); @@ -496,6 +498,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, if (LongSize == 32) { if (IsAndroid) Mapping.Offset = kDynamicShadowSentinel; + else if (IsMIPSN32ABI) + Mapping.Offset = kMIPS_ShadowOffsetN32; else if (IsMIPS32) Mapping.Offset = kMIPS32_ShadowOffset32; else if (IsFreeBSD) diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp index b11b84d65d23..57c491436b93 100644 --- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp +++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp @@ -39,7 +39,8 @@ addModuleFlags(Module &M, Nodes.push_back(MDNode::get(Context, Vals)); } - M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes)); + M.addModuleFlag(Module::Append, "CG Profile", + MDTuple::getDistinct(Context, Nodes)); return true; } diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 218b4bbfb6c0..b01c74320380 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -180,11 +180,31 @@ static cl::opt<bool> ClWithTls( "platforms that support this"), cl::Hidden, cl::init(true)); -static cl::opt<bool> - ClRecordStackHistory("hwasan-record-stack-history", - cl::desc("Record stack frames with tagged allocations " - "in a thread-local ring buffer"), - cl::Hidden, cl::init(true)); +// Mode for selecting how to insert frame record info into the stack ring +// buffer. +enum RecordStackHistoryMode { + // Do not record frame record info. + none, + + // Insert instructions into the prologue for storing into the stack ring + // buffer directly. + instr, + + // Add a call to __hwasan_add_frame_record in the runtime. + libcall, +}; + +static cl::opt<RecordStackHistoryMode> ClRecordStackHistory( + "hwasan-record-stack-history", + cl::desc("Record stack frames with tagged allocations in a thread-local " + "ring buffer"), + cl::values(clEnumVal(none, "Do not record stack ring history"), + clEnumVal(instr, "Insert instructions into the prologue for " + "storing into the stack ring buffer directly"), + clEnumVal(libcall, "Add a call to __hwasan_add_frame_record for " + "storing into the stack ring buffer")), + cl::Hidden, cl::init(instr)); + static cl::opt<bool> ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics", cl::desc("instrument memory intrinsics"), @@ -313,6 +333,7 @@ public: Value *getPC(IRBuilder<> &IRB); Value *getSP(IRBuilder<> &IRB); + Value *getFrameRecordInfo(IRBuilder<> &IRB); void instrumentPersonalityFunctions(); @@ -378,6 +399,7 @@ private: FunctionCallee HwasanTagMemoryFunc; FunctionCallee HwasanGenerateTagFunc; + FunctionCallee HwasanRecordFrameRecordFunc; Constant *ShadowGlobal; @@ -629,6 +651,9 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { HwasanGenerateTagFunc = M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty); + HwasanRecordFrameRecordFunc = M.getOrInsertFunction( + "__hwasan_add_frame_record", IRB.getVoidTy(), Int64Ty); + ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow", ArrayType::get(IRB.getInt8Ty(), 0)); @@ -1132,6 +1157,21 @@ Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) { return CachedSP; } +Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) { + // Prepare ring buffer data. + Value *PC = getPC(IRB); + Value *SP = getSP(IRB); + + // Mix SP and PC. + // Assumptions: + // PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero) + // SP is 0xsssssssssssSSSS0 (4 lower bits are zero) + // We only really need ~20 lower non-zero bits (SSSS), so we mix like this: + // 0xSSSSPPPPPPPPPPPP + SP = IRB.CreateShl(SP, 44); + return IRB.CreateOr(PC, SP); +} + void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { if (!Mapping.InTls) ShadowBase = getShadowNonTls(IRB); @@ -1141,50 +1181,67 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { if (!WithFrameRecord && ShadowBase) return; - Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); - assert(SlotPtr); - - Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI. - Value *ThreadLongMaybeUntagged = - TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong); + Value *SlotPtr = nullptr; + Value *ThreadLong = nullptr; + Value *ThreadLongMaybeUntagged = nullptr; + + auto getThreadLongMaybeUntagged = [&]() { + if (!SlotPtr) + SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); + if (!ThreadLong) + ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); + // Extract the address field from ThreadLong. Unnecessary on AArch64 with + // TBI. + return TargetTriple.isAArch64() ? ThreadLong + : untagPointer(IRB, ThreadLong); + }; if (WithFrameRecord) { - StackBaseTag = IRB.CreateAShr(ThreadLong, 3); - - // Prepare ring buffer data. - Value *PC = getPC(IRB); - Value *SP = getSP(IRB); - - // Mix SP and PC. - // Assumptions: - // PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero) - // SP is 0xsssssssssssSSSS0 (4 lower bits are zero) - // We only really need ~20 lower non-zero bits (SSSS), so we mix like this: - // 0xSSSSPPPPPPPPPPPP - SP = IRB.CreateShl(SP, 44); - - // Store data to ring buffer. - Value *RecordPtr = - IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0)); - IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr); - - // Update the ring buffer. Top byte of ThreadLong defines the size of the - // buffer in pages, it must be a power of two, and the start of the buffer - // must be aligned by twice that much. Therefore wrap around of the ring - // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). - // The use of AShr instead of LShr is due to - // https://bugs.llvm.org/show_bug.cgi?id=39030 - // Runtime library makes sure not to use the highest bit. - Value *WrapMask = IRB.CreateXor( - IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), - ConstantInt::get(IntptrTy, (uint64_t)-1)); - Value *ThreadLongNew = IRB.CreateAnd( - IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask); - IRB.CreateStore(ThreadLongNew, SlotPtr); + switch (ClRecordStackHistory) { + case libcall: { + // Emit a runtime call into hwasan rather than emitting instructions for + // recording stack history. + Value *FrameRecordInfo = getFrameRecordInfo(IRB); + IRB.CreateCall(HwasanRecordFrameRecordFunc, {FrameRecordInfo}); + break; + } + case instr: { + ThreadLongMaybeUntagged = getThreadLongMaybeUntagged(); + + StackBaseTag = IRB.CreateAShr(ThreadLong, 3); + + // Store data to ring buffer. + Value *FrameRecordInfo = getFrameRecordInfo(IRB); + Value *RecordPtr = IRB.CreateIntToPtr(ThreadLongMaybeUntagged, + IntptrTy->getPointerTo(0)); + IRB.CreateStore(FrameRecordInfo, RecordPtr); + + // Update the ring buffer. Top byte of ThreadLong defines the size of the + // buffer in pages, it must be a power of two, and the start of the buffer + // must be aligned by twice that much. Therefore wrap around of the ring + // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). + // The use of AShr instead of LShr is due to + // https://bugs.llvm.org/show_bug.cgi?id=39030 + // Runtime library makes sure not to use the highest bit. + Value *WrapMask = IRB.CreateXor( + IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), + ConstantInt::get(IntptrTy, (uint64_t)-1)); + Value *ThreadLongNew = IRB.CreateAnd( + IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask); + IRB.CreateStore(ThreadLongNew, SlotPtr); + break; + } + case none: { + llvm_unreachable( + "A stack history recording mode should've been selected."); + } + } } if (!ShadowBase) { + if (!ThreadLongMaybeUntagged) + ThreadLongMaybeUntagged = getThreadLongMaybeUntagged(); + // Get shadow base address by aligning RecordPtr up. // Note: this is not correct if the pointer is already aligned. // Runtime library will make sure this never happens. @@ -1408,7 +1465,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F, Instruction *InsertPt = &*F.getEntryBlock().begin(); IRBuilder<> EntryIRB(InsertPt); emitPrologue(EntryIRB, - /*WithFrameRecord*/ ClRecordStackHistory && + /*WithFrameRecord*/ ClRecordStackHistory != none && Mapping.WithFrameRecord && !SInfo.AllocasToInstrument.empty()); diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7843b1522830..3572cb3b50e2 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1244,6 +1244,7 @@ bool InstrProfiling::emitRuntimeHook() { auto *Var = new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index c33b1b3b1a5c..d4aa31db8337 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -486,7 +486,7 @@ static bool isTsanAtomic(const Instruction *I) { if (!SSID) return false; if (isa<LoadInst>(I) || isa<StoreInst>(I)) - return SSID.getValue() != SyncScope::SingleThread; + return SSID.value() != SyncScope::SingleThread; return true; } diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 8a1761505d59..fe6f9486ab0c 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -611,9 +611,9 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, ConstCand->ConstInt->getValue()); if (Diff) { const InstructionCost ImmCosts = - TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty); + TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.value(), Ty); Cost -= ImmCosts; - LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " " + LLVM_DEBUG(dbgs() << "Offset " << Diff.value() << " " << "has penalty: " << ImmCosts << "\n" << "Adjusted cost: " << Cost << "\n"); } diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 783301fe589e..b460637b7d88 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -748,14 +748,14 @@ void GVNPass::printPipeline( OS << "<"; if (Options.AllowPRE != None) - OS << (Options.AllowPRE.getValue() ? "" : "no-") << "pre;"; + OS << (Options.AllowPRE.value() ? "" : "no-") << "pre;"; if (Options.AllowLoadPRE != None) - OS << (Options.AllowLoadPRE.getValue() ? "" : "no-") << "load-pre;"; + OS << (Options.AllowLoadPRE.value() ? "" : "no-") << "load-pre;"; if (Options.AllowLoadPRESplitBackedge != None) - OS << (Options.AllowLoadPRESplitBackedge.getValue() ? "" : "no-") + OS << (Options.AllowLoadPRESplitBackedge.value() ? "" : "no-") << "split-backedge-load-pre;"; if (Options.AllowMemDep != None) - OS << (Options.AllowMemDep.getValue() ? "" : "no-") << "memdep"; + OS << (Options.AllowMemDep.value() ? "" : "no-") << "memdep"; OS << ">"; } @@ -1059,8 +1059,8 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo, if (DT->dominates(cast<Instruction>(OtherAccess), cast<Instruction>(U))) OtherAccess = U; else - assert(DT->dominates(cast<Instruction>(U), - cast<Instruction>(OtherAccess))); + assert(U == OtherAccess || DT->dominates(cast<Instruction>(U), + cast<Instruction>(OtherAccess))); } else OtherAccess = U; } @@ -1494,14 +1494,6 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, return false; } - // FIXME: Can we support the fallthrough edge? - if (isa<CallBrInst>(Pred->getTerminator())) { - LLVM_DEBUG( - dbgs() << "COULD NOT PRE LOAD BECAUSE OF CALLBR CRITICAL EDGE '" - << Pred->getName() << "': " << *Load << '\n'); - return false; - } - if (LoadBB->isEHPad()) { LLVM_DEBUG( dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" @@ -2875,11 +2867,6 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { if (isa<IndirectBrInst>(PREPred->getTerminator())) return false; - // Don't do PRE across callbr. - // FIXME: Can we do this across the fallthrough edge? - if (isa<CallBrInst>(PREPred->getTerminator())) - return false; - // We can't do PRE safely on a critical edge, so instead we schedule // the edge to be split and perform the PRE the next time we iterate // on the function. diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index e977dd18be9f..a9ca0bdc8f7b 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -106,13 +106,18 @@ static cl::opt<bool> VerifyIndvars( static cl::opt<ReplaceExitVal> ReplaceExitValue( "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), - cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"), - clEnumValN(OnlyCheapRepl, "cheap", - "only replace exit value when the cost is cheap"), - clEnumValN(NoHardUse, "noharduse", - "only replace exit values when loop def likely dead"), - clEnumValN(AlwaysRepl, "always", - "always replace exit value whenever possible"))); + cl::values( + clEnumValN(NeverRepl, "never", "never replace exit value"), + clEnumValN(OnlyCheapRepl, "cheap", + "only replace exit value when the cost is cheap"), + clEnumValN( + UnusedIndVarInLoop, "unusedindvarinloop", + "only replace exit value when it is an unused " + "induction variable in the loop and has cheap replacement cost"), + clEnumValN(NoHardUse, "noharduse", + "only replace exit values when loop def likely dead"), + clEnumValN(AlwaysRepl, "always", + "always replace exit value whenever possible"))); static cl::opt<bool> UsePostIncrementRanges( "indvars-post-increment-ranges", cl::Hidden, @@ -1302,15 +1307,39 @@ static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, } static void replaceLoopPHINodesWithPreheaderValues( - Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { + LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); auto *LoopPreheader = L->getLoopPreheader(); auto *LoopHeader = L->getHeader(); + SmallVector<Instruction *> Worklist; for (auto &PN : LoopHeader->phis()) { auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); + for (User *U : PN.users()) + Worklist.push_back(cast<Instruction>(U)); PN.replaceAllUsesWith(PreheaderIncoming); DeadInsts.emplace_back(&PN); } + + // Replacing with the preheader value will often allow IV users to simplify + // (especially if the preheader value is a constant). + SmallPtrSet<Instruction *, 16> Visited; + while (!Worklist.empty()) { + auto *I = cast<Instruction>(Worklist.pop_back_val()); + if (!Visited.insert(I).second) + continue; + + // Don't simplify instructions outside the loop. + if (!L->contains(I)) + continue; + + Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout()); + if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { + for (User *U : I->users()) + Worklist.push_back(cast<Instruction>(U)); + I->replaceAllUsesWith(Res); + DeadInsts.emplace_back(I); + } + } } static void replaceWithInvariantCond( @@ -1549,14 +1578,19 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { if (!BI) return true; - // If already constant, nothing to do. - if (isa<Constant>(BI->getCondition())) - return true; - // Likewise, the loop latch must be dominated by the exiting BB. if (!DT->dominates(ExitingBB, L->getLoopLatch())) return true; + if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { + // If already constant, nothing to do. However, if this is an + // unconditional exit, we can still replace header phis with their + // preheader value. + if (!L->contains(BI->getSuccessor(CI->isNullValue()))) + replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts); + return true; + } + return false; }); @@ -1640,7 +1674,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { // the header PHIs with values coming from the preheader. if (ExitCount->isZero()) { foldExit(L, ExitingBB, true, DeadInsts); - replaceLoopPHINodesWithPreheaderValues(L, DeadInsts); + replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts); Changed = true; continue; } diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 799669a19796..b54cf5e7cb20 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -1710,7 +1710,7 @@ IntersectSignedRange(ScalarEvolution &SE, return None; if (!R1) return R2; - auto &R1Value = R1.getValue(); + auto &R1Value = R1.value(); // We never return empty ranges from this function, and R1 is supposed to be // a result of intersection. Thus, R1 is never empty. assert(!R1Value.isEmpty(SE, /* IsSigned */ true) && @@ -1739,7 +1739,7 @@ IntersectUnsignedRange(ScalarEvolution &SE, return None; if (!R1) return R2; - auto &R1Value = R1.getValue(); + auto &R1Value = R1.value(); // We never return empty ranges from this function, and R1 is supposed to be // a result of intersection. Thus, R1 is never empty. assert(!R1Value.isEmpty(SE, /* IsSigned */ false) && @@ -1950,13 +1950,12 @@ bool InductiveRangeCheckElimination::run( LS.IsSignedPredicate); if (Result) { auto MaybeSafeIterRange = - IntersectRange(SE, SafeIterRange, Result.getValue()); + IntersectRange(SE, SafeIterRange, Result.value()); if (MaybeSafeIterRange) { - assert( - !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) && - "We should never return empty ranges!"); + assert(!MaybeSafeIterRange.value().isEmpty(SE, LS.IsSignedPredicate) && + "We should never return empty ranges!"); RangeChecksToEliminate.push_back(IRC); - SafeIterRange = MaybeSafeIterRange.getValue(); + SafeIterRange = MaybeSafeIterRange.value(); } } } @@ -1964,8 +1963,7 @@ bool InductiveRangeCheckElimination::run( if (!SafeIterRange) return false; - LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, - SafeIterRange.getValue()); + LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, SafeIterRange.value()); bool Changed = LC.run(); if (Changed) { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 5caefc422921..b31eab50c5ec 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1459,9 +1459,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // Add all the unavailable predecessors to the PredsToSplit list. for (BasicBlock *P : predecessors(LoadBB)) { // If the predecessor is an indirect goto, we can't split the edge. - // Same for CallBr. - if (isa<IndirectBrInst>(P->getTerminator()) || - isa<CallBrInst>(P->getTerminator())) + if (isa<IndirectBrInst>(P->getTerminator())) return false; if (!AvailablePredSet.count(P)) @@ -1685,9 +1683,8 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB, } // If the predecessor ends with an indirect goto, we can't change its - // destination. Same for CallBr. - if (isa<IndirectBrInst>(Pred->getTerminator()) || - isa<CallBrInst>(Pred->getTerminator())) + // destination. + if (isa<IndirectBrInst>(Pred->getTerminator())) continue; PredToDestList.emplace_back(Pred, DestBB); @@ -1924,10 +1921,9 @@ bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) { } // If any of predecessors end with an indirect goto, we can't change its - // destination. Same for CallBr. + // destination. if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { - return isa<IndirectBrInst>(Pred->getTerminator()) || - isa<CallBrInst>(Pred->getTerminator()); + return isa<IndirectBrInst>(Pred->getTerminator()); })) return false; @@ -2173,6 +2169,9 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB, BasicBlock *ZeroPred = nullptr; BasicBlock *OnePred = nullptr; for (BasicBlock *P : predecessors(PredBB)) { + // If PredPred ends with IndirectBrInst, we can't handle it. + if (isa<IndirectBrInst>(P->getTerminator())) + continue; if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>( evaluateOnPredecessorEdge(BB, P, Cond))) { if (CI->isZero()) { diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 492f4e40395a..f54264b1dca6 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1508,8 +1508,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad()) return false; for (BasicBlock *BBPred : predecessors(BB)) { - if (isa<IndirectBrInst>(BBPred->getTerminator()) || - isa<CallBrInst>(BBPred->getTerminator())) + if (isa<IndirectBrInst>(BBPred->getTerminator())) return false; } return true; diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 03a10cb36bb6..b178bcae3b0e 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -602,7 +602,7 @@ private: : LLVMLoopDistributeFollowupCoincident}); if (PartitionID) { Loop *NewLoop = Part->getDistributedLoop(); - NewLoop->setLoopID(PartitionID.getValue()); + NewLoop->setLoopID(PartitionID.value()); } } }; @@ -826,7 +826,7 @@ public: {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, "llvm.loop.distribute.", true) - .getValue(); + .value(); LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 88d6a7aff3c9..d908c151d9f2 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1483,7 +1483,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // anything where the alignment isn't at least the element size. assert((StoreAlign && LoadAlign) && "Expect unordered load/store to have align."); - if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize) + if (StoreAlign.value() < StoreSize || LoadAlign.value() < StoreSize) return Changed; // If the element.atomic memcpy is not lowered into explicit @@ -1497,7 +1497,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // Note that unordered atomic loads/stores are *required* by the spec to // have an alignment but non-atomic loads/stores may not. NewCall = Builder.CreateElementUnorderedAtomicMemCpy( - StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(), + StoreBasePtr, StoreAlign.value(), LoadBasePtr, LoadAlign.value(), NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias); } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 9959e408e2e2..4ef7809c6681 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -5601,27 +5601,6 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, DeadInsts.emplace_back(OperandIsInstr); } -// Check if there are any loop exit values which are only used once within the -// loop which may potentially be optimized with a call to rewriteLoopExitValue. -static bool LoopExitValHasSingleUse(Loop *L) { - BasicBlock *ExitBB = L->getExitBlock(); - if (!ExitBB) - return false; - - for (PHINode &ExitPhi : ExitBB->phis()) { - if (ExitPhi.getNumIncomingValues() != 1) - break; - - BasicBlock *Pred = ExitPhi.getIncomingBlock(0); - Value *IVNext = ExitPhi.getIncomingValueForBlock(Pred); - // One use would be the exit phi node, and there should be only one other - // use for this to be considered. - if (IVNext->getNumUses() == 2) - return true; - } - return false; -} - /// Rewrite all the fixup locations with new values, following the chosen /// solution. void LSRInstance::ImplementSolution( @@ -6406,8 +6385,8 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, // less DWARF ops than an iteration count-based expression. if (Optional<APInt> Offset = SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) { - if (Offset.getValue().getMinSignedBits() <= 64) - SalvageExpr->createOffsetExpr(Offset.getValue().getSExtValue(), + if (Offset.value().getMinSignedBits() <= 64) + SalvageExpr->createOffsetExpr(Offset.value().getSExtValue(), LSRInductionVar); } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr, SE)) @@ -6627,12 +6606,12 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // When this is the case, if the exit value of the IV can be calculated using // SCEV, we can replace the exit block PHI with the final value of the IV and // skip the updates in each loop iteration. - if (L->isRecursivelyLCSSAForm(DT, LI) && LoopExitValHasSingleUse(L)) { + if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) { SmallVector<WeakTrackingVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); SCEVExpander Rewriter(SE, DL, "lsr", false); int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT, - OnlyCheapRepl, DeadInsts); + UnusedIndVarInLoop, DeadInsts); if (Rewrites) { Changed = true; RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 8c2868563227..64fcdfa15aa9 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -373,7 +373,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupRemainderInner}); if (NewInnerEpilogueLoopID) - SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); + SubLoop->setLoopID(NewInnerEpilogueLoopID.value()); // Find trip count and trip multiple BasicBlock *Latch = L->getLoopLatch(); @@ -403,14 +403,14 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupRemainderOuter}); if (NewOuterEpilogueLoopID) - EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue()); + EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.value()); } Optional<MDNode *> NewInnerLoopID = makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupInner}); if (NewInnerLoopID) - SubLoop->setLoopID(NewInnerLoopID.getValue()); + SubLoop->setLoopID(NewInnerLoopID.value()); else SubLoop->setLoopID(OrigSubLoopID); @@ -419,7 +419,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter}); if (NewOuterLoopID) { - L->setLoopID(NewOuterLoopID.getValue()); + L->setLoopID(NewOuterLoopID.value()); // Do not setLoopAlreadyUnrolled if a followup was given. return UnrollResult; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index fda86afe5f9d..de5833f60adc 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1324,7 +1324,7 @@ static LoopUnrollResult tryToUnrollLoop( makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); if (RemainderLoopID) - RemainderLoop->setLoopID(RemainderLoopID.getValue()); + RemainderLoop->setLoopID(RemainderLoopID.value()); } if (UnrollResult != LoopUnrollResult::FullyUnrolled) { @@ -1332,7 +1332,7 @@ static LoopUnrollResult tryToUnrollLoop( makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupUnrolled}); if (NewLoopID) { - L->setLoopID(NewLoopID.getValue()); + L->setLoopID(NewLoopID.value()); // Do not setLoopAlreadyUnrolled if loop attributes have been specified // explicitly. @@ -1645,15 +1645,15 @@ void LoopUnrollPass::printPipeline( OS, MapClassName2PassName); OS << "<"; if (UnrollOpts.AllowPartial != None) - OS << (UnrollOpts.AllowPartial.getValue() ? "" : "no-") << "partial;"; + OS << (UnrollOpts.AllowPartial.value() ? "" : "no-") << "partial;"; if (UnrollOpts.AllowPeeling != None) - OS << (UnrollOpts.AllowPeeling.getValue() ? "" : "no-") << "peeling;"; + OS << (UnrollOpts.AllowPeeling.value() ? "" : "no-") << "peeling;"; if (UnrollOpts.AllowRuntime != None) - OS << (UnrollOpts.AllowRuntime.getValue() ? "" : "no-") << "runtime;"; + OS << (UnrollOpts.AllowRuntime.value() ? "" : "no-") << "runtime;"; if (UnrollOpts.AllowUpperBound != None) - OS << (UnrollOpts.AllowUpperBound.getValue() ? "" : "no-") << "upperbound;"; + OS << (UnrollOpts.AllowUpperBound.value() ? "" : "no-") << "upperbound;"; if (UnrollOpts.AllowProfileBasedPeeling != None) - OS << (UnrollOpts.AllowProfileBasedPeeling.getValue() ? "" : "no-") + OS << (UnrollOpts.AllowProfileBasedPeeling.value() ? "" : "no-") << "profile-peeling;"; if (UnrollOpts.FullUnrollMaxCount != None) OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ";"; diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index da1737979305..75f0896d4845 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" @@ -1929,11 +1930,23 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. + const DataLayout &DL = I->getModule()->getDataLayout(); Constant *Cst = nullptr; unsigned Opcode = I->getOpcode(); - while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { - Constant *C = cast<Constant>(Ops.pop_back_val().Op); - Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; + while (!Ops.empty()) { + if (auto *C = dyn_cast<Constant>(Ops.back().Op)) { + if (!Cst) { + Ops.pop_back(); + Cst = C; + continue; + } + if (Constant *Res = ConstantFoldBinaryOpOperands(Opcode, C, Cst, DL)) { + Ops.pop_back(); + Cst = Res; + continue; + } + } + break; } // If there was nothing but constants then we are done. if (Ops.empty()) diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index e9983ff82176..079b2fc973b9 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -769,8 +769,7 @@ llvm::SplitAllCriticalEdges(Function &F, unsigned NumBroken = 0; for (BasicBlock &BB : F) { Instruction *TI = BB.getTerminator(); - if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) && - !isa<CallBrInst>(TI)) + if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI)) for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (SplitCriticalEdge(TI, i, Options)) ++NumBroken; @@ -1132,9 +1131,7 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, // all BlockAddress uses would need to be updated. assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) && "Cannot split an edge from an IndirectBrInst"); - assert(!isa<CallBrInst>(Preds[i]->getTerminator()) && - "Cannot split an edge from a CallBrInst"); - Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB); + Preds[i]->getTerminator()->replaceSuccessorWith(BB, NewBB); } // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 0b36e8708a03..9c595401ce29 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -129,8 +129,7 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, SmallVector<BasicBlock *, 4> LoopPreds; // Check if extra modifications will be required to preserve loop-simplify // form after splitting. If it would require splitting blocks with IndirectBr - // or CallBr terminators, bail out if preserving loop-simplify form is - // requested. + // terminators, bail out if preserving loop-simplify form is requested. if (LI) { if (Loop *TIL = LI->getLoopFor(TIBB)) { @@ -156,10 +155,7 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, // Loop-simplify form can be preserved, if we can split all in-loop // predecessors. if (any_of(LoopPreds, [](BasicBlock *Pred) { - const Instruction *T = Pred->getTerminator(); - if (const auto *CBR = dyn_cast<CallBrInst>(T)) - return CBR->getDefaultDest() != Pred; - return isa<IndirectBrInst>(T); + return isa<IndirectBrInst>(Pred->getTerminator()); })) { if (Options.PreserveLoopSimplify) return nullptr; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index f94d854f7ee8..421f1f329f07 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -927,6 +927,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::AlwaysInline: case Attribute::Cold: case Attribute::DisableSanitizerInstrumentation: + case Attribute::FnRetThunkExtern: case Attribute::Hot: case Attribute::NoRecurse: case Attribute::InlineHint: @@ -1777,7 +1778,7 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC, auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency()); if (Count) newFunction->setEntryCount( - ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME + ProfileCount(Count.value(), Function::PCT_Real)); // FIXME BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency()); } diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 205f7a7d9ed2..24126b5ab67b 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -961,8 +961,13 @@ createDebugifyFunctionPass(enum DebugifyMode Mode, } PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) { - applyDebugifyMetadata(M, M.functions(), - "ModuleDebugify: ", /*ApplyToMF*/ nullptr); + if (Mode == DebugifyMode::SyntheticDebugInfo) + applyDebugifyMetadata(M, M.functions(), + "ModuleDebugify: ", /*ApplyToMF*/ nullptr); + else + collectDebugInfoMetadata(M, M.functions(), *DebugInfoBeforePass, + "ModuleDebugify (original debuginfo)", + NameOfWrappedPass); return PreservedAnalyses::all(); } @@ -992,8 +997,14 @@ FunctionPass *createCheckDebugifyFunctionPass( PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, ModuleAnalysisManager &) { - checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false, - nullptr); + if (Mode == DebugifyMode::SyntheticDebugInfo) + checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass, + "CheckModuleDebugify", Strip, StatsMap); + else + checkDebugInfoMetadata( + M, M.functions(), *DebugInfoBeforePass, + "CheckModuleDebugify (original debuginfo)", NameOfWrappedPass, + OrigDIVerifyBugsReportFilePath); return PreservedAnalyses::all(); } @@ -1006,13 +1017,15 @@ static bool isIgnoredPass(StringRef PassID) { void DebugifyEachInstrumentation::registerCallbacks( PassInstrumentationCallbacks &PIC) { - PIC.registerBeforeNonSkippedPassCallback([](StringRef P, Any IR) { + PIC.registerBeforeNonSkippedPassCallback([this](StringRef P, Any IR) { if (isIgnoredPass(P)) return; if (any_isa<const Function *>(IR)) - applyDebugify(*const_cast<Function *>(any_cast<const Function *>(IR))); + applyDebugify(*const_cast<Function *>(any_cast<const Function *>(IR)), + Mode, DebugInfoBeforePass, P); else if (any_isa<const Module *>(IR)) - applyDebugify(*const_cast<Module *>(any_cast<const Module *>(IR))); + applyDebugify(*const_cast<Module *>(any_cast<const Module *>(IR)), + Mode, DebugInfoBeforePass, P); }); PIC.registerAfterPassCallback([this](StringRef P, Any IR, const PreservedAnalyses &PassPA) { @@ -1022,12 +1035,24 @@ void DebugifyEachInstrumentation::registerCallbacks( auto &F = *const_cast<Function *>(any_cast<const Function *>(IR)); Module &M = *F.getParent(); auto It = F.getIterator(); - checkDebugifyMetadata(M, make_range(It, std::next(It)), P, - "CheckFunctionDebugify", /*Strip=*/true, &StatsMap); + if (Mode == DebugifyMode::SyntheticDebugInfo) + checkDebugifyMetadata(M, make_range(It, std::next(It)), P, + "CheckFunctionDebugify", /*Strip=*/true, DIStatsMap); + else + checkDebugInfoMetadata( + M, make_range(It, std::next(It)), *DebugInfoBeforePass, + "CheckModuleDebugify (original debuginfo)", + P, OrigDIVerifyBugsReportFilePath); } else if (any_isa<const Module *>(IR)) { auto &M = *const_cast<Module *>(any_cast<const Module *>(IR)); - checkDebugifyMetadata(M, M.functions(), P, "CheckModuleDebugify", - /*Strip=*/true, &StatsMap); + if (Mode == DebugifyMode::SyntheticDebugInfo) + checkDebugifyMetadata(M, M.functions(), P, "CheckModuleDebugify", + /*Strip=*/true, DIStatsMap); + else + checkDebugInfoMetadata( + M, M.functions(), *DebugInfoBeforePass, + "CheckModuleDebugify (original debuginfo)", + P, OrigDIVerifyBugsReportFilePath); } }); } diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index cd3b6c1a095a..023a0afd329b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -402,7 +402,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Optional<MDNode *> NewLoopID = makeFollowupLoopID( LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); if (NewLoopID) { - NewLoop->setLoopID(NewLoopID.getValue()); + NewLoop->setLoopID(NewLoopID.value()); // Do not setLoopAlreadyUnrolled if loop attributes have been defined // explicitly. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index ec898c463574..82f993b4ceab 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -75,9 +75,6 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, if (isa<IndirectBrInst>(PredBB->getTerminator())) // We cannot rewrite exiting edges from an indirectbr. return false; - if (isa<CallBrInst>(PredBB->getTerminator())) - // We cannot rewrite exiting edges from a callbr. - return false; InLoopPredecessors.push_back(PredBB); } else { @@ -359,7 +356,7 @@ TransformationMode llvm::hasUnrollTransformation(const Loop *L) { Optional<int> Count = getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count"); if (Count) - return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; + return Count.value() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable")) return TM_ForcedByUser; @@ -380,7 +377,7 @@ TransformationMode llvm::hasUnrollAndJamTransformation(const Loop *L) { Optional<int> Count = getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count"); if (Count) - return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; + return Count.value() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable")) return TM_ForcedByUser; @@ -1246,6 +1243,20 @@ static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) return true; } +/// Checks if it is safe to call InductionDescriptor::isInductionPHI for \p Phi, +/// and returns true if this Phi is an induction phi in the loop. When +/// isInductionPHI returns true, \p ID will be also be set by isInductionPHI. +static bool checkIsIndPhi(PHINode *Phi, Loop *L, ScalarEvolution *SE, + InductionDescriptor &ID) { + if (!Phi) + return false; + if (!L->getLoopPreheader()) + return false; + if (Phi->getParent() != L->getHeader()) + return false; + return InductionDescriptor::isInductionPHI(Phi, L, SE, ID); +} + int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, @@ -1297,6 +1308,46 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, if (!L->contains(Inst)) continue; + // Find exit values which are induction variables in the loop, and are + // unused in the loop, with the only use being the exit block PhiNode, + // and the induction variable update binary operator. + // The exit value can be replaced with the final value when it is cheap + // to do so. + if (ReplaceExitValue == UnusedIndVarInLoop) { + InductionDescriptor ID; + PHINode *IndPhi = dyn_cast<PHINode>(Inst); + if (IndPhi) { + if (!checkIsIndPhi(IndPhi, L, SE, ID)) + continue; + // This is an induction PHI. Check that the only users are PHI + // nodes, and induction variable update binary operators. + if (llvm::any_of(Inst->users(), [&](User *U) { + if (!isa<PHINode>(U) && !isa<BinaryOperator>(U)) + return true; + BinaryOperator *B = dyn_cast<BinaryOperator>(U); + if (B && B != ID.getInductionBinOp()) + return true; + return false; + })) + continue; + } else { + // If it is not an induction phi, it must be an induction update + // binary operator with an induction phi user. + BinaryOperator *B = dyn_cast<BinaryOperator>(Inst); + if (!B) + continue; + if (llvm::any_of(Inst->users(), [&](User *U) { + PHINode *Phi = dyn_cast<PHINode>(U); + if (Phi != PN && !checkIsIndPhi(Phi, L, SE, ID)) + return true; + return false; + })) + continue; + if (B != ID.getInductionBinOp()) + continue; + } + } + // Okay, this instruction has a user outside of the current loop // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. We prefer to start with @@ -1362,7 +1413,9 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, // Only do the rewrite when the ExitValue can be expanded cheaply. // If LoopCanBeDel is true, rewrite exit value aggressively. - if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) + if ((ReplaceExitValue == OnlyCheapRepl || + ReplaceExitValue == UnusedIndVarInLoop) && + !LoopCanBeDel && Phi.HighCost) continue; Value *ExitVal = Rewriter.expandCodeFor( diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp index 8641581c8039..9914a5ca6c5e 100644 --- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp @@ -74,6 +74,10 @@ Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op, return Builder.CreateFAdd(Loaded, Inc, "new"); case AtomicRMWInst::FSub: return Builder.CreateFSub(Loaded, Inc, "new"); + case AtomicRMWInst::FMax: + return Builder.CreateMaxNum(Loaded, Inc); + case AtomicRMWInst::FMin: + return Builder.CreateMinNum(Loaded, Inc); default: llvm_unreachable("Unknown atomic op"); } diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp index b73d68ebec7c..4414b04c7264 100644 --- a/llvm/lib/Transforms/Utils/MisExpect.cpp +++ b/llvm/lib/Transforms/Utils/MisExpect.cpp @@ -221,7 +221,7 @@ void checkBackendInstrumentation(Instruction &I, auto ExpectedWeightsOpt = extractWeights(&I, I.getContext()); if (!ExpectedWeightsOpt) return; - auto ExpectedWeights = ExpectedWeightsOpt.getValue(); + auto ExpectedWeights = ExpectedWeightsOpt.value(); verifyMisExpect(I, RealWeights, ExpectedWeights); } @@ -230,7 +230,7 @@ void checkFrontendInstrumentation(Instruction &I, auto RealWeightsOpt = extractWeights(&I, I.getContext()); if (!RealWeightsOpt) return; - auto RealWeights = RealWeightsOpt.getValue(); + auto RealWeights = RealWeightsOpt.value(); verifyMisExpect(I, RealWeights, ExpectedWeights); } diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index 5120ade70e16..9e1492b97a86 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -255,7 +255,7 @@ void VFABI::setVectorVariantNames(CallInst *CI, LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n"); Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M); assert(VI && "Cannot add an invalid VFABI name."); - assert(M->getNamedValue(VI.getValue().VectorName) && + assert(M->getNamedValue(VI.value().VectorName) && "Cannot add variant to attribute: " "vector function declaration is missing."); } @@ -275,5 +275,13 @@ void llvm::embedBufferInModule(Module &M, MemoryBufferRef Buf, GV->setSection(SectionName); GV->setAlignment(Alignment); + LLVMContext &Ctx = M.getContext(); + NamedMDNode *MD = M.getOrInsertNamedMetadata("llvm.embedded.objects"); + Metadata *MDVals[] = {ConstantAsMetadata::get(GV), + MDString::get(Ctx, SectionName)}; + + MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); + GV->setMetadata(LLVMContext::MD_exclude, llvm::MDNode::get(Ctx, {})); + appendToCompilerUsed(M, GV); } diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index aff692b36288..bec1db896efb 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -488,31 +488,33 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, StoresByIndex, std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)), less_first()); + Value *ReplVal; if (I == StoresByIndex.begin()) { if (StoresByIndex.empty()) // If there are no stores, the load takes the undef value. - LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + ReplVal = UndefValue::get(LI->getType()); else // There is no store before this load, bail out (load may be affected // by the following stores - see main comment). return false; } else { - // Otherwise, there was a store before this load, the load takes its value. - // Note, if the load was marked as nonnull we don't want to lose that - // information when we erase it. So we preserve it with an assume. - Value *ReplVal = std::prev(I)->second->getOperand(0); - if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && - !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT)) - addAssumeNonNull(AC, LI); + // Otherwise, there was a store before this load, the load takes its + // value. + ReplVal = std::prev(I)->second->getOperand(0); + } - // If the replacement value is the load, this must occur in unreachable - // code. - if (ReplVal == LI) - ReplVal = PoisonValue::get(LI->getType()); + // Note, if the load was marked as nonnull we don't want to lose that + // information when we erase it. So we preserve it with an assume. + if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && + !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT)) + addAssumeNonNull(AC, LI); - LI->replaceAllUsesWith(ReplVal); - } + // If the replacement value is the load, this must occur in unreachable + // code. + if (ReplVal == LI) + ReplVal = PoisonValue::get(LI->getType()); + LI->replaceAllUsesWith(ReplVal); LI->eraseFromParent(); LBI.deleteValue(LI); } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index eee91e70292e..09a83f1ea094 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -208,8 +208,6 @@ private: if (!Elt) LV.markOverdefined(); // Unknown sort of constant. - else if (isa<UndefValue>(Elt)) - ; // Undef values remain unknown. else LV.markConstant(Elt); // Constants are constant. } @@ -356,8 +354,7 @@ public: // We only track the contents of scalar globals. if (GV->getValueType()->isSingleValueType()) { ValueLatticeElement &IV = TrackedGlobals[GV]; - if (!isa<UndefValue>(GV->getInitializer())) - IV.markConstant(GV->getInitializer()); + IV.markConstant(GV->getInitializer()); } } @@ -822,9 +819,6 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { if (Constant *OpC = getConstant(OpSt)) { // Fold the constant as we build. Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL); - if (isa<UndefValue>(C)) - return; - // Propagate constant value markConstant(&I, C); } else if (I.getDestTy()->isIntegerTy()) { auto &LV = getValueState(&I); @@ -959,19 +953,15 @@ void SCCPInstVisitor::visitUnaryOperator(Instruction &I) { if (isOverdefined(IV)) return (void)markOverdefined(&I); - if (isConstant(V0State)) { - Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State)); - - // op Y -> undef. - if (isa<UndefValue>(C)) - return; - return (void)markConstant(IV, &I, C); - } - - // If something is undef, wait for it to resolve. - if (!isOverdefined(V0State)) + // If something is unknown/undef, wait for it to resolve. + if (V0State.isUnknownOrUndef()) return; + if (isConstant(V0State)) + if (Constant *C = ConstantFoldUnaryOpOperand(I.getOpcode(), + getConstant(V0State), DL)) + return (void)markConstant(IV, &I, C); + markOverdefined(&I); } @@ -999,9 +989,6 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL)); auto *C = dyn_cast_or_null<Constant>(R); if (C) { - // X op Y -> undef. - if (isa<UndefValue>(C)) - return; // Conservatively assume that the result may be based on operands that may // be undef. Note that we use mergeInValue to combine the constant with // the existing lattice value for I, as different constants might be found @@ -1050,6 +1037,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) { Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State); if (C) { + // TODO: getCompare() currently has incorrect handling for unknown/undef. if (isa<UndefValue>(C)) return; ValueLatticeElement CV; @@ -1095,8 +1083,6 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end()); Constant *C = ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices); - if (isa<UndefValue>(C)) - return; markConstant(&I, C); } @@ -1174,11 +1160,8 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { } // Transform load from a constant into a constant if possible. - if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) { - if (isa<UndefValue>(C)) - return; + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) return (void)markConstant(IV, &I, C); - } } // Fall back to metadata. @@ -1223,12 +1206,8 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { // If we can constant fold this, mark the result of the call as a // constant. - if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) { - // call -> undef. - if (isa<UndefValue>(C)) - return; + if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) return (void)markConstant(&CB, C); - } } // Fall back to metadata. diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 401f1ee5a55d..0c8bf3827256 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -220,7 +220,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, // Fold a binop with constant operands. if (Constant *CLHS = dyn_cast<Constant>(LHS)) if (Constant *CRHS = dyn_cast<Constant>(RHS)) - return ConstantExpr::get(Opcode, CLHS, CRHS); + if (Constant *Res = ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, DL)) + return Res; // Do a quick scan to see if we have this binop nearby. If so, reuse it. unsigned ScanLimit = 6; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 567b866f7777..4b5ade99767b 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -377,18 +377,12 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, /// expensive. static InstructionCost computeSpeculationCost(const User *I, const TargetTransformInfo &TTI) { - assert(isSafeToSpeculativelyExecute(I) && + assert((!isa<Instruction>(I) || + isSafeToSpeculativelyExecute(cast<Instruction>(I))) && "Instruction is not safe to speculatively execute!"); return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency); } -/// Check whether this is a potentially trapping constant. -static bool canTrap(const Value *V) { - if (auto *C = dyn_cast<Constant>(V)) - return C->canTrap(); - return false; -} - /// If we have a merge point of an "if condition" as accepted above, /// return true if the specified value dominates the block. We /// don't handle the true generality of domination here, just a special case @@ -421,9 +415,9 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB, Instruction *I = dyn_cast<Instruction>(V); if (!I) { - // Non-instructions all dominate instructions, but not all constantexprs - // can be executed unconditionally. - return !canTrap(V); + // Non-instructions dominate all instructions and can be executed + // unconditionally. + return true; } BasicBlock *PBB = I->getParent(); @@ -1473,10 +1467,7 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, while (isa<DbgInfoIntrinsic>(I2)) I2 = &*BB2_Itr++; } - // FIXME: Can we define a safety predicate for CallBr? - if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) || - (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) || - isa<CallBrInst>(I1)) + if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2)) return false; BasicBlock *BIParent = BI->getParent(); @@ -1609,11 +1600,6 @@ HoistTerminator: if (passingValueIsAlwaysUndefined(BB1V, &PN) || passingValueIsAlwaysUndefined(BB2V, &PN)) return Changed; - - if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) - return Changed; - if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) - return Changed; } } @@ -2679,9 +2665,6 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB, passingValueIsAlwaysUndefined(ThenV, &PN)) return false; - if (canTrap(OrigV) || canTrap(ThenV)) - return false; - HaveRewritablePHIs = true; ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV); ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV); @@ -2979,10 +2962,8 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { return true; } -static ConstantInt * -getKnownValueOnEdge(Value *V, BasicBlock *From, BasicBlock *To, - SmallDenseMap<std::pair<BasicBlock *, BasicBlock *>, - ConstantInt *> &Visited) { +static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From, + BasicBlock *To) { // Don't look past the block defining the value, we might get the value from // a previous loop iteration. auto *I = dyn_cast<Instruction>(V); @@ -2996,23 +2977,7 @@ getKnownValueOnEdge(Value *V, BasicBlock *From, BasicBlock *To, return BI->getSuccessor(0) == To ? ConstantInt::getTrue(BI->getContext()) : ConstantInt::getFalse(BI->getContext()); - // Limit the amount of blocks we inspect. - if (Visited.size() >= 8) - return nullptr; - - auto Pair = Visited.try_emplace({From, To}, nullptr); - if (!Pair.second) - return Pair.first->second; - - // Check whether the known value is the same for all predecessors. - ConstantInt *Common = nullptr; - for (BasicBlock *Pred : predecessors(From)) { - ConstantInt *C = getKnownValueOnEdge(V, Pred, From, Visited); - if (!C || (Common && Common != C)) - return nullptr; - Common = C; - } - return Visited[{From, To}] = Common; + return nullptr; } /// If we have a conditional branch on something for which we know the constant @@ -3022,7 +2987,7 @@ static Optional<bool> FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL, AssumptionCache *AC) { - SmallMapVector<BasicBlock *, ConstantInt *, 8> KnownValues; + SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues; BasicBlock *BB = BI->getParent(); Value *Cond = BI->getCondition(); PHINode *PN = dyn_cast<PHINode>(Cond); @@ -3035,12 +3000,11 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, for (Use &U : PN->incoming_values()) if (auto *CB = dyn_cast<ConstantInt>(U)) - KnownValues.insert({PN->getIncomingBlock(U), CB}); + KnownValues[CB].insert(PN->getIncomingBlock(U)); } else { - SmallDenseMap<std::pair<BasicBlock *, BasicBlock *>, ConstantInt *> Visited; for (BasicBlock *Pred : predecessors(BB)) { - if (ConstantInt *CB = getKnownValueOnEdge(Cond, Pred, BB, Visited)) - KnownValues.insert({Pred, CB}); + if (ConstantInt *CB = getKnownValueOnEdge(Cond, Pred, BB)) + KnownValues[CB].insert(Pred); } } @@ -3056,29 +3020,34 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, for (const auto &Pair : KnownValues) { // Okay, we now know that all edges from PredBB should be revectored to // branch to RealDest. - ConstantInt *CB = Pair.second; - BasicBlock *PredBB = Pair.first; + ConstantInt *CB = Pair.first; + ArrayRef<BasicBlock *> PredBBs = Pair.second.getArrayRef(); BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue()); if (RealDest == BB) continue; // Skip self loops. + // Skip if the predecessor's terminator is an indirect branch. - if (isa<IndirectBrInst>(PredBB->getTerminator())) + if (any_of(PredBBs, [](BasicBlock *PredBB) { + return isa<IndirectBrInst>(PredBB->getTerminator()); + })) continue; - SmallVector<DominatorTree::UpdateType, 3> Updates; + LLVM_DEBUG({ + dbgs() << "Condition " << *Cond << " in " << BB->getName() + << " has value " << *Pair.first << " in predecessors:\n"; + for (const BasicBlock *PredBB : Pair.second) + dbgs() << " " << PredBB->getName() << "\n"; + dbgs() << "Threading to destination " << RealDest->getName() << ".\n"; + }); + + // Split the predecessors we are threading into a new edge block. We'll + // clone the instructions into this block, and then redirect it to RealDest. + BasicBlock *EdgeBB = SplitBlockPredecessors(BB, PredBBs, ".critedge", DTU); - // The dest block might have PHI nodes, other predecessors and other - // difficult cases. Instead of being smart about this, just insert a new - // block that jumps to the destination block, effectively splitting - // the edge we are about to create. - BasicBlock *EdgeBB = - BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge", - RealDest->getParent(), RealDest); - BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB); - if (DTU) - Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest}); - CritEdgeBranch->setDebugLoc(BI->getDebugLoc()); + // TODO: These just exist to reduce test diff, we can drop them if we like. + EdgeBB->setName(RealDest->getName() + ".critedge"); + EdgeBB->moveBefore(RealDest); // Update PHI nodes. AddPredecessorToBlock(RealDest, EdgeBB, BB); @@ -3086,12 +3055,12 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, // BB may have instructions that are being threaded over. Clone these // instructions into EdgeBB. We know that there will be no uses of the // cloned instructions outside of EdgeBB. - BasicBlock::iterator InsertPt = EdgeBB->begin(); + BasicBlock::iterator InsertPt = EdgeBB->getFirstInsertionPt(); DenseMap<Value *, Value *> TranslateMap; // Track translated values. - TranslateMap[Cond] = Pair.second; + TranslateMap[Cond] = CB; for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) { if (PHINode *PN = dyn_cast<PHINode>(BBI)) { - TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB); + TranslateMap[PN] = PN->getIncomingValueForBlock(EdgeBB); continue; } // Clone the instruction. @@ -3129,19 +3098,15 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, } } - // Loop over all of the edges from PredBB to BB, changing them to branch - // to EdgeBB instead. - Instruction *PredBBTI = PredBB->getTerminator(); - for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i) - if (PredBBTI->getSuccessor(i) == BB) { - BB->removePredecessor(PredBB); - PredBBTI->setSuccessor(i, EdgeBB); - } + BB->removePredecessor(EdgeBB); + BranchInst *EdgeBI = cast<BranchInst>(EdgeBB->getTerminator()); + EdgeBI->setSuccessor(0, RealDest); + EdgeBI->setDebugLoc(BI->getDebugLoc()); if (DTU) { - Updates.push_back({DominatorTree::Insert, PredBB, EdgeBB}); - Updates.push_back({DominatorTree::Delete, PredBB, BB}); - + SmallVector<DominatorTree::UpdateType, 2> Updates; + Updates.push_back({DominatorTree::Delete, EdgeBB, BB}); + Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest}); DTU->applyUpdates(Updates); } @@ -3599,13 +3564,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, Cond->getParent() != BB || !Cond->hasOneUse()) return false; - // Cond is known to be a compare or binary operator. Check to make sure that - // neither operand is a potentially-trapping constant expression. - if (canTrap(Cond->getOperand(0))) - return false; - if (canTrap(Cond->getOperand(1))) - return false; - // Finally, don't infinitely unroll conditional loops. if (is_contained(successors(BB), BB)) return false; @@ -4113,9 +4071,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, if (tryWidenCondBranchToCondBranch(PBI, BI, DTU)) return true; - if (canTrap(BI->getCondition())) - return false; - // If both branches are conditional and both contain stores to the same // address, remove the stores from the conditionals and create a conditional // merged store at the end. @@ -4157,10 +4112,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // insertion of a large number of select instructions. For targets // without predication/cmovs, this is a big pessimization. - // Also do not perform this transformation if any phi node in the common - // destination block can trap when reached by BB or PBB (PR17073). In that - // case, it would be unsafe to hoist the operation into a select instruction. - BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); BasicBlock *RemovedDest = PBI->getSuccessor(PBIOp ^ 1); unsigned NumPhis = 0; @@ -4168,16 +4119,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, ++II, ++NumPhis) { if (NumPhis > 2) // Disable this xform. return false; - - PHINode *PN = cast<PHINode>(II); - Value *BIV = PN->getIncomingValueForBlock(BB); - if (canTrap(BIV)) - return false; - - unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); - Value *PBIV = PN->getIncomingValue(PBBIdx); - if (canTrap(PBIV)) - return false; } // Finally, if everything is ok, fold the branches to logical ops. @@ -6174,6 +6115,23 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, return isSwitchDense(SI->getNumCases(), TableSize); } +static bool ShouldUseSwitchConditionAsTableIndex( + ConstantInt &MinCaseVal, const ConstantInt &MaxCaseVal, + bool HasDefaultResults, const SmallDenseMap<PHINode *, Type *> &ResultTypes, + const DataLayout &DL, const TargetTransformInfo &TTI) { + if (MinCaseVal.isNullValue()) + return true; + if (MinCaseVal.isNegative() || + MaxCaseVal.getLimitedValue() == std::numeric_limits<uint64_t>::max() || + !HasDefaultResults) + return false; + return all_of(ResultTypes, [&](const auto &KV) { + return SwitchLookupTable::WouldFitInRegister( + DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, + KV.second /* ResultType */); + }); +} + /// Try to reuse the switch table index compare. Following pattern: /// \code /// if (idx < tablesize) @@ -6329,9 +6287,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, } uint64_t NumResults = ResultLists[PHIs[0]].size(); - APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue(); - uint64_t TableSize = RangeSpread.getLimitedValue() + 1; - bool TableHasHoles = (NumResults < TableSize); // If the table has holes, we need a constant result for the default case // or a bitmask that fits in a register. @@ -6340,6 +6295,22 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResultsList, DL, TTI); + for (const auto &I : DefaultResultsList) { + PHINode *PHI = I.first; + Constant *Result = I.second; + DefaultResults[PHI] = Result; + } + + bool UseSwitchConditionAsTableIndex = ShouldUseSwitchConditionAsTableIndex( + *MinCaseVal, *MaxCaseVal, HasDefaultResults, ResultTypes, DL, TTI); + uint64_t TableSize; + if (UseSwitchConditionAsTableIndex) + TableSize = MaxCaseVal->getLimitedValue() + 1; + else + TableSize = + (MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1; + + bool TableHasHoles = (NumResults < TableSize); bool NeedMask = (TableHasHoles && !HasDefaultResults); if (NeedMask) { // As an extra penalty for the validity test we require more cases. @@ -6349,12 +6320,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, return false; } - for (const auto &I : DefaultResultsList) { - PHINode *PHI = I.first; - Constant *Result = I.second; - DefaultResults[PHI] = Result; - } - if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) return false; @@ -6368,11 +6333,15 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // Compute the table index value. Builder.SetInsertPoint(SI); Value *TableIndex; - if (MinCaseVal->isNullValue()) + ConstantInt *TableIndexOffset; + if (UseSwitchConditionAsTableIndex) { + TableIndexOffset = ConstantInt::get(MaxCaseVal->getType(), 0); TableIndex = SI->getCondition(); - else - TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal, - "switch.tableidx"); + } else { + TableIndexOffset = MinCaseVal; + TableIndex = + Builder.CreateSub(SI->getCondition(), TableIndexOffset, "switch.tableidx"); + } // Compute the maximum table size representable by the integer type we are // switching upon. @@ -6424,7 +6393,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // Build bitmask; fill in a 1 bit for every case. const ResultListTy &ResultList = ResultLists[PHIs[0]]; for (size_t I = 0, E = ResultList.size(); I != E; ++I) { - uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue()) + uint64_t Idx = (ResultList[I].first->getValue() - TableIndexOffset->getValue()) .getLimitedValue(); MaskInt |= One << Idx; } @@ -6463,8 +6432,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // If using a bitmask, use any value to fill the lookup table holes. Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI]; StringRef FuncName = Fn->getName(); - SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL, - FuncName); + SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV, + DL, FuncName); Value *Result = Table.BuildLookup(TableIndex, Builder); diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index dbef1ff2e739..af15e0c31b75 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -79,21 +79,23 @@ namespace { bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand); bool replaceIVUserWithLoopInvariant(Instruction *UseInst); + bool replaceFloatIVWithIntegerIV(Instruction *UseInst); bool eliminateOverflowIntrinsic(WithOverflowInst *WO); bool eliminateSaturatingIntrinsic(SaturatingInst *SI); bool eliminateTrunc(TruncInst *TI); bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand); - bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand); - void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); - void simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand, + bool makeIVComparisonInvariant(ICmpInst *ICmp, Instruction *IVOperand); + void eliminateIVComparison(ICmpInst *ICmp, Instruction *IVOperand); + void simplifyIVRemainder(BinaryOperator *Rem, Instruction *IVOperand, bool IsSigned); void replaceRemWithNumerator(BinaryOperator *Rem); void replaceRemWithNumeratorOrZero(BinaryOperator *Rem); void replaceSRemWithURem(BinaryOperator *Rem); bool eliminateSDiv(BinaryOperator *SDiv); - bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand); - bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand); + bool strengthenOverflowingOperation(BinaryOperator *OBO, + Instruction *IVOperand); + bool strengthenRightShift(BinaryOperator *BO, Instruction *IVOperand); }; } @@ -192,7 +194,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) } bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp, - Value *IVOperand) { + Instruction *IVOperand) { unsigned IVOperIdx = 0; ICmpInst::Predicate Pred = ICmp->getPredicate(); if (IVOperand != ICmp->getOperand(0)) { @@ -261,7 +263,8 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp, /// SimplifyIVUsers helper for eliminating useless /// comparisons against an induction variable. -void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { +void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, + Instruction *IVOperand) { unsigned IVOperIdx = 0; ICmpInst::Predicate Pred = ICmp->getPredicate(); ICmpInst::Predicate OriginalPred = Pred; @@ -372,7 +375,8 @@ void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) { /// SimplifyIVUsers helper for eliminating useless remainder operations /// operating on an induction variable or replacing srem by urem. -void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand, +void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, + Instruction *IVOperand, bool IsSigned) { auto *NValue = Rem->getOperand(0); auto *DValue = Rem->getOperand(1); @@ -673,6 +677,35 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) { return true; } +/// Eliminate redundant type cast between integer and float. +bool SimplifyIndvar::replaceFloatIVWithIntegerIV(Instruction *UseInst) { + if (UseInst->getOpcode() != CastInst::SIToFP) + return false; + + Value *IVOperand = UseInst->getOperand(0); + // Get the symbolic expression for this instruction. + ConstantRange IVRange = SE->getSignedRange(SE->getSCEV(IVOperand)); + unsigned DestNumSigBits = UseInst->getType()->getFPMantissaWidth(); + if (IVRange.getActiveBits() <= DestNumSigBits) { + for (User *U : UseInst->users()) { + // Match for fptosi of sitofp and with same type. + auto *CI = dyn_cast<FPToSIInst>(U); + if (!CI || IVOperand->getType() != CI->getType()) + continue; + + CI->replaceAllUsesWith(IVOperand); + DeadInsts.push_back(CI); + LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *CI + << " with: " << *IVOperand << '\n'); + + ++NumFoldedUser; + Changed = true; + } + } + + return Changed; +} + /// Eliminate any operation that SCEV can prove is an identity function. bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand) { @@ -718,18 +751,16 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, /// Annotate BO with nsw / nuw if it provably does not signed-overflow / /// unsigned-overflow. Returns true if anything changed, false otherwise. bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, - Value *IVOperand) { - SCEV::NoWrapFlags Flags; - bool Deduced; - std::tie(Flags, Deduced) = SE->getStrengthenedNoWrapFlagsFromBinOp( + Instruction *IVOperand) { + auto Flags = SE->getStrengthenedNoWrapFlagsFromBinOp( cast<OverflowingBinaryOperator>(BO)); - if (!Deduced) - return Deduced; + if (!Flags) + return false; - BO->setHasNoUnsignedWrap(ScalarEvolution::maskFlags(Flags, SCEV::FlagNUW) == + BO->setHasNoUnsignedWrap(ScalarEvolution::maskFlags(*Flags, SCEV::FlagNUW) == SCEV::FlagNUW); - BO->setHasNoSignedWrap(ScalarEvolution::maskFlags(Flags, SCEV::FlagNSW) == + BO->setHasNoSignedWrap(ScalarEvolution::maskFlags(*Flags, SCEV::FlagNSW) == SCEV::FlagNSW); // The getStrengthenedNoWrapFlagsFromBinOp() check inferred additional nowrap @@ -737,14 +768,14 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, // forgetValue() here to make sure those flags also propagate to any other // SCEV expressions based on the addrec. However, this can have pathological // compile-time impact, see https://bugs.llvm.org/show_bug.cgi?id=50384. - return Deduced; + return true; } /// Annotate the Shr in (X << IVOperand) >> C as exact using the /// information from the IV's range. Returns true if anything changed, false /// otherwise. bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO, - Value *IVOperand) { + Instruction *IVOperand) { using namespace llvm::PatternMatch; if (BO->getOpcode() == Instruction::Shl) { @@ -896,6 +927,13 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) { } } + // Try to use integer induction for FPToSI of float induction directly. + if (replaceFloatIVWithIntegerIV(UseInst)) { + // Re-queue the potentially new direct uses of IVOperand. + pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers); + continue; + } + CastInst *Cast = dyn_cast<CastInst>(UseInst); if (V && Cast) { V->visitCast(Cast); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index f4306bb43dfd..b359717424a6 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -75,7 +75,8 @@ static bool callHasFP128Argument(const CallInst *CI) { }); } -static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) { +static Value *convertStrToNumber(CallInst *CI, StringRef &Str, Value *EndPtr, + int64_t Base, IRBuilderBase &B) { if (Base < 2 || Base > 36) // handle special zero base if (Base != 0) @@ -97,6 +98,15 @@ static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) { if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result)) return nullptr; + if (EndPtr) { + // Store the pointer to the end. + uint64_t ILen = End - nptr.c_str(); + Value *Off = B.getInt64(ILen); + Value *StrBeg = CI->getArgOperand(0); + Value *StrEnd = B.CreateInBoundsGEP(B.getInt8Ty(), StrBeg, Off, "endptr"); + B.CreateStore(StrEnd, EndPtr); + } + return ConstantInt::get(CI->getType(), Result); } @@ -295,31 +305,69 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, SrcLen, B)); } +// Helper to transform memchr(S, C, N) == S to N && *S == C and, when +// NBytes is null, strchr(S, C) to *S == C. A precondition of the function +// is that either S is dereferenceable or the value of N is nonzero. +static Value* memChrToCharCompare(CallInst *CI, Value *NBytes, + IRBuilderBase &B, const DataLayout &DL) +{ + Value *Src = CI->getArgOperand(0); + Value *CharVal = CI->getArgOperand(1); + + // Fold memchr(A, C, N) == A to N && *A == C. + Type *CharTy = B.getInt8Ty(); + Value *Char0 = B.CreateLoad(CharTy, Src); + CharVal = B.CreateTrunc(CharVal, CharTy); + Value *Cmp = B.CreateICmpEQ(Char0, CharVal, "char0cmp"); + + if (NBytes) { + Value *Zero = ConstantInt::get(NBytes->getType(), 0); + Value *And = B.CreateICmpNE(NBytes, Zero); + Cmp = B.CreateLogicalAnd(And, Cmp); + } + + Value *NullPtr = Constant::getNullValue(CI->getType()); + return B.CreateSelect(Cmp, Src, NullPtr); +} + Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); Value *SrcStr = CI->getArgOperand(0); + Value *CharVal = CI->getArgOperand(1); annotateNonNullNoUndefBasedOnAccess(CI, 0); + if (isOnlyUsedInEqualityComparison(CI, SrcStr)) + return memChrToCharCompare(CI, nullptr, B, DL); + // If the second operand is non-constant, see if we can compute the length // of the input string and turn this into memchr. - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal); if (!CharC) { uint64_t Len = GetStringLength(SrcStr); if (Len) annotateDereferenceableBytes(CI, 0, Len); else return nullptr; + + Function *Callee = CI->getCalledFunction(); + FunctionType *FT = Callee->getFunctionType(); if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; return copyFlags( *CI, - emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + emitMemChr(SrcStr, CharVal, // include nul. ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B, DL, TLI)); } + if (CharC->isZero()) { + Value *NullPtr = Constant::getNullValue(CI->getType()); + if (isOnlyUsedInEqualityComparison(CI, NullPtr)) + // Pre-empt the transformation to strlen below and fold + // strchr(A, '\0') == null to false. + return B.CreateIntToPtr(B.getTrue(), CI->getType()); + } + // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; @@ -1008,8 +1056,12 @@ Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) { Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { Value *SrcStr = CI->getArgOperand(0); Value *Size = CI->getArgOperand(2); - if (isKnownNonZero(Size, DL)) + + if (isKnownNonZero(Size, DL)) { annotateNonNullNoUndefBasedOnAccess(CI, 0); + if (isOnlyUsedInEqualityComparison(CI, SrcStr)) + return memChrToCharCompare(CI, Size, B, DL); + } Value *CharVal = CI->getArgOperand(1); ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal); @@ -1099,9 +1151,16 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { return B.CreateSelect(And, SrcStr, Sel1, "memchr.sel2"); } - if (!LenC) + if (!LenC) { + if (isOnlyUsedInEqualityComparison(CI, SrcStr)) + // S is dereferenceable so it's safe to load from it and fold + // memchr(S, C, N) == S to N && *S == C for any C and N. + // TODO: This is safe even even for nonconstant S. + return memChrToCharCompare(CI, Size, B, DL); + // From now on we need a constant length and constant array. return nullptr; + } // If the char is variable but the input str and length are not we can turn // this memchr call into a simple bit field test. Of course this only works @@ -1589,31 +1648,6 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, return nullptr; } -static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) { - // Multiplications calculated using Addition Chains. - // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html - - assert(Exp != 0 && "Incorrect exponent 0 not handled"); - - if (InnerChain[Exp]) - return InnerChain[Exp]; - - static const unsigned AddChain[33][2] = { - {0, 0}, // Unused. - {0, 0}, // Unused (base case = pow1). - {1, 1}, // Unused (pre-computed). - {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4}, - {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7}, - {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10}, - {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, - {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, - }; - - InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), - getPow(InnerChain, AddChain[Exp][1], B)); - return InnerChain[Exp]; -} - // Return a properly extended integer (DstWidth bits wide) if the operation is // an itofp. static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) { @@ -1914,70 +1948,52 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { if (Value *Sqrt = replacePowWithSqrt(Pow, B)) return Sqrt; - // pow(x, n) -> x * x * x * ... + // pow(x, n) -> powi(x, n) * sqrt(x) if n has exactly a 0.5 fraction const APFloat *ExpoF; - if (AllowApprox && match(Expo, m_APFloat(ExpoF)) && - !ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)) { - // We limit to a max of 7 multiplications, thus the maximum exponent is 32. - // If the exponent is an integer+0.5 we generate a call to sqrt and an - // additional fmul. - // TODO: This whole transformation should be backend specific (e.g. some - // backends might prefer libcalls or the limit for the exponent might - // be different) and it should also consider optimizing for size. - APFloat LimF(ExpoF->getSemantics(), 33), - ExpoA(abs(*ExpoF)); - if (ExpoA < LimF) { - // This transformation applies to integer or integer+0.5 exponents only. - // For integer+0.5, we create a sqrt(Base) call. - Value *Sqrt = nullptr; - if (!ExpoA.isInteger()) { - APFloat Expo2 = ExpoA; - // To check if ExpoA is an integer + 0.5, we add it to itself. If there - // is no floating point exception and the result is an integer, then - // ExpoA == integer + 0.5 - if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK) - return nullptr; - - if (!Expo2.isInteger()) - return nullptr; - - Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(), - Pow->doesNotAccessMemory(), M, B, TLI); - if (!Sqrt) - return nullptr; - } - - // We will memoize intermediate products of the Addition Chain. - Value *InnerChain[33] = {nullptr}; - InnerChain[1] = Base; - InnerChain[2] = B.CreateFMul(Base, Base, "square"); - - // We cannot readily convert a non-double type (like float) to a double. - // So we first convert it to something which could be converted to double. - ExpoA.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &Ignored); - Value *FMul = getPow(InnerChain, ExpoA.convertToDouble(), B); + if (match(Expo, m_APFloat(ExpoF)) && !ExpoF->isExactlyValue(0.5) && + !ExpoF->isExactlyValue(-0.5)) { + APFloat ExpoA(abs(*ExpoF)); + APFloat ExpoI(*ExpoF); + Value *Sqrt = nullptr; + if (AllowApprox && !ExpoA.isInteger()) { + APFloat Expo2 = ExpoA; + // To check if ExpoA is an integer + 0.5, we add it to itself. If there + // is no floating point exception and the result is an integer, then + // ExpoA == integer + 0.5 + if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK) + return nullptr; - // Expand pow(x, y+0.5) to pow(x, y) * sqrt(x). - if (Sqrt) - FMul = B.CreateFMul(FMul, Sqrt); + if (!Expo2.isInteger()) + return nullptr; - // If the exponent is negative, then get the reciprocal. - if (ExpoF->isNegative()) - FMul = B.CreateFDiv(ConstantFP::get(Ty, 1.0), FMul, "reciprocal"); + if (ExpoI.roundToIntegral(APFloat::rmTowardNegative) != + APFloat::opInexact) + return nullptr; + if (!ExpoI.isInteger()) + return nullptr; + ExpoF = &ExpoI; - return FMul; + Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(), + Pow->doesNotAccessMemory(), M, B, TLI); + if (!Sqrt) + return nullptr; } + // pow(x, n) -> powi(x, n) if n is a constant signed integer value APSInt IntExpo(TLI->getIntSize(), /*isUnsigned=*/false); - // powf(x, n) -> powi(x, n) if n is a constant signed integer value if (ExpoF->isInteger() && ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) == APFloat::opOK) { - return copyFlags( + Value *PowI = copyFlags( *Pow, createPowWithIntegerExponent( Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), M, B)); + + if (PowI && Sqrt) + return B.CreateFMul(PowI, Sqrt); + + return PowI; } } @@ -2517,7 +2533,7 @@ Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) { if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; - return convertStrToNumber(CI, Str, 10); + return convertStrToNumber(CI, Str, nullptr, 10, B); } Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { @@ -2525,11 +2541,14 @@ Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; - if (!isa<ConstantPointerNull>(CI->getArgOperand(1))) + Value *EndPtr = CI->getArgOperand(1); + if (isa<ConstantPointerNull>(EndPtr)) + EndPtr = nullptr; + else if (!isKnownNonZero(EndPtr, DL)) return nullptr; if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) { - return convertStrToNumber(CI, Str, CInt->getSExtValue()); + return convertStrToNumber(CI, Str, EndPtr, CInt->getSExtValue(), B); } return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 6242d9a93fc1..183ba86abcb4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -386,20 +386,6 @@ static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { return true; } -/// Check whether it is safe to if-convert this phi node. -/// -/// Phi nodes with constant expressions that can trap are not safe to if -/// convert. -static bool canIfConvertPHINodes(BasicBlock *BB) { - for (PHINode &Phi : BB->phis()) { - for (Value *V : Phi.incoming_values()) - if (auto *C = dyn_cast<Constant>(V)) - if (C->canTrap()) - return false; - } - return true; -} - static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -993,7 +979,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getPredicate()); return true; } @@ -1098,13 +1083,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( SmallPtrSetImpl<const Instruction *> &MaskedOp, SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { for (Instruction &I : *BB) { - // Check that we don't have a constant expression that can trap as operand. - for (Value *Operand : I.operands()) { - if (auto *C = dyn_cast<Constant>(Operand)) - if (C->canTrap()) - return false; - } - // We can predicate blocks with calls to assume, as long as we drop them in // case we flatten the CFG via predication. if (match(&I, m_Intrinsic<Intrinsic::assume>())) { @@ -1190,7 +1168,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } // Collect the blocks that need predication. - BasicBlock *Header = TheLoop->getHeader(); for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { @@ -1212,13 +1189,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BB->getTerminator()); return false; } - } else if (BB != Header && !canIfConvertPHINodes(BB)) { - reportVectorizationFailure( - "Control flow cannot be substituted for a select", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); - return false; } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 0cb2032fa45a..2e9a9fe0640e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -33,7 +33,6 @@ class LoopInfo; class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; -class LoopVectorizationRequirements; class LoopVectorizeHints; class OptimizationRemarkEmitter; class TargetTransformInfo; @@ -46,8 +45,9 @@ class VPBuilder { VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); VPInstruction *createInstruction(unsigned Opcode, - ArrayRef<VPValue *> Operands, DebugLoc DL) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL); + ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") { + VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name); if (BB) BB->insert(Instr, InsertPt); return Instr; @@ -55,8 +55,8 @@ class VPBuilder { VPInstruction *createInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL) { - return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL); + DebugLoc DL, const Twine &Name = "") { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name); } public: @@ -124,34 +124,37 @@ public: /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as /// its underlying Instruction. VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, - Instruction *Inst = nullptr) { + Instruction *Inst = nullptr, const Twine &Name = "") { DebugLoc DL; if (Inst) DL = Inst->getDebugLoc(); - VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL); + VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, - DebugLoc DL) { - return createInstruction(Opcode, Operands, DL); + DebugLoc DL, const Twine &Name = "") { + return createInstruction(Opcode, Operands, DL, Name); } - VPValue *createNot(VPValue *Operand, DebugLoc DL) { - return createInstruction(VPInstruction::Not, {Operand}, DL); + VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") { + return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL, + const Twine &Name = "") { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name); } - VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL); + VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL, + const Twine &Name = "") { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL, Name); } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL); + DebugLoc DL, const Twine &Name = "") { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, + Name); } //===--------------------------------------------------------------------===// @@ -191,6 +194,10 @@ struct VectorizationFactor { /// Cost of the scalar loop. InstructionCost ScalarCost; + /// The minimum trip count required to make vectorization profitable, e.g. due + /// to runtime checks. + ElementCount MinProfitableTripCount; + VectorizationFactor(ElementCount Width, InstructionCost Cost, InstructionCost ScalarCost) : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} @@ -268,8 +275,6 @@ class LoopVectorizationPlanner { const LoopVectorizeHints &Hints; - LoopVectorizationRequirements &Requirements; - OptimizationRemarkEmitter *ORE; SmallVector<VPlanPtr, 4> VPlans; @@ -285,10 +290,9 @@ public: InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements, OptimizationRemarkEmitter *ORE) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {} + PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. @@ -332,11 +336,6 @@ public: bool requiresTooManyRuntimeChecks() const; protected: - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b637b2d5ddae..0777a1385916 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +static cl::opt<unsigned> VectorizeMemoryCheckThreshold( + "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks")); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the @@ -442,6 +441,7 @@ public: const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) @@ -453,6 +453,11 @@ public: // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + + if (MinProfitableTripCount.isZero()) + this->MinProfitableTripCount = VecWidth; + else + this->MinProfitableTripCount = MinProfitableTripCount; } virtual ~InnerLoopVectorizer() = default; @@ -656,6 +661,8 @@ protected: /// vector elements. ElementCount VF; + ElementCount MinProfitableTripCount; + /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -735,6 +742,7 @@ public: LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} @@ -783,8 +791,8 @@ public: BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, - Checks), + EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, + CM, BFI, PSI, Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1018,7 +1026,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || isa<VPScalarIVStepsRecipe>(CurRec) || - isa<VPCanonicalIVPHIRecipe>(CurRec)) + isa<VPCanonicalIVPHIRecipe>(CurRec) || + isa<VPActiveLaneMaskPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1503,6 +1512,13 @@ public: /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns true if were tail-folding and want to use the active lane mask + /// for vector loop control flow. + bool useActiveLaneMaskForControlFlow() const { + return FoldTailByMasking && + TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1551,14 +1567,14 @@ public: Scalars.clear(); } -private: - unsigned NumPredStores = 0; - /// Convenience function that returns the value of vscale_range iff /// vscale_range.min == vscale_range.max or otherwise returns the value /// returned by the corresponding TLI method. Optional<unsigned> getVScaleForTuning() const; +private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -1661,7 +1677,8 @@ private: /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. - SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> + PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at /// least once. This may be needed as a fallback loop in case runtime @@ -1849,14 +1866,17 @@ class GeneratedRTChecks { DominatorTree *DT; LoopInfo *LI; + TargetTransformInfo *TTI; SCEVExpander SCEVExp; SCEVExpander MemCheckExp; + bool CostTooHigh = false; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - const DataLayout &DL) - : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + TargetTransformInfo *TTI, const DataLayout &DL) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), MemCheckExp(SE, DL, "scev.check") {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can @@ -1867,6 +1887,15 @@ public: void Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + // Hard cutoff to limit compile-time increase in case a very large number of + // runtime checks needs to be generated. + // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to + // profile info. + CostTooHigh = + LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; + if (CostTooHigh) + return; + BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -1938,6 +1967,44 @@ public: } } + InstructionCost getCost() { + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); + + if (CostTooHigh) { + InstructionCost Cost; + Cost.setInvalid(); + LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); + return Cost; + } + + InstructionCost RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost + << "\n"); + + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2880,9 +2947,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - Value *Step = createStepForVF(Builder, CountTy, VF, UF); + auto CreateStep = [&]() { + // Create step with max(MinProTripCount, UF * VF). + if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) + return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); + return createStepForVF(Builder, CountTy, VF, UF); + }; + if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + CheckMinIters = + Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable()) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an @@ -2894,8 +2968,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } + // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -2920,7 +2995,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -4792,7 +4866,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { MaxVScale = TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -5187,9 +5261,9 @@ bool LoopVectorizationCostModel::isMoreProfitable( unsigned EstimatedWidthB = B.Width.getKnownMinValue(); if (Optional<unsigned> VScale = getVScaleForTuning()) { if (A.Width.isScalable()) - EstimatedWidthA *= VScale.getValue(); + EstimatedWidthA *= VScale.value(); if (B.Width.isScalable()) - EstimatedWidthB *= VScale.getValue(); + EstimatedWidthB *= VScale.value(); } // Assume vscale may be larger than 1 (or the value being tuned for), @@ -5872,10 +5946,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; - return TTI.getRegUsageForType(VectorType::get(Ty, VF)); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -6014,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // map will indicate that we've analyzed it already. ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + PredicatedBBsAfterVectorization[VF].clear(); + // Find all the instructions that are scalar with predication in the loop and // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. @@ -6031,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. - PredicatedBBsAfterVectorization.insert(BB); + PredicatedBBsAfterVectorization[VF].insert(BB); } } } @@ -6896,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF.isVector() && BI->isConditional() && - (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { @@ -7363,14 +7440,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - return (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold && - !Hints.allowReordering()) || - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; -} - Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7439,7 +7508,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -7554,7 +7625,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) - L->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.value()); else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -7585,51 +7656,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -void LoopVectorizationPlanner::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - - // We create new control-flow for the vectorized loop, so the original exit - // conditions will be dead after vectorization if it's only used by the - // terminator - SmallVector<BasicBlock*> ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - for (auto *BB : ExitingBlocks) { - auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); - if (!Cmp || !Cmp->hasOneUse()) - continue; - - // TODO: we should introduce a getUniqueExitingBlocks on Loop - if (!DeadInstructions.insert(Cmp).second) - continue; - - // The operands of the icmp is often a dead trunc, used by IndUpdate. - // TODO: can recurse through operands in general - for (Value *Op : Cmp->operands()) { - if (isa<TruncInst>(Op) && Op->hasOneUse()) - DeadInstructions.insert(cast<Instruction>(Op)); - } - } - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - auto *Latch = OrigLoop->getLoopLatch(); - for (auto &Induction : Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - - // If the tail is to be folded by masking, the primary induction variable, - // if exists, isn't dead: it will be used for masking. Don't kill it. - if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) - continue; - - if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); - } -} - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } //===--------------------------------------------------------------------===// @@ -8001,11 +8027,19 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8014,9 +8048,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); @@ -8409,9 +8444,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return RegSucc; } -VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe, - VPlanPtr &Plan) { +VPRegionBlock *VPRecipeBuilder::createReplicateRegion( + Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) { // Instructions marked for predication are replicated and placed under an // if-then construct to prevent side-effects. @@ -8425,7 +8459,7 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); auto *PHIRecipe = Instr->getType()->isVoidTy() ? nullptr - : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); + : new VPPredInstPHIRecipe(PredRecipe); if (PHIRecipe) { Plan->removeVPValueFor(Instr); Plan->addVPValue(Instr, PHIRecipe); @@ -8517,19 +8551,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. + SmallPtrSet<Instruction *, 4> DeadInstructions; auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); @@ -8565,32 +8591,84 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a -// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a -// BranchOnCount VPInstruction to the latch. +// Add the necessary canonical IV and branch recipes required to control the +// loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW) { + bool HasNUW, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar + // IV by VF * UF. auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL); + {CanonicalIVPHI}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + if (UseLaneMaskForLoopControlFlow) { + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {StartV}, DL, "index.part.next"); + Preheader->appendRecipe(CanonicalIVIncrementParts); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.entry"); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + + // Create the active lane mask for the next iteration of the loop. + CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.next"); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + // We have to invert the mask here because a true condition means jumping + // to the exit block. + auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); + EB->appendRecipe(NotMask); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); + EB->appendRecipe(BranchBack); + } else { + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); + } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8691,7 +8769,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking()); + !CM.foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8961,8 +9040,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::sinkScalarOperands(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeDeadRecipes(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); // Fold Exit block into its predecessor if possible. @@ -9006,7 +9085,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true); + true, CM.useActiveLaneMaskForControlFlow()); return Plan; } @@ -9078,7 +9157,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); - WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + // Append the recipe to the end of the VPBasicBlock because we need to + // ensure that it comes after all of it's inputs, including CondOp. + WidenRecipe->getParent()->appendRecipe(RedRecipe); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); @@ -9151,229 +9232,6 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { *this, State); } -void VPWidenSelectRecipe::execute(VPTransformState &State) { - auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = - InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); - Value *Op0 = State.get(getOperand(1), Part); - Value *Op1 = State.get(getOperand(2), Part); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel, Part); - State.addMetadata(Sel, &I); - } -} - -void VPWidenRecipe::execute(VPTransformState &State) { - auto &I = *cast<Instruction>(getUnderlyingValue()); - auto &Builder = State.Builder; - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) { - VecOp->copyIRFlags(&I); - - // If the instruction is vectorized and was in a basic block that needed - // predication, we can't propagate poison-generating flags (nuw/nsw, - // exact, etc.). The control flow has been linearized and the - // instruction is no longer guarded by the predicate, which could make - // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.contains(this)) - VecOp->dropPoisonGeneratingFlags(); - } - - // Use this vector value for all users of the original instruction. - State.set(this, V, Part); - State.addMetadata(V, &I); - } - - break; - } - case Instruction::Freeze: { - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Op = State.get(getOperand(0), Part); - - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze, Part); - } - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - State.setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(this, C, Part); - State.addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - State.setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = (State.VF.isScalar()) - ? CI->getType() - : VectorType::get(CI->getType(), State.VF); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - -void VPWidenGEPRecipe::execute(VPTransformState &State) { - auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = State.Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); - State.set(this, EntryPart, Part); - State.addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < State.UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(getOperand(0), VPIteration(0, 0)) - : State.get(getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // If the GEP instruction is vectorized and was in a basic block that - // needed predication, we can't propagate the poison-generating 'inbounds' - // flag. The control flow has been linearized and the GEP is no longer - // guarded by the predicate, which could make the 'inbounds' properties to - // no longer hold. - bool IsInBounds = - GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", IsInBounds); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP, Part); - State.addMetadata(NewGEP, GEP); - } - } -} - void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); @@ -9632,45 +9490,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { } } -void VPBlendRecipe::execute(VPTransformState &State) { - State.setDebugLocFromInst(Phi); - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // SELECT(Mask1, In1, - // In0))) - // Note that Mask0 is never used: lanes for which no path reaches this phi and - // are essentially undef are taken from In0. - InnerLoopVectorizer::VectorParts Entry(State.UF); - for (unsigned In = 0; In < NumIncoming; ++In) { - for (unsigned Part = 0; Part < State.UF; ++Part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), Part); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), Part); - Entry[Part] = - State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); - } - } - } - for (unsigned Part = 0; Part < State.UF; ++Part) - State.set(this, Entry[Part], Part); -} - void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), @@ -9758,32 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State); } -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); - - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane.getKnownLane(); - - Value *ConditionBit = nullptr; - VPValue *BlockInMask = getMask(); - if (BlockInMask) { - ConditionBit = State.get(BlockInMask, Part); - if (ConditionBit->getType()->isVectorTy()) - ConditionBit = State.Builder.CreateExtractElement( - ConditionBit, State.Builder.getInt32(Lane)); - } else // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); -} - void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -10103,8 +9896,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10123,10 +9915,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -10183,6 +9975,105 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } } +static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, + Optional<unsigned> VScale, Loop *L, + ScalarEvolution &SE) { + InstructionCost CheckCost = Checks.getCost(); + if (!CheckCost.isValid()) + return false; + + // When interleaving only scalar and vector cost will be equal, which in turn + // would lead to a divide by 0. Fall back to hard threshold. + if (VF.Width.isScalar()) { + if (CheckCost > VectorizeMemoryCheckThreshold) { + LLVM_DEBUG( + dbgs() + << "LV: Interleaving only is not profitable due to runtime checks\n"); + return false; + } + return true; + } + + // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. + double ScalarC = *VF.ScalarCost.getValue(); + if (ScalarC == 0) + return true; + + // First, compute the minimum iteration count required so that the vector + // loop outperforms the scalar loop. + // The total cost of the scalar loop is + // ScalarC * TC + // where + // * TC is the actual trip count of the loop. + // * ScalarC is the cost of a single scalar iteration. + // + // The total cost of the vector loop is + // RtC + VecC * (TC / VF) + EpiC + // where + // * RtC is the cost of the generated runtime checks + // * VecC is the cost of a single vector iteration. + // * TC is the actual trip count of the loop + // * VF is the vectorization factor + // * EpiCost is the cost of the generated epilogue, including the cost + // of the remaining scalar operations. + // + // Vectorization is profitable once the total vector cost is less than the + // total scalar cost: + // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC + // + // Now we can compute the minimum required trip count TC as + // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // + // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that + // the computations are performed on doubles, not integers and the result + // is rounded up, hence we get an upper estimate of the TC. + unsigned IntVF = VF.Width.getKnownMinValue(); + if (VF.Width.isScalable()) { + unsigned AssumedMinimumVscale = 1; + if (VScale) + AssumedMinimumVscale = *VScale; + IntVF *= AssumedMinimumVscale; + } + double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; + double RtC = *CheckCost.getValue(); + double MinTC1 = RtC / (ScalarC - VecCOverVF); + + // Second, compute a minimum iteration count so that the cost of the + // runtime checks is only a fraction of the total scalar loop cost. This + // adds a loop-dependent bound on the overhead incurred if the runtime + // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC + // * TC. To bound the runtime check to be a fraction 1/X of the scalar + // cost, compute + // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC + double MinTC2 = RtC * 10 / ScalarC; + + // Now pick the larger minimum. If it is not a multiple of VF, choose the + // next closest multiple of VF. This should partly compensate for ignoring + // the epilogue cost. + uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); + + LLVM_DEBUG( + dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" + << VF.MinProfitableTripCount << "\n"); + + // Skip vectorization if the expected trip count is less than the minimum + // required trip count. + if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), + VF.MinProfitableTripCount)) { + LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " + "trip count < minimum profitable VF (" + << *ExpectedTC << " < " << VF.MinProfitableTripCount + << ")\n"); + + return false; + } + } + return true; +} + LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), @@ -10340,8 +10231,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10353,10 +10243,25 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); if (MaybeVF) { - if (LVP.requiresTooManyRuntimeChecks()) { + VF = *MaybeVF; + // Select the interleave count. + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Optimistically generate runtime checks if they are needed. Drop them if + // they turn out to not be profitable. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + + // Check if it is profitable to vectorize with runtime checks. + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (!ForceVectorization && + !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), @@ -10368,15 +10273,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.emitRemarkWithHints(); return false; } - VF = *MaybeVF; - // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); - - unsigned SelectedIC = std::max(IC, UserIC); - // Optimistically generate runtime checks if they are needed. Drop them if - // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10533,8 +10429,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, + PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); @@ -10564,7 +10461,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); if (RemainderLoopID) { - L->setLoopID(RemainderLoopID.getValue()); + L->setLoopID(RemainderLoopID.value()); } else { if (DisableRuntimeUnroll) AddRuntimeUnrollDisableMetaData(L); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 019a09665a67..e136cd9aedac 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2637,7 +2637,7 @@ private: AliasCacheKey key = std::make_pair(Inst1, Inst2); Optional<bool> &result = AliasCache[key]; if (result) { - return result.getValue(); + return result.value(); } bool aliased = true; if (Loc1.Ptr && isSimple(Inst1)) @@ -4592,7 +4592,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, }; InstructionsState S = getSameOpcode(VL); - if (Depth == RecursionMaxDepth) { + + // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of + // a load), in which case peek through to include it in the tree, without + // ballooning over-budget. + if (Depth >= RecursionMaxDepth && + !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp && + VL.size() >= 4 && + (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { + return match(I, + m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && + cast<Instruction>(I)->getOpcode() == + cast<Instruction>(S.MainOp)->getOpcode(); + })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -11217,7 +11229,7 @@ public: return OptimizationRemarkMissed( SV_NAME, "HorSLPNotBeneficial", ReducedValsToOps.find(VL[0])->second.front()) - << "Vectorizing horizontal reduction is possible" + << "Vectorizing horizontal reduction is possible " << "but not beneficial with cost " << ore::NV("Cost", Cost) << " and threshold " << ore::NV("Threshold", -SLPCostThreshold); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 97f2b1a93815..c7949c42c03e 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -159,7 +159,8 @@ public: /// Create a replicating region for instruction \p I that requires /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, + VPRegionBlock *createReplicateRegion(Instruction *I, + VPReplicateRecipe *PredRecipe, VPlanPtr &Plan); /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4d709097c306..30032dda7f60 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -248,25 +248,27 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { } void VPTransformState::setDebugLocFromInst(const Value *V) { - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { - const DILocation *DIL = Inst->getDebugLoc(); - - // When a FSDiscriminator is enabled, we don't need to add the multiply - // factors to the discriminators. - if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { - // FIXME: For scalable vectors, assume vscale=1. - auto NewDIL = - DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); - if (NewDIL) - Builder.SetCurrentDebugLocation(*NewDIL); - else - LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " - << DIL->getFilename() << " Line: " << DIL->getLine()); - } else - Builder.SetCurrentDebugLocation(DIL); - } else + const Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) { Builder.SetCurrentDebugLocation(DebugLoc()); + return; + } + + const DILocation *DIL = Inst->getDebugLoc(); + // When a FSDiscriminator is enabled, we don't need to add the multiply + // factors to the discriminators. + if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && + !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + // FIXME: For scalable vectors, assume vscale=1. + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); + if (NewDIL) + Builder.SetCurrentDebugLocation(*NewDIL); + else + LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " + << DIL->getFilename() << " Line: " << DIL->getLine()); + } else + Builder.SetCurrentDebugLocation(DIL); } BasicBlock * @@ -566,6 +568,24 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif +VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa<VPActiveLaneMaskPHIRecipe>(&R)) + return cast<VPActiveLaneMaskPHIRecipe>(&R); + } + return nullptr; +} + +static bool canSimplifyBranchOnCond(VPInstruction *Term) { + VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0)); + if (!Not || Not->getOpcode() != VPInstruction::Not) + return false; + + VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0)); + return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; +} + void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, @@ -573,11 +593,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); - // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when - // preparing to execute the plan for the main vector loop. - if (!IsEpilogueVectorization && Term && - Term->getOpcode() == VPInstruction::BranchOnCount && - isa<ConstantInt>(TripCountV)) { + // Try to simplify the branch condition if TC <= VF * UF when preparing to + // execute the plan for the main vector loop. We only do this if the + // terminator is: + // 1. BranchOnCount, or + // 2. BranchOnCond where the input is Not(ActiveLaneMask). + if (!IsEpilogueVectorization && Term && isa<ConstantInt>(TripCountV) && + (Term->getOpcode() == VPInstruction::BranchOnCount || + (Term->getOpcode() == VPInstruction::BranchOnCond && + canSimplifyBranchOnCond(Term)))) { ConstantInt *C = cast<ConstantInt>(TripCountV); uint64_t TCVal = C->getZExtValue(); if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { @@ -697,7 +721,8 @@ void VPlan::execute(VPTransformState *State) { // generated. bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || - cast<VPReductionPHIRecipe>(PhiR)->isOrdered(); + (isa<VPReductionPHIRecipe>(PhiR) && + cast<VPReductionPHIRecipe>(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 09da4a545d0d..f009a7ee6b4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -784,6 +784,10 @@ public: ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + // The next two are similar to the above, but instead increment the + // canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + CanonicalIVIncrementForPartNUW, BranchOnCount, BranchOnCond }; @@ -794,6 +798,9 @@ private: FastMathFlags FMF; DebugLoc DL; + /// An optional name that can be used for the generated IR instruction. + const std::string Name; + /// Utility method serving execute(): generates a single instance of the /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); @@ -802,14 +809,15 @@ protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode), - DL(DL) {} + DL(DL), Name(Name.str()) {} VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL = {}) - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {} + DebugLoc DL = {}, const Twine &Name = "") + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -818,7 +826,7 @@ public: VPInstruction *clone() const { SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands, DL); + return new VPInstruction(Opcode, Operands, DL, Name); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -897,6 +905,8 @@ public: case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: case VPInstruction::BranchOnCount: return true; }; @@ -1125,6 +1135,7 @@ public: /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1132,6 +1143,7 @@ public: } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1861,6 +1873,42 @@ public: } }; +/// A recipe for generating the active lane mask for the vector loop that is +/// used to predicate the vector operations. +/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +/// remove VPActiveLaneMaskPHIRecipe. +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr, StartMask), + DL(DL) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2656,6 +2704,10 @@ public: return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } + /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there + /// be only one at most. If there isn't one, then return nullptr. + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); + void addLiveOut(PHINode *PN, VPValue *V); void clearLiveOuts() { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 92422b17457c..fdd901a4a70d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -26,13 +26,19 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> using namespace llvm; +using VectorParts = SmallVector<Value *, 2>; + extern cl::opt<bool> EnableVPlanNativePath; +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPWidenMemoryInstructionSC: { @@ -186,7 +192,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); - Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); + Value *V = + Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); State.set(this, V, Part); return; } @@ -194,14 +201,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, switch (getOpcode()) { case VPInstruction::Not: { Value *A = State.get(getOperand(0), Part); - Value *V = Builder.CreateNot(A); + Value *V = Builder.CreateNot(A, Name); State.set(this, V, Part); break; } case VPInstruction::ICmpULE: { Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); - Value *V = Builder.CreateICmpULE(IV, TC); + Value *V = Builder.CreateICmpULE(IV, TC, Name); State.set(this, V, Part); break; } @@ -209,7 +216,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, Value *Cond = State.get(getOperand(0), Part); Value *Op1 = State.get(getOperand(1), Part); Value *Op2 = State.get(getOperand(2), Part); - Value *V = Builder.CreateSelect(Cond, Op1, Op2); + Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name); State.set(this, V, Part); break; } @@ -223,7 +230,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); + {VIVElem0, ScalarTC}, nullptr, Name); State.set(this, Call, Part); break; } @@ -247,7 +254,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, PartMinus1, Part); } else { Value *V2 = State.get(getOperand(1), Part); - State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part); + State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name), + Part); } break; } @@ -261,7 +269,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, // elements) times the unroll factor (num of SIMD instructions). Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } else { Next = State.get(this, 0); } @@ -269,6 +277,23 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, Next, Part); break; } + + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + auto *IV = State.get(getOperand(0), VPIteration(0, 0)); + if (Part == 0) { + State.set(this, IV, Part); + break; + } + + // The canonical IV is incremented by the vectorization factor (num of SIMD + // elements) times the unroll part. + Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); + State.set(this, Next, Part); + break; + } case VPInstruction::BranchOnCond: { if (Part != 0) break; @@ -370,6 +395,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::CanonicalIVIncrementForPart: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementForPartNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; @@ -431,7 +462,158 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(2)->printAsOperand(O, SlotTracker); O << (InvariantCond ? " (condition is loop invariant)" : ""); } +#endif + +void VPWidenSelectRecipe::execute(VPTransformState &State) { + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.addMetadata(Sel, &I); + } +} + +void VPWidenRecipe::execute(VPTransformState &State) { + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.contains(this)) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.addMetadata(V, &I); + } + + break; + } + case Instruction::Freeze: { + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Op = State.get(getOperand(0), Part); + + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, Part); + } + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.addMetadata(C, &I); + } + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; @@ -487,7 +669,82 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "= SCALAR-STEPS "; printOperands(O, SlotTracker); } +#endif + +void VPWidenGEPRecipe::execute(VPTransformState &State) { + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, + Indices, "", IsInBounds); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.addMetadata(NewGEP, GEP); + } + } +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; @@ -501,7 +758,48 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, O << " = getelementptr "; printOperands(O, SlotTracker); } +#endif +void VPBlendRecipe::execute(VPTransformState &State) { + State.setDebugLocFromInst(Phi); + // We know that all PHIs in non-header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // SELECT(Mask1, In1, + // In0))) + // Note that Mask0 is never used: lanes for which no path reaches this phi and + // are essentially undef are taken from In0. + VectorParts Entry(State.UF); + for (unsigned In = 0; In < NumIncoming; ++In) { + for (unsigned Part = 0; Part < State.UF; ++Part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + Value *In0 = State.get(getIncomingValue(In), Part); + if (In == 0) + Entry[Part] = In0; // Initialize with the first incoming value. + else { + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Value *Cond = State.get(getMask(In), Part); + Entry[Part] = + State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); + } + } + } + for (unsigned Part = 0; Part < State.UF; ++Part) + State.set(this, Entry[Part], Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; @@ -566,7 +864,35 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, if (AlsoPack) O << " (S->V)"; } +#endif +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Branch on Mask works only on single instance."); + + unsigned Part = State.Instance->Part; + unsigned Lane = State.Instance->Lane.getKnownLane(); + + Value *ConditionBit = nullptr; + VPValue *BlockInMask = getMask(); + if (BlockInMask) { + ConditionBit = State.get(BlockInMask, Part); + if (ConditionBit->getType()->isVectorTy()) + ConditionBit = State.Builder.CreateExtractElement( + ConditionBit, State.Builder.getInt32(Lane)); + } else // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); + + // Replace the temporary unreachable terminator with a new conditional branch, + // whose two destinations will be set later when they are created. + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + assert(isa<UnreachableInst>(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional branch."); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; @@ -838,3 +1164,28 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +// remove VPActiveLaneMaskPHIRecipe. +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *StartMask = State.get(getOperand(0), Part); + PHINode *EntryPart = + State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "ACTIVE-LANE-MASK-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5fc676834331..c99fae1b2ab4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ public: // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -358,6 +359,7 @@ public: // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index f917883145c0..3501de6ab38e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -133,32 +133,48 @@ void VPlanVerifier::verifyHierarchicalCFG( verifyRegionRec(TopRegion); } -bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { - auto Iter = depth_first( - VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); - for (const VPBasicBlock *VPBB : - VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { - // Verify that phi-like recipes are at the beginning of the block, with no - // other recipes in between. - auto RecipeI = VPBB->begin(); - auto End = VPBB->end(); - while (RecipeI != End && RecipeI->isPhi()) - RecipeI++; +static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) { + // Verify that phi-like recipes are at the beginning of the block, with no + // other recipes in between. + auto RecipeI = VPBB->begin(); + auto End = VPBB->end(); + unsigned NumActiveLaneMaskPhiRecipes = 0; + while (RecipeI != End && RecipeI->isPhi()) { + if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI)) + NumActiveLaneMaskPhiRecipes++; + RecipeI++; + } - while (RecipeI != End) { - if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) { - errs() << "Found phi-like recipe after non-phi recipe"; + if (NumActiveLaneMaskPhiRecipes > 1) { + errs() << "There should be no more than one VPActiveLaneMaskPHIRecipe"; + return false; + } + + while (RecipeI != End) { + if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) { + errs() << "Found phi-like recipe after non-phi recipe"; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - errs() << ": "; - RecipeI->dump(); - errs() << "after\n"; - std::prev(RecipeI)->dump(); + errs() << ": "; + RecipeI->dump(); + errs() << "after\n"; + std::prev(RecipeI)->dump(); #endif - return false; - } - RecipeI++; + return false; } + RecipeI++; + } + + return true; +} + +bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); + for (const VPBasicBlock *VPBB : + VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { + if (!verifyVPBasicBlock(VPBB)) + return false; } const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); @@ -181,15 +197,16 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { } if (Exiting->empty()) { - errs() << "VPlan vector loop exiting block must end with BranchOnCount " - "VPInstruction but is empty\n"; + errs() << "VPlan vector loop exiting block must end with BranchOnCount or " + "BranchOnCond VPInstruction but is empty\n"; return false; } auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnCond VPInstruction\n"; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 90598937affc..d12624ffb824 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -414,6 +414,10 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilder<> &Builder) { + // Shufflevectors can only be created for fixed-width vectors. + if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) + return nullptr; + // If the extract can be constant-folded, this code is unsimplified. Defer // to other passes to handle that. Value *X = ExtElt->getVectorOperand(); @@ -1249,14 +1253,20 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || VT != Op0->getType()) return false; - auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0)); - auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1)); - auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0)); - auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1)); + auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); + auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); + auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); + auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); + SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); auto checkSVNonOpUses = [&](Instruction *I) { if (!I || I->getOperand(0)->getType() != VT) return true; - return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); + return any_of(I->users(), [&](User *U) { + return U != Op0 && U != Op1 && + !(isa<ShuffleVectorInst>(U) && + (InputShuffles.contains(cast<Instruction>(U)) || + isInstructionTriviallyDead(cast<Instruction>(U)))); + }); }; if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) @@ -1271,6 +1281,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { auto *SV = dyn_cast<ShuffleVectorInst>(U); if (!SV || SV->getType() != VT) return false; + if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || + (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) + return false; if (!llvm::is_contained(Shuffles, SV)) Shuffles.push_back(SV); } @@ -1283,13 +1296,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (FromReduction && Shuffles.size() > 1) return false; + // Add any shuffle uses for the shuffles we have found, to include them in our + // cost calculations. + if (!FromReduction) { + for (ShuffleVectorInst *SV : Shuffles) { + for (auto U : SV->users()) { + ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); + if (SSV && isa<UndefValue>(SSV->getOperand(1))) + Shuffles.push_back(SSV); + } + } + } + // For each of the output shuffles, we try to sort all the first vector // elements to the beginning, followed by the second array elements at the // end. If the binops are legalized to smaller vectors, this may reduce total // number of binops. We compute the ReconstructMask mask needed to convert // back to the original lane order. - SmallVector<int> V1, V2; - SmallVector<SmallVector<int>> ReconstructMasks; + SmallVector<std::pair<int, int>> V1, V2; + SmallVector<SmallVector<int>> OrigReconstructMasks; int MaxV1Elt = 0, MaxV2Elt = 0; unsigned NumElts = VT->getNumElements(); for (ShuffleVectorInst *SVN : Shuffles) { @@ -1300,6 +1325,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // case we need to commute the mask). Value *SVOp0 = SVN->getOperand(0); Value *SVOp1 = SVN->getOperand(1); + if (isa<UndefValue>(SVOp1)) { + auto *SSV = cast<ShuffleVectorInst>(SVOp0); + SVOp0 = SSV->getOperand(0); + SVOp1 = SSV->getOperand(1); + for (unsigned I = 0, E = Mask.size(); I != E; I++) { + if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) + return false; + Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); + } + } if (SVOp0 == Op1 && SVOp1 == Op0) { std::swap(SVOp0, SVOp1); ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); @@ -1316,21 +1351,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { ReconstructMask.push_back(-1); } else if (Mask[I] < static_cast<int>(NumElts)) { MaxV1Elt = std::max(MaxV1Elt, Mask[I]); - auto It = find(V1, Mask[I]); + auto It = find_if(V1, [&](const std::pair<int, int> &A) { + return Mask[I] == A.first; + }); if (It != V1.end()) ReconstructMask.push_back(It - V1.begin()); else { ReconstructMask.push_back(V1.size()); - V1.push_back(Mask[I]); + V1.emplace_back(Mask[I], V1.size()); } } else { MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); - auto It = find(V2, Mask[I] - NumElts); + auto It = find_if(V2, [&](const std::pair<int, int> &A) { + return Mask[I] - static_cast<int>(NumElts) == A.first; + }); if (It != V2.end()) ReconstructMask.push_back(NumElts + It - V2.begin()); else { ReconstructMask.push_back(NumElts + V2.size()); - V2.push_back(Mask[I] - NumElts); + V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); } } } @@ -1339,7 +1378,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // result. In-order can help simplify the shuffle away. if (FromReduction) sort(ReconstructMask); - ReconstructMasks.push_back(ReconstructMask); + OrigReconstructMasks.push_back(std::move(ReconstructMask)); } // If the Maximum element used from V1 and V2 are not larger than the new @@ -1351,16 +1390,68 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { MaxV2Elt == static_cast<int>(V2.size()) - 1)) return false; + // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a + // shuffle of another shuffle, or not a shuffle (that is treated like a + // identity shuffle). + auto GetBaseMaskValue = [&](Instruction *I, int M) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return M; + if (isa<UndefValue>(SV->getOperand(1))) + if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) + if (InputShuffles.contains(SSV)) + return SSV->getMaskValue(SV->getMaskValue(M)); + return SV->getMaskValue(M); + }; + + // Attempt to sort the inputs my ascending mask values to make simpler input + // shuffles and push complex shuffles down to the uses. We sort on the first + // of the two input shuffle orders, to try and get at least one input into a + // nice order. + auto SortBase = [&](Instruction *A, std::pair<int, int> X, + std::pair<int, int> Y) { + int MXA = GetBaseMaskValue(A, X.first); + int MYA = GetBaseMaskValue(A, Y.first); + return MXA < MYA; + }; + stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { + return SortBase(SVI0A, A, B); + }); + stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { + return SortBase(SVI1A, A, B); + }); + // Calculate our ReconstructMasks from the OrigReconstructMasks and the + // modified order of the input shuffles. + SmallVector<SmallVector<int>> ReconstructMasks; + for (auto Mask : OrigReconstructMasks) { + SmallVector<int> ReconstructMask; + for (int M : Mask) { + auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { + auto It = find_if(V, [M](auto A) { return A.second == M; }); + assert(It != V.end() && "Expected all entries in Mask"); + return std::distance(V.begin(), It); + }; + if (M < 0) + ReconstructMask.push_back(-1); + else if (M < static_cast<int>(NumElts)) { + ReconstructMask.push_back(FindIndex(V1, M)); + } else { + ReconstructMask.push_back(NumElts + FindIndex(V2, M)); + } + } + ReconstructMasks.push_back(std::move(ReconstructMask)); + } + // Calculate the masks needed for the new input shuffles, which get padded // with undef SmallVector<int> V1A, V1B, V2A, V2B; for (unsigned I = 0; I < V1.size(); I++) { - V1A.push_back(SVI0A->getMaskValue(V1[I])); - V1B.push_back(SVI0B->getMaskValue(V1[I])); + V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); + V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); } for (unsigned I = 0; I < V2.size(); I++) { - V2A.push_back(SVI1A->getMaskValue(V2[I])); - V2B.push_back(SVI1B->getMaskValue(V2[I])); + V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); + V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); } while (V1A.size() < NumElts) { V1A.push_back(UndefMaskElem); @@ -1371,9 +1462,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { V2B.push_back(UndefMaskElem); } - auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { - return C + - TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); + auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return C; + return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) + ? TTI::SK_PermuteSingleSrc + : TTI::SK_PermuteTwoSrc, + VT, SV->getShuffleMask()); }; auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); @@ -1386,9 +1482,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); - // This set helps us only cost each unique shuffle once. - SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles( - {SVI0A, SVI0B, SVI1A, SVI1B}); CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), InstructionCost(0), AddShuffleCost); @@ -1408,22 +1501,35 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), InstructionCost(0), AddShuffleMaskCost); + LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); + LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore + << " vs CostAfter: " << CostAfter << "\n"); if (CostBefore <= CostAfter) return false; // The cost model has passed, create the new instructions. - Builder.SetInsertPoint(SVI0A); - Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), - SVI0A->getOperand(1), V1A); - Builder.SetInsertPoint(SVI0B); - Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), - SVI0B->getOperand(1), V1B); - Builder.SetInsertPoint(SVI1A); - Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), - SVI1A->getOperand(1), V2A); - Builder.SetInsertPoint(SVI1B); - Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), - SVI1B->getOperand(1), V2B); + auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return I; + if (isa<UndefValue>(SV->getOperand(1))) + if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) + if (InputShuffles.contains(SSV)) + return SSV->getOperand(Op); + return SV->getOperand(Op); + }; + Builder.SetInsertPoint(SVI0A->getNextNode()); + Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), + GetShuffleOperand(SVI0A, 1), V1A); + Builder.SetInsertPoint(SVI0B->getNextNode()); + Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), + GetShuffleOperand(SVI0B, 1), V1B); + Builder.SetInsertPoint(SVI1A->getNextNode()); + Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), + GetShuffleOperand(SVI1A, 1), V2A); + Builder.SetInsertPoint(SVI1B->getNextNode()); + Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), + GetShuffleOperand(SVI1B, 1), V2B); Builder.SetInsertPoint(Op0); Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), NSV0A, NSV0B); |