diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/CodeGen/CodeGenPrepare.cpp | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'llvm/lib/CodeGen/CodeGenPrepare.cpp')
-rw-r--r-- | llvm/lib/CodeGen/CodeGenPrepare.cpp | 915 |
1 files changed, 668 insertions, 247 deletions
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index f05afd058746..e8b8e6c93cf0 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -43,7 +43,6 @@ #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -61,7 +60,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" -#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" @@ -178,6 +176,17 @@ static cl::opt<bool> ProfileGuidedSectionPrefix( "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::desc("Use profile info to add section prefix for hot/cold functions")); +static cl::opt<bool> ProfileUnknownInSpecialSection( + "profile-unknown-in-special-section", cl::Hidden, cl::init(false), + cl::ZeroOrMore, + cl::desc("In profiling mode like sampleFDO, if a function doesn't have " + "profile, we cannot tell the function is cold for sure because " + "it may be a function newly added without ever being sampled. " + "With the flag enabled, compiler can put such profile unknown " + "functions into a special section, so runtime system can choose " + "to handle it in a different way than .text section, to save " + "RAM for example. ")); + static cl::opt<unsigned> FreqRatioToSkipMerge( "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2), cl::desc("Skip merging empty blocks if (frequency of empty block) / " @@ -230,6 +239,15 @@ static cl::opt<bool> EnableICMP_EQToICMP_ST( "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false), cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion.")); +static cl::opt<bool> + VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false), + cl::desc("Enable BFI update verification for " + "CodeGenPrepare.")); + +static cl::opt<bool> OptimizePhiTypes( + "cgp-optimize-phi-types", cl::Hidden, cl::init(false), + cl::desc("Enable converting phi types in CodeGenPrepare")); + namespace { enum ExtType { @@ -327,6 +345,7 @@ class TypePromotionTransaction; // FIXME: When we can selectively preserve passes, preserve the domtree. AU.addRequired<ProfileSummaryInfoWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); } @@ -368,12 +387,14 @@ class TypePromotionTransaction; bool optimizeInst(Instruction *I, bool &ModifiedDT); bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); + bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *Load); bool optimizeShiftInst(BinaryOperator *BO); + bool optimizeFunnelShift(IntrinsicInst *Fsh); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI); bool optimizeSwitchInst(SwitchInst *SI); @@ -389,20 +410,25 @@ class TypePromotionTransaction; unsigned CreatedInstsCost = 0); bool mergeSExts(Function &F); bool splitLargeGEPOffsets(); + bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited, + SmallPtrSetImpl<Instruction *> &DeletedInstrs); + bool optimizePhiTypes(Function &F); bool performAddressTypePromotion( Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, bool HasPromoted, TypePromotionTransaction &TPT, SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); bool splitBranchCondition(Function &F, bool &ModifiedDT); - bool simplifyOffsetableRelocate(Instruction &I); + bool simplifyOffsetableRelocate(GCStatepointInst &I); bool tryToSinkFreeOperands(Instruction *I); - bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, + bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, + Value *Arg1, CmpInst *Cmp, Intrinsic::ID IID); bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT); bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT); bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT); + void verifyBFIUpdates(Function &F); }; } // end anonymous namespace @@ -428,12 +454,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) { InsertedInsts.clear(); PromotedInsts.clear(); - if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { - TM = &TPC->getTM<TargetMachine>(); - SubtargetInfo = TM->getSubtargetImpl(F); - TLI = SubtargetInfo->getTargetLowering(); - TRI = SubtargetInfo->getRegisterInfo(); - } + TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + SubtargetInfo = TM->getSubtargetImpl(F); + TLI = SubtargetInfo->getTargetLowering(); + TRI = SubtargetInfo->getRegisterInfo(); TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -446,14 +470,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) { F.setSectionPrefix(".hot"); else if (PSI->isFunctionColdInCallGraph(&F, *BFI)) F.setSectionPrefix(".unlikely"); + else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() && + PSI->isFunctionHotnessUnknown(F)) + F.setSectionPrefix(".unknown"); } /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. - if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI && - TLI->isSlowDivBypassed()) { + if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) { const DenseMap<unsigned int, unsigned int> &BypassWidths = - TLI->getBypassSlowDivWidths(); + TLI->getBypassSlowDivWidths(); BasicBlock* BB = &*F.begin(); while (BB != nullptr) { // bypassSlowDivision may create new BBs, but we don't want to reapply the @@ -495,6 +521,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) { MadeChange |= mergeSExts(F); if (!LargeOffsetGEPMap.empty()) MadeChange |= splitLargeGEPOffsets(); + MadeChange |= optimizePhiTypes(F); + + if (MadeChange) + eliminateFallThrough(F); // Really free removed instructions during promotion. for (Instruction *I : RemovedInsts) @@ -550,11 +580,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) { } if (!DisableGCOpts) { - SmallVector<Instruction *, 2> Statepoints; + SmallVector<GCStatepointInst *, 2> Statepoints; for (BasicBlock &BB : F) for (Instruction &I : BB) - if (isStatepoint(I)) - Statepoints.push_back(&I); + if (auto *SP = dyn_cast<GCStatepointInst>(&I)) + Statepoints.push_back(SP); for (auto &I : Statepoints) EverMadeChange |= simplifyOffsetableRelocate(*I); } @@ -563,9 +593,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // preparatory transforms. EverMadeChange |= placeDbgValues(F); +#ifndef NDEBUG + if (VerifyBFIUpdates) + verifyBFIUpdates(F); +#endif + return EverMadeChange; } +// Verify BFI has been updated correctly by recomputing BFI and comparing them. +void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) { + DominatorTree NewDT(F); + LoopInfo NewLI(NewDT); + BranchProbabilityInfo NewBPI(F, NewLI, TLInfo); + BlockFrequencyInfo NewBFI(F, NewBPI, NewLI); + NewBFI.verifyMatch(*BFI); +} + /// Merge basic blocks which are connected by a single edge, where one of the /// basic blocks has a single successor pointing to the other basic block, /// which has a single predecessor. @@ -749,7 +793,7 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, BlockFrequency PredFreq = BFI->getBlockFreq(Pred); BlockFrequency BBFreq = BFI->getBlockFreq(BB); - for (auto SameValueBB : SameIncomingValueBBs) + for (auto *SameValueBB : SameIncomingValueBBs) if (SameValueBB->getUniquePredecessor() == Pred && DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB)) BBFreq += BFI->getBlockFreq(SameValueBB); @@ -925,7 +969,7 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, SmallVectorImpl<Value *> &OffsetV) { for (unsigned i = 1; i < GEP->getNumOperands(); i++) { // Only accept small constant integer operands - auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i)); + auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i)); if (!Op || Op->getZExtValue() > 20) return false; } @@ -949,7 +993,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, // be skipped by optimization and we do not care about them. for (auto R = RelocatedBase->getParent()->getFirstInsertionPt(); &*R != RelocatedBase; ++R) - if (auto RI = dyn_cast<GCRelocateInst>(R)) + if (auto *RI = dyn_cast<GCRelocateInst>(R)) if (RI->getStatepoint() == RelocatedBase->getStatepoint()) if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) { RelocatedBase->moveBefore(RI); @@ -973,7 +1017,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, } Value *Base = ToReplace->getBasePtr(); - auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr()); + auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr()); if (!Derived || Derived->getPointerOperand() != Base) continue; @@ -1050,10 +1094,9 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, // %base' = gc.relocate(%tok, i32 4, i32 4) // %ptr' = gep %base' + 15 // %val = load %ptr' -bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { +bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) { bool MadeChange = false; SmallVector<GCRelocateInst *, 2> AllRelocateCalls; - for (auto *U : I.users()) if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U)) // Collect all the relocate calls associated with a statepoint @@ -1187,6 +1230,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, } bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, + Value *Arg0, Value *Arg1, CmpInst *Cmp, Intrinsic::ID IID) { if (BO->getParent() != Cmp->getParent()) { @@ -1204,8 +1248,6 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, } // We allow matching the canonical IR (add X, C) back to (usubo X, -C). - Value *Arg0 = BO->getOperand(0); - Value *Arg1 = BO->getOperand(1); if (BO->getOpcode() == Instruction::Add && IID == Intrinsic::usub_with_overflow) { assert(isa<Constant>(Arg1) && "Unexpected input for usubo"); @@ -1215,7 +1257,9 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, // Insert at the first instruction of the pair. Instruction *InsertPt = nullptr; for (Instruction &Iter : *Cmp->getParent()) { - if (&Iter == BO || &Iter == Cmp) { + // If BO is an XOR, it is not guaranteed that it comes after both inputs to + // the overflow intrinsic are defined. + if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) { InsertPt = &Iter; break; } @@ -1224,12 +1268,16 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, IRBuilder<> Builder(InsertPt); Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1); - Value *Math = Builder.CreateExtractValue(MathOV, 0, "math"); + if (BO->getOpcode() != Instruction::Xor) { + Value *Math = Builder.CreateExtractValue(MathOV, 0, "math"); + BO->replaceAllUsesWith(Math); + } else + assert(BO->hasOneUse() && + "Patterns with XOr should use the BO only in the compare"); Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov"); - BO->replaceAllUsesWith(Math); Cmp->replaceAllUsesWith(OV); - BO->eraseFromParent(); Cmp->eraseFromParent(); + BO->eraseFromParent(); return true; } @@ -1269,12 +1317,17 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT) { Value *A, *B; BinaryOperator *Add; - if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) + if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) { if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add)) return false; + // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases. + A = Add->getOperand(0); + B = Add->getOperand(1); + } if (!TLI->shouldFormOverflowOp(ISD::UADDO, - TLI->getValueType(*DL, Add->getType()))) + TLI->getValueType(*DL, Add->getType()), + Add->hasNUsesOrMore(2))) return false; // We don't want to move around uses of condition values this late, so we @@ -1283,7 +1336,8 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse()) return false; - if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow)) + if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp, + Intrinsic::uadd_with_overflow)) return false; // Reset callers - do not crash by iterating over a dead instruction. @@ -1341,10 +1395,12 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, return false; if (!TLI->shouldFormOverflowOp(ISD::USUBO, - TLI->getValueType(*DL, Sub->getType()))) + TLI->getValueType(*DL, Sub->getType()), + Sub->hasNUsesOrMore(2))) return false; - if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow)) + if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1), + Cmp, Intrinsic::usub_with_overflow)) return false; // Reset callers - do not crash by iterating over a dead instruction. @@ -1813,9 +1869,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, const TargetLowering *TLI, const DataLayout *DL, bool &ModifiedDT) { - if (!TLI || !DL) - return false; - // If a zero input is undefined, it doesn't make sense to despeculate that. if (match(CountZeros->getOperand(1), m_One())) return false; @@ -1877,7 +1930,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { // Lower inline assembly if we can. // If we found an inline asm expession, and if the target knows how to // lower it to normal LLVM code, do so now. - if (TLI && isa<InlineAsm>(CI->getCalledValue())) { + if (CI->isInlineAsm()) { if (TLI->ExpandInlineAsm(CI)) { // Avoid invalidating the iterator. CurInstIterator = BB->begin(); @@ -1894,7 +1947,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { // Align the pointer arguments to this call if the target thinks it's a good // idea unsigned MinSize, PrefAlign; - if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { + if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { for (auto &Arg : CI->arg_operands()) { // We want to align both objects whose address is used directly and // objects whose address is used in casts and GEPs, though it only makes @@ -1912,7 +1965,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { AllocaInst *AI; if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign && DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2) - AI->setAlignment(MaybeAlign(PrefAlign)); + AI->setAlignment(Align(PrefAlign)); // Global variables can only be aligned if they are defined in this // object (i.e. they are uniquely initialized in this object), and // over-aligning global variables that have an explicit section is @@ -1927,12 +1980,14 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { // If this is a memcpy (or similar) then we may be able to improve the // alignment if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) { - unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL); - if (DestAlign > MI->getDestAlignment()) + Align DestAlign = getKnownAlignment(MI->getDest(), *DL); + MaybeAlign MIDestAlign = MI->getDestAlign(); + if (!MIDestAlign || DestAlign > *MIDestAlign) MI->setDestAlignment(DestAlign); if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { - unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL); - if (SrcAlign > MTI->getSourceAlignment()) + MaybeAlign MTISrcAlign = MTI->getSourceAlign(); + Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL); + if (!MTISrcAlign || SrcAlign > *MTISrcAlign) MTI->setSourceAlignment(SrcAlign); } } @@ -1942,8 +1997,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { // cold block. This interacts with our handling for loads and stores to // ensure that we can fold all uses of a potential addressing computation // into their uses. TODO: generalize this to work over profiling data - bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get()); - if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) + if (CI->hasFnAttr(Attribute::Cold) && + !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) for (auto &Arg : CI->arg_operands()) { if (!Arg->getType()->isPointerTy()) continue; @@ -1955,10 +2010,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { if (II) { switch (II->getIntrinsicID()) { default: break; + case Intrinsic::assume: { + II->eraseFromParent(); + return true; + } + case Intrinsic::experimental_widenable_condition: { // Give up on future widening oppurtunties so that we can fold away dead // paths and merge blocks before going into block-local instruction - // selection. + // selection. if (II->use_empty()) { II->eraseFromParent(); return true; @@ -2008,21 +2068,43 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { case Intrinsic::ctlz: // If counting zeros is expensive, try to avoid it. return despeculateCountZeros(II, TLI, DL, ModifiedDT); + case Intrinsic::fshl: + case Intrinsic::fshr: + return optimizeFunnelShift(II); case Intrinsic::dbg_value: return fixupDbgValue(II); + case Intrinsic::vscale: { + // If datalayout has no special restrictions on vector data layout, + // replace `llvm.vscale` by an equivalent constant expression + // to benefit from cheap constant propagation. + Type *ScalableVectorTy = + VectorType::get(Type::getInt8Ty(II->getContext()), 1, true); + if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) { + auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo()); + auto *One = ConstantInt::getSigned(II->getType(), 1); + auto *CGep = + ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One); + II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType())); + II->eraseFromParent(); + return true; + } + break; } - - if (TLI) { - SmallVector<Value*, 2> PtrOps; - Type *AccessTy; - if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) - while (!PtrOps.empty()) { - Value *PtrVal = PtrOps.pop_back_val(); - unsigned AS = PtrVal->getType()->getPointerAddressSpace(); - if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) - return true; - } + case Intrinsic::masked_gather: + return optimizeGatherScatterInst(II, II->getArgOperand(0)); + case Intrinsic::masked_scatter: + return optimizeGatherScatterInst(II, II->getArgOperand(1)); } + + SmallVector<Value *, 2> PtrOps; + Type *AccessTy; + if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) { + Value *PtrVal = PtrOps.pop_back_val(); + unsigned AS = PtrVal->getType()->getPointerAddressSpace(); + if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) + return true; + } } // From here on out we're working with named functions. @@ -2033,7 +2115,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { // to fortified library functions (e.g. __memcpy_chk) that have the default // "don't know" as the objectsize. Anything else should be left alone. FortifiedLibCallSimplifier Simplifier(TLInfo, true); - if (Value *V = Simplifier.optimizeCall(CI)) { + IRBuilder<> Builder(CI); + if (Value *V = Simplifier.optimizeCall(CI, Builder)) { CI->replaceAllUsesWith(V); CI->eraseFromParent(); return true; @@ -2073,14 +2156,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { /// ret i32 %tmp2 /// @endcode bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) { - if (!TLI) - return false; - ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator()); if (!RetI) return false; PHINode *PN = nullptr; + ExtractValueInst *EVI = nullptr; BitCastInst *BCI = nullptr; Value *V = RetI->getReturnValue(); if (V) { @@ -2088,6 +2169,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT if (BCI) V = BCI->getOperand(0); + EVI = dyn_cast<ExtractValueInst>(V); + if (EVI) { + V = EVI->getOperand(0); + if (!std::all_of(EVI->idx_begin(), EVI->idx_end(), + [](unsigned idx) { return idx == 0; })) + return false; + } + PN = dyn_cast<PHINode>(V); if (!PN) return false; @@ -2101,7 +2190,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT if (PN) { BasicBlock::iterator BI = BB->begin(); // Skip over debug and the bitcast. - do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI); + do { + ++BI; + } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI); if (&*BI != RetI) return false; } else { @@ -2157,6 +2248,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT // Duplicate the return into TailCallBB. (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB); + assert(!VerifyBFIUpdates || + BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB)); + BFI->setBlockFreq( + BB, + (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency()); ModifiedDT = Changed = true; ++NumRetsDup; } @@ -2354,6 +2450,9 @@ namespace { /// This class provides transaction based operation on the IR. /// Every change made through this class is recorded in the internal state and /// can be undone (rollback) until commit is called. +/// CGP does not check if instructions could be speculatively executed when +/// moved. Preserving the original location would pessimize the debugging +/// experience, as well as negatively impact the quality of sample PGO. class TypePromotionTransaction { /// This represents the common interface of the individual transaction. /// Each class implements the logic for doing one specific modification on @@ -2516,6 +2615,7 @@ class TypePromotionTransaction { /// trunc Opnd to Ty. TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) { IRBuilder<> Builder(Opnd); + Builder.SetCurrentDebugLocation(DebugLoc()); Val = Builder.CreateTrunc(Opnd, Ty, "promoted"); LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n"); } @@ -2568,6 +2668,7 @@ class TypePromotionTransaction { ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) : TypePromotionAction(InsertPt) { IRBuilder<> Builder(InsertPt); + Builder.SetCurrentDebugLocation(DebugLoc()); Val = Builder.CreateZExt(Opnd, Ty, "promoted"); LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n"); } @@ -2721,8 +2822,9 @@ public: TypePromotionTransaction(SetOfInstrs &RemovedInsts) : RemovedInsts(RemovedInsts) {} - /// Advocate every changes made in that transaction. - void commit(); + /// Advocate every changes made in that transaction. Return true if any change + /// happen. + bool commit(); /// Undo all the changes made after the given point. void rollback(ConstRestorationPt Point); @@ -2828,11 +2930,13 @@ TypePromotionTransaction::getRestorationPoint() const { return !Actions.empty() ? Actions.back().get() : nullptr; } -void TypePromotionTransaction::commit() { +bool TypePromotionTransaction::commit() { for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; ++It) (*It)->commit(); + bool Modified = !Actions.empty(); Actions.clear(); + return Modified; } void TypePromotionTransaction::rollback( @@ -3115,7 +3219,7 @@ public: SmallPtrSet<Value *, 32> Visited; WorkList.push_back(Val); while (!WorkList.empty()) { - auto P = WorkList.pop_back_val(); + auto *P = WorkList.pop_back_val(); if (!Visited.insert(P).second) continue; if (auto *PI = dyn_cast<Instruction>(P)) @@ -3164,13 +3268,13 @@ public: void destroyNewNodes(Type *CommonType) { // For safe erasing, replace the uses with dummy value first. - auto Dummy = UndefValue::get(CommonType); - for (auto I : AllPhiNodes) { + auto *Dummy = UndefValue::get(CommonType); + for (auto *I : AllPhiNodes) { I->replaceAllUsesWith(Dummy); I->eraseFromParent(); } AllPhiNodes.clear(); - for (auto I : AllSelectNodes) { + for (auto *I : AllSelectNodes) { I->replaceAllUsesWith(Dummy); I->eraseFromParent(); } @@ -3511,7 +3615,7 @@ private: // Must be a Phi node then. auto *PHI = cast<PHINode>(V); // Fill the Phi node with values from predecessors. - for (auto B : predecessors(PHI->getParent())) { + for (auto *B : predecessors(PHI->getParent())) { Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B); assert(Map.find(PV) != Map.end() && "No predecessor Value!"); PHI->addIncoming(ST.Get(Map[PV]), B); @@ -3625,10 +3729,11 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, // X*Scale + C*Scale to addr mode. ConstantInt *CI = nullptr; Value *AddLHS = nullptr; if (isa<Instruction>(ScaleReg) && // not a constant expr. - match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { + match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) && + CI->getValue().isSignedIntN(64)) { TestAddrMode.InBounds = false; TestAddrMode.ScaledReg = AddLHS; - TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; + TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale; // If this addressing mode is legal, commit it and remember that we folded // this instruction. @@ -3849,7 +3954,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, // We can get through binary operator, if it is legal. In other words, the // binary operator must have a nuw or nsw flag. const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); - if (BinOp && isa<OverflowingBinaryOperator>(BinOp) && + if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) && ((!IsSExt && BinOp->hasNoUnsignedWrap()) || (IsSExt && BinOp->hasNoSignedWrap()))) return true; @@ -4251,15 +4356,20 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); ConstantOffset += SL->getElementOffset(Idx); } else { - uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType()); - if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { - const APInt &CVal = CI->getValue(); - if (CVal.getMinSignedBits() <= 64) { - ConstantOffset += CVal.getSExtValue() * TypeSize; - continue; + TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType()); + if (TS.isNonZero()) { + // The optimisations below currently only work for fixed offsets. + if (TS.isScalable()) + return false; + int64_t TypeSize = TS.getFixedSize(); + if (ConstantInt *CI = + dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { + const APInt &CVal = CI->getValue(); + if (CVal.getMinSignedBits() <= 64) { + ConstantOffset += CVal.getSExtValue() * TypeSize; + continue; + } } - } - if (TypeSize) { // Scales of zero don't do anything. // We only allow one variable index at the moment. if (VariableOperand != -1) return false; @@ -4422,11 +4532,13 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { - // Fold in immediates if legal for the target. - AddrMode.BaseOffs += CI->getSExtValue(); - if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) - return true; - AddrMode.BaseOffs -= CI->getSExtValue(); + if (CI->getValue().isSignedIntN(64)) { + // Fold in immediates if legal for the target. + AddrMode.BaseOffs += CI->getSExtValue(); + if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) + return true; + AddrMode.BaseOffs -= CI->getSExtValue(); + } } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { // If this is a global variable, try to fold it into the addressing mode. if (!AddrMode.BaseGV) { @@ -4502,8 +4614,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, const TargetRegisterInfo &TRI) { const Function *F = CI->getFunction(); TargetLowering::AsmOperandInfoVector TargetConstraints = - TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, - ImmutableCallSite(CI)); + TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI); for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; @@ -4581,14 +4692,16 @@ static bool FindAllMemoryUses( } if (CallInst *CI = dyn_cast<CallInst>(UserI)) { - // If this is a cold call, we can sink the addressing calculation into - // the cold path. See optimizeCallInst - bool OptForSize = OptSize || + if (CI->hasFnAttr(Attribute::Cold)) { + // If this is a cold call, we can sink the addressing calculation into + // the cold path. See optimizeCallInst + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); - if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) - continue; + if (!OptForSize) + continue; + } - InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); + InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand()); if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. @@ -4854,7 +4967,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, TPT.rollback(LastKnownGood); return false; } - TPT.commit(); + bool Modified = TPT.commit(); // Get the combined AddrMode (or the only AddrMode, if we only had one). ExtAddrMode AddrMode = AddrModes.getAddrMode(); @@ -4868,7 +4981,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, })) { LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); - return false; + return Modified; } // Insert this computation right after this user. Since our caller is @@ -4891,7 +5004,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (SunkAddr->getType() != Addr->getType()) SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && - TM && SubtargetInfo->addrSinkUsingGEPs())) { + SubtargetInfo->addrSinkUsingGEPs())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode @@ -4909,7 +5022,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // We can't add more than one pointer together, nor can we scale a // pointer (both of which seem meaningless). if (ResultPtr || AddrMode.Scale != 1) - return false; + return Modified; ResultPtr = AddrMode.ScaledReg; AddrMode.Scale = 0; @@ -4926,12 +5039,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *ScaledRegTy = AddrMode.ScaledReg->getType(); if (cast<IntegerType>(IntPtrTy)->getBitWidth() > cast<IntegerType>(ScaledRegTy)->getBitWidth()) - return false; + return Modified; } if (AddrMode.BaseGV) { if (ResultPtr) - return false; + return Modified; ResultPtr = AddrMode.BaseGV; } @@ -4955,7 +5068,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) { SunkAddr = Constant::getNullValue(Addr->getType()); } else if (!ResultPtr) { - return false; + return Modified; } else { Type *I8PtrTy = Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); @@ -5040,7 +5153,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) || (AddrMode.BaseGV && DL->isNonIntegralPointerType(AddrMode.BaseGV->getType()))) - return false; + return Modified; LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); @@ -5080,7 +5193,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Instruction *I = dyn_cast_or_null<Instruction>(Result); if (I && (Result != AddrMode.BaseReg)) I->eraseFromParent(); - return false; + return Modified; } if (AddrMode.Scale != 1) V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), @@ -5142,6 +5255,119 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return true; } +/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find +/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can +/// only handle a 2 operand GEP in the same basic block or a splat constant +/// vector. The 2 operands to the GEP must have a scalar pointer and a vector +/// index. +/// +/// If the existing GEP has a vector base pointer that is splat, we can look +/// through the splat to find the scalar pointer. If we can't find a scalar +/// pointer there's nothing we can do. +/// +/// If we have a GEP with more than 2 indices where the middle indices are all +/// zeroes, we can replace it with 2 GEPs where the second has 2 operands. +/// +/// If the final index isn't a vector or is a splat, we can emit a scalar GEP +/// followed by a GEP with an all zeroes vector index. This will enable +/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a +/// zero index. +bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, + Value *Ptr) { + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP || !GEP->hasIndices()) + return false; + + // If the GEP and the gather/scatter aren't in the same BB, don't optimize. + // FIXME: We should support this by sinking the GEP. + if (MemoryInst->getParent() != GEP->getParent()) + return false; + + SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end()); + + bool RewriteGEP = false; + + if (Ops[0]->getType()->isVectorTy()) { + Ops[0] = const_cast<Value *>(getSplatValue(Ops[0])); + if (!Ops[0]) + return false; + RewriteGEP = true; + } + + unsigned FinalIndex = Ops.size() - 1; + + // Ensure all but the last index is 0. + // FIXME: This isn't strictly required. All that's required is that they are + // all scalars or splats. + for (unsigned i = 1; i < FinalIndex; ++i) { + auto *C = dyn_cast<Constant>(Ops[i]); + if (!C) + return false; + if (isa<VectorType>(C->getType())) + C = C->getSplatValue(); + auto *CI = dyn_cast_or_null<ConstantInt>(C); + if (!CI || !CI->isZero()) + return false; + // Scalarize the index if needed. + Ops[i] = CI; + } + + // Try to scalarize the final index. + if (Ops[FinalIndex]->getType()->isVectorTy()) { + if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) { + auto *C = dyn_cast<ConstantInt>(V); + // Don't scalarize all zeros vector. + if (!C || !C->isZero()) { + Ops[FinalIndex] = V; + RewriteGEP = true; + } + } + } + + // If we made any changes or the we have extra operands, we need to generate + // new instructions. + if (!RewriteGEP && Ops.size() == 2) + return false; + + unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements(); + + IRBuilder<> Builder(MemoryInst); + + Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType()); + + Value *NewAddr; + + // If the final index isn't a vector, emit a scalar GEP containing all ops + // and a vector GEP with all zeroes final index. + if (!Ops[FinalIndex]->getType()->isVectorTy()) { + NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front()); + auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts); + NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy)); + } else { + Value *Base = Ops[0]; + Value *Index = Ops[FinalIndex]; + + // Create a scalar GEP if there are more than 2 operands. + if (Ops.size() != 2) { + // Replace the last index with 0. + Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy); + Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front()); + } + + // Now create the GEP with scalar pointer and vector index. + NewAddr = Builder.CreateGEP(Base, Index); + } + + MemoryInst->replaceUsesOfWith(Ptr, NewAddr); + + // If we have no uses, recursively delete the value and all dead instructions + // using it. + if (Ptr->use_empty()) + RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo); + + return true; +} + /// If there are any memory operands, use OptimizeMemoryInst to sink their /// address computing into the block when possible / profitable. bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { @@ -5150,7 +5376,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { const TargetRegisterInfo *TRI = TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = - TLI->ParseConstraints(*DL, TRI, CS); + TLI->ParseConstraints(*DL, TRI, *CS); unsigned ArgNo = 0; for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; @@ -5231,7 +5457,7 @@ bool CodeGenPrepare::tryToPromoteExts( bool Promoted = false; // Iterate over all the extensions to try to promote them. - for (auto I : Exts) { + for (auto *I : Exts) { // Early check if we directly have ext(load). if (isa<LoadInst>(I->getOperand(0))) { ProfitablyMovedExts.push_back(I); @@ -5242,7 +5468,7 @@ bool CodeGenPrepare::tryToPromoteExts( // this check inside the for loop is to catch the case where an extension // is directly fed by a load because in such case the extension can be moved // up without any promotion on its operands. - if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) + if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion) return false; // Get the action to perform the promotion. @@ -5292,7 +5518,7 @@ bool CodeGenPrepare::tryToPromoteExts( SmallVector<Instruction *, 2> NewlyMovedExts; (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); bool NewPromoted = false; - for (auto ExtInst : NewlyMovedExts) { + for (auto *ExtInst : NewlyMovedExts) { Instruction *MovedExt = cast<Instruction>(ExtInst); Value *ExtOperand = MovedExt->getOperand(0); // If we have reached to a load, we need this extra profitability check @@ -5358,9 +5584,9 @@ bool CodeGenPrepare::mergeSExts(Function &F) { return Changed; } -// Spliting large data structures so that the GEPs accessing them can have +// Splitting large data structures so that the GEPs accessing them can have // smaller offsets so that they can be sunk to the same blocks as their users. -// For example, a large struct starting from %base is splitted into two parts +// For example, a large struct starting from %base is split into two parts // where the second part starts from %new_base. // // Before: @@ -5421,7 +5647,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { int64_t BaseOffset = LargeOffsetGEPs.begin()->second; Value *NewBaseGEP = nullptr; - auto LargeOffsetGEP = LargeOffsetGEPs.begin(); + auto *LargeOffsetGEP = LargeOffsetGEPs.begin(); while (LargeOffsetGEP != LargeOffsetGEPs.end()) { GetElementPtrInst *GEP = LargeOffsetGEP->first; int64_t Offset = LargeOffsetGEP->second; @@ -5435,7 +5661,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { GEP->getAddressSpace())) { // We need to create a new base if the offset to the current base is // too large to fit into the addressing mode. So, a very large struct - // may be splitted into several parts. + // may be split into several parts. BaseGEP = GEP; BaseOffset = Offset; NewBaseGEP = nullptr; @@ -5506,6 +5732,155 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { return Changed; } +bool CodeGenPrepare::optimizePhiType( + PHINode *I, SmallPtrSetImpl<PHINode *> &Visited, + SmallPtrSetImpl<Instruction *> &DeletedInstrs) { + // We are looking for a collection on interconnected phi nodes that together + // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts + // are of the same type. Convert the whole set of nodes to the type of the + // bitcast. + Type *PhiTy = I->getType(); + Type *ConvertTy = nullptr; + if (Visited.count(I) || + (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy())) + return false; + + SmallVector<Instruction *, 4> Worklist; + Worklist.push_back(cast<Instruction>(I)); + SmallPtrSet<PHINode *, 4> PhiNodes; + PhiNodes.insert(I); + Visited.insert(I); + SmallPtrSet<Instruction *, 4> Defs; + SmallPtrSet<Instruction *, 4> Uses; + + while (!Worklist.empty()) { + Instruction *II = Worklist.pop_back_val(); + + if (auto *Phi = dyn_cast<PHINode>(II)) { + // Handle Defs, which might also be PHI's + for (Value *V : Phi->incoming_values()) { + if (auto *OpPhi = dyn_cast<PHINode>(V)) { + if (!PhiNodes.count(OpPhi)) { + if (Visited.count(OpPhi)) + return false; + PhiNodes.insert(OpPhi); + Visited.insert(OpPhi); + Worklist.push_back(OpPhi); + } + } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) { + if (!Defs.count(OpLoad)) { + Defs.insert(OpLoad); + Worklist.push_back(OpLoad); + } + } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) { + if (!Defs.count(OpEx)) { + Defs.insert(OpEx); + Worklist.push_back(OpEx); + } + } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) { + if (!ConvertTy) + ConvertTy = OpBC->getOperand(0)->getType(); + if (OpBC->getOperand(0)->getType() != ConvertTy) + return false; + if (!Defs.count(OpBC)) { + Defs.insert(OpBC); + Worklist.push_back(OpBC); + } + } else if (!isa<UndefValue>(V)) + return false; + } + } + + // Handle uses which might also be phi's + for (User *V : II->users()) { + if (auto *OpPhi = dyn_cast<PHINode>(V)) { + if (!PhiNodes.count(OpPhi)) { + if (Visited.count(OpPhi)) + return false; + PhiNodes.insert(OpPhi); + Visited.insert(OpPhi); + Worklist.push_back(OpPhi); + } + } else if (auto *OpStore = dyn_cast<StoreInst>(V)) { + if (OpStore->getOperand(0) != II) + return false; + Uses.insert(OpStore); + } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) { + if (!ConvertTy) + ConvertTy = OpBC->getType(); + if (OpBC->getType() != ConvertTy) + return false; + Uses.insert(OpBC); + } else + return false; + } + } + + if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy)) + return false; + + LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to " + << *ConvertTy << "\n"); + + // Create all the new phi nodes of the new type, and bitcast any loads to the + // correct type. + ValueToValueMap ValMap; + ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy); + for (Instruction *D : Defs) { + if (isa<BitCastInst>(D)) + ValMap[D] = D->getOperand(0); + else + ValMap[D] = + new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode()); + } + for (PHINode *Phi : PhiNodes) + ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(), + Phi->getName() + ".tc", Phi); + // Pipe together all the PhiNodes. + for (PHINode *Phi : PhiNodes) { + PHINode *NewPhi = cast<PHINode>(ValMap[Phi]); + for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++) + NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)], + Phi->getIncomingBlock(i)); + } + // And finally pipe up the stores and bitcasts + for (Instruction *U : Uses) { + if (isa<BitCastInst>(U)) { + DeletedInstrs.insert(U); + U->replaceAllUsesWith(ValMap[U->getOperand(0)]); + } else + U->setOperand(0, + new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U)); + } + + // Save the removed phis to be deleted later. + for (PHINode *Phi : PhiNodes) + DeletedInstrs.insert(Phi); + return true; +} + +bool CodeGenPrepare::optimizePhiTypes(Function &F) { + if (!OptimizePhiTypes) + return false; + + bool Changed = false; + SmallPtrSet<PHINode *, 4> Visited; + SmallPtrSet<Instruction *, 4> DeletedInstrs; + + // Attempt to optimize all the phis in the functions to the correct type. + for (auto &BB : F) + for (auto &Phi : BB.phis()) + Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs); + + // Remove any old phi's that have been converted. + for (auto *I : DeletedInstrs) { + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + } + + return Changed; +} + /// Return true, if an ext(load) can be formed from an extension in /// \p MovedExts. bool CodeGenPrepare::canFormExtLd( @@ -5567,11 +5942,6 @@ bool CodeGenPrepare::canFormExtLd( /// \p Inst[in/out] the extension may be modified during the process if some /// promotions apply. bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { - // ExtLoad formation and address type promotion infrastructure requires TLI to - // be effective. - if (!TLI) - return false; - bool AllowPromotionWithoutCommonHeader = false; /// See if it is an interesting sext operations for the address type /// promotion before trying to promote it, e.g., the ones with the right @@ -5596,16 +5966,8 @@ bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { assert(LI && ExtFedByLoad && "Expect a valid load and extension"); TPT.commit(); - // Move the extend into the same block as the load + // Move the extend into the same block as the load. ExtFedByLoad->moveAfter(LI); - // CGP does not check if the zext would be speculatively executed when moved - // to the same basic block as the load. Preserving its original location - // would pessimize the debugging experience, as well as negatively impact - // the quality of sample pgo. We don't want to use "line 0" as that has a - // size cost in the line-table section and logically the zext can be seen as - // part of the load. Therefore we conservatively reuse the same debug - // location for the load and the zext. - ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); ++NumExtsMoved; Inst = ExtFedByLoad; return true; @@ -5633,7 +5995,7 @@ bool CodeGenPrepare::performAddressTypePromotion( bool Promoted = false; SmallPtrSet<Instruction *, 1> UnhandledExts; bool AllSeenFirst = true; - for (auto I : SpeculativelyMovedExts) { + for (auto *I : SpeculativelyMovedExts) { Value *HeadOfChain = I->getOperand(0); DenseMap<Value *, Instruction *>::iterator AlreadySeen = SeenChainsForSExt.find(HeadOfChain); @@ -5651,7 +6013,7 @@ bool CodeGenPrepare::performAddressTypePromotion( TPT.commit(); if (HasPromoted) Promoted = true; - for (auto I : SpeculativelyMovedExts) { + for (auto *I : SpeculativelyMovedExts) { Value *HeadOfChain = I->getOperand(0); SeenChainsForSExt[HeadOfChain] = nullptr; ValToSExtendedUses[HeadOfChain].push_back(I); @@ -5662,7 +6024,7 @@ bool CodeGenPrepare::performAddressTypePromotion( // This is the first chain visited from the header, keep the current chain // as unhandled. Defer to promote this until we encounter another SExt // chain derived from the same header. - for (auto I : SpeculativelyMovedExts) { + for (auto *I : SpeculativelyMovedExts) { Value *HeadOfChain = I->getOperand(0); SeenChainsForSExt[HeadOfChain] = Inst; } @@ -5670,7 +6032,7 @@ bool CodeGenPrepare::performAddressTypePromotion( } if (!AllSeenFirst && !UnhandledExts.empty()) - for (auto VisitedSExt : UnhandledExts) { + for (auto *VisitedSExt : UnhandledExts) { if (RemovedInsts.count(VisitedSExt)) continue; TypePromotionTransaction TPT(RemovedInsts); @@ -5681,7 +6043,7 @@ bool CodeGenPrepare::performAddressTypePromotion( TPT.commit(); if (HasPromoted) Promoted = true; - for (auto I : Chains) { + for (auto *I : Chains) { Value *HeadOfChain = I->getOperand(0); // Mark this as handled. SeenChainsForSExt[HeadOfChain] = nullptr; @@ -5701,7 +6063,7 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) { return false; // Only do this xform if truncating is free. - if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) + if (!TLI->isTruncateFree(I->getType(), Src->getType())) return false; // Only safe to perform the optimization if the source is also defined in @@ -5947,7 +6309,8 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { // If it's safe to speculatively execute, then it should not have side // effects; therefore, it's safe to sink and possibly *not* execute. return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && - TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive; + TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >= + TargetTransformInfo::TCC_Expensive; } /// Returns true if a SelectInst should be turned into an explicit branch. @@ -6044,13 +6407,47 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) { return true; } +bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) { + Intrinsic::ID Opcode = Fsh->getIntrinsicID(); + assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) && + "Expected a funnel shift"); + + // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper + // than general vector shifts, and (3) the shift amount is select-of-splatted + // values, hoist the funnel shifts before the select: + // fsh Op0, Op1, (select Cond, TVal, FVal) --> + // select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal) + // + // This is inverting a generic IR transform when we know that the cost of a + // general vector shift is more than the cost of 2 shift-by-scalars. + // We can't do this effectively in SDAG because we may not be able to + // determine if the select operands are splats from within a basic block. + Type *Ty = Fsh->getType(); + if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) + return false; + Value *Cond, *TVal, *FVal; + if (!match(Fsh->getOperand(2), + m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal))))) + return false; + if (!isSplatValue(TVal) || !isSplatValue(FVal)) + return false; + + IRBuilder<> Builder(Fsh); + Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1); + Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal }); + Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal }); + Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal); + Fsh->replaceAllUsesWith(NewSel); + Fsh->eraseFromParent(); + return true; +} + /// If we have a SelectInst that will likely profit from branch prediction, /// turn it into a branch. bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // If branch conversion isn't desirable, exit early. - if (DisableSelectToBranch || - OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) || - !TLI) + if (DisableSelectToBranch || OptSize || + llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())) return false; // Find all consecutive select instructions that share the same condition. @@ -6103,7 +6500,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // Into: // start: // %cmp = cmp uge i32 %a, %b - // br i1 %cmp, label %select.true, label %select.false + // %cmp.frozen = freeze %cmp + // br i1 %cmp.frozen, label %select.true, label %select.false // select.true: // br label %select.end // select.false: @@ -6111,6 +6509,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // select.end: // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] // + // %cmp should be frozen, otherwise it may introduce undefined behavior. // In addition, we may sink instructions that produce %c or %d from // the entry block into the destination(s) of the new branch. // If the true or false blocks do not contain a sunken instruction, that @@ -6122,6 +6521,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); // Delete the unconditional branch that was just created by the split. StartBlock->getTerminator()->eraseFromParent(); @@ -6188,7 +6588,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { TT = TrueBlock; FT = FalseBlock; } - IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI); + IRBuilder<> IB(SI); + auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); + IB.CreateCondBr(CondFr, TT, FT, SI); SmallPtrSet<const Instruction *, 2> INS; INS.insert(ASI.begin(), ASI.end()); @@ -6215,79 +6617,54 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { return true; } -static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { - SmallVector<int, 16> Mask(SVI->getShuffleMask()); - int SplatElem = -1; - for (unsigned i = 0; i < Mask.size(); ++i) { - if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem) - return false; - SplatElem = Mask[i]; - } - - return true; -} - -/// Some targets have expensive vector shifts if the lanes aren't all the same -/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases -/// it's often worth sinking a shufflevector splat down to its use so that -/// codegen can spot all lanes are identical. +/// Some targets only accept certain types for splat inputs. For example a VDUP +/// in MVE takes a GPR (integer) register, and the instruction that incorporate +/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register. bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { - BasicBlock *DefBB = SVI->getParent(); - - // Only do this xform if variable vector shifts are particularly expensive. - if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType())) + if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) return false; - - // We only expect better codegen by sinking a shuffle if we can recognise a - // constant splat. - if (!isBroadcastShuffle(SVI)) + Type *NewType = TLI->shouldConvertSplatType(SVI); + if (!NewType) return false; - // InsertedShuffles - Only insert a shuffle in each block once. - DenseMap<BasicBlock*, Instruction*> InsertedShuffles; - - bool MadeChange = false; - for (User *U : SVI->users()) { - Instruction *UI = cast<Instruction>(U); - - // Figure out which BB this ext is used in. - BasicBlock *UserBB = UI->getParent(); - if (UserBB == DefBB) continue; - - // For now only apply this when the splat is used by a shift instruction. - if (!UI->isShift()) continue; - - // Everything checks out, sink the shuffle if the user's block doesn't - // already have a copy. - Instruction *&InsertedShuffle = InsertedShuffles[UserBB]; + auto *SVIVecType = cast<FixedVectorType>(SVI->getType()); + assert(!NewType->isVectorTy() && "Expected a scalar type!"); + assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() && + "Expected a type of the same size!"); + auto *NewVecType = + FixedVectorType::get(NewType, SVIVecType->getNumElements()); + + // Create a bitcast (shuffle (insert (bitcast(..)))) + IRBuilder<> Builder(SVI->getContext()); + Builder.SetInsertPoint(SVI); + Value *BC1 = Builder.CreateBitCast( + cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType); + Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1, + (uint64_t)0); + Value *Shuffle = Builder.CreateShuffleVector( + Insert, UndefValue::get(NewVecType), SVI->getShuffleMask()); + Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType); + + SVI->replaceAllUsesWith(BC2); + RecursivelyDeleteTriviallyDeadInstructions(SVI); + + // Also hoist the bitcast up to its operand if it they are not in the same + // block. + if (auto *BCI = dyn_cast<Instruction>(BC1)) + if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0))) + if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) && + !Op->isTerminator() && !Op->isEHPad()) + BCI->moveAfter(Op); - if (!InsertedShuffle) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - assert(InsertPt != UserBB->end()); - InsertedShuffle = - new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), - SVI->getOperand(2), "", &*InsertPt); - InsertedShuffle->setDebugLoc(SVI->getDebugLoc()); - } - - UI->replaceUsesOfWith(SVI, InsertedShuffle); - MadeChange = true; - } - - // If we removed all uses, nuke the shuffle. - if (SVI->use_empty()) { - SVI->eraseFromParent(); - MadeChange = true; - } - - return MadeChange; + return true; } bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { // If the operands of I can be folded into a target instruction together with // I, duplicate and sink them. SmallVector<Use *, 4> OpsToSink; - if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink)) + if (!TLI->shouldSinkOperands(I, OpsToSink)) return false; // OpsToSink can contain multiple uses in a use chain (e.g. @@ -6340,9 +6717,6 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { } bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { - if (!TLI || !DL) - return false; - Value *Cond = SI->getCondition(); Type *OldType = Cond->getType(); LLVMContext &Context = Cond->getContext(); @@ -6494,6 +6868,8 @@ class VectorPromoteHelper { uint64_t ScalarCost = TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); uint64_t VectorCost = StoreExtractCombineCost; + enum TargetTransformInfo::TargetCostKind CostKind = + TargetTransformInfo::TCK_RecipThroughput; for (const auto &Inst : InstsToBePromoted) { // Compute the cost. // By construction, all instructions being promoted are arithmetic ones. @@ -6509,8 +6885,9 @@ class VectorPromoteHelper { !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue : TargetTransformInfo::OK_AnyValue; ScalarCost += TTI.getArithmeticInstrCost( - Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK); + Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK); VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, + CostKind, Arg0OVK, Arg1OVK); } LLVM_DEBUG( @@ -6539,19 +6916,23 @@ class VectorPromoteHelper { UseSplat = true; } - unsigned End = getTransitionType()->getVectorNumElements(); + ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount(); if (UseSplat) - return ConstantVector::getSplat(End, Val); - - SmallVector<Constant *, 4> ConstVec; - UndefValue *UndefVal = UndefValue::get(Val->getType()); - for (unsigned Idx = 0; Idx != End; ++Idx) { - if (Idx == ExtractIdx) - ConstVec.push_back(Val); - else - ConstVec.push_back(UndefVal); - } - return ConstantVector::get(ConstVec); + return ConstantVector::getSplat(EC, Val); + + if (!EC.Scalable) { + SmallVector<Constant *, 4> ConstVec; + UndefValue *UndefVal = UndefValue::get(Val->getType()); + for (unsigned Idx = 0; Idx != EC.Min; ++Idx) { + if (Idx == ExtractIdx) + ConstVec.push_back(Val); + else + ConstVec.push_back(UndefVal); + } + return ConstantVector::get(ConstVec); + } else + llvm_unreachable( + "Generate scalable vector for non-splat is unimplemented"); } /// Check if promoting to a vector type an operand at \p OperandIdx @@ -6706,7 +7087,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { /// has this feature and this is profitable. bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { unsigned CombineCost = std::numeric_limits<unsigned>::max(); - if (DisableStoreExtract || !TLI || + if (DisableStoreExtract || (!StressStoreExtract && !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(), Inst->getOperand(1), CombineCost))) @@ -6793,6 +7174,14 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, const TargetLowering &TLI) { // Handle simple but common cases only. Type *StoreType = SI.getValueOperand()->getType(); + + // The code below assumes shifting a value by <number of bits>, + // whereas scalable vectors would have to be shifted by + // <2log(vscale) + number of bits> in order to store the + // low/high parts. Bailing out for now. + if (isa<ScalableVectorType>(StoreType)) + return false; + if (!DL.typeSizeEqualsStoreSize(StoreType) || DL.getTypeSizeInBits(StoreType) == 0) return false; @@ -6856,12 +7245,19 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, Value *Addr = Builder.CreateBitCast( SI.getOperand(1), SplitStoreType->getPointerTo(SI.getPointerAddressSpace())); - if ((IsLE && Upper) || (!IsLE && !Upper)) + Align Alignment = SI.getAlign(); + const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper); + if (IsOffsetStore) { Addr = Builder.CreateGEP( SplitStoreType, Addr, ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1)); - Builder.CreateAlignedStore( - V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment()); + + // When splitting the store in half, naturally one half will retain the + // alignment of the original wider store, regardless of whether it was + // over-aligned or not, while the other will require adjustment. + Alignment = commonAlignment(Alignment, HalfValBitSize / 8); + } + Builder.CreateAlignedStore(V, Addr, Alignment); }; CreateSplitStore(LValue, false); @@ -6950,7 +7346,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, return false; ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1)); // Check that GEPI is a cheap one. - if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType()) + if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(), + TargetTransformInfo::TCK_SizeAndLatency) > TargetTransformInfo::TCC_Basic) return false; Value *GEPIOp = GEPI->getOperand(0); @@ -6999,7 +7396,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, cast<ConstantInt>(UGEPI->getOperand(1))->getType()) return false; ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); - if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType()) + if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(), + TargetTransformInfo::TCK_SizeAndLatency) > TargetTransformInfo::TCC_Basic) return false; UGEPIs.push_back(UGEPI); @@ -7010,7 +7408,9 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, for (GetElementPtrInst *UGEPI : UGEPIs) { ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue(); - unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType()); + unsigned ImmCost = + TTI->getIntImmCost(NewIdx, GEPIIdx->getType(), + TargetTransformInfo::TCK_SizeAndLatency); if (ImmCost > TargetTransformInfo::TCC_Basic) return false; } @@ -7067,16 +7467,15 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { if (isa<Constant>(CI->getOperand(0))) return false; - if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL)) + if (OptimizeNoopCopyExpression(CI, *TLI, *DL)) return true; if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { /// Sink a zext or sext into its user blocks if the target type doesn't /// fit in one register - if (TLI && - TLI->getTypeAction(CI->getContext(), + if (TLI->getTypeAction(CI->getContext(), TLI->getValueType(*DL, CI->getType())) == - TargetLowering::TypeExpandInteger) { + TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { bool MadeChange = optimizeExt(I); @@ -7087,30 +7486,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { } if (auto *Cmp = dyn_cast<CmpInst>(I)) - if (TLI && optimizeCmp(Cmp, ModifiedDT)) + if (optimizeCmp(Cmp, ModifiedDT)) return true; if (LoadInst *LI = dyn_cast<LoadInst>(I)) { LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); - if (TLI) { - bool Modified = optimizeLoadExt(LI); - unsigned AS = LI->getPointerAddressSpace(); - Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); - return Modified; - } - return false; + bool Modified = optimizeLoadExt(LI); + unsigned AS = LI->getPointerAddressSpace(); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (TLI && splitMergedValStore(*SI, *DL, *TLI)) + if (splitMergedValStore(*SI, *DL, *TLI)) return true; SI->setMetadata(LLVMContext::MD_invariant_group, nullptr); - if (TLI) { - unsigned AS = SI->getPointerAddressSpace(); - return optimizeMemoryInst(I, SI->getOperand(1), - SI->getOperand(0)->getType(), AS); - } - return false; + unsigned AS = SI->getPointerAddressSpace(); + return optimizeMemoryInst(I, SI->getOperand(1), + SI->getOperand(0)->getType(), AS); } if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { @@ -7127,15 +7520,14 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); - if (BinOp && (BinOp->getOpcode() == Instruction::And) && - EnableAndCmpSinking && TLI) + if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking) return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); // TODO: Move this into the switch on opcode - it handles shifts already. if (BinOp && (BinOp->getOpcode() == Instruction::AShr || BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); - if (TLI && CI && TLI->hasExtractBitsInsn()) + if (CI && TLI->hasExtractBitsInsn()) if (OptimizeExtractBits(BinOp, CI, *TLI, *DL)) return true; } @@ -7158,6 +7550,35 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { return false; } + if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) { + // freeze(icmp a, const)) -> icmp (freeze a), const + // This helps generate efficient conditional jumps. + Instruction *CmpI = nullptr; + if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0))) + CmpI = II; + else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0))) + CmpI = F->getFastMathFlags().none() ? F : nullptr; + + if (CmpI && CmpI->hasOneUse()) { + auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1); + bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) || + isa<ConstantPointerNull>(Op0); + bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) || + isa<ConstantPointerNull>(Op1); + if (Const0 || Const1) { + if (!Const0 || !Const1) { + auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI); + F->takeName(FI); + CmpI->setOperand(Const0 ? 1 : 0, F); + } + FI->replaceAllUsesWith(CmpI); + FI->eraseFromParent(); + return true; + } + } + return false; + } + if (tryToSinkFreeOperands(I)) return true; @@ -7214,7 +7635,7 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { } bool MadeBitReverse = true; - while (TLI && MadeBitReverse) { + while (MadeBitReverse) { MadeBitReverse = false; for (auto &I : reverse(BB)) { if (makeBitReverse(I, *DL, *TLI)) { @@ -7326,7 +7747,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. /// bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) { - if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive()) + if (!TM->Options.EnableFastISel || TLI->isJumpExpensive()) return false; bool MadeChange = false; @@ -7367,7 +7788,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) { LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump()); // Create a new BB. - auto TmpBB = + auto *TmpBB = BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split", BB.getParent(), BB.getNextNode()); |