diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
18 files changed, 4618 insertions, 2916 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 7622ed6d194f..27a4d241b320 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,9 +1,13 @@ add_llvm_library(LLVMVectorize LoadStoreVectorizer.cpp + LoopVectorizationLegality.cpp LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp VPlan.cpp + VPlanHCFGBuilder.cpp + VPlanHCFGTransforms.cpp + VPlanVerifier.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index dc83b6d4d292..5f3d127202ad 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -6,6 +6,38 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// This pass merges loads/stores to/from sequential memory addresses into vector +// loads/stores. Although there's nothing GPU-specific in here, this pass is +// motivated by the microarchitectural quirks of nVidia and AMD GPUs. +// +// (For simplicity below we talk about loads only, but everything also applies +// to stores.) +// +// This pass is intended to be run late in the pipeline, after other +// vectorization opportunities have been exploited. So the assumption here is +// that immediately following our new vector load we'll need to extract out the +// individual elements of the load, so we can operate on them individually. +// +// On CPUs this transformation is usually not beneficial, because extracting the +// elements of a vector register is expensive on most architectures. It's +// usually better just to load each element individually into its own scalar +// register. +// +// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a +// "vector load" loads directly into a series of scalar registers. In effect, +// extracting the elements of the vector is free. It's therefore always +// beneficial to vectorize a sequence of loads on these architectures. +// +// Vectorizing (perhaps a better name might be "coalescing") loads can have +// large performance impacts on GPU kernels, and opportunities for vectorizing +// are common in GPU code. This pass tries very hard to find such +// opportunities; its runtime is quadratic in the number of loads in a BB. +// +// Some CPU architectures, such as ARM, have instructions that load into +// multiple scalar registers, similar to a GPU vectorized load. In theory ARM +// could use this pass (with some modifications), but currently it implements +// its own pass to do something similar to what we do here. #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" @@ -21,6 +53,7 @@ #include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" @@ -45,7 +78,6 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <cassert> @@ -65,8 +97,16 @@ static const unsigned StackAdjustedAlignment = 4; namespace { +/// ChainID is an arbitrary token that is allowed to be different only for the +/// accesses that are guaranteed to be considered non-consecutive by +/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions +/// together and reducing the number of instructions the main search operates on +/// at a time, i.e. this is to reduce compile time and nothing else as the main +/// search has O(n^2) time complexity. The underlying type of ChainID should not +/// be relied upon. +using ChainID = const Value *; using InstrList = SmallVector<Instruction *, 8>; -using InstrListMap = MapVector<Value *, InstrList>; +using InstrListMap = MapVector<ChainID, InstrList>; class Vectorizer { Function &F; @@ -86,10 +126,6 @@ public: bool run(); private: - Value *getPointerOperand(Value *I) const; - - GetElementPtrInst *getSourceGEP(Value *Src) const; - unsigned getPointerAddressSpace(Value *I); unsigned getAlignment(LoadInst *LI) const { @@ -108,7 +144,15 @@ private: return DL.getABITypeAlignment(SI->getValueOperand()->getType()); } + static const unsigned MaxDepth = 3; + bool isConsecutiveAccess(Value *A, Value *B); + bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta, + unsigned Depth = 0) const; + bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, + unsigned Depth) const; + bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, + unsigned Depth) const; /// After vectorization, reorder the instructions that I depends on /// (the instructions defining its operands), to ensure they dominate I. @@ -239,14 +283,6 @@ bool Vectorizer::run() { return Changed; } -Value *Vectorizer::getPointerOperand(Value *I) const { - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->getPointerOperand(); - return nullptr; -} - unsigned Vectorizer::getPointerAddressSpace(Value *I) { if (LoadInst *L = dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); @@ -255,23 +291,10 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) { return -1; } -GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const { - // First strip pointer bitcasts. Make sure pointee size is the same with - // and without casts. - // TODO: a stride set by the add instruction below can match the difference - // in pointee type size here. Currently it will not be vectorized. - Value *SrcPtr = getPointerOperand(Src); - Value *SrcBase = SrcPtr->stripPointerCasts(); - if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) == - DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType())) - SrcPtr = SrcBase; - return dyn_cast<GetElementPtrInst>(SrcPtr); -} - // FIXME: Merge with llvm::isConsecutiveAccess bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); + Value *PtrA = getLoadStorePointerOperand(A); + Value *PtrB = getLoadStorePointerOperand(B); unsigned ASA = getPointerAddressSpace(A); unsigned ASB = getPointerAddressSpace(B); @@ -280,18 +303,27 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { return false; // Make sure that A and B are different pointers of the same size type. - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *PtrATy = PtrA->getType()->getPointerElementType(); Type *PtrBTy = PtrB->getType()->getPointerElementType(); if (PtrA == PtrB || + PtrATy->isVectorTy() != PtrBTy->isVectorTy() || DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || DL.getTypeStoreSize(PtrATy->getScalarType()) != DL.getTypeStoreSize(PtrBTy->getScalarType())) return false; + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); - APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + return areConsecutivePointers(PtrA, PtrB, Size); +} + +bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, + const APInt &PtrDelta, + unsigned Depth) const { + unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); + APInt OffsetA(PtrBitWidth, 0); + APInt OffsetB(PtrBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); @@ -300,11 +332,11 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { // Check if they are based on the same pointer. That makes the offsets // sufficient. if (PtrA == PtrB) - return OffsetDelta == Size; + return OffsetDelta == PtrDelta; // Compute the necessary base pointer delta to have the necessary final delta - // equal to the size. - APInt BaseDelta = Size - OffsetDelta; + // equal to the pointer delta requested. + APInt BaseDelta = PtrDelta - OffsetDelta; // Compute the distance with SCEV between the base pointers. const SCEV *PtrSCEVA = SE.getSCEV(PtrA); @@ -314,71 +346,127 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { if (X == PtrSCEVB) return true; + // The above check will not catch the cases where one of the pointers is + // factorized but the other one is not, such as (C + (S * (A + B))) vs + // (AS + BS). Get the minus scev. That will allow re-combining the expresions + // and getting the simplified difference. + const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); + if (C == Dist) + return true; + // Sometimes even this doesn't work, because SCEV can't always see through // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking // things the hard way. + return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); +} + +bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, + APInt PtrDelta, + unsigned Depth) const { + auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); + auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); + if (!GEPA || !GEPB) + return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); // Look through GEPs after checking they're the same except for the last // index. - GetElementPtrInst *GEPA = getSourceGEP(A); - GetElementPtrInst *GEPB = getSourceGEP(B); - if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) + if (GEPA->getNumOperands() != GEPB->getNumOperands() || + GEPA->getPointerOperand() != GEPB->getPointerOperand()) return false; - unsigned FinalIndex = GEPA->getNumOperands() - 1; - for (unsigned i = 0; i < FinalIndex; i++) - if (GEPA->getOperand(i) != GEPB->getOperand(i)) + gep_type_iterator GTIA = gep_type_begin(GEPA); + gep_type_iterator GTIB = gep_type_begin(GEPB); + for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { + if (GTIA.getOperand() != GTIB.getOperand()) return false; + ++GTIA; + ++GTIB; + } - Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex)); - Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex)); + Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); + Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) return false; + if (PtrDelta.isNegative()) { + if (PtrDelta.isMinSignedValue()) + return false; + PtrDelta.negate(); + std::swap(OpA, OpB); + } + uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); + if (PtrDelta.urem(Stride) != 0) + return false; + unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); + APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); + // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) return false; bool Signed = isa<SExtInst>(OpA); - OpA = dyn_cast<Instruction>(OpA->getOperand(0)); + // At this point A could be a function parameter, i.e. not an instruction + Value *ValA = OpA->getOperand(0); OpB = dyn_cast<Instruction>(OpB->getOperand(0)); - if (!OpA || !OpB || OpA->getType() != OpB->getType()) + if (!OpB || ValA->getType() != OpB->getType()) return false; - // Now we need to prove that adding 1 to OpA won't overflow. + // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; - // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, - // we're okay. + // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to + // ValA, we're okay. if (OpB->getOpcode() == Instruction::Add && isa<ConstantInt>(OpB->getOperand(1)) && - cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) { + IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { if (Signed) Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); else Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); } - unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); + unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); // Second attempt: - // If any bits are known to be zero other than the sign bit in OpA, we can - // add 1 to it while guaranteeing no overflow of any sort. + // If all set bits of IdxDiff or any higher order bit other than the sign bit + // are known to be zero in ValA, we can add Diff to it while guaranteeing no + // overflow of any sort. if (!Safe) { + OpA = dyn_cast<Instruction>(ValA); + if (!OpA) + return false; KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.countMaxTrailingOnes() < (BitWidth - 1)) - Safe = true; + APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); + if (Signed) + BitsAllowedToBeSet.clearBit(BitWidth - 1); + if (BitsAllowedToBeSet.ult(IdxDiff)) + return false; } - if (!Safe) + const SCEV *OffsetSCEVA = SE.getSCEV(ValA); + const SCEV *OffsetSCEVB = SE.getSCEV(OpB); + const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); + const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); + return X == OffsetSCEVB; +} + +bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, + const APInt &PtrDelta, + unsigned Depth) const { + if (Depth++ == MaxDepth) return false; - const SCEV *OffsetSCEVA = SE.getSCEV(OpA); - const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); - const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); - return X2 == OffsetSCEVB; + if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { + if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { + return SelectA->getCondition() == SelectB->getCondition() && + areConsecutivePointers(SelectA->getTrueValue(), + SelectB->getTrueValue(), PtrDelta, Depth) && + areConsecutivePointers(SelectA->getFalseValue(), + SelectB->getFalseValue(), PtrDelta, Depth); + } + } + return false; } void Vectorizer::reorder(Instruction *I) { @@ -448,7 +536,7 @@ Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { SmallVector<Instruction *, 16> Instrs; for (Instruction *I : Chain) { - Value *PtrOperand = getPointerOperand(I); + Value *PtrOperand = getLoadStorePointerOperand(I); assert(PtrOperand && "Instruction must have a pointer operand."); Instrs.push_back(I); if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) @@ -484,7 +572,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { SmallVector<Instruction *, 16> ChainInstrs; bool IsLoadChain = isa<LoadInst>(Chain[0]); - DEBUG({ + LLVM_DEBUG({ for (Instruction *I : Chain) { if (IsLoadChain) assert(isa<LoadInst>(I) && @@ -506,11 +594,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { Intrinsic::sideeffect) { // Ignore llvm.sideeffect calls. } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { - DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); + LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I + << '\n'); break; } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { - DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I - << '\n'); + LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I + << '\n'); break; } } @@ -536,32 +625,40 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) break; - if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr)) + auto *MemLoad = dyn_cast<LoadInst>(MemInstr); + auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); + if (MemLoad && ChainLoad) continue; + // We can ignore the alias if the we have a load store pair and the load + // is known to be invariant. The load cannot be clobbered by the store. + auto IsInvariantLoad = [](const LoadInst *LI) -> bool { + return LI->getMetadata(LLVMContext::MD_invariant_load); + }; + // We can ignore the alias as long as the load comes before the store, // because that means we won't be moving the load past the store to // vectorize it (the vectorized load is inserted at the location of the // first load in the chain). - if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) && - OBB.dominates(ChainInstr, MemInstr)) + if (isa<StoreInst>(MemInstr) && ChainLoad && + (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr))) continue; // Same case, but in reverse. - if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) && - OBB.dominates(MemInstr, ChainInstr)) + if (MemLoad && isa<StoreInst>(ChainInstr) && + (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr))) continue; if (!AA.isNoAlias(MemoryLocation::get(MemInstr), MemoryLocation::get(ChainInstr))) { - DEBUG({ + LLVM_DEBUG({ dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' - << " " << *getPointerOperand(MemInstr) << '\n' + << " " << *getLoadStorePointerOperand(MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' - << " " << *getPointerOperand(ChainInstr) << '\n'; + << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; }); // Save this aliasing memory instruction as a barrier, but allow other // instructions that precede the barrier to be vectorized with this one. @@ -594,6 +691,20 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { return Chain.slice(0, ChainIdx); } +static ChainID getChainID(const Value *Ptr, const DataLayout &DL) { + const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); + if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { + // The select's themselves are distinct instructions even if they share the + // same condition and evaluate to consecutive pointers for true and false + // values of the condition. Therefore using the select's themselves for + // grouping instructions would put consecutive accesses into different lists + // and they won't be even checked for being consecutive, and won't be + // vectorized. + return Sel->getCondition(); + } + return ObjPtr; +} + std::pair<InstrListMap, InstrListMap> Vectorizer::collectInstructions(BasicBlock *BB) { InstrListMap LoadRefs; @@ -632,8 +743,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) { unsigned AS = Ptr->getType()->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast<VectorType>(Ty); + // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2) + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; // Make sure all the users of a vector are constant-index extracts. @@ -644,8 +759,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save the load locations. - Value *ObjPtr = GetUnderlyingObject(Ptr, DL); - LoadRefs[ObjPtr].push_back(LI); + const ChainID ID = getChainID(Ptr, DL); + LoadRefs[ID].push_back(LI); } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (!SI->isSimple()) continue; @@ -675,8 +790,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) { unsigned AS = Ptr->getType()->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast<VectorType>(Ty); + // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2) + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { @@ -686,8 +805,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save store location. - Value *ObjPtr = GetUnderlyingObject(Ptr, DL); - StoreRefs[ObjPtr].push_back(SI); + const ChainID ID = getChainID(Ptr, DL); + StoreRefs[ID].push_back(SI); } } @@ -697,12 +816,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) { bool Vectorizer::vectorizeChains(InstrListMap &Map) { bool Changed = false; - for (const std::pair<Value *, InstrList> &Chain : Map) { + for (const std::pair<ChainID, InstrList> &Chain : Map) { unsigned Size = Chain.second.size(); if (Size < 2) continue; - DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); + LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); // Process the stores in chunks of 64. for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { @@ -716,7 +835,8 @@ bool Vectorizer::vectorizeChains(InstrListMap &Map) { } bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { - DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); + LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() + << " instructions.\n"); SmallVector<int, 16> Heads, Tails; int ConsecutiveChain[64]; @@ -852,14 +972,14 @@ bool Vectorizer::vectorizeStoreChain( // vector factor, break it into two pieces. unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); + LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." + " Creating two separate arrays.\n"); return vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed) | vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); } - DEBUG({ + LLVM_DEBUG({ dbgs() << "LSV: Stores to vectorize:\n"; for (Instruction *I : Chain) dbgs() << " " << *I << "\n"; @@ -1000,8 +1120,8 @@ bool Vectorizer::vectorizeLoadChain( // vector factor, break it into two pieces. unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { - DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." - " Creating two separate arrays.\n"); + LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." + " Creating two separate arrays.\n"); return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); } @@ -1024,7 +1144,7 @@ bool Vectorizer::vectorizeLoadChain( Alignment = NewAlign; } - DEBUG({ + LLVM_DEBUG({ dbgs() << "LSV: Loads to vectorize:\n"; for (Instruction *I : Chain) I->dump(); @@ -1107,7 +1227,7 @@ bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), SzInBytes * 8, AddressSpace, Alignment, &Fast); - DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows - << " and fast? " << Fast << "\n";); + LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows + << " and fast? " << Fast << "\n";); return !Allows || !Fast; } diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp new file mode 100644 index 000000000000..697bc1b448d7 --- /dev/null +++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -0,0 +1,1072 @@ +//===- LoopVectorizationLegality.cpp --------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides loop vectorization legality analysis. Original code +// resided in LoopVectorize.cpp for a long time. +// +// At this point, it is implemented as a utility class, not as an analysis +// pass. It should be easy to create an analysis pass around it if there +// is a need (but D45420 needs to happen first). +// +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/IntrinsicInst.h" + +using namespace llvm; + +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + +static cl::opt<bool> + EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, + cl::desc("Enable if-conversion during vectorization.")); + +static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + +static cl::opt<unsigned> VectorizeSCEVCheckThreshold( + "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( + "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed with a " + "vectorize(enable) pragma")); + +/// Maximum vectorization interleave count. +static const unsigned MaxInterleaveFactor = 16; + +namespace llvm { + +OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, + StringRef RemarkName, + Loop *TheLoop, + Instruction *I) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + +bool LoopVectorizeHints::Hint::validate(unsigned Val) { + switch (Kind) { + case HK_WIDTH: + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + case HK_UNROLL: + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + case HK_FORCE: + return (Val <= 1); + case HK_ISVECTORIZED: + return (Val == 0 || Val == 1); + } + return false; +} + +LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving, + OptimizationRemarkEmitter &ORE) + : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Force("vectorize.enable", FK_Undefined, HK_FORCE), + IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + + // force-vector-interleave overrides DisableInterleaving. + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; + + if (IsVectorized.Value != 1) + // If the vectorization width and interleaving count are both 1 then + // consider the loop to have been already vectorized because there's + // nothing more that we can do. + IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; + LLVM_DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); +} + +bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L, + bool AlwaysVectorize) const { + if (getForce() == LoopVectorizeHints::FK_Disabled) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitRemarkWithHints(); + return false; + } + + if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitRemarkWithHints(); + return false; + } + + if (getIsVectorized() == 1) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + // FIXME: Add interleave.disable metadata. This will allow + // vectorize.disable to be used without disabling the pass and errors + // to differentiate between disabled vectorization and a width of 1. + ORE.emit([&]() { + return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), + "AllDisabled", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: vectorization and interleaving are " + "explicitly disabled, or the loop has already been " + "vectorized"; + }); + return false; + } + + return true; +} + +void LoopVectorizeHints::emitRemarkWithHints() const { + using namespace ore; + + ORE.emit([&]() { + if (Force.Value == LoopVectorizeHints::FK_Disabled) + return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "loop not vectorized: vectorization is explicitly disabled"; + else { + OptimizationRemarkMissed R(LV_NAME, "MissedDetails", + TheLoop->getStartLoc(), TheLoop->getHeader()); + R << "loop not vectorized"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=" << NV("Force", true); + if (Width.Value != 0) + R << ", Vector Width=" << NV("VectorWidth", Width.Value); + if (Interleave.Value != 0) + R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); + R << ")"; + } + return R; + } + }); +} + +const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { + if (getWidth() == 1) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Disabled) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + return LV_NAME; + return OptimizationRemarkAnalysis::AlwaysPrint; +} + +void LoopVectorizeHints::getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector<Metadata *, 4> Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast<MDString>(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast<MDString>(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } +} + +void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); + if (!C) + return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); + break; + } + } +} + +MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name, + unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = { + MDString::get(Context, Name), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); +} + +bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node, + ArrayRef<Hint> HintTypes) { + MDString *Name = dyn_cast<MDString>(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; +} + +void LoopVectorizeHints::writeHintsToMetadata(ArrayRef<Hint> HintTypes) { + if (HintTypes.empty()) + return; + + // Reserve the first element to LoopID (see below). + SmallVector<Metadata *, 4> MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); +} + +bool LoopVectorizationRequirements::doesNotMeet( + Function *F, Loop *L, const LoopVectorizeHints &Hints) { + const char *PassName = Hints.vectorizeAnalysisPassName(); + bool Failed = false; + if (UnsafeAlgebraInst && !Hints.allowReordering()) { + ORE.emit([&]() { + return OptimizationRemarkAnalysisFPCommute( + PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(), + UnsafeAlgebraInst->getParent()) + << "loop not vectorized: cannot prove it is safe to reorder " + "floating-point operations"; + }); + Failed = true; + } + + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE.emit([&]() { + return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", + L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Failed = true; + } + + return Failed; +} + +// Return true if the inner loop \p Lp is uniform with regard to the outer loop +// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes +// executing the inner loop will execute the same iterations). This check is +// very constrained for now but it will be relaxed in the future. \p Lp is +// considered uniform if it meets all the following conditions: +// 1) it has a canonical IV (starting from 0 and with stride 1), +// 2) its latch terminator is a conditional branch and, +// 3) its latch condition is a compare instruction whose operands are the +// canonical IV and an OuterLp invariant. +// This check doesn't take into account the uniformity of other conditions not +// related to the loop latch because they don't affect the loop uniformity. +// +// NOTE: We decided to keep all these checks and its associated documentation +// together so that we can easily have a picture of the current supported loop +// nests. However, some of the current checks don't depend on \p OuterLp and +// would be redundantly executed for each \p Lp if we invoked this function for +// different candidate outer loops. This is not the case for now because we +// don't currently have the infrastructure to evaluate multiple candidate outer +// loops and \p OuterLp will be a fixed parameter while we only support explicit +// outer loop vectorization. It's also very likely that these checks go away +// before introducing the aforementioned infrastructure. However, if this is not +// the case, we should move the \p OuterLp independent checks to a separate +// function that is only executed once for each \p Lp. +static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { + assert(Lp->getLoopLatch() && "Expected loop with a single latch."); + + // If Lp is the outer loop, it's uniform by definition. + if (Lp == OuterLp) + return true; + assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); + + // 1. + PHINode *IV = Lp->getCanonicalInductionVariable(); + if (!IV) { + LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); + return false; + } + + // 2. + BasicBlock *Latch = Lp->getLoopLatch(); + auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!LatchBr || LatchBr->isUnconditional()) { + LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); + return false; + } + + // 3. + auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition()); + if (!LatchCmp) { + LLVM_DEBUG( + dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); + return false; + } + + Value *CondOp0 = LatchCmp->getOperand(0); + Value *CondOp1 = LatchCmp->getOperand(1); + Value *IVUpdate = IV->getIncomingValueForBlock(Latch); + if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && + !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { + LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); + return false; + } + + return true; +} + +// Return true if \p Lp and all its nested loops are uniform with regard to \p +// OuterLp. +static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { + if (!isUniformLoop(Lp, OuterLp)) + return false; + + // Check if nested loops are uniform. + for (Loop *SubLp : *Lp) + if (!isUniformLoopNest(SubLp, OuterLp)) + return false; + + return true; +} + +/// Check whether it is safe to if-convert this phi node. +/// +/// Phi nodes with constant expressions that can trap are not safe to if +/// convert. +static bool canIfConvertPHINodes(BasicBlock *BB) { + for (PHINode &Phi : BB->phis()) { + for (Value *V : Phi.incoming_values()) + if (auto *C = dyn_cast<Constant>(V)) + if (C->canTrap()) + return false; + } + return true; +} + +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} + +static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + +/// Check that the instruction has outside loop users and is not an +/// identified reduction variable. +static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, + SmallPtrSetImpl<Value *> &AllowedExit) { + // Reduction and Induction instructions are allowed to have exit users. All + // other instructions must not have external users. + if (!AllowedExit.count(Inst)) + // Check that all of the users of the loop are inside the BB. + for (User *U : Inst->users()) { + Instruction *UI = cast<Instruction>(U); + // This user may be a reduction exit value. + if (!TheLoop->contains(UI)) { + LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); + return true; + } + } + return false; +} + +int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { + const ValueToValueMap &Strides = + getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); + + int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); + if (Stride == 1 || Stride == -1) + return Stride; + return 0; +} + +bool LoopVectorizationLegality::isUniform(Value *V) { + return LAI->isUniform(V); +} + +bool LoopVectorizationLegality::canVectorizeOuterLoop() { + assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + + for (BasicBlock *BB : TheLoop->blocks()) { + // Check whether the BB terminator is a BranchInst. Any other terminator is + // not supported yet. + auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); + if (!Br) { + LLVM_DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Check whether the BranchInst is a supported one. Only unconditional + // branches, conditional branches with an outer loop invariant condition or + // backedges are supported. + if (Br && Br->isConditional() && + !TheLoop->isLoopInvariant(Br->getCondition()) && + !LI->isLoopHeader(Br->getSuccessor(0)) && + !LI->isLoopHeader(Br->getSuccessor(1))) { + LLVM_DEBUG(dbgs() << "LV: Unsupported conditional branch.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + } + + // Check whether inner loops are uniform. At this point, we only support + // simple outer loops scenarios with uniform nested loops. + if (!isUniformLoopNest(TheLoop /*loop nest*/, + TheLoop /*context outer loop*/)) { + LLVM_DEBUG( + dbgs() + << "LV: Not vectorizing: Outer loop contains divergent loops.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +void LoopVectorizationLegality::addInductionPhi( + PHINode *Phi, const InductionDescriptor &ID, + SmallPtrSetImpl<Value *> &AllowedExit) { + Inductions[Phi] = ID; + + // In case this induction also comes with casts that we know we can ignore + // in the vectorized loop body, record them here. All casts could be recorded + // here for ignoring, but suffices to record only the first (as it is the + // only one that may bw used outside the cast sequence). + const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); + if (!Casts.empty()) + InductionCastsToIgnore.insert(*Casts.begin()); + + Type *PhiTy = Phi->getType(); + const DataLayout &DL = Phi->getModule()->getDataLayout(); + + // Get the widest type. + if (!PhiTy->isFloatingPointTy()) { + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + } + + // Int inductions are special because we only allow one IV. + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && + isa<Constant>(ID.getStartValue()) && + cast<Constant>(ID.getStartValue())->isNullValue()) { + + // Use the phi node with the widest type as induction. Use the last + // one if there are multiple (no good reason for doing this other + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. + if (!PrimaryInduction || PhiTy == WidestIndTy) + PrimaryInduction = Phi; + } + + // Both the PHI node itself, and the "post-increment" value feeding + // back into the PHI node may have external users. + // We can allow those uses, except if the SCEVs we have for them rely + // on predicates that only hold within the loop, since allowing the exit + // currently means re-using this SCEV outside the loop. + if (PSE.getUnionPredicate().isAlwaysTrue()) { + AllowedExit.insert(Phi); + AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); + } + + LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); +} + +bool LoopVectorizationLegality::canVectorizeInstrs() { + BasicBlock *Header = TheLoop->getHeader(); + + // Look for the attribute signaling the absence of NaNs. + Function &F = *Header->getParent(); + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; + + // For each block in the loop. + for (BasicBlock *BB : TheLoop->blocks()) { + // Scan the instructions in the block and look for hazards. + for (Instruction &I : *BB) { + if (auto *Phi = dyn_cast<PHINode>(&I)) { + Type *PhiTy = Phi->getType(); + // Check that this PHI type is allowed. + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "loop control flow is not understood by vectorizer"); + LLVM_DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); + return false; + } + + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. + if (BB != Header) { + // Check that this instruction has no outside users or is an + // identified reduction value with an outside user. + if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) + continue; + ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) + << "value could not be identified as " + "an induction or reduction variable"); + return false; + } + + // We only allow if-converted PHIs with exactly two incoming values. + if (Phi->getNumIncomingValues() != 2) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "control flow not understood by vectorizer"); + LLVM_DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + + RecurrenceDescriptor RedDes; + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, + DT)) { + if (RedDes.hasUnsafeAlgebra()) + Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); + AllowedExit.insert(RedDes.getLoopExitInstr()); + Reductions[Phi] = RedDes; + continue; + } + + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { + addInductionPhi(Phi, ID, AllowedExit); + if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) + Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); + continue; + } + + if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, + SinkAfter, DT)) { + FirstOrderRecurrences.insert(Phi); + continue; + } + + // As a last resort, coerce the PHI to a AddRec expression + // and re-try classifying it a an induction PHI. + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { + addInductionPhi(Phi, ID, AllowedExit); + continue; + } + + ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) + << "value that could not be identified as " + "reduction is used outside the loop"); + LLVM_DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); + return false; + } // end of PHI handling + + // We handle calls that: + // * Are debug info intrinsics. + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. + auto *CI = dyn_cast<CallInst>(&I); + if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && + !isa<DbgInfoIntrinsic>(CI) && + !(CI->getCalledFunction() && TLI && + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { + ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) + << "call instruction cannot be vectorized"); + LLVM_DEBUG( + dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); + return false; + } + + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (CI && hasVectorInstrinsicScalarOpd( + getVectorIntrinsicIDForCall(CI, TLI), 1)) { + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { + ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) + << "intrinsic instruction cannot be vectorized"); + LLVM_DEBUG(dbgs() + << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + + // Check that the instruction return type is vectorizable. + // Also, we can't vectorize extractelement instructions. + if ((!VectorType::isValidElementType(I.getType()) && + !I.getType()->isVoidTy()) || + isa<ExtractElementInst>(I)) { + ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) + << "instruction return type cannot be vectorized"); + LLVM_DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); + return false; + } + + // Check that the stored type is vectorizable. + if (auto *ST = dyn_cast<StoreInst>(&I)) { + Type *T = ST->getValueOperand()->getType(); + if (!VectorType::isValidElementType(T)) { + ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) + << "store instruction cannot be vectorized"); + return false; + } + + // FP instructions can allow unsafe algebra, thus vectorizable by + // non-IEEE-754 compliant SIMD units. + // This applies to floating-point math operations and calls, not memory + // operations, shuffles, or casts, as they don't change precision or + // semantics. + } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && + !I.isFast()) { + LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); + Hints->setPotentiallyUnsafe(); + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { + ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) + << "value cannot be used outside the loop"); + return false; + } + } // next instr. + } + + if (!PrimaryInduction) { + LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + if (Inductions.empty()) { + ORE->emit(createMissedAnalysis("NoInductionVariable") + << "loop induction variable could not be identified"); + return false; + } + } + + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) + PrimaryInduction = nullptr; + + return true; +} + +bool LoopVectorizationLegality::canVectorizeMemory() { + LAI = &(*GetLAA)(*TheLoop); + const OptimizationRemarkAnalysis *LAR = LAI->getReport(); + if (LAR) { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + }); + } + if (!LAI->canVectorizeMemory()) + return false; + + if (LAI->hasStoreToLoopInvariantAddress()) { + ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") + << "write to a loop invariant address could not be vectorized"); + LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + + Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); + PSE.addPredicate(LAI->getPSE().getUnionPredicate()); + + return true; +} + +bool LoopVectorizationLegality::isInductionPhi(const Value *V) { + Value *In0 = const_cast<Value *>(V); + PHINode *PN = dyn_cast_or_null<PHINode>(In0); + if (!PN) + return false; + + return Inductions.count(PN); +} + +bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { + auto *Inst = dyn_cast<Instruction>(V); + return (Inst && InductionCastsToIgnore.count(Inst)); +} + +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { + return isInductionPhi(V) || isCastedInductionVariable(V); +} + +bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { + return FirstOrderRecurrences.count(Phi); +} + +bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); +} + +bool LoopVectorizationLegality::blockCanBePredicated( + BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + + for (Instruction &I : *BB) { + // Check that we don't have a constant expression that can trap as operand. + for (Value *Operand : I.operands()) { + if (auto *C = dyn_cast<Constant>(Operand)) + if (C->canTrap()) + return false; + } + // We might be able to hoist the load. + if (I.mayReadFromMemory()) { + auto *LI = dyn_cast<LoadInst>(&I); + if (!LI) + return false; + if (!SafePtrs.count(LI->getPointerOperand())) { + // !llvm.mem.parallel_loop_access implies if-conversion safety. + // Otherwise, record that the load needs (real or emulated) masking + // and let the cost model decide. + if (!IsAnnotatedParallel) + MaskedOp.insert(LI); + continue; + } + } + + if (I.mayWriteToMemory()) { + auto *SI = dyn_cast<StoreInst>(&I); + if (!SI) + return false; + // Predicated store requires some form of masking: + // 1) masked store HW instruction, + // 2) emulation via load-blend-store (only if safe and legal to do so, + // be aware on the race conditions), or + // 3) element-by-element predicate check and scalar store. + MaskedOp.insert(SI); + continue; + } + if (I.mayThrow()) + return false; + } + + return true; +} + +bool LoopVectorizationLegality::canVectorizeWithIfConvert() { + if (!EnableIfConversion) { + ORE->emit(createMissedAnalysis("IfConversionDisabled") + << "if-conversion is disabled"); + return false; + } + + assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); + + // A list of pointers that we can safely read and write to. + SmallPtrSet<Value *, 8> SafePointes; + + // Collect safe addresses. + for (BasicBlock *BB : TheLoop->blocks()) { + if (blockNeedsPredication(BB)) + continue; + + for (Instruction &I : *BB) + if (auto *Ptr = getLoadStorePointerOperand(&I)) + SafePointes.insert(Ptr); + } + + // Collect the blocks that need predication. + BasicBlock *Header = TheLoop->getHeader(); + for (BasicBlock *BB : TheLoop->blocks()) { + // We don't support switch statements inside loops. + if (!isa<BranchInst>(BB->getTerminator())) { + ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) + << "loop contains a switch statement"); + return false; + } + + // We must be able to predicate all blocks that need to be predicated. + if (blockNeedsPredication(BB)) { + if (!blockCanBePredicated(BB, SafePointes)) { + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); + return false; + } + } else if (BB != Header && !canIfConvertPHINodes(BB)) { + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); + return false; + } + } + + // We can if-convert this loop. + return true; +} + +// Helper function to canVectorizeLoopNestCFG. +bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, + bool UseVPlanNativePath) { + assert((UseVPlanNativePath || Lp->empty()) && + "VPlan-native path is not enabled."); + + // TODO: ORE should be improved to show more accurate information when an + // outer loop can't be vectorized because a nested loop is not understood or + // legal. Something like: "outer_loop_location: loop not vectorized: + // (inner_loop_location) loop control flow is not understood by vectorizer". + + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + + // We must have a loop in canonical form. Loops with indirectbr in them cannot + // be canonicalized. + if (!Lp->getLoopPreheader()) { + LLVM_DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We must have a single backedge. + if (Lp->getNumBackEdges() != 1) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We must have a single exiting block. + if (!Lp->getExitingBlock()) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We only handle bottom-tested loops, i.e. loop in which the condition is + // checked at the end of each iteration. With that we can assume that all + // instructions in the loop are executed the same number of times. + if (Lp->getExitingBlock() != Lp->getLoopLatch()) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +bool LoopVectorizationLegality::canVectorizeLoopNestCFG( + Loop *Lp, bool UseVPlanNativePath) { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Recursively check whether the loop control flow of nested loops is + // understood. + for (Loop *SubLp : *Lp) + if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + // Check whether the loop-related control flow in the loop nest is expected by + // vectorizer. + if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We need to have a loop header. + LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() + << '\n'); + + // Specific checks for outer loops. We skip the remaining legal checks at this + // point because they don't support outer loops. + if (!TheLoop->empty()) { + assert(UseVPlanNativePath && "VPlan-native path is not enabled."); + + if (!canVectorizeOuterLoop()) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n"); + // TODO: Implement DoExtraAnalysis when subsequent legal checks support + // outer loops. + return false; + } + + LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); + return Result; + } + + assert(TheLoop->empty() && "Inner loop expected."); + // Check if we can if-convert non-single-bb loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); + if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { + LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Check if we can vectorize the instructions and CFG in this loop. + if (!canVectorizeInstrs()) { + LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Go over each instruction and look at memory deps. + if (!canVectorizeMemory()) { + LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" + << (LAI->getRuntimePointerChecking()->Need + ? " (with a runtime bound check)" + : "") + << "!\n"); + + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + + if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { + ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); + LLVM_DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Okay! We've done all the tests. If any have failed, return false. Otherwise + // we can vectorize, and at this point we don't have any other mem analysis + // which may limit our maximum vectorization factor, so just return true with + // no restrictions. + return Result; +} + +} // namespace llvm diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h new file mode 100644 index 000000000000..2aa219064299 --- /dev/null +++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -0,0 +1,282 @@ +//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a LoopVectorizationPlanner class. +/// InnerLoopVectorizer vectorizes loops which contain only one basic +/// LoopVectorizationPlanner - drives the vectorization process after having +/// passed Legality checks. +/// The planner builds and optimizes the Vectorization Plans which record the +/// decisions how to vectorize the given loop. In particular, represent the +/// control-flow of the vectorized version, the replication of instructions that +/// are to be scalarized, and interleave access groups. +/// +/// Also provides a VPlan-based builder utility analogous to IRBuilder. +/// It provides an instruction-level API for generating VPInstructions while +/// abstracting away the Recipe manipulation details. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H +#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H + +#include "VPlan.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" + +namespace llvm { + +/// VPlan-based builder utility analogous to IRBuilder. +class VPBuilder { +private: + VPBasicBlock *BB = nullptr; + VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); + + VPInstruction *createInstruction(unsigned Opcode, + ArrayRef<VPValue *> Operands) { + VPInstruction *Instr = new VPInstruction(Opcode, Operands); + if (BB) + BB->insert(Instr, InsertPt); + return Instr; + } + + VPInstruction *createInstruction(unsigned Opcode, + std::initializer_list<VPValue *> Operands) { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands)); + } + +public: + VPBuilder() {} + + /// Clear the insertion point: created instructions will not be inserted into + /// a block. + void clearInsertionPoint() { + BB = nullptr; + InsertPt = VPBasicBlock::iterator(); + } + + VPBasicBlock *getInsertBlock() const { return BB; } + VPBasicBlock::iterator getInsertPoint() const { return InsertPt; } + + /// InsertPoint - A saved insertion point. + class VPInsertPoint { + VPBasicBlock *Block = nullptr; + VPBasicBlock::iterator Point; + + public: + /// Creates a new insertion point which doesn't point to anything. + VPInsertPoint() = default; + + /// Creates a new insertion point at the given location. + VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint) + : Block(InsertBlock), Point(InsertPoint) {} + + /// Returns true if this insert point is set. + bool isSet() const { return Block != nullptr; } + + VPBasicBlock *getBlock() const { return Block; } + VPBasicBlock::iterator getPoint() const { return Point; } + }; + + /// Sets the current insert point to a previously-saved location. + void restoreIP(VPInsertPoint IP) { + if (IP.isSet()) + setInsertPoint(IP.getBlock(), IP.getPoint()); + else + clearInsertionPoint(); + } + + /// This specifies that created VPInstructions should be appended to the end + /// of the specified block. + void setInsertPoint(VPBasicBlock *TheBB) { + assert(TheBB && "Attempting to set a null insert point"); + BB = TheBB; + InsertPt = BB->end(); + } + + /// This specifies that created instructions should be inserted at the + /// specified point. + void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) { + BB = TheBB; + InsertPt = IP; + } + + /// Insert and return the specified instruction. + VPInstruction *insert(VPInstruction *I) const { + BB->insert(I, InsertPt); + return I; + } + + /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as + /// its underlying Instruction. + VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, + Instruction *Inst = nullptr) { + VPInstruction *NewVPInst = createInstruction(Opcode, Operands); + NewVPInst->setUnderlyingValue(Inst); + return NewVPInst; + } + VPValue *createNaryOp(unsigned Opcode, + std::initializer_list<VPValue *> Operands, + Instruction *Inst = nullptr) { + return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst); + } + + VPValue *createNot(VPValue *Operand) { + return createInstruction(VPInstruction::Not, {Operand}); + } + + VPValue *createAnd(VPValue *LHS, VPValue *RHS) { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); + } + + VPValue *createOr(VPValue *LHS, VPValue *RHS) { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); + } + + //===--------------------------------------------------------------------===// + // RAII helpers. + //===--------------------------------------------------------------------===// + + /// RAII object that stores the current insertion point and restores it when + /// the object is destroyed. + class InsertPointGuard { + VPBuilder &Builder; + VPBasicBlock *Block; + VPBasicBlock::iterator Point; + + public: + InsertPointGuard(VPBuilder &B) + : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {} + + InsertPointGuard(const InsertPointGuard &) = delete; + InsertPointGuard &operator=(const InsertPointGuard &) = delete; + + ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); } + }; +}; + +/// TODO: The following VectorizationFactor was pulled out of +/// LoopVectorizationCostModel class. LV also deals with +/// VectorizerParams::VectorizationFactor and VectorizationCostTy. +/// We need to streamline them. + +/// Information about vectorization costs +struct VectorizationFactor { + // Vector width with best cost + unsigned Width; + // Cost of the loop with that width + unsigned Cost; +}; + +/// Planner drives the vectorization process after having passed +/// Legality checks. +class LoopVectorizationPlanner { + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// Target Library Info. + const TargetLibraryInfo *TLI; + + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + + /// The profitablity analysis. + LoopVectorizationCostModel &CM; + + using VPlanPtr = std::unique_ptr<VPlan>; + + SmallVector<VPlanPtr, 4> VPlans; + + /// This class is used to enable the VPlan to invoke a method of ILV. This is + /// needed until the method is refactored out of ILV and becomes reusable. + struct VPCallbackILV : public VPCallback { + InnerLoopVectorizer &ILV; + + VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} + + Value *getOrCreateVectorValues(Value *V, unsigned Part) override; + }; + + /// A builder used to construct the current plan. + VPBuilder Builder; + + unsigned BestVF = 0; + unsigned BestUF = 0; + +public: + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM) + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} + + /// Plan how to best vectorize, return the best VF and its cost. + VectorizationFactor plan(bool OptForSize, unsigned UserVF); + + /// Use the VPlan-native path to plan how to best vectorize, return the best + /// VF and its cost. + VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + + /// Finalize the best decision and dispose of all other VPlans. + void setBestPlan(unsigned VF, unsigned UF); + + /// Generate the IR code for the body of the vectorized loop according to the + /// best selected VPlan. + void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + + void printPlans(raw_ostream &O) { + for (const auto &Plan : VPlans) + O << *Plan; + } + + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying + /// \p Predicate on Range.Start, possibly decreasing Range.End such that the + /// returned value holds for the entire \p Range. + static bool + getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, + VFRange &Range); + +protected: + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions); + + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, + /// according to the information gathered by Legal when it checked if it is + /// legal to vectorize the loop. + void buildVPlans(unsigned MinVF, unsigned MaxVF); + +private: + /// Build a VPlan according to the information gathered by Legal. \return a + /// VPlan for vectorization factors \p Range.Start and up to \p Range.End + /// exclusive, possibly decreasing \p Range.End. + VPlanPtr buildVPlan(VFRange &Range); + + /// Build a VPlan using VPRecipes according to the information gather by + /// Legal. This method is only used for the legacy inner loop vectorizer. + VPlanPtr + buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, + SmallPtrSetImpl<Instruction *> &DeadInstructions); + + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, + /// according to the information gathered by Legal when it checked if it is + /// legal to vectorize the loop. This method creates VPlans using VPRecipes. + void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 52f32cda2609..3c693f5d5ee0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -26,6 +26,14 @@ // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. // +// There is a development effort going on to migrate loop vectorizer to the +// VPlan infrastructure and to introduce outer loop vectorization support (see +// docs/Proposal/VectorizationPlan.rst and +// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this +// purpose, we temporarily introduced the VPlan-native vectorization path: an +// alternative vectorization path that is natively implemented on top of the +// VPlan infrastructure. See EnableVPlanNativePath for enabling. +// //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: @@ -47,8 +55,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" -#include "VPlan.h" -#include "VPlanBuilder.h" +#include "LoopVectorizationPlanner.h" +#include "VPRecipeBuilder.h" +#include "VPlanHCFGBuilder.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -57,11 +66,9 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" -#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -70,6 +77,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -124,6 +132,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -145,10 +154,6 @@ using namespace llvm; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt<bool> - EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, - cl::desc("Enable if-conversion during vectorization.")); - /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. static cl::opt<unsigned> TinyTripCountVectorThreshold( @@ -184,9 +189,6 @@ static cl::opt<unsigned> ForceTargetNumVectorRegs( "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); -/// Maximum vectorization interleave count. -static const unsigned MaxInterleaveFactor = 16; - static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " @@ -209,7 +211,7 @@ static cl::opt<unsigned> SmallLoopCost( "The cost of a loop that is considered 'small' by the interleaver.")); static cl::opt<bool> LoopVectorizeWithBlockFrequency( - "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, + "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); @@ -238,71 +240,21 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); - -static cl::opt<unsigned> VectorizeSCEVCheckThreshold( - "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed.")); - -static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( - "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed with a " - "vectorize(enable) pragma")); - -/// Create an analysis remark that explains why vectorization failed -/// -/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p -/// RemarkName is the identifier for the remark. If \p I is passed it is an -/// instruction that prevents vectorization. Otherwise \p TheLoop is used for -/// the location of the remark. \return the remark object that can be -/// streamed to. -static OptimizationRemarkAnalysis -createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, - Instruction *I = nullptr) { - Value *CodeRegion = TheLoop->getHeader(); - DebugLoc DL = TheLoop->getStartLoc(); - - if (I) { - CodeRegion = I->getParent(); - // If there is no debug location attached to the instruction, revert back to - // using the loop's. - if (I->getDebugLoc()) - DL = I->getDebugLoc(); - } - - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; -} - -namespace { - -class LoopVectorizationLegality; -class LoopVectorizationCostModel; -class LoopVectorizationRequirements; - -} // end anonymous namespace - -/// Returns true if the given loop body has a cycle, excluding the loop -/// itself. -static bool hasCyclesInLoopBody(const Loop &L) { - if (!L.empty()) - return true; - - for (const auto &SCC : - make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L), - scc_iterator<Loop, LoopBodyTraits>::end(L))) { - if (SCC.size() > 1) { - DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n"); - DEBUG(L.dump()); - return true; - } - } - return false; -} +static cl::opt<bool> EnableVPlanNativePath( + "enable-vplan-native-path", cl::init(false), cl::Hidden, + cl::desc("Enable VPlan-native vectorization path with " + "support for outer loop vectorization.")); + +// This flag enables the stress testing of the VPlan H-CFG construction in the +// VPlan-native vectorization path. It must be used in conjuction with +// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the +// verification of the H-CFGs built. +static cl::opt<bool> VPlanBuildStressTest( + "vplan-build-stress-test", cl::init(false), cl::Hidden, + cl::desc( + "Build VPlan for every supported loop nest in the function and bail " + "out right after the build (stress test the VPlan H-CFG construction " + "in the VPlan-native vectorization path).")); /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return @@ -317,16 +269,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { // in the project. They can be effectively organized in a common Load/Store // utilities unit. -/// A helper function that returns the pointer operand of a load or store -/// instruction. -static Value *getPointerOperand(Value *I) { - if (auto *LI = dyn_cast<LoadInst>(I)) - return LI->getPointerOperand(); - if (auto *SI = dyn_cast<StoreInst>(I)) - return SI->getPointerOperand(); - return nullptr; -} - /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && @@ -373,7 +315,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { /// A helper function that returns the reciprocal of the block probability of /// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for for every X iterations of the loop header. +/// will execute once for every X iterations of the loop header. /// /// TODO: We should use actual block probability here, if available. Currently, /// we always assume predicated blocks have a 50% chance of executing. @@ -502,7 +444,7 @@ public: void vectorizeMemoryInstruction(Instruction *Instr, VectorParts *BlockInMask = nullptr); - /// \brief Set the debug location in the builder using the debug location in + /// Set the debug location in the builder using the debug location in /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); @@ -538,7 +480,7 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); - /// \brief The Loop exit block may have single value PHI nodes with some + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for /// each predecessor of its parent basic block. See PR14725. @@ -573,9 +515,9 @@ protected: /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and /// \p EntryVal is the value from the original loop that maps to the steps. - /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it - /// can be a truncate instruction). - void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, + /// Note that \p EntryVal doesn't have to be an induction variable - it + /// can also be a truncate instruction. + void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID); /// Create a vector induction phi node based on an existing scalar one. \p @@ -602,10 +544,20 @@ protected: /// vector loop for both the Phi and the cast. /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, - Value *VectorLoopValue, - unsigned Part, - unsigned Lane = UINT_MAX); + /// + /// \p EntryVal is the value from the original loop that maps to the vector + /// phi node and is used to distinguish what is the IV currently being + /// processed - original one (if \p EntryVal is a phi corresponding to the + /// original IV) or the "newly-created" one based on the proof mentioned above + /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the + /// latter case \p EntryVal is a TruncInst and we must not record anything for + /// that IV, but it's error-prone to expect callers of this routine to care + /// about that, hence this explicit parameter. + void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, + const Instruction *EntryVal, + Value *VectorLoopValue, + unsigned Part, + unsigned Lane = UINT_MAX); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -646,7 +598,7 @@ protected: /// loop. void addMetadata(Instruction *To, Instruction *From); - /// \brief Similar to the previous function but it adds the metadata to a + /// Similar to the previous function but it adds the metadata to a /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); @@ -679,7 +631,7 @@ protected: /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - /// \brief LoopVersioning. It's only set up (non-null) if memchecks were + /// LoopVersioning. It's only set up (non-null) if memchecks were /// used. /// /// This is currently only used to add no-alias metadata based on the @@ -777,7 +729,7 @@ private: } // end namespace llvm -/// \brief Look for a meaningful debug location on the instruction or it's +/// Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { if (!I) @@ -849,7 +801,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, namespace llvm { -/// \brief The group of interleaved loads/stores sharing the same stride and +/// The group of interleaved loads/stores sharing the same stride and /// close to each other. /// /// Each member in this group has an index starting from 0, and the largest @@ -893,7 +845,7 @@ public: unsigned getAlignment() const { return Align; } unsigned getNumMembers() const { return Members.size(); } - /// \brief Try to insert a new member \p Instr with index \p Index and + /// Try to insert a new member \p Instr with index \p Index and /// alignment \p NewAlign. The index is related to the leader and it could be /// negative if it is the new leader. /// @@ -927,7 +879,7 @@ public: return true; } - /// \brief Get the member with the given index \p Index + /// Get the member with the given index \p Index /// /// \returns nullptr if contains no such member. Instruction *getMember(unsigned Index) const { @@ -938,7 +890,7 @@ public: return Members.find(Key)->second; } - /// \brief Get the index for the given member. Unlike the key in the member + /// Get the index for the given member. Unlike the key in the member /// map, the index starts from 0. unsigned getIndex(Instruction *Instr) const { for (auto I : Members) @@ -989,7 +941,7 @@ private: namespace { -/// \brief Drive the analysis of interleaved memory accesses in the loop. +/// Drive the analysis of interleaved memory accesses in the loop. /// /// Use this class to analyze interleaved accesses only when we can vectorize /// a loop. Otherwise it's meaningless to do analysis as the vectorization @@ -1000,11 +952,12 @@ namespace { class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT, LoopInfo *LI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {} + DominatorTree *DT, LoopInfo *LI, + const LoopAccessInfo *LAI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} ~InterleavedAccessInfo() { - SmallSet<InterleaveGroup *, 4> DelSet; + SmallPtrSet<InterleaveGroup *, 4> DelSet; // Avoid releasing a pointer twice. for (auto &I : InterleaveGroupMap) DelSet.insert(I.second); @@ -1012,16 +965,16 @@ public: delete Ptr; } - /// \brief Analyze the interleaved accesses and collect them in interleave + /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(const ValueToValueMap &Strides); + void analyzeInterleaving(); - /// \brief Check if \p Instr belongs to any interleave group. + /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { return InterleaveGroupMap.count(Instr); } - /// \brief Get the interleave group that \p Instr belongs to. + /// Get the interleave group that \p Instr belongs to. /// /// \returns nullptr if doesn't have such group. InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { @@ -1030,13 +983,10 @@ public: return nullptr; } - /// \brief Returns true if an interleaved group that may access memory + /// Returns true if an interleaved group that may access memory /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } - /// \brief Initialize the LoopAccessInfo used for dependence checking. - void setLAI(const LoopAccessInfo *Info) { LAI = Info; } - private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -1047,7 +997,7 @@ private: Loop *TheLoop; DominatorTree *DT; LoopInfo *LI; - const LoopAccessInfo *LAI = nullptr; + const LoopAccessInfo *LAI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -1061,7 +1011,7 @@ private: /// access to a set of dependent sink accesses. DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; - /// \brief The descriptor for a strided memory access. + /// The descriptor for a strided memory access. struct StrideDescriptor { StrideDescriptor() = default; StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, @@ -1081,10 +1031,10 @@ private: unsigned Align = 0; }; - /// \brief A type for holding instructions and their stride descriptors. + /// A type for holding instructions and their stride descriptors. using StrideEntry = std::pair<Instruction *, StrideDescriptor>; - /// \brief Create a new interleave group with the given instruction \p Instr, + /// Create a new interleave group with the given instruction \p Instr, /// stride \p Stride and alignment \p Align. /// /// \returns the newly created interleave group. @@ -1096,7 +1046,7 @@ private: return InterleaveGroupMap[Instr]; } - /// \brief Release the group and remove all the relationships. + /// Release the group and remove all the relationships. void releaseGroup(InterleaveGroup *Group) { for (unsigned i = 0; i < Group->getFactor(); i++) if (Instruction *Member = Group->getMember(i)) @@ -1105,28 +1055,28 @@ private: delete Group; } - /// \brief Collect all the accesses with a constant stride in program order. + /// Collect all the accesses with a constant stride in program order. void collectConstStrideAccesses( MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, const ValueToValueMap &Strides); - /// \brief Returns true if \p Stride is allowed in an interleaved group. + /// Returns true if \p Stride is allowed in an interleaved group. static bool isStrided(int Stride) { unsigned Factor = std::abs(Stride); return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; } - /// \brief Returns true if \p BB is a predicated block. + /// Returns true if \p BB is a predicated block. bool isPredicated(BasicBlock *BB) const { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } - /// \brief Returns true if LoopAccessInfo can be used for dependence queries. + /// Returns true if LoopAccessInfo can be used for dependence queries. bool areDependencesValid() const { return LAI && LAI->getDepChecker().getDependences(); } - /// \brief Returns true if memory accesses \p A and \p B can be reordered, if + /// Returns true if memory accesses \p A and \p B can be reordered, if /// necessary, when constructing interleaved groups. /// /// \p A must precede \p B in program order. We return false if reordering is @@ -1174,7 +1124,7 @@ private: return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); } - /// \brief Collect the dependences from LoopAccessInfo. + /// Collect the dependences from LoopAccessInfo. /// /// We process the dependences once during the interleaved access analysis to /// enable constant-time dependence queries. @@ -1187,315 +1137,6 @@ private: } }; -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { - enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char *Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char *Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) {} - - bool validate(unsigned Val) { - switch (Kind) { - case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; - case HK_FORCE: - return (Val <= 1); - case HK_ISVECTORIZED: - return (Val==0 || Val==1); - } - return false; - } - }; - - /// Vectorization width. - Hint Width; - - /// Vectorization interleave factor. - Hint Interleave; - - /// Vectorization forced - Hint Force; - - /// Already Vectorized - Hint IsVectorized; - - /// Return the loop metadata prefix. - static StringRef Prefix() { return "llvm.loop."; } - - /// True if there is any unsafe math in the loop. - bool PotentiallyUnsafe = false; - -public: - enum ForceKind { - FK_Undefined = -1, ///< Not selected. - FK_Disabled = 0, ///< Forcing disabled. - FK_Enabled = 1, ///< Forcing enabled. - }; - - LoopVectorizeHints(const Loop *L, bool DisableInterleaving, - OptimizationRemarkEmitter &ORE) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, - HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), - Force("vectorize.enable", FK_Undefined, HK_FORCE), - IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); - - // force-vector-interleave overrides DisableInterleaving. - if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; - - if (IsVectorized.Value != 1) - // If the vectorization width and interleaving count are both 1 then - // consider the loop to have been already vectorized because there's - // nothing more that we can do. - IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; - DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() - << "LV: Interleaving disabled by the pass manager\n"); - } - - /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized() { - IsVectorized.Value = 1; - Hint Hints[] = {IsVectorized}; - writeHintsToMetadata(Hints); - } - - bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { - if (getForce() == LoopVectorizeHints::FK_Disabled) { - DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitRemarkWithHints(); - return false; - } - - if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { - DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitRemarkWithHints(); - return false; - } - - if (getIsVectorized() == 1) { - DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); - // FIXME: Add interleave.disable metadata. This will allow - // vectorize.disable to be used without disabling the pass and errors - // to differentiate between disabled vectorization and a width of 1. - ORE.emit([&]() { - return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), - "AllDisabled", L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: vectorization and interleaving are " - "explicitly disabled, or the loop has already been " - "vectorized"; - }); - return false; - } - - return true; - } - - /// Dumps all the hint information. - void emitRemarkWithHints() const { - using namespace ore; - - ORE.emit([&]() { - if (Force.Value == LoopVectorizeHints::FK_Disabled) - return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "loop not vectorized: vectorization is explicitly disabled"; - else { - OptimizationRemarkMissed R(LV_NAME, "MissedDetails", - TheLoop->getStartLoc(), - TheLoop->getHeader()); - R << "loop not vectorized"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=" << NV("Force", true); - if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); - if (Interleave.Value != 0) - R << ", Interleave Count=" - << NV("InterleaveCount", Interleave.Value); - R << ")"; - } - return R; - } - }); - } - - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - unsigned getIsVectorized() const { return IsVectorized.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } - - /// \brief If hints are provided that force vectorization, use the AlwaysPrint - /// pass name to force the frontend to print the diagnostic. - const char *vectorizeAnalysisPassName() const { - if (getWidth() == 1) - return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Disabled) - return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) - return LV_NAME; - return OptimizationRemarkAnalysis::AlwaysPrint; - } - - bool allowReordering() const { - // When enabling loop hints are provided we allow the vectorizer to change - // the order of operations that is given by the scalar loop. This is not - // enabled by default because can be unsafe or inefficient. For example, - // reordering floating-point operations will change the way round-off - // error accumulates in the loop. - return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; - } - - bool isPotentiallyUnsafe() const { - // Avoid FP vectorization if the target is unsure about proper support. - // This may be related to the SIMD unit in the target not handling - // IEEE 754 FP ops properly, or bad single-to-double promotions. - // Otherwise, a sequence of vectorized loops, even without reduction, - // could lead to different end results on the destination vectors. - return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; - } - - void setPotentiallyUnsafe() { PotentiallyUnsafe = true; } - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector<Metadata *, 4> Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast<MDString>(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast<MDString>(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); - } - } - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); - if (!C) - return; - unsigned Val = C->getZExtValue(); - - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); - break; - } - } - } - - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); - } - - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { - MDString *Name = dyn_cast<MDString>(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; - } - - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { - if (HintTypes.empty()) - return; - - // Reserve the first element to LoopID (see below). - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); - } - - /// The loop these hints belong to. - const Loop *TheLoop; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter &ORE; -}; - } // end anonymous namespace static void emitMissedWarning(Function *F, Loop *L, @@ -1519,324 +1160,7 @@ static void emitMissedWarning(Function *F, Loop *L, } } -namespace { - -/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and -/// to what vectorization factor. -/// This class does not look at the profitability of vectorization, only the -/// legality. This class has two main kinds of checks: -/// * Memory checks - The code in canVectorizeMemory checks if vectorization -/// will change the order of memory accesses in a way that will change the -/// correctness of the program. -/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory -/// checks for a number of different conditions, such as the availability of a -/// single induction variable, that all types are supported and vectorize-able, -/// etc. This code reflects the capabilities of InnerLoopVectorizer. -/// This class is also used by InnerLoopVectorizer for identifying -/// induction variable and the different reduction variables. -class LoopVectorizationLegality { -public: - LoopVectorizationLegality( - Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, - const TargetTransformInfo *TTI, - std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, - OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, - LoopVectorizeHints *H) - : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), - ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {} - - /// ReductionList contains the reduction descriptors for all - /// of the reductions that were found in the loop. - using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>; - - /// InductionList saves induction variables and maps them to the - /// induction descriptor. - using InductionList = MapVector<PHINode *, InductionDescriptor>; - - /// RecurrenceSet contains the phi nodes that are recurrences other than - /// inductions and reductions. - using RecurrenceSet = SmallPtrSet<const PHINode *, 8>; - - /// Returns true if it is legal to vectorize this loop. - /// This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. - bool canVectorize(); - - /// Returns the primary induction variable. - PHINode *getPrimaryInduction() { return PrimaryInduction; } - - /// Returns the reduction variables found in the loop. - ReductionList *getReductionVars() { return &Reductions; } - - /// Returns the induction variables found in the loop. - InductionList *getInductionVars() { return &Inductions; } - - /// Return the first-order recurrences found in the loop. - RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } - - /// Return the set of instructions to sink to handle first-order recurrences. - DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; } - - /// Returns the widest induction type. - Type *getWidestInductionType() { return WidestIndTy; } - - /// Returns True if V is a Phi node of an induction variable in this loop. - bool isInductionPhi(const Value *V); - - /// Returns True if V is a cast that is part of an induction def-use chain, - /// and had been proven to be redundant under a runtime guard (in other - /// words, the cast has the same SCEV expression as the induction phi). - bool isCastedInductionVariable(const Value *V); - - /// Returns True if V can be considered as an induction variable in this - /// loop. V can be the induction phi, or some redundant cast in the def-use - /// chain of the inducion phi. - bool isInductionVariable(const Value *V); - - /// Returns True if PN is a reduction variable in this loop. - bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } - - /// Returns True if Phi is a first-order recurrence in this loop. - bool isFirstOrderRecurrence(const PHINode *Phi); - - /// Return true if the block BB needs to be predicated in order for the loop - /// to be vectorized. - bool blockNeedsPredication(BasicBlock *BB); - - /// Check if this pointer is consecutive when vectorizing. This happens - /// when the last index of the GEP is the induction variable, or that the - /// pointer itself is an induction variable. - /// This check allows us to vectorize A[idx] into a wide load/store. - /// Returns: - /// 0 - Stride is unknown or non-consecutive. - /// 1 - Address is consecutive. - /// -1 - Address is consecutive, and decreasing. - /// NOTE: This method must only be used before modifying the original scalar - /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). - int isConsecutivePtr(Value *Ptr); - - /// Returns true if the value V is uniform within the loop. - bool isUniform(Value *V); - - /// Returns the information that we collected about runtime memory check. - const RuntimePointerChecking *getRuntimePointerChecking() const { - return LAI->getRuntimePointerChecking(); - } - - const LoopAccessInfo *getLAI() const { return LAI; } - - /// \brief Check if \p Instr belongs to any interleaved access group. - bool isAccessInterleaved(Instruction *Instr) { - return InterleaveInfo.isInterleaved(Instr); - } - - /// \brief Get the interleaved access group that \p Instr belongs to. - const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { - return InterleaveInfo.getInterleaveGroup(Instr); - } - - /// \brief Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps. - bool requiresScalarEpilogue() const { - return InterleaveInfo.requiresScalarEpilogue(); - } - - unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } - - uint64_t getMaxSafeRegisterWidth() const { - return LAI->getDepChecker().getMaxSafeRegisterWidth(); - } - - bool hasStride(Value *V) { return LAI->hasStride(V); } - - /// Returns true if the target machine supports masked store operation - /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); - } - - /// Returns true if the target machine supports masked load operation - /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); - } - - /// Returns true if the target machine supports masked scatter operation - /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI->isLegalMaskedScatter(DataType); - } - - /// Returns true if the target machine supports masked gather operation - /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI->isLegalMaskedGather(DataType); - } - - /// Returns true if the target machine can represent \p V as a masked gather - /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { - auto *LI = dyn_cast<LoadInst>(V); - auto *SI = dyn_cast<StoreInst>(V); - if (!LI && !SI) - return false; - auto *Ptr = getPointerOperand(V); - auto *Ty = cast<PointerType>(Ptr->getType())->getElementType(); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); - } - - /// Returns true if vector representation of the instruction \p I - /// requires mask. - bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } - - unsigned getNumStores() const { return LAI->getNumStores(); } - unsigned getNumLoads() const { return LAI->getNumLoads(); } - unsigned getNumPredStores() const { return NumPredStores; } - - /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - bool isScalarWithPredication(Instruction *I); - - /// Returns true if \p I is a memory instruction with consecutive memory - /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); - - // Returns true if the NoNaN attribute is set on the function. - bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } - -private: - /// Check if a single basic block loop is vectorizable. - /// At this point we know that this is a loop with a constant trip count - /// and we only need to check individual instructions. - bool canVectorizeInstrs(); - - /// When we vectorize loops we may change the order in which - /// we read and write from memory. This method checks if it is - /// legal to vectorize the code, considering only memory constrains. - /// Returns true if the loop is vectorizable - bool canVectorizeMemory(); - - /// Return true if we can vectorize this loop using the IF-conversion - /// transformation. - bool canVectorizeWithIfConvert(); - - /// Return true if all of the instructions in the block can be speculatively - /// executed. \p SafePtrs is a list of addresses that are known to be legal - /// and we know that we can read from them without segfault. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); - - /// Updates the vectorization state by adding \p Phi to the inductions list. - /// This can set \p Phi as the main induction of the loop if \p Phi is a - /// better choice for the main induction than the existing one. - void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl<Value *> &AllowedExit); - - /// Create an analysis remark that explains why vectorization failed - /// - /// \p RemarkName is the identifier for the remark. If \p I is passed it is - /// an instruction that prevents vectorization. Otherwise the loop is used - /// for the location of the remark. \return the remark object that can be - /// streamed to. - OptimizationRemarkAnalysis - createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { - return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), - RemarkName, TheLoop, I); - } - - /// \brief If an access has a symbolic strides, this maps the pointer value to - /// the stride symbol. - const ValueToValueMap *getSymbolicStrides() { - // FIXME: Currently, the set of symbolic strides is sometimes queried before - // it's collected. This happens from canVectorizeWithIfConvert, when the - // pointer is checked to reference consecutive elements suitable for a - // masked access. - return LAI ? &LAI->getSymbolicStrides() : nullptr; - } - - unsigned NumPredStores = 0; - - /// The loop that we evaluate. - Loop *TheLoop; - - /// A wrapper around ScalarEvolution used to add runtime SCEV checks. - /// Applies dynamic knowledge to simplify SCEV expressions in the context - /// of existing SCEV assumptions. The analysis will also add a minimal set - /// of new predicates if this is required to enable vectorization and - /// unrolling. - PredicatedScalarEvolution &PSE; - - /// Target Library Info. - TargetLibraryInfo *TLI; - - /// Target Transform Info - const TargetTransformInfo *TTI; - - /// Dominator Tree. - DominatorTree *DT; - - // LoopAccess analysis. - std::function<const LoopAccessInfo &(Loop &)> *GetLAA; - - // And the loop-accesses info corresponding to this loop. This pointer is - // null until canVectorizeMemory sets it up. - const LoopAccessInfo *LAI = nullptr; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter *ORE; - - /// The interleave access information contains groups of interleaved accesses - /// with the same stride and close to each other. - InterleavedAccessInfo InterleaveInfo; - - // --- vectorization state --- // - - /// Holds the primary induction variable. This is the counter of the - /// loop. - PHINode *PrimaryInduction = nullptr; - - /// Holds the reduction variables. - ReductionList Reductions; - - /// Holds all of the induction variables that we found in the loop. - /// Notice that inductions don't need to start at zero and that induction - /// variables can be pointers. - InductionList Inductions; - - /// Holds all the casts that participate in the update chain of the induction - /// variables, and that have been proven to be redundant (possibly under a - /// runtime guard). These casts can be ignored when creating the vectorized - /// loop body. - SmallPtrSet<Instruction *, 4> InductionCastsToIgnore; - - /// Holds the phi nodes that are first-order recurrences. - RecurrenceSet FirstOrderRecurrences; - - /// Holds instructions that need to sink past other instructions to handle - /// first-order recurrences. - DenseMap<Instruction *, Instruction *> SinkAfter; - - /// Holds the widest induction type encountered. - Type *WidestIndTy = nullptr; - - /// Allowed outside users. This holds the induction and reduction - /// vars which can be accessed from outside the loop. - SmallPtrSet<Value *, 4> AllowedExit; - - /// Can we assume the absence of NaNs. - bool HasFunNoNaNAttr = false; - - /// Vectorization requirements that will go through late-evaluation. - LoopVectorizationRequirements *Requirements; - - /// Used to emit an analysis of any legality issues. - LoopVectorizeHints *Hints; - - /// While vectorizing these instructions we have to generate a - /// call to the appropriate masked intrinsic - SmallPtrSet<const Instruction *, 8> MaskedOp; -}; +namespace llvm { /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. @@ -1853,23 +1177,15 @@ public: const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, - const LoopVectorizeHints *Hints) + const LoopVectorizeHints *Hints, + InterleavedAccessInfo &IAI) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization should be avoided up front. Optional<unsigned> computeMaxVF(bool OptForSize); - /// Information about vectorization costs - struct VectorizationFactor { - // Vector width with best cost - unsigned Width; - - // Cost of the loop with that width - unsigned Cost; - }; - /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is @@ -1903,7 +1219,7 @@ public: /// avoid redundant calculations. void setCostBasedWideningDecision(unsigned VF); - /// \brief A struct that represents some properties of the register usage + /// A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. @@ -1911,9 +1227,6 @@ public: /// Holds the maximum number of concurrent live intervals in the loop. unsigned MaxLocalUsers; - - /// Holds the number of instructions in the loop. - unsigned NumInstructions; }; /// \return Returns information about the register usages of the loop for the @@ -2063,7 +1376,69 @@ public: collectLoopScalars(VF); } + /// Returns true if the target machine supports masked store operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedStore(Type *DataType, Value *Ptr) { + return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); + } + + /// Returns true if the target machine supports masked load operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { + return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); + } + + /// Returns true if the target machine supports masked scatter operation + /// for the given \p DataType. + bool isLegalMaskedScatter(Type *DataType) { + return TTI.isLegalMaskedScatter(DataType); + } + + /// Returns true if the target machine supports masked gather operation + /// for the given \p DataType. + bool isLegalMaskedGather(Type *DataType) { + return TTI.isLegalMaskedGather(DataType); + } + + /// Returns true if the target machine can represent \p V as a masked gather + /// or scatter operation. + bool isLegalGatherOrScatter(Value *V) { + bool LI = isa<LoadInst>(V); + bool SI = isa<StoreInst>(V); + if (!LI && !SI) + return false; + auto *Ty = getMemInstValueType(V); + return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + } + + /// Returns true if \p I is an instruction that will be scalarized with + /// predication. Such instructions include conditional stores and + /// instructions that may divide by zero. + bool isScalarWithPredication(Instruction *I); + + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + + /// Check if \p Instr belongs to any interleaved access group. + bool isAccessInterleaved(Instruction *Instr) { + return InterleaveInfo.isInterleaved(Instr); + } + + /// Get the interleaved access group that \p Instr belongs to. + const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + return InterleaveInfo.getInterleaveGroup(Instr); + } + + /// Returns true if an interleaved group requires a scalar iteration + /// to handle accesses with gaps. + bool requiresScalarEpilogue() const { + return InterleaveInfo.requiresScalarEpilogue(); + } + private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); @@ -2115,12 +1490,16 @@ private: /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); + /// Returns true if an artificially high cost for emulated masked memrefs + /// should be used. + bool useEmulatedMaskMemRefHack(Instruction *I); + /// Create an analysis remark that explains why vectorization failed /// /// \p RemarkName is the identifier for the remark. \return the remark object /// that can be streamed to. OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { - return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), RemarkName, TheLoop); } @@ -2222,6 +1601,10 @@ public: /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// The interleave access information contains groups of interleaved accesses + /// with the same stride and close to each other. + InterleavedAccessInfo &InterleaveInfo; + /// Values to ignore in the cost model. SmallPtrSet<const Value *, 16> ValuesToIgnore; @@ -2229,271 +1612,78 @@ public: SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; -} // end anonymous namespace - -namespace llvm { - -/// InnerLoopVectorizer vectorizes loops which contain only one basic -/// LoopVectorizationPlanner - drives the vectorization process after having -/// passed Legality checks. -/// The planner builds and optimizes the Vectorization Plans which record the -/// decisions how to vectorize the given loop. In particular, represent the -/// control-flow of the vectorized version, the replication of instructions that -/// are to be scalarized, and interleave access groups. -class LoopVectorizationPlanner { - /// The loop that we evaluate. - Loop *OrigLoop; - - /// Loop Info analysis. - LoopInfo *LI; - - /// Target Library Info. - const TargetLibraryInfo *TLI; - - /// Target Transform Info. - const TargetTransformInfo *TTI; - - /// The legality analysis. - LoopVectorizationLegality *Legal; - - /// The profitablity analysis. - LoopVectorizationCostModel &CM; - - using VPlanPtr = std::unique_ptr<VPlan>; - - SmallVector<VPlanPtr, 4> VPlans; - - /// This class is used to enable the VPlan to invoke a method of ILV. This is - /// needed until the method is refactored out of ILV and becomes reusable. - struct VPCallbackILV : public VPCallback { - InnerLoopVectorizer &ILV; - - VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} - - Value *getOrCreateVectorValues(Value *V, unsigned Part) override { - return ILV.getOrCreateVectorValue(V, Part); - } - }; - - /// A builder used to construct the current plan. - VPBuilder Builder; - - /// When we if-convert we need to create edge masks. We have to cache values - /// so that we don't end up with exponential recursion/IR. Note that - /// if-conversion currently takes place during VPlan-construction, so these - /// caches are only used at that stage. - using EdgeMaskCacheTy = - DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; - using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; - EdgeMaskCacheTy EdgeMaskCache; - BlockMaskCacheTy BlockMaskCache; - - unsigned BestVF = 0; - unsigned BestUF = 0; - -public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, - LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} - - /// Plan how to best vectorize, return the best VF and its cost. - LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, - unsigned UserVF); - - /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); - - /// Generate the IR code for the body of the vectorized loop according to the - /// best selected VPlan. - void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - - void printPlans(raw_ostream &O) { - for (const auto &Plan : VPlans) - O << *Plan; - } - -protected: - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - - /// A range of powers-of-2 vectorization factors with fixed start and - /// adjustable end. The range includes start and excludes end, e.g.,: - /// [1, 9) = {1, 2, 4, 8} - struct VFRange { - // A power of 2. - const unsigned Start; - - // Need not be a power of 2. If End <= Start range is empty. - unsigned End; - }; - - /// Test a \p Predicate on a \p Range of VF's. Return the value of applying - /// \p Predicate on Range.Start, possibly decreasing Range.End such that the - /// returned value holds for the entire \p Range. - bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, - VFRange &Range); - - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. - void buildVPlans(unsigned MinVF, unsigned MaxVF); - -private: - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); - - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); - - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is <true, nullptr>, as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); - - // Check if \I is a memory instruction to be widened for \p Range.Start and - // potentially masked. Such instructions are handled by a recipe that takes an - // additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan); - - /// Check if an induction recipe should be constructed for \I within the given - /// VF \p Range. If so build and return it. If not, return null. \p Range.End - /// may be decreased to ensure same decision from \p Range.Start to - /// \p Range.End. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, - VFRange &Range); - - /// Handle non-loop phi nodes. Currently all such phi nodes are turned into - /// a sequence of select instructions as the vectorizer currently performs - /// full if-conversion. - VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); - - /// Check if \p I can be widened within the given VF \p Range. If \p I can be - /// widened for \p Range.Start, check if the last recipe of \p VPBB can be - /// extended to include \p I or else build a new VPWidenRecipe for it and - /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, - /// false otherwise. Range.End may be decreased to ensure same decision from - /// \p Range.Start to \p Range.End. - bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); - - /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it - /// is predicated. \return \p VPBB augmented with this new recipe if \p I is - /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new - /// Region. Update the packing decision of predicated instructions if they - /// feed \p I. Range.End may be decreased to ensure same recipe behavior from - /// \p Range.Start to \p Range.End. - VPBasicBlock *handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, - VPlanPtr &Plan); - - /// Create a replicating region for instruction \p I that requires - /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, - VPlanPtr &Plan); - - /// Build a VPlan according to the information gathered by Legal. \return a - /// VPlan for vectorization factors \p Range.Start and up to \p Range.End - /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range, - const SmallPtrSetImpl<Value *> &NeedDef); -}; - } // end namespace llvm -namespace { - -/// \brief This holds vectorization requirements that must be verified late in -/// the process. The requirements are set by legalize and costmodel. Once -/// vectorization has been determined to be possible and profitable the -/// requirements can be verified by looking for metadata or compiler options. -/// For example, some loops require FP commutativity which is only allowed if -/// vectorization is explicitly specified or if the fast-math compiler option -/// has been provided. -/// Late evaluation of these requirements allows helpful diagnostics to be -/// composed that tells the user what need to be done to vectorize the loop. For -/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late -/// evaluation should be used only when diagnostics can generated that can be -/// followed by a non-expert user. -class LoopVectorizationRequirements { -public: - LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} - - void addUnsafeAlgebraInst(Instruction *I) { - // First unsafe algebra instruction. - if (!UnsafeAlgebraInst) - UnsafeAlgebraInst = I; - } - - void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } - - bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { - const char *PassName = Hints.vectorizeAnalysisPassName(); - bool Failed = false; - if (UnsafeAlgebraInst && !Hints.allowReordering()) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisFPCommute( - PassName, "CantReorderFPOps", - UnsafeAlgebraInst->getDebugLoc(), - UnsafeAlgebraInst->getParent()) - << "loop not vectorized: cannot prove it is safe to reorder " - "floating-point operations"; - }); - Failed = true; - } - - // Test if runtime memcheck thresholds are exceeded. - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", - L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Failed = true; - } +// Return true if \p OuterLp is an outer loop annotated with hints for explicit +// vectorization. The loop needs to be annotated with #pragma omp simd +// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the +// vector length information is not provided, vectorization is not considered +// explicit. Interleave hints are not allowed either. These limitations will be +// relaxed in the future. +// Please, note that we are currently forced to abuse the pragma 'clang +// vectorize' semantics. This pragma provides *auto-vectorization hints* +// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' +// provides *explicit vectorization hints* (LV can bypass legal checks and +// assume that vectorization is legal). However, both hints are implemented +// using the same metadata (llvm.loop.vectorize, processed by +// LoopVectorizeHints). This will be fixed in the future when the native IR +// representation for pragma 'omp simd' is introduced. +static bool isExplicitVecOuterLoop(Loop *OuterLp, + OptimizationRemarkEmitter *ORE) { + assert(!OuterLp->empty() && "This is not an outer loop"); + LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); + + // Only outer loops with an explicit vectorization hint are supported. + // Unannotated outer loops are ignored. + if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) + return false; - return Failed; + Function *Fn = OuterLp->getHeader()->getParent(); + if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) { + LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); + return false; } -private: - unsigned NumRuntimePointerChecks = 0; - Instruction *UnsafeAlgebraInst = nullptr; + if (!Hints.getWidth()) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); + emitMissedWarning(Fn, OuterLp, Hints, ORE); + return false; + } - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter &ORE; -}; + if (Hints.getInterleave() > 1) { + // TODO: Interleave support is future work. + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " + "outer loops.\n"); + emitMissedWarning(Fn, OuterLp, Hints, ORE); + return false; + } -} // end anonymous namespace + return true; +} -static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { - if (L.empty()) { - if (!hasCyclesInLoopBody(L)) +static void collectSupportedLoops(Loop &L, LoopInfo *LI, + OptimizationRemarkEmitter *ORE, + SmallVectorImpl<Loop *> &V) { + // Collect inner loops and outer loops without irreducible control flow. For + // now, only collect outer loops that have explicit vectorization hints. If we + // are stress testing the VPlan H-CFG construction, we collect the outermost + // loop of every loop nest. + if (L.empty() || VPlanBuildStressTest || + (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { + LoopBlocksRPO RPOT(&L); + RPOT.perform(LI); + if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { V.push_back(&L); - return; + // TODO: Collect inner loops inside marked outer loops in case + // vectorization fails for the outer loop. Do not invoke + // 'containsIrreducibleCFG' again for inner loops when the outer loop is + // already known to be reducible. We can use an inherited attribute for + // that. + return; + } } for (Loop *InnerL : L) - addAcyclicInnerLoop(*InnerL, V); + collectSupportedLoops(*InnerL, LI, ORE, V); } namespace { @@ -2562,14 +1752,16 @@ struct LoopVectorize : public FunctionPass { //===----------------------------------------------------------------------===// Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // We need to place the broadcast of invariant variables outside the loop. + // We need to place the broadcast of invariant variables outside the loop, + // but only if it's proven safe to do so. Else, broadcast will be inside + // vector loop body. Instruction *Instr = dyn_cast<Instruction>(V); - bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); - bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; - + bool SafeToHoist = OrigLoop->isLoopInvariant(V) && + (!Instr || + DT->dominates(Instr->getParent(), LoopVectorPreHeader)); // Place the code for broadcasting invariant variables in the new preheader. IRBuilder<>::InsertPointGuard Guard(Builder); - if (Invariant) + if (SafeToHoist) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // Broadcast the scalar into all locations in the vector. @@ -2580,6 +1772,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); Value *Start = II.getStartValue(); // Construct the initial value of the vector IV in the vector loop preheader @@ -2627,14 +1821,18 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", &*LoopVectorBody->getFirstInsertionPt()); + VecInd->setDebugLoc(EntryVal->getDebugLoc()); Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); - recordVectorLoopValueForInductionCast(II, LastInduction, Part); + if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); + recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); + LastInduction = cast<Instruction>(addFastMathFlag( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } // Move the last step to the end of the latch block. This ensures consistent @@ -2665,8 +1863,20 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { } void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, - unsigned Lane) { + const InductionDescriptor &ID, const Instruction *EntryVal, + Value *VectorLoopVal, unsigned Part, unsigned Lane) { + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // This induction variable is not the phi from the original loop but the + // newly-created IV based on the proof that casted Phi is equal to the + // uncasted Phi in the vectorized loop (under a runtime guard possibly). It + // re-uses the same InductionDescriptor that original IV uses but we don't + // have to do any recording in this case - that is done when original IV is + // processed. + if (isa<TruncInst>(EntryVal)) + return; + const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); if (Casts.empty()) return; @@ -2754,15 +1964,16 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // If we haven't yet vectorized the induction variable, splat the scalar // induction variable, and build the necessary step vectors. + // TODO: Don't do it unless the vectorized IV is really required. if (!VectorizedIV) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); - recordVectorLoopValueForInductionCast(ID, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); + recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); } } @@ -2833,7 +2044,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Value *EntryVal, + Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); @@ -2868,25 +2079,11 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); - recordVectorLoopValueForInductionCast(ID, Add, Part, Lane); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); } } } -int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : - ValueToValueMap(); - - int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); - if (Stride == 1 || Stride == -1) - return Stride; - return 0; -} - -bool LoopVectorizationLegality::isUniform(Value *V) { - return LAI->isUniform(V); -} - Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); @@ -3046,7 +2243,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { - const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); // Skip if current instruction is not the insert position. @@ -3054,7 +2251,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getPointerOperand(Instr); + Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); @@ -3076,6 +2273,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; Part++) { Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); @@ -3091,6 +2292,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + if (InBounds) + cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); // Cast to the vector pointer type. NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); @@ -3196,7 +2399,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getPointerOperand(Instr); + Value *Ptr = getLoadStorePointerOperand(Instr); unsigned Alignment = getMemInstAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. @@ -3227,10 +2430,37 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (isMaskRequired) Mask = *BlockInMask; + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>( + getLoadStorePointerOperand(Instr)->stripPointerCasts())) + InBounds = gep->isInBounds(); + + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { + // Calculate the pointer for the specific unroll-part. + GetElementPtrInst *PartPtr = nullptr; + + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF))); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF))); + PartPtr->setIsInBounds(InBounds); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + Mask[Part] = reverseVector(Mask[Part]); + } else { + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF))); + PartPtr->setIsInBounds(InBounds); + } + + return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + }; + // Handle Stores: if (SI) { - assert(!Legal->isUniform(SI->getPointerOperand()) && - "We do not allow storing to uniform addresses"); setDebugLocFromInst(Builder, SI); for (unsigned Part = 0; Part < UF; ++Part) { @@ -3242,30 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); } else { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - if (Reverse) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = reverseVector(StoredVal); // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). - - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = - Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); } - - Value *VecPtr = - Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - + auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask[Part]); @@ -3289,21 +2503,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); - } - - Value *VecPtr = - Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], UndefValue::get(DataTy), @@ -3457,7 +2657,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF > 1 && Legal->requiresScalarEpilogue()) { + if (VF > 1 && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3508,8 +2708,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); @@ -3714,6 +2914,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create( OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + // Copy original phi DL over to the new one. + BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; if (OrigPhi == OldInduction) { // We know what the end value is. @@ -3871,7 +3073,7 @@ struct CSEDenseMapInfo { } // end anonymous namespace -///\brief Perform cse of induction variable instructions. +///Perform cse of induction variable instructions. static void cse(BasicBlock *BB) { // Perform simple cse. SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; @@ -3893,7 +3095,7 @@ static void cse(BasicBlock *BB) { } } -/// \brief Estimate the overhead of scalarizing an instruction. This is a +/// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, const TargetTransformInfo &TTI) { @@ -4074,7 +3276,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); - } else if (isa<LoadInst>(I)) { + } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { @@ -4089,7 +3291,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); NewI = B.CreateExtractElement(O0, EE->getOperand(2)); } else { - llvm_unreachable("Unhandled instruction type!"); + // If we don't know what to do, be conservative and don't do anything. + continue; } // Lastly, extend the result. @@ -4164,15 +3367,12 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() { // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. - for (Instruction &I : *OrigLoop->getHeader()) { - PHINode *Phi = dyn_cast<PHINode>(&I); - if (!Phi) - break; + for (PHINode &Phi : OrigLoop->getHeader()->phis()) { // Handle first-order recurrences and reductions that need to be fixed. - if (Legal->isFirstOrderRecurrence(Phi)) - fixFirstOrderRecurrence(Phi); - else if (Legal->isReductionVariable(Phi)) - fixReduction(Phi); + if (Legal->isFirstOrderRecurrence(&Phi)) + fixFirstOrderRecurrence(&Phi); + else if (Legal->isReductionVariable(&Phi)) + fixReduction(&Phi); } } @@ -4335,15 +3535,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Finally, fix users of the recurrence outside the loop. The users will need // either the last value of the scalar recurrence or the last value of the // vector recurrence we extracted in the middle block. Since the loop is in - // LCSSA form, we just need to find the phi node for the original scalar + // LCSSA form, we just need to find all the phi nodes for the original scalar // recurrence in the exit block, and then add an edge for the middle block. - for (auto &I : *LoopExitBlock) { - auto *LCSSAPhi = dyn_cast<PHINode>(&I); - if (!LCSSAPhi) - break; - if (LCSSAPhi->getIncomingValue(0) == Phi) { - LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - break; + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getIncomingValue(0) == Phi) { + LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } } } @@ -4499,21 +3695,15 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); - LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) - break; - + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { // All PHINodes need to have a single entry edge, or two if // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); // We found a reduction value exit-PHI. Update it with the // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) - LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) + LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); } // end of the LCSSA phi scan. // Fix the scalar loop reduction variable with the incoming reduction sum @@ -4528,14 +3718,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } void InnerLoopVectorizer::fixLCSSAPHIs() { - for (Instruction &LEI : *LoopExitBlock) { - auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); - if (!LCSSAPhi) - break; - if (LCSSAPhi->getNumIncomingValues() == 1) { - assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) && "Incoming value isn't loop invariant"); - LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock); } } } @@ -4955,7 +4142,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { default: // This instruction is not vectorized by simple widening. - DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); llvm_unreachable("Unhandled instruction!"); } // end of switch. } @@ -4973,467 +4160,7 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - DEBUG(DT->verifyDomTree()); -} - -/// \brief Check whether it is safe to if-convert this phi node. -/// -/// Phi nodes with constant expressions that can trap are not safe to if -/// convert. -static bool canIfConvertPHINodes(BasicBlock *BB) { - for (Instruction &I : *BB) { - auto *Phi = dyn_cast<PHINode>(&I); - if (!Phi) - return true; - for (Value *V : Phi->incoming_values()) - if (auto *C = dyn_cast<Constant>(V)) - if (C->canTrap()) - return false; - } - return true; -} - -bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) { - ORE->emit(createMissedAnalysis("IfConversionDisabled") - << "if-conversion is disabled"); - return false; - } - - assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - - // A list of pointers that we can safely read and write to. - SmallPtrSet<Value *, 8> SafePointes; - - // Collect safe addresses. - for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) - continue; - - for (Instruction &I : *BB) - if (auto *Ptr = getPointerOperand(&I)) - SafePointes.insert(Ptr); - } - - // Collect the blocks that need predication. - BasicBlock *Header = TheLoop->getHeader(); - for (BasicBlock *BB : TheLoop->blocks()) { - // We don't support switch statements inside loops. - if (!isa<BranchInst>(BB->getTerminator())) { - ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) - << "loop contains a switch statement"); - return false; - } - - // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - return false; - } - } else if (BB != Header && !canIfConvertPHINodes(BB)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - return false; - } - } - - // We can if-convert this loop. - return true; -} - -bool LoopVectorizationLegality::canVectorize() { - // Store the result and return it at the end instead of exiting early, in case - // allowExtraAnalysis is used to report multiple reasons for not vectorizing. - bool Result = true; - - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - // We must have a loop in canonical form. Loops with indirectbr in them cannot - // be canonicalized. - if (!TheLoop->getLoopPreheader()) { - DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // FIXME: The code is currently dead, since the loop gets sent to - // LoopVectorizationLegality is already an innermost loop. - // - // We can only vectorize innermost loops. - if (!TheLoop->empty()) { - ORE->emit(createMissedAnalysis("NotInnermostLoop") - << "loop is not the innermost loop"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We only handle bottom-tested loops, i.e. loop in which the condition is - // checked at the end of each iteration. With that we can assume that all - // instructions in the loop are executed the same number of times. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We need to have a loop header. - DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() - << '\n'); - - // Check if we can if-convert non-single-bb loops. - unsigned NumBlocks = TheLoop->getNumBlocks(); - if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { - DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Check if we can vectorize the instructions and CFG in this loop. - if (!canVectorizeInstrs()) { - DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Go over each instruction and look at memory deps. - if (!canVectorizeMemory()) { - DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - DEBUG(dbgs() << "LV: We can vectorize this loop" - << (LAI->getRuntimePointerChecking()->Need - ? " (with a runtime bound check)" - : "") - << "!\n"); - - bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); - - // If an override option has been passed in for interleaved accesses, use it. - if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) - UseInterleaved = EnableInterleavedMemAccesses; - - // Analyze interleaved memory accesses. - if (UseInterleaved) - InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); - - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; - if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) - SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; - - if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { - ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") - << "Too many SCEV assumptions need to be made and checked " - << "at runtime"); - DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Okay! We've done all the tests. If any have failed, return false. Otherwise - // we can vectorize, and at this point we don't have any other mem analysis - // which may limit our maximum vectorization factor, so just return true with - // no restrictions. - return Result; -} - -static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { - if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty); - - // It is possible that char's or short's overflow when we ask for the loop's - // trip count, work around this by changing the type size. - if (Ty->getScalarSizeInBits() < 32) - return Type::getInt32Ty(Ty->getContext()); - - return Ty; -} - -static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { - Ty0 = convertPointerToIntegerType(DL, Ty0); - Ty1 = convertPointerToIntegerType(DL, Ty1); - if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) - return Ty0; - return Ty1; -} - -/// \brief Check that the instruction has outside loop users and is not an -/// identified reduction variable. -static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSetImpl<Value *> &AllowedExit) { - // Reduction and Induction instructions are allowed to have exit users. All - // other instructions must not have external users. - if (!AllowedExit.count(Inst)) - // Check that all of the users of the loop are inside the BB. - for (User *U : Inst->users()) { - Instruction *UI = cast<Instruction>(U); - // This user may be a reduction exit value. - if (!TheLoop->contains(UI)) { - DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); - return true; - } - } - return false; -} - -void LoopVectorizationLegality::addInductionPhi( - PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl<Value *> &AllowedExit) { - Inductions[Phi] = ID; - - // In case this induction also comes with casts that we know we can ignore - // in the vectorized loop body, record them here. All casts could be recorded - // here for ignoring, but suffices to record only the first (as it is the - // only one that may bw used outside the cast sequence). - const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); - if (!Casts.empty()) - InductionCastsToIgnore.insert(*Casts.begin()); - - Type *PhiTy = Phi->getType(); - const DataLayout &DL = Phi->getModule()->getDataLayout(); - - // Get the widest type. - if (!PhiTy->isFloatingPointTy()) { - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); - } - - // Int inductions are special because we only allow one IV. - if (ID.getKind() == InductionDescriptor::IK_IntInduction && - ID.getConstIntStepValue() && - ID.getConstIntStepValue()->isOne() && - isa<Constant>(ID.getStartValue()) && - cast<Constant>(ID.getStartValue())->isNullValue()) { - - // Use the phi node with the widest type as induction. Use the last - // one if there are multiple (no good reason for doing this other - // than it is expedient). We've checked that it begins at zero and - // steps by one, so this is a canonical induction variable. - if (!PrimaryInduction || PhiTy == WidestIndTy) - PrimaryInduction = Phi; - } - - // Both the PHI node itself, and the "post-increment" value feeding - // back into the PHI node may have external users. - // We can allow those uses, except if the SCEVs we have for them rely - // on predicates that only hold within the loop, since allowing the exit - // currently means re-using this SCEV outside the loop. - if (PSE.getUnionPredicate().isAlwaysTrue()) { - AllowedExit.insert(Phi); - AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); - } - - DEBUG(dbgs() << "LV: Found an induction variable.\n"); -} - -bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *Header = TheLoop->getHeader(); - - // Look for the attribute signaling the absence of NaNs. - Function &F = *Header->getParent(); - HasFunNoNaNAttr = - F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - - // For each block in the loop. - for (BasicBlock *BB : TheLoop->blocks()) { - // Scan the instructions in the block and look for hazards. - for (Instruction &I : *BB) { - if (auto *Phi = dyn_cast<PHINode>(&I)) { - Type *PhiTy = Phi->getType(); - // Check that this PHI type is allowed. - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "loop control flow is not understood by vectorizer"); - DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); - return false; - } - - // If this PHINode is not in the header block, then we know that we - // can convert it to select during if-conversion. No need to check if - // the PHIs in this block are induction or reduction variables. - if (BB != Header) { - // Check that this instruction has no outside users or is an - // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) - continue; - ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) - << "value could not be identified as " - "an induction or reduction variable"); - return false; - } - - // We only allow if-converted PHIs with exactly two incoming values. - if (Phi->getNumIncomingValues() != 2) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "control flow not understood by vectorizer"); - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - - RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { - if (RedDes.hasUnsafeAlgebra()) - Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); - AllowedExit.insert(RedDes.getLoopExitInstr()); - Reductions[Phi] = RedDes; - continue; - } - - InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { - addInductionPhi(Phi, ID, AllowedExit); - if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) - Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); - continue; - } - - if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, - SinkAfter, DT)) { - FirstOrderRecurrences.insert(Phi); - continue; - } - - // As a last resort, coerce the PHI to a AddRec expression - // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { - addInductionPhi(Phi, ID, AllowedExit); - continue; - } - - ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) - << "value that could not be identified as " - "reduction is used outside the loop"); - DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); - return false; - } // end of PHI handling - - // We handle calls that: - // * Are debug info intrinsics. - // * Have a mapping to an IR intrinsic. - // * Have a vector version available. - auto *CI = dyn_cast<CallInst>(&I); - if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && - !isa<DbgInfoIntrinsic>(CI) && - !(CI->getCalledFunction() && TLI && - TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) - << "call instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); - return false; - } - - // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the - // second argument is the same (i.e. loop invariant) - if (CI && hasVectorInstrinsicScalarOpd( - getVectorIntrinsicIDForCall(CI, TLI), 1)) { - auto *SE = PSE.getSE(); - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) - << "intrinsic instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); - return false; - } - } - - // Check that the instruction return type is vectorizable. - // Also, we can't vectorize extractelement instructions. - if ((!VectorType::isValidElementType(I.getType()) && - !I.getType()->isVoidTy()) || - isa<ExtractElementInst>(I)) { - ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) - << "instruction return type cannot be vectorized"); - DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); - return false; - } - - // Check that the stored type is vectorizable. - if (auto *ST = dyn_cast<StoreInst>(&I)) { - Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) { - ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) - << "store instruction cannot be vectorized"); - return false; - } - - // FP instructions can allow unsafe algebra, thus vectorizable by - // non-IEEE-754 compliant SIMD units. - // This applies to floating-point math operations and calls, not memory - // operations, shuffles, or casts, as they don't change precision or - // semantics. - } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && - !I.isFast()) { - DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); - Hints->setPotentiallyUnsafe(); - } - - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) - << "value cannot be used outside the loop"); - return false; - } - } // next instr. - } - - if (!PrimaryInduction) { - DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) { - ORE->emit(createMissedAnalysis("NoInductionVariable") - << "loop induction variable could not be identified"); - return false; - } - } - - // Now we know the widest induction type, check if our found induction - // is the same size. If it's not, unset it here and InnerLoopVectorizer - // will create another. - if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) - PrimaryInduction = nullptr; - - return true; + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { @@ -5461,7 +4188,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { if (auto *Store = dyn_cast<StoreInst>(MemAccess)) if (Ptr == Store->getValueOperand()) return WideningDecision == CM_Scalarize; - assert(Ptr == getPointerOperand(MemAccess) && + assert(Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"); return WideningDecision != CM_GatherScatter; }; @@ -5527,7 +4254,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { } for (auto *I : ScalarPtrs) if (!PossibleNonScalarPtrs.count(I)) { - DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); Worklist.insert(I); } @@ -5544,8 +4271,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { continue; Worklist.insert(Ind); Worklist.insert(IndUpdate); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate + << "\n"); } // Insert the forced scalars. @@ -5572,7 +4300,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { isScalarUse(J, Src)); })) { Worklist.insert(Src); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); } } @@ -5612,21 +4340,30 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // The induction variable and its update instruction will remain scalar. Worklist.insert(Ind); Worklist.insert(IndUpdate); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate + << "\n"); } Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { - if (!blockNeedsPredication(I->getParent())) +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { + if (!Legal->blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { default: break; - case Instruction::Store: - return !isMaskRequired(I); + case Instruction::Load: + case Instruction::Store: { + if (!Legal->isMaskRequired(I)) + return false; + auto *Ptr = getLoadStorePointerOperand(I); + auto *Ty = getMemInstValueType(I); + return isa<LoadInst>(I) ? + !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) + : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -5636,17 +4373,17 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { return false; } -bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); assert((LI || SI) && "Invalid memory instruction"); - auto *Ptr = getPointerOperand(I); + auto *Ptr = getLoadStorePointerOperand(I); // In order to be widened, the pointer should be consecutive, first of all. - if (!isConsecutivePtr(Ptr)) + if (!Legal->isConsecutivePtr(Ptr)) return false; // If the instruction is a store located in a predicated block, it will be @@ -5697,7 +4434,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { Worklist.insert(Cmp); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); } // Holds consecutive and consecutive-like pointers. Consecutive-like pointers @@ -5729,7 +4466,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { // If there's no pointer operand, there's nothing to do. - auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); if (!Ptr) continue; @@ -5737,7 +4474,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // pointer operand. auto UsersAreMemAccesses = llvm::all_of(Ptr->users(), [&](User *U) -> bool { - return getPointerOperand(U) == Ptr; + return getLoadStorePointerOperand(U) == Ptr; }); // Ensure the memory instruction will not be scalarized or used by @@ -5758,7 +4495,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) if (!PossibleNonUniformPtrs.count(V)) { - DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); Worklist.insert(V); } @@ -5777,10 +4514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); return !TheLoop->contains(J) || Worklist.count(J) || - (OI == getPointerOperand(J) && isUniformDecision(J, VF)); + (OI == getLoadStorePointerOperand(J) && + isUniformDecision(J, VF)); })) { Worklist.insert(OI); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); } } } @@ -5788,7 +4526,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getPointerOperand(I) == Ptr && isUniformDecision(I, VF); + return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); }; // For an instruction to be added into Worklist above, all its users inside @@ -5825,123 +4563,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // The induction variable and its update instruction will remain uniform. Worklist.insert(Ind); Worklist.insert(IndUpdate); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate + << "\n"); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationLegality::canVectorizeMemory() { - LAI = &(*GetLAA)(*TheLoop); - InterleaveInfo.setLAI(LAI); - const OptimizationRemarkAnalysis *LAR = LAI->getReport(); - if (LAR) { - ORE->emit([&]() { - return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), - "loop not vectorized: ", *LAR); - }); - } - if (!LAI->canVectorizeMemory()) - return false; - - if (LAI->hasStoreToLoopInvariantAddress()) { - ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); - PSE.addPredicate(LAI->getPSE().getUnionPredicate()); - - return true; -} - -bool LoopVectorizationLegality::isInductionPhi(const Value *V) { - Value *In0 = const_cast<Value *>(V); - PHINode *PN = dyn_cast_or_null<PHINode>(In0); - if (!PN) - return false; - - return Inductions.count(PN); -} - -bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { - auto *Inst = dyn_cast<Instruction>(V); - return (Inst && InductionCastsToIgnore.count(Inst)); -} - -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { - return isInductionPhi(V) || isCastedInductionVariable(V); -} - -bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { - return FirstOrderRecurrences.count(Phi); -} - -bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); -} - -bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - - for (Instruction &I : *BB) { - // Check that we don't have a constant expression that can trap as operand. - for (Value *Operand : I.operands()) { - if (auto *C = dyn_cast<Constant>(Operand)) - if (C->canTrap()) - return false; - } - // We might be able to hoist the load. - if (I.mayReadFromMemory()) { - auto *LI = dyn_cast<LoadInst>(&I); - if (!LI) - return false; - if (!SafePtrs.count(LI->getPointerOperand())) { - if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || - isLegalMaskedGather(LI->getType())) { - MaskedOp.insert(LI); - continue; - } - // !llvm.mem.parallel_loop_access implies if-conversion safety. - if (IsAnnotatedParallel) - continue; - return false; - } - } - - if (I.mayWriteToMemory()) { - auto *SI = dyn_cast<StoreInst>(&I); - // We only support predication of stores in basic blocks with one - // predecessor. - if (!SI) - return false; - - // Build a masked store if it is legal for the target. - if (isLegalMaskedStore(SI->getValueOperand()->getType(), - SI->getPointerOperand()) || - isLegalMaskedScatter(SI->getValueOperand()->getType())) { - MaskedOp.insert(SI); - continue; - } - - bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); - bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); - - if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || - !isSinglePredecessor) - return false; - } - if (I.mayThrow()) - return false; - } - - return true; -} - void InterleavedAccessInfo::collectConstStrideAccesses( MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, const ValueToValueMap &Strides) { @@ -5962,7 +4591,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses( if (!LI && !SI) continue; - Value *Ptr = getPointerOperand(&I); + Value *Ptr = getLoadStorePointerOperand(&I); // We don't check wrapping here because we don't know yet if Ptr will be // part of a full group or a group with gaps. Checking wrapping for all // pointers (even those that end up in groups with no gaps) will be overly @@ -6022,9 +4651,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving( - const ValueToValueMap &Strides) { - DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); +void InterleavedAccessInfo::analyzeInterleaving() { + LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); + const ValueToValueMap &Strides = LAI->getSymbolicStrides(); // Holds all accesses with a constant stride. MapVector<Instruction *, StrideDescriptor> AccessStrideInfo; @@ -6065,7 +4694,8 @@ void InterleavedAccessInfo::analyzeInterleaving( if (isStrided(DesB.Stride)) { Group = getInterleaveGroup(B); if (!Group) { - DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n'); + LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B + << '\n'); Group = createInterleaveGroup(B, DesB.Stride, DesB.Align); } if (B->mayWriteToMemory()) @@ -6124,7 +4754,12 @@ void InterleavedAccessInfo::analyzeInterleaving( // Ignore A if it's already in a group or isn't the same kind of memory // operation as B. - if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory()) + // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory + // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory() + // should have returned false - except for the case we asked for optimization + // remarks. + if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory()) + || (A->mayWriteToMemory() != B->mayWriteToMemory())) continue; // Check rules 1 and 2. Ignore A if its stride or size is different from @@ -6163,8 +4798,9 @@ void InterleavedAccessInfo::analyzeInterleaving( // Try to insert A into B's group. if (Group->insertMember(A, IndexA, DesA.Align)) { - DEBUG(dbgs() << "LV: Inserted:" << *A << '\n' - << " into the interleave group with" << *B << '\n'); + LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n' + << " into the interleave group with" << *B + << '\n'); InterleaveGroupMap[A] = Group; // Set the first load in program order as the insert position. @@ -6177,8 +4813,9 @@ void InterleavedAccessInfo::analyzeInterleaving( // Remove interleaved store groups with gaps. for (InterleaveGroup *Group : StoreGroups) if (Group->getNumMembers() != Group->getFactor()) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due " - "to gaps.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved store group due " + "to gaps.\n"); releaseGroup(Group); } // Remove interleaved groups with gaps (currently only loads) whose memory @@ -6207,21 +4844,23 @@ void InterleavedAccessInfo::analyzeInterleaving( // So we check only group member 0 (which is always guaranteed to exist), // and group member Factor - 1; If the latter doesn't exist we rely on // peeling (if it is a non-reveresed accsess -- see Case 3). - Value *FirstMemberPtr = getPointerOperand(Group->getMember(0)); + Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0)); if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " - "first group member potentially pointer-wrapping.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved group due to " + "first group member potentially pointer-wrapping.\n"); releaseGroup(Group); continue; } Instruction *LastMember = Group->getMember(Group->getFactor() - 1); if (LastMember) { - Value *LastMemberPtr = getPointerOperand(LastMember); + Value *LastMemberPtr = getLoadStorePointerOperand(LastMember); if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " - "last group member potentially pointer-wrapping.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved group due to " + "last group member potentially pointer-wrapping.\n"); releaseGroup(Group); } } else { @@ -6231,29 +4870,25 @@ void InterleavedAccessInfo::analyzeInterleaving( // to look for a member at index factor - 1, since every group must have // a member at index zero. if (Group->isReverse()) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " - "a reverse access with gaps.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved group due to " + "a reverse access with gaps.\n"); releaseGroup(Group); continue; } - DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); + LLVM_DEBUG( + dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); RequiresScalarEpilogue = true; } } } Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { - if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); - return None; - } - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. - DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + LLVM_DEBUG( + dbgs() << "LV: Not inserting runtime ptr check for divergent target"); ORE->emit( createMissedAnalysis("CantVersionLoopWithDivergentTarget") @@ -6271,20 +4906,22 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz"); - DEBUG(dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return None; } // If we optimize the program for size, avoid creating the tail loop. - DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); // If we don't know the precise trip count, don't try to vectorize. if (TC < 2) { ORE->emit( createMissedAnalysis("UnknownLoopCountComplexCFG") << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + LLVM_DEBUG( + dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return None; } @@ -6302,7 +4939,8 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { "same time. Enable vectorization of this loop " "with '#pragma clang loop vectorize(enable)' " "when compiling with -Os/-Oz"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + LLVM_DEBUG( + dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return None; } @@ -6327,29 +4965,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " - << WidestType << " bits.\n"); - DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister - << " bits.\n"); + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " + << WidestRegister << " bits.\n"); - assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" - " into one vector!"); + assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" + " into one vector!"); if (MaxVectorSize == 0) { - DEBUG(dbgs() << "LV: The target has no vector registers.\n"); + LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; return MaxVectorSize; } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in // choosing a higher viable VF as done in the loop below. - DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " + << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; return MaxVectorSize; } unsigned MaxVF = MaxVectorSize; - if (MaximizeBandwidth && !OptForSize) { + if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || + (MaximizeBandwidth && !OptForSize)) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector<unsigned, 8> VFs; @@ -6369,24 +5008,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, break; } } + if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { + if (MaxVF < MinVF) { + LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF + << ") with target's minimum: " << MinVF << '\n'); + MaxVF = MinVF; + } + } } return MaxVF; } -LoopVectorizationCostModel::VectorizationFactor +VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { float Cost = expectedCost(1).first; -#ifndef NDEBUG const float ScalarCost = Cost; -#endif /* NDEBUG */ unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - // Ignore scalar width, because the user explicitly wants vectorization. if (ForceVectorization && MaxVF > 1) { - Width = 2; - Cost = expectedCost(Width).first / (float)Width; + // Ignore scalar width, because the user explicitly wants vectorization. + // Initialize cost to max so that VF = 2 is, at least, chosen during cost + // evaluation. + Cost = std::numeric_limits<float>::max(); } for (unsigned i = 2; i <= MaxVF; i *= 2) { @@ -6395,10 +5040,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { // the vector elements. VectorizationCostTy C = expectedCost(i); float VectorCost = C.first / (float)i; - DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (int)VectorCost << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i + << " costs: " << (int)VectorCost << ".\n"); if (!C.second && !ForceVectorization) { - DEBUG( + LLVM_DEBUG( dbgs() << "LV: Not considering vector loop of width " << i << " because it will not generate any vector instructions.\n"); continue; @@ -6409,10 +5054,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { } } - DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); - DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); + if (!EnableCondStoresVectorization && NumPredStores) { + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); + LLVM_DEBUG( + dbgs() << "LV: No vectorization. There are conditional stores.\n"); + Width = 1; + Cost = ScalarCost; + } + + LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); + LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; return Factor; } @@ -6460,7 +5114,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // optimization to non-pointer types. // if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I)) + !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) continue; MinWidth = std::min(MinWidth, @@ -6504,8 +5158,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers\n"); if (VF == 1) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) @@ -6519,7 +5173,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); - R.NumInstructions = std::max(R.NumInstructions, 1U); // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -6564,7 +5217,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. if (VF > 1 && !Legal->getReductionVars()->empty()) { - DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -6575,7 +5228,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -6603,11 +5256,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, if (EnableLoadStoreRuntimeInterleave && std::max(StoresIC, LoadsIC) > SmallIC) { - DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); + LLVM_DEBUG( + dbgs() << "LV: Interleaving to saturate store or load ports.\n"); return std::max(StoresIC, LoadsIC); } - DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); return SmallIC; } @@ -6615,11 +5269,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // this point) that could benefit from interleaving. bool HasReductions = !Legal->getReductionVars()->empty(); if (TTI.enableAggressiveInterleaving(HasReductions)) { - DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } - DEBUG(dbgs() << "LV: Not Interleaving.\n"); + LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); return 1; } @@ -6646,7 +5300,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { DFS.perform(LI); RegisterUsage RU; - RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -6658,14 +5311,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. - SmallSet<Instruction *, 8> Ends; + SmallPtrSet<Instruction *, 8> Ends; // Saves the list of values that are used in the loop but are // defined outside the loop, such as arguments and constants. SmallPtrSet<Value *, 8> LoopInvariants; unsigned Index = 0; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - RU.NumInstructions += BB->size(); for (Instruction &I : *BB) { IdxToInstr[Index++] = &I; @@ -6698,7 +5350,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { for (auto &Interval : EndPoint) TransposeEnds[Interval.second].push_back(Interval.first); - SmallSet<Instruction *, 8> OpenIntervals; + SmallPtrSet<Instruction *, 8> OpenIntervals; // Get the size of the widest register. unsigned MaxSafeDepDist = -1U; @@ -6711,7 +5363,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { SmallVector<RegisterUsage, 8> RUs(VFs.size()); SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); - DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { @@ -6756,8 +5408,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " - << OpenIntervals.size() << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); @@ -6772,10 +5424,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { Invariant += GetRegUsage(Inst->getType(), VFs[i]); } - DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant + << '\n'); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -6785,6 +5437,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { return RUs; } +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ + // TODO: Cost model for emulated masked load/store is completely + // broken. This hack guides the cost model to use an artificially + // high enough value to practically disable vectorization with such + // operations, except where previously deployed legality hack allowed + // using very low cost values. This is to avoid regressions coming simply + // from moving "masked load/store" check from legality to cost model. + // Masked Load/Gather emulation was previously never allowed. + // Limited number of Masked Store/Scatter emulation was allowed. + assert(isScalarWithPredication(I) && + "Expecting a scalar emulated instruction"); + return isa<LoadInst>(I) || + (isa<StoreInst>(I) && + NumPredStores > NumberOfStoresToPredicate); +} + void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already @@ -6805,11 +5473,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { if (!Legal->blockNeedsPredication(BB)) continue; for (Instruction &I : *BB) - if (Legal->isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I)) { ScalarCostsTy ScalarCosts; - if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + // Do not apply discount logic if hacked cost is needed + // for emulated masked memrefs. + if (!useEmulatedMaskMemRefHack(&I) && + computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); - // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization.insert(BB); } @@ -6844,7 +5514,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (Legal->isScalarWithPredication(I)) + if (isScalarWithPredication(I)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6898,7 +5568,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, false); ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); @@ -6940,11 +5610,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy BlockCost; // For each instruction in the old loop. - for (Instruction &I : *BB) { - // Skip dbg intrinsics. - if (isa<DbgInfoIntrinsic>(I)) - continue; - + for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) @@ -6958,8 +5624,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { BlockCost.first += C.first; BlockCost.second |= C.second; - DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " - << VF << " For instruction: " << I << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first + << " for VF " << VF << " For instruction: " << I + << '\n'); } // If we are vectorizing a predicated block, it will have been @@ -6978,7 +5645,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } -/// \brief Gets Address Access SCEV after verifying that the access pattern +/// Gets Address Access SCEV after verifying that the access pattern /// is loop invariant except the induction variable dependence. /// /// This SCEV can be sent to the Target in order to estimate the address @@ -7020,7 +5687,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned Alignment = getMemInstAlignment(I); unsigned AS = getMemInstAddressSpace(I); - Value *Ptr = getPointerOperand(I); + Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); // Figure out whether the access is strided and get the stride value @@ -7041,9 +5708,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated // block. - if (Legal->isScalarWithPredication(I)) + if (isScalarWithPredication(I)) { Cost /= getReciprocalPredBlockProb(); + if (useEmulatedMaskMemRefHack(I)) + // Artificially setting to a high enough value to practically disable + // vectorization with such operations. + Cost = 3000000; + } + return Cost; } @@ -7052,7 +5725,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); unsigned Alignment = getMemInstAlignment(I); - Value *Ptr = getPointerOperand(I); + Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getMemInstAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -7088,7 +5761,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); unsigned Alignment = getMemInstAlignment(I); - Value *Ptr = getPointerOperand(I); + Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, @@ -7101,7 +5774,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Type *VectorTy = ToVectorTy(ValTy, VF); unsigned AS = getMemInstAddressSpace(I); - auto Group = Legal->getInterleavedAccessGroup(I); + auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); @@ -7168,13 +5841,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; + NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the old loop. for (Instruction &I : *BB) { - Value *Ptr = getPointerOperand(&I); + Value *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; + if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) + NumPredStores++; if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) { // Scalar load + broadcast unsigned Cost = getUniformMemOpCost(&I, VF); @@ -7183,9 +5859,10 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } // We assume that widening is the best solution when possible. - if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + if (memoryInstructionCanBeWidened(&I, VF)) { unsigned Cost = getConsecutiveMemOpCost(&I, VF); - int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I)); + int ConsecutiveStride = + Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = @@ -7197,8 +5874,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // Choose between Interleaving, Gather/Scatter or Scalarization. unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); unsigned NumAccesses = 1; - if (Legal->isAccessInterleaved(&I)) { - auto Group = Legal->getInterleavedAccessGroup(&I); + if (isAccessInterleaved(&I)) { + auto Group = getInterleavedAccessGroup(&I); assert(Group && "Fail to get an interleaved access group."); // Make one decision for the whole group. @@ -7210,7 +5887,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } unsigned GatherScatterCost = - Legal->isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses : std::numeric_limits<unsigned>::max(); @@ -7235,7 +5912,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // If the instructions belongs to an interleave group, the whole group // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. - if (auto Group = Legal->getInterleavedAccessGroup(&I)) + if (auto Group = getInterleavedAccessGroup(&I)) setWideningDecision(Group, VF, Decision, Cost); else setWideningDecision(&I, VF, Decision, Cost); @@ -7255,7 +5932,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { Instruction *PtrDef = - dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); if (PtrDef && TheLoop->contains(PtrDef) && getWideningDecision(&I, VF) != CM_GatherScatter) AddrDefs.insert(PtrDef); @@ -7285,7 +5962,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // Scalarize a widened load of address. setWideningDecision(I, VF, CM_Scalarize, (VF * getMemoryInstructionCost(I, 1))); - else if (auto Group = Legal->getInterleavedAccessGroup(I)) { + else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) @@ -7371,7 +6048,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && Legal->isScalarWithPredication(I)) { + if (VF > 1 && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7569,7 +6246,7 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { // Check if the pointer operand of a load or store instruction is // consecutive. - if (auto *Ptr = getPointerOperand(Inst)) + if (auto *Ptr = getLoadStorePointerOperand(Inst)) return Legal->isConsecutivePtr(Ptr); return false; } @@ -7594,23 +6271,59 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } -LoopVectorizationCostModel::VectorizationFactor +VectorizationFactor +LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, + unsigned UserVF) { + // Width 1 means no vectorization, cost 0 means uncomputed cost. + const VectorizationFactor NoVectorization = {1U, 0U}; + + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + if (!OrigLoop->empty()) { + // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing. + // This won't be necessary when UserVF is not required in the VPlan-native + // path. + if (VPlanBuildStressTest && !UserVF) + UserVF = 4; + + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + assert(UserVF && "Expected UserVF for outer loop vectorization."); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + buildVPlans(UserVF, UserVF); + + // For VPlan build stress testing, we bail out after VPlan construction. + if (VPlanBuildStressTest) + return NoVectorization; + + return {UserVF, 0}; + } + + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " + "VPlan-native path.\n"); + return NoVectorization; +} + +VectorizationFactor LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { - // Width 1 means no vectorize, cost 0 means uncomputed cost. - const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U, - 0U}; + assert(OrigLoop->empty() && "Inner loop expected."); + // Width 1 means no vectorization, cost 0 means uncomputed cost. + const VectorizationFactor NoVectorization = {1U, 0U}; Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. return NoVectorization; if (UserVF) { - DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); - buildVPlans(UserVF, UserVF); - DEBUG(printPlans(dbgs())); + buildVPlansWithVPRecipes(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); return {UserVF, 0}; } @@ -7627,8 +6340,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { CM.collectInstsToScalarize(VF); } - buildVPlans(1, MaxVF); - DEBUG(printPlans(dbgs())); + buildVPlansWithVPRecipes(1, MaxVF); + LLVM_DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7637,7 +6350,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { } void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { - DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); + LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF + << '\n'); BestVF = VF; BestUF = UF; @@ -7787,30 +6501,15 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - - // Collect conditions feeding internal conditional branches; they need to be - // represented in VPlan for it to model masking. - SmallPtrSet<Value *, 1> NeedDef; - - auto *Latch = OrigLoop->getLoopLatch(); - for (BasicBlock *BB : OrigLoop->blocks()) { - if (BB == Latch) - continue; - BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); - if (Branch && Branch->isConditional()) - NeedDef.insert(Branch->getCondition()); - } - for (unsigned VF = MinVF; VF < MaxVF + 1;) { VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back(buildVPlan(SubRange, NeedDef)); + VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } } -VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src, - BasicBlock *Dst, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, + VPlanPtr &Plan) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. @@ -7840,8 +6539,7 @@ VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src, return EdgeMaskCache[Edge] = EdgeMask; } -VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Look for cached value. @@ -7874,10 +6572,9 @@ VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB, return BlockMaskCache[BB] = BlockMask; } -VPInterleaveRecipe * -LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); +VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, + VFRange &Range) { + const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -7889,7 +6586,7 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, LoopVectorizationCostModel::CM_Interleave); }; }; - if (!getDecisionAndClampRange(isIGMember(I), Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) return nullptr; // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) @@ -7902,8 +6599,8 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, } VPWidenMemoryInstructionRecipe * -LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) return nullptr; @@ -7922,7 +6619,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, return Decision != LoopVectorizationCostModel::CM_Scalarize; }; - if (!getDecisionAndClampRange(willWiden, Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return nullptr; VPValue *Mask = nullptr; @@ -7933,8 +6630,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, } VPWidenIntOrFpInductionRecipe * -LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, - VFRange &Range) { +VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { if (PHINode *Phi = dyn_cast<PHINode>(I)) { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. @@ -7959,15 +6655,14 @@ LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; }; - if (isa<TruncInst>(I) && - getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) + if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( + isOptimizableIVTruncate(I), Range)) return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), cast<TruncInst>(I)); return nullptr; } -VPBlendRecipe * -LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { +VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { PHINode *Phi = dyn_cast<PHINode>(I); if (!Phi || Phi->getParent() == OrigLoop->getHeader()) return nullptr; @@ -7991,9 +6686,9 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { return new VPBlendRecipe(Phi, Masks); } -bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, - VFRange &Range) { - if (Legal->isScalarWithPredication(I)) +bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, + VFRange &Range) { + if (CM.isScalarWithPredication(I)) return false; auto IsVectorizableOpcode = [](unsigned Opcode) { @@ -8077,7 +6772,7 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, return true; }; - if (!getDecisionAndClampRange(willWiden, Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; // Success: widen this instruction. We optimize the common case where @@ -8092,15 +6787,15 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, return true; } -VPBasicBlock *LoopVectorizationPlanner::handleReplication( +VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan) { - bool IsUniform = getDecisionAndClampRange( + bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); - bool IsPredicated = Legal->isScalarWithPredication(I); + bool IsPredicated = CM.isScalarWithPredication(I); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); // Find if I uses a predicated instruction. If so, it will use its scalar @@ -8113,24 +6808,25 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication( // Finalize the recipe for Instr, first if it is not predicated. if (!IsPredicated) { - DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); + LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); VPBB->appendRecipe(Recipe); return VPBB; } - DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); assert(VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."); // Record predicated instructions for above packing optimizations. PredInst2Recipe[I] = Recipe; - VPBlockBase *Region = - VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan)); - return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock())); + VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); + VPBlockUtils::insertBlockAfter(Region, VPBB); + auto *RegSucc = new VPBasicBlock(); + VPBlockUtils::insertBlockAfter(RegSucc, Region); + return RegSucc; } -VPRegionBlock * -LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe, - VPlanPtr &Plan) { +VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, + VPRecipeBase *PredRecipe, + VPlanPtr &Plan) { // Instructions marked for predication are replicated and placed under an // if-then construct to prevent side-effects. @@ -8150,19 +6846,67 @@ LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. - Entry->setTwoSuccessors(Pred, Exit); - Pred->setOneSuccessor(Exit); + VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); + VPBlockUtils::connectBlocks(Pred, Exit); return Region; } -LoopVectorizationPlanner::VPlanPtr -LoopVectorizationPlanner::buildVPlan(VFRange &Range, - const SmallPtrSetImpl<Value *> &NeedDef) { - EdgeMaskCache.clear(); - BlockMaskCache.clear(); - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; +bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, + VPlanPtr &Plan, VPBasicBlock *VPBB) { + VPRecipeBase *Recipe = nullptr; + // Check if Instr should belong to an interleave memory recipe, or already + // does. In the latter case Instr is irrelevant. + if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + VPBB->appendRecipe(Recipe); + return true; + } + + // Check if Instr is a memory operation that should be widened. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + VPBB->appendRecipe(Recipe); + return true; + } + + // Check if Instr should form some PHI recipe. + if ((Recipe = tryToOptimizeInduction(Instr, Range))) { + VPBB->appendRecipe(Recipe); + return true; + } + if ((Recipe = tryToBlend(Instr, Plan))) { + VPBB->appendRecipe(Recipe); + return true; + } + if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { + VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); + return true; + } + + // Check if Instr is to be widened by a general VPWidenRecipe, after + // having first checked for specific widening recipes that deal with + // Interleave Groups, Inductions and Phi nodes. + if (tryToWiden(Instr, VPBB, Range)) + return true; + + return false; +} + +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, + unsigned MaxVF) { + assert(OrigLoop->empty() && "Inner loop expected."); + + // Collect conditions feeding internal conditional branches; they need to be + // represented in VPlan for it to model masking. + SmallPtrSet<Value *, 1> NeedDef; + + auto *Latch = OrigLoop->getLoopLatch(); + for (BasicBlock *BB : OrigLoop->blocks()) { + if (BB == Latch) + continue; + BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); + if (Branch && Branch->isConditional()) + NeedDef.insert(Branch->getCondition()); + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -8173,15 +6917,31 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, SmallPtrSet<Instruction *, 4> DeadInstructions; collectTriviallyDeadInstructions(DeadInstructions); + for (unsigned VF = MinVF; VF < MaxVF + 1;) { + VFRange SubRange = {VF, MaxVF + 1}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); + VF = SubRange.End; + } +} + +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::buildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, + SmallPtrSetImpl<Instruction *> &DeadInstructions) { // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + DenseMap<Instruction *, Instruction *> SinkAfterInverse; + // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique<VPlan>(VPBB); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -8196,7 +6956,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - VPBB->setOneSuccessor(FirstVPBBForBB); + VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); @@ -8204,18 +6964,17 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // Organize the ingredients to vectorize from current basic block in the // right order. - for (Instruction &I : *BB) { + for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) || - DeadInstructions.count(Instr)) + if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) continue; // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct // member of the IG, do not construct any Recipe for it. - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); + const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr); if (IG && Instr != IG->getInsertPos() && Range.Start >= 2 && // Query is illegal for VF == 1 CM.getWideningDecision(Instr, Range.Start) == @@ -8230,8 +6989,9 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // should follow. auto SAIt = SinkAfter.find(Instr); if (SAIt != SinkAfter.end()) { - DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); + LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" + << *SAIt->second + << " to vectorize a 1st order recurrence.\n"); SinkAfterInverse[SAIt->second] = Instr; continue; } @@ -8247,45 +7007,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // Introduce each ingredient into VPlan. for (Instruction *Instr : Ingredients) { - VPRecipeBase *Recipe = nullptr; - - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - - // Check if Instr is a memory operation that should be widened. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { - VPBB->appendRecipe(Recipe); - continue; - } - - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - if ((Recipe = tryToBlend(Instr, Plan))) { - VPBB->appendRecipe(Recipe); - continue; - } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - continue; - } - - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. - if (tryToWiden(Instr, VPBB, Range)) + if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) continue; // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = - handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan); + VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( + Instr, Range, VPBB, PredInst2Recipe, Plan); if (NextVPBB != VPBB) { VPBB = NextVPBB; VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) @@ -8300,7 +7028,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); assert(PreEntry->empty() && "Expecting empty pre-entry block."); VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); - PreEntry->disconnectSuccessor(Entry); + VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; std::string PlanName; @@ -8319,6 +7047,30 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, return Plan; } +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::buildVPlan(VFRange &Range) { + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + assert(!OrigLoop->empty()); + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + + // Create new empty VPlan + auto Plan = llvm::make_unique<VPlan>(); + + // Build hierarchical CFG + VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI); + HCFGBuilder.buildHierarchicalCFG(*Plan.get()); + + return Plan; +} + +Value* LoopVectorizationPlanner::VPCallbackILV:: +getOrCreateVectorValues(Value *V, unsigned Part) { + return ILV.getOrCreateVectorValue(V, Part); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; @@ -8483,28 +7235,66 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); } +// Process the loop in the VPlan-native vectorization path. This path builds +// VPlan upfront in the vectorization pipeline, which allows to apply +// VPlan-to-VPlan transformations from the very beginning without modifying the +// input LLVM IR. +static bool processLoopInVPlanNativePath( + Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, + LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) { + + assert(EnableVPlanNativePath && "VPlan-native path is disabled."); + Function *F = L->getHeader()->getParent(); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); + LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + &Hints, IAI); + // Use the planner for outer loop vectorization. + // TODO: CM is not used at this point inside the planner. Turn CM into an + // optional argument if we don't need it in the future. + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + + // Get user vectorization factor. + unsigned UserVF = Hints.getWidth(); + + // Check the function attributes to find out if this function should be + // optimized for size. + bool OptForSize = + Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + + // Plan how to best vectorize, return the best VF and its cost. + LVP.planInVPlanNativePath(OptForSize, UserVF); + + // Returning false. We are currently not generating vector code in the VPlan + // native path. + return false; +} + bool LoopVectorizePass::processLoop(Loop *L) { - assert(L->empty() && "Only process inner loops."); + assert((EnableVPlanNativePath || L->empty()) && + "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ - DEBUG(dbgs() << "\nLV: Checking a loop in \"" - << L->getHeader()->getParent()->getName() << "\" from " - << DebugLocStr << "\n"); + LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); - DEBUG(dbgs() << "LV: Loop hints:" - << " force=" - << (Hints.getForce() == LoopVectorizeHints::FK_Disabled - ? "disabled" - : (Hints.getForce() == LoopVectorizeHints::FK_Enabled - ? "enabled" - : "?")) - << " width=" << Hints.getWidth() - << " unroll=" << Hints.getInterleave() << "\n"); + LLVM_DEBUG( + dbgs() << "LV: Loop hints:" + << " force=" + << (Hints.getForce() == LoopVectorizeHints::FK_Disabled + ? "disabled" + : (Hints.getForce() == LoopVectorizeHints::FK_Enabled + ? "enabled" + : "?")) + << " width=" << Hints.getWidth() + << " unroll=" << Hints.getInterleave() << "\n"); // Function containing loop Function *F = L->getHeader()->getParent(); @@ -8518,7 +7308,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // benefit from vectorization, respectively. if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { - DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); + LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -8526,10 +7316,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, - &Requirements, &Hints); - if (!LVL.canVectorize()) { - DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE, + &Requirements, &Hints, DB, AC); + if (!LVL.canVectorize(EnableVPlanNativePath)) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints, ORE); return false; } @@ -8539,11 +7329,33 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + // Entrance to the VPlan-native vectorization path. Outer loops are processed + // here. They may require CFG and instruction level transformations before + // even evaluating whether vectorization is profitable. Since we cannot modify + // the incoming IR, we need to build VPlan upfront in the vectorization + // pipeline. + if (!L->empty()) + return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, + ORE, Hints); + + assert(L->empty() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); - bool HasExpectedTC = (ExpectedTC > 0); - + // Prefer constant trip counts over profile data, over upper bound estimate. + unsigned ExpectedTC = 0; + bool HasExpectedTC = false; + if (const SCEVConstant *ConstExits = + dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { + const APInt &ExitsCount = ConstExits->getAPInt(); + // We are interested in small values for ExpectedTC. Skip over those that + // can't fit an unsigned. + if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { + ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; + HasExpectedTC = true; + } + } + // ExpectedTC may be large because it's bound by a variable. Check + // profiling information to validate we should vectorize. if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { auto EstimatedTC = getLoopEstimatedTripCount(L); if (EstimatedTC) { @@ -8551,15 +7363,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { HasExpectedTC = true; } } + if (!HasExpectedTC) { + ExpectedTC = SE->getSmallConstantMaxTripCount(L); + HasExpectedTC = (ExpectedTC > 0); + } if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { - DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); + LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) - DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "\n"); // Loops with a very small trip count are considered for vectorization // under OptForSize, thereby making sure the cost of their loop body is // dominant, free of runtime guards and scalar iteration overheads. @@ -8572,10 +7388,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { - DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" - "attribute is used.\n"); - ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NoImplicitFloat", L) + LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" + "attribute is used.\n"); + ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NoImplicitFloat", L) << "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints, ORE); return false; @@ -8587,17 +7403,30 @@ bool LoopVectorizePass::processLoop(Loop *L) { // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { - DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); + LLVM_DEBUG( + dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); ORE->emit( - createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) + createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) << "loop not vectorized due to unsafe FP support."); emitMissedWarning(F, L, Hints, ORE); return false; } + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; + + // Analyze interleaved memory accesses. + if (UseInterleaved) { + IAI.analyzeInterleaving(); + } + // Use the cost model. LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints); + &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. @@ -8607,8 +7436,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - LoopVectorizationCostModel::VectorizationFactor VF = - LVP.plan(OptForSize, UserVF); + VectorizationFactor VF = LVP.plan(OptForSize, UserVF); // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); @@ -8620,14 +7448,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; if (Requirements.doesNotMeet(F, L, Hints)) { - DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " - "requirements.\n"); + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); emitMissedWarning(F, L, Hints, ORE); return false; } if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", "the cost-model indicates that vectorization is not beneficial"); @@ -8636,7 +7464,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. - DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = std::make_pair( "InterleavingNotBeneficial", "the cost-model indicates that interleaving is not beneficial"); @@ -8648,8 +7476,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. - DEBUG(dbgs() - << "LV: Interleaving is beneficial but is explicitly disabled."); + LLVM_DEBUG( + dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); IntDiagMsg = std::make_pair( "InterleavingBeneficialButDisabled", "the cost-model indicates that interleaving is beneficial " @@ -8676,24 +7504,24 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); return false; } else if (!VectorizeLoop && InterleaveLoop) { - DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, L->getStartLoc(), L->getHeader()) << VecDiagMsg.second; }); } else if (VectorizeLoop && !InterleaveLoop) { - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + << ") in " << DebugLocStr << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) << IntDiagMsg.second; }); } else if (VectorizeLoop && InterleaveLoop) { - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); - DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + << ") in " << DebugLocStr << '\n'); + LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } LVP.setBestPlan(VF.Width, IC); @@ -8740,7 +7568,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - DEBUG(verifyFunction(*L->getHeader()->getParent())); + LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } @@ -8788,7 +7616,7 @@ bool LoopVectorizePass::runImpl( SmallVector<Loop *, 8> Worklist; for (Loop *L : *LI) - addAcyclicInnerLoop(*L, Worklist); + collectSupportedLoops(*L, LI, ORE, Worklist); LoopsAnalyzed += Worklist.size(); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index a7ccd3faec44..ac8c4f046c6f 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -161,7 +161,7 @@ static const unsigned MaxMemDepDistance = 160; /// regions to be handled. static const int MinScheduleRegionSize = 16; -/// \brief Predicate for the element types that the SLP vectorizer supports. +/// Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM /// vectors. We also filter target specific types which have absolutely no @@ -246,13 +246,15 @@ static bool isSplat(ArrayRef<Value *> VL) { /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 /// ret <4 x i8> %ins4 /// InstCombiner transforms this into a shuffle and vector mul +/// TODO: Can we split off and reuse the shuffle mask detection from +/// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> isShuffle(ArrayRef<Value *> VL) { auto *EI0 = cast<ExtractElementInst>(VL[0]); unsigned Size = EI0->getVectorOperandType()->getVectorNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; - enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute}; + enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; for (unsigned I = 0, E = VL.size(); I < E; ++I) { auto *EI = cast<ExtractElementInst>(VL[I]); @@ -272,7 +274,11 @@ isShuffle(ArrayRef<Value *> VL) { continue; // For correct shuffling we have to have at most 2 different vector operands // in all extractelement instructions. - if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2) + if (!Vec1 || Vec1 == Vec) + Vec1 = Vec; + else if (!Vec2 || Vec2 == Vec) + Vec2 = Vec; + else return None; if (CommonShuffleMode == Permute) continue; @@ -282,119 +288,17 @@ isShuffle(ArrayRef<Value *> VL) { CommonShuffleMode = Permute; continue; } - // Check the shuffle mode for the current operation. - if (!Vec1) - Vec1 = Vec; - else if (Vec != Vec1) - Vec2 = Vec; - // Example: shufflevector A, B, <0,5,2,7> - // I is odd and IntIdx for A == I - FirstAlternate shuffle. - // I is even and IntIdx for B == I - FirstAlternate shuffle. - // Example: shufflevector A, B, <4,1,6,3> - // I is even and IntIdx for A == I - SecondAlternate shuffle. - // I is odd and IntIdx for B == I - SecondAlternate shuffle. - const bool IIsEven = I & 1; - const bool CurrVecIsA = Vec == Vec1; - const bool IIsOdd = !IIsEven; - const bool CurrVecIsB = !CurrVecIsA; - ShuffleMode CurrentShuffleMode = - ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate - : SecondAlternate; - // Common mode is not set or the same as the shuffle mode of the current - // operation - alternate. - if (CommonShuffleMode == Unknown) - CommonShuffleMode = CurrentShuffleMode; - // Common shuffle mode is not the same as the shuffle mode of the current - // operation - permutation. - if (CommonShuffleMode != CurrentShuffleMode) - CommonShuffleMode = Permute; + CommonShuffleMode = Select; } // If we're not crossing lanes in different vectors, consider it as blending. - if ((CommonShuffleMode == FirstAlternate || - CommonShuffleMode == SecondAlternate) && - Vec2) - return TargetTransformInfo::SK_Alternate; + if (CommonShuffleMode == Select && Vec2) + return TargetTransformInfo::SK_Select; // If Vec2 was never used, we have a permutation of a single vector, otherwise // we have permutation of 2 vectors. return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc : TargetTransformInfo::SK_PermuteSingleSrc; } -///\returns Opcode that can be clubbed with \p Op to create an alternate -/// sequence which can later be merged as a ShuffleVector instruction. -static unsigned getAltOpcode(unsigned Op) { - switch (Op) { - case Instruction::FAdd: - return Instruction::FSub; - case Instruction::FSub: - return Instruction::FAdd; - case Instruction::Add: - return Instruction::Sub; - case Instruction::Sub: - return Instruction::Add; - default: - return 0; - } -} - -static bool isOdd(unsigned Value) { - return Value & 1; -} - -static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, - unsigned CheckedOpcode) { - return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; -} - -/// Chooses the correct key for scheduling data. If \p Op has the same (or -/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p -/// OpValue. -static Value *isOneOf(Value *OpValue, Value *Op) { - auto *I = dyn_cast<Instruction>(Op); - if (!I) - return OpValue; - auto *OpInst = cast<Instruction>(OpValue); - unsigned OpInstOpcode = OpInst->getOpcode(); - unsigned IOpcode = I->getOpcode(); - if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode)) - return Op; - return OpValue; -} - -namespace { - -/// Contains data for the instructions going to be vectorized. -struct RawInstructionsData { - /// Main Opcode of the instructions going to be vectorized. - unsigned Opcode = 0; - - /// The list of instructions have some instructions with alternate opcodes. - bool HasAltOpcodes = false; -}; - -} // end anonymous namespace - -/// Checks the list of the vectorized instructions \p VL and returns info about -/// this list. -static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) { - auto *I0 = dyn_cast<Instruction>(VL[0]); - if (!I0) - return {}; - RawInstructionsData Res; - unsigned Opcode = I0->getOpcode(); - // Walk through the list of the vectorized instructions - // in order to check its structure described by RawInstructionsData. - for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { - auto *I = dyn_cast<Instruction>(VL[Cnt]); - if (!I) - return {}; - if (Opcode != I->getOpcode()) - Res.HasAltOpcodes = true; - } - Res.Opcode = Opcode; - return Res; -} - namespace { /// Main data required for vectorization of instructions. @@ -402,42 +306,90 @@ struct InstructionsState { /// The very first instruction in the list with the main opcode. Value *OpValue = nullptr; - /// The main opcode for the list of instructions. - unsigned Opcode = 0; + /// The main/alternate instruction. + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + + /// The main/alternate opcodes for the list of instructions. + unsigned getOpcode() const { + return MainOp ? MainOp->getOpcode() : 0; + } + + unsigned getAltOpcode() const { + return AltOp ? AltOp->getOpcode() : 0; + } /// Some of the instructions in the list have alternate opcodes. - bool IsAltShuffle = false; + bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + + bool isOpcodeOrAlt(Instruction *I) const { + unsigned CheckedOpcode = I->getOpcode(); + return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; + } - InstructionsState() = default; - InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle) - : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {} + InstructionsState() = delete; + InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) + : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} }; } // end anonymous namespace +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *isOneOf(const InstructionsState &S, Value *Op) { + auto *I = dyn_cast<Instruction>(Op); + if (I && S.isOpcodeOrAlt(I)) + return Op; + return S.OpValue; +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. -static InstructionsState getSameOpcode(ArrayRef<Value *> VL) { - auto Res = getMainOpcode(VL); - unsigned Opcode = Res.Opcode; - if (!Res.HasAltOpcodes) - return InstructionsState(VL[0], Opcode, false); - auto *OpInst = cast<Instruction>(VL[0]); - unsigned AltOpcode = getAltOpcode(Opcode); - // Examine each element in the list instructions VL to determine - // if some operations there could be considered as an alternative - // (for example as subtraction relates to addition operation). +static InstructionsState getSameOpcode(ArrayRef<Value *> VL, + unsigned BaseIndex = 0) { + // Make sure these are all Instructions. + if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + + bool IsCastOp = isa<CastInst>(VL[BaseIndex]); + bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); + unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); + unsigned AltOpcode = Opcode; + unsigned AltIndex = BaseIndex; + + // Check for one alternate opcode from another BinaryOperator. + // TODO - generalize to support all operators (types, calls etc.). for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - auto *I = cast<Instruction>(VL[Cnt]); - unsigned InstOpcode = I->getOpcode(); - if ((Res.HasAltOpcodes && - InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || - (!Res.HasAltOpcodes && InstOpcode != Opcode)) { - return InstructionsState(OpInst, 0, false); - } + unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); + if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { + if (InstOpcode == Opcode || InstOpcode == AltOpcode) + continue; + if (Opcode == AltOpcode) { + AltOpcode = InstOpcode; + AltIndex = Cnt; + continue; + } + } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { + Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); + Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); + if (Ty0 == Ty1) { + if (InstOpcode == Opcode || InstOpcode == AltOpcode) + continue; + if (Opcode == AltOpcode) { + AltOpcode = InstOpcode; + AltIndex = Cnt; + continue; + } + } + } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + continue; + return InstructionsState(VL[BaseIndex], nullptr, nullptr); } - return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes); + + return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), + cast<Instruction>(VL[AltIndex])); } /// \returns true if all of the values in \p VL have the same type or false @@ -452,16 +404,21 @@ static bool allSameType(ArrayRef<Value *> VL) { } /// \returns True if Extract{Value,Element} instruction extracts element Idx. -static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { - assert(Opcode == Instruction::ExtractElement || - Opcode == Instruction::ExtractValue); +static Optional<unsigned> getExtractIndex(Instruction *E) { + unsigned Opcode = E->getOpcode(); + assert((Opcode == Instruction::ExtractElement || + Opcode == Instruction::ExtractValue) && + "Expected extractelement or extractvalue instruction."); if (Opcode == Instruction::ExtractElement) { - ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); - return CI && CI->getZExtValue() == Idx; - } else { - ExtractValueInst *EI = cast<ExtractValueInst>(E); - return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx; + auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); + if (!CI) + return None; + return CI->getZExtValue(); } + ExtractValueInst *EI = cast<ExtractValueInst>(E); + if (EI->getNumIndices() != 1) + return None; + return *EI->idx_begin(); } /// \returns True if in-tree use also needs extract. This refers to @@ -549,7 +506,7 @@ public: MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } - /// \brief Vectorize the tree that starts with the elements in \p VL. + /// Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); @@ -585,8 +542,8 @@ public: ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumLoadsWantToKeepOrder = 0; - NumLoadsWantToChangeOrder = 0; + NumOpsWantToKeepOrder.clear(); + NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -596,12 +553,22 @@ public: unsigned getTreeSize() const { return VectorizableTree.size(); } - /// \brief Perform LICM and CSE on the newly generated gather sequences. - void optimizeGatherSequence(Function &F); + /// Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); + + /// \returns The best order of instructions for vectorization. + Optional<ArrayRef<unsigned>> bestOrder() const { + auto I = std::max_element( + NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), + [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, + const decltype(NumOpsWantToKeepOrder)::value_type &D2) { + return D1.second < D2.second; + }); + if (I == NumOpsWantToKeepOrder.end() || + I->getSecond() <= NumOpsWantToKeepOriginalOrder) + return None; - /// \returns true if it is beneficial to reverse the vector order. - bool shouldReorder() const { - return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; + return makeArrayRef(I->getFirst()); } /// \return The vector element size in bits to use when vectorizing the @@ -625,7 +592,7 @@ public: return MinVecRegSize; } - /// \brief Check if ArrayType or StructType is isomorphic to some VectorType. + /// Check if ArrayType or StructType is isomorphic to some VectorType. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; @@ -648,9 +615,13 @@ private: /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); - /// \returns True if the ExtractElement/ExtractValue instructions in VL can - /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). - bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const; + /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can + /// be vectorized to use the original vector (or aggregate "bitcast" to a + /// vector) and sets \p CurrentOrder to the identity permutation; otherwise + /// returns false, setting \p CurrentOrder to either an empty vector or a + /// non-identity permutation that allows to reuse extract instructions. + bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, + SmallVectorImpl<unsigned> &CurrentOrder) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); @@ -658,22 +629,19 @@ private: /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef<Value *> VL); - /// \returns the pointer to the vectorized value if \p VL is already - /// vectorized, or NULL. They may happen in cycles. - Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const; - /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty); + int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. int getGatherCost(ArrayRef<Value *> VL); - /// \brief Set the Builder insert point to one after the last instruction in + /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue); + void setInsertPointAfterBundle(ArrayRef<Value *> VL, + const InstructionsState &S); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -684,7 +652,8 @@ private: /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL, + void reorderAltShuffleOperands(const InstructionsState &S, + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right); @@ -698,8 +667,12 @@ private: /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { - assert(VL.size() == Scalars.size() && "Invalid size"); - return std::equal(VL.begin(), VL.end(), Scalars.begin()); + if (VL.size() == Scalars.size()) + return std::equal(VL.begin(), VL.end(), Scalars.begin()); + return VL.size() == ReuseShuffleIndices.size() && + std::equal( + VL.begin(), VL.end(), ReuseShuffleIndices.begin(), + [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); } /// A vector of scalars. @@ -711,6 +684,12 @@ private: /// Do we need to gather this sequence ? bool NeedToGather = false; + /// Does this sequence require some shuffling? + SmallVector<unsigned, 4> ReuseShuffleIndices; + + /// Does this entry require reordering? + ArrayRef<unsigned> ReorderIndices; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -725,13 +704,17 @@ private: }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, - int &UserTreeIdx) { + void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx, + ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<unsigned> ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; + Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), + ReuseShuffleIndices.end()); + Last->ReorderIndices = ReorderIndices; if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -744,7 +727,6 @@ private: if (UserTreeIdx >= 0) Last->UserTreeIndices.push_back(UserTreeIdx); UserTreeIdx = idx; - return Last; } /// -- Vectorization State -- @@ -758,13 +740,6 @@ private: return nullptr; } - const TreeEntry *getTreeEntry(Value *V) const { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; - return nullptr; - } - /// Maps a specific scalar to its tree entry. SmallDenseMap<Value*, int> ScalarToTreeEntry; @@ -1038,7 +1013,7 @@ private: template <typename ReadyListType> void schedule(ScheduleData *SD, ReadyListType &ReadyList) { SD->IsScheduled = true; - DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); + LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); ScheduleData *BundleMember = SD; while (BundleMember) { @@ -1061,8 +1036,8 @@ private: assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); - DEBUG(dbgs() - << "SLP: gets ready (def): " << *DepBundle << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: gets ready (def): " << *DepBundle << "\n"); } }); } @@ -1075,8 +1050,8 @@ private: assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle - << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } BundleMember = BundleMember->NextInBundle; @@ -1101,7 +1076,8 @@ private: doForAllOpcodes(I, [&](ScheduleData *SD) { if (SD->isSchedulingEntity() && SD->isReady()) { ReadyList.insert(SD); - DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: initially in ready list: " << *I << "\n"); } }); } @@ -1110,7 +1086,8 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue); + bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); @@ -1120,7 +1097,7 @@ private: /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V, Value *OpValue); + bool extendSchedulingRegion(Value *V, const InstructionsState &S); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -1201,11 +1178,38 @@ private: /// List of users to ignore during scheduling and that don't need extracting. ArrayRef<Value *> UserIgnoreList; - // Number of load bundles that contain consecutive loads. - int NumLoadsWantToKeepOrder = 0; + using OrdersType = SmallVector<unsigned, 4>; + /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of + /// sorted SmallVectors of unsigned. + struct OrdersTypeDenseMapInfo { + static OrdersType getEmptyKey() { + OrdersType V; + V.push_back(~1U); + return V; + } + + static OrdersType getTombstoneKey() { + OrdersType V; + V.push_back(~2U); + return V; + } + + static unsigned getHashValue(const OrdersType &V) { + return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { + return LHS == RHS; + } + }; - // Number of load bundles that contain consecutive loads in reversed order. - int NumLoadsWantToChangeOrder = 0; + /// Contains orders of operations along with the number of bundles that have + /// operations in this order. It stores only those orders that require + /// reordering, if reordering is not required it is counted using \a + /// NumOpsWantToKeepOriginalOrder. + DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder; + /// Number of bundles that do not require reordering. + unsigned NumOpsWantToKeepOriginalOrder = 0; // Analysis and block reference. Function *F; @@ -1242,7 +1246,7 @@ template <> struct GraphTraits<BoUpSLP *> { /// NodeRef has to be a pointer per the GraphWriter. using NodeRef = TreeEntry *; - /// \brief Add the VectorizableTree to the index iterator to be able to return + /// Add the VectorizableTree to the index iterator to be able to return /// TreeEntry pointers. struct ChildIteratorType : public iterator_adaptor_base<ChildIteratorType, @@ -1340,17 +1344,22 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + int FoundLane = Lane; + if (!Entry->ReuseShuffleIndices.empty()) { + FoundLane = + std::distance(Entry->ReuseShuffleIndices.begin(), + llvm::find(Entry->ReuseShuffleIndices, FoundLane)); + } // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { - DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << - Lane << " from " << *Scalar << ".\n"); - ExternalUses.emplace_back(Scalar, nullptr, Lane); - continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " + << Lane << " from " << *Scalar << ".\n"); + ExternalUses.emplace_back(Scalar, nullptr, FoundLane); } for (User *U : Scalar->users()) { - DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); Instruction *UserInst = dyn_cast<Instruction>(U); if (!UserInst) @@ -1364,8 +1373,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, // be used. if (UseScalar != U || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { - DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U - << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U + << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); continue; } @@ -1375,9 +1384,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, if (is_contained(UserIgnoreList, UserInst)) continue; - DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << - Lane << " from " << *Scalar << ".\n"); - ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); + LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " + << Lane << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); } } } @@ -1389,28 +1398,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { - DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { - DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { - DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { - DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1421,8 +1430,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't vectorize ephemeral values. for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (EphValues.count(VL[i])) { - DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << - ") is ephemeral.\n"); + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + << ") is ephemeral.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1430,18 +1439,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check if this is a duplicate of another entry. if (TreeEntry *E = getTreeEntry(S.OpValue)) { - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); - if (E->Scalars[i] != VL[i]) { - DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } + LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); + if (!E->isSame(VL)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; } // Record the reuse of the tree node. FIXME, currently this is only used to // properly draw the graph rather than for the actual vectorization. E->UserTreeIndices.push_back(UserTreeIdx); - DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + << ".\n"); return; } @@ -1451,8 +1459,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!I) continue; if (getTreeEntry(I)) { - DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << - ") is already in tree.\n"); + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + << ") is already in tree.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1462,7 +1470,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // we need to gather the scalars. for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { - DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1476,19 +1484,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. - DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); + LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); newTreeEntry(VL, false, UserTreeIdx); return; } // Check that every instruction appears once in this bundle. - for (unsigned i = 0, e = VL.size(); i < e; ++i) - for (unsigned j = i + 1; j < e; ++j) - if (VL[i] == VL[j]) { - DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } + SmallVector<unsigned, 4> ReuseShuffleIndicies; + SmallVector<Value *, 4> UniqueValues; + DenseMap<Value *, unsigned> UniquePositions; + for (Value *V : VL) { + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(Res.first->second); + if (Res.second) + UniqueValues.emplace_back(V); + } + if (UniqueValues.size() == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; + } + VL = UniqueValues; + } auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -1496,18 +1517,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S.OpValue)) { - DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); + if (!BS.tryScheduleBundle(VL, this, S)) { + LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } - DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + unsigned ShuffleOrOp = S.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast<PHINode>(VL0); @@ -1518,15 +1539,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TerminatorInst *Term = dyn_cast<TerminatorInst>( cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); if (Term) { - DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); + LLVM_DEBUG( + dbgs() + << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; @@ -1541,13 +1564,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = canReuseExtract(VL, VL0); + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { - DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); - } else { - BS.cancelScheduling(VL, VL0); + LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); + ++NumOpsWantToKeepOriginalOrder; + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies); + return; } - newTreeEntry(VL, Reuse, UserTreeIdx); + if (!CurrentOrder.empty()) { + LLVM_DEBUG({ + dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " + "with order"; + for (unsigned Idx : CurrentOrder) + dbgs() << " " << Idx; + dbgs() << "\n"; + }); + // Insert new order with initial value 0, if it does not exist, + // otherwise return the iterator to the existing one. + auto StoredCurrentOrderAndNum = + NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++StoredCurrentOrderAndNum->getSecond(); + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + StoredCurrentOrderAndNum->getFirst()); + return; + } + LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL, VL0); return; } case Instruction::Load: { @@ -1562,62 +1607,67 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { - LoadInst *L = cast<LoadInst>(VL[i]); + SmallVector<Value *, 4> PointerOps(VL.size()); + auto POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast<LoadInst>(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } + *POIter = L->getPointerOperand(); + ++POIter; } - // Check if the loads are consecutive, reversed, or neither. - // TODO: What we really want is to sort the loads, but for now, check - // the two likely directions. - bool Consecutive = true; - bool ReverseConsecutive = true; - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - Consecutive = false; - break; + OrdersType CurrentOrder; + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); } else { - ReverseConsecutive = false; + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; } - } - - if (Consecutive) { - ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of loads.\n"); - return; - } - - // If none of the load pairs were consecutive when checked in order, - // check the reverse order. - if (ReverseConsecutive) - for (unsigned i = VL.size() - 1; i > 0; --i) - if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) { - ReverseConsecutive = false; - break; + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = + dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted loads are consecutive. + if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (CurrentOrder.empty()) { + // Original loads are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + } else { + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++I->getSecond(); + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } + return; + } + } + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - - if (ReverseConsecutive) { - ++NumLoadsWantToChangeOrder; - DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); - } else { - DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - } + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1637,13 +1687,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() + << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; @@ -1665,14 +1716,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() + << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; @@ -1703,14 +1755,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right); + reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -1730,9 +1782,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // We don't combine GEPs with complicated (nested) indexing. for (unsigned j = 0; j < VL.size(); ++j) { if (cast<Instruction>(VL[j])->getNumOperands() != 2) { - DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1743,9 +1795,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned j = 0; j < VL.size(); ++j) { Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); if (Ty0 != CurTy) { - DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + LLVM_DEBUG(dbgs() + << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1754,16 +1807,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned j = 0; j < VL.size(); ++j) { auto Op = cast<Instruction>(VL[j])->getOperand(1); if (!isa<ConstantInt>(Op)) { - DEBUG( - dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + LLVM_DEBUG(dbgs() + << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1779,13 +1832,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) @@ -1802,8 +1855,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } Function *Int = CI->getCalledFunction(); @@ -1816,9 +1869,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] - << "\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] + << "\n"); return; } // ctlz,cttz and powi are special intrinsics whose second argument @@ -1827,10 +1880,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI - << " argument "<< A1I<<"!=" << A1J - << "\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument " << A1I << "!=" << A1J << "\n"); return; } } @@ -1840,14 +1892,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" - << *VL[i] << '\n'); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" + << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1862,19 +1914,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::ShuffleVector: // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. - if (!S.IsAltShuffle) { + if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); - DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S.Opcode, VL, Left, Right); + reorderAltShuffleOperands(S, VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -1892,8 +1944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } } @@ -1923,15 +1975,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { return N; } -bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const { +bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, + SmallVectorImpl<unsigned> &CurrentOrder) const { Instruction *E0 = cast<Instruction>(OpValue); assert(E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode"); + assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); + CurrentOrder.clear(); + // We have to extract from a vector/aggregate with the same number of elements. unsigned NElts; if (E0->getOpcode() == Instruction::ExtractValue) { @@ -1951,15 +2006,40 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const { return false; // Check that all of the indices extract from the correct offset. - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *Inst = cast<Instruction>(VL[I]); - if (!matchExtractIndex(Inst, I, Inst->getOpcode())) - return false; + bool ShouldKeepOrder = true; + unsigned E = VL.size(); + // Assign to all items the initial value E + 1 so we can check if the extract + // instruction index was used already. + // Also, later we can check that all the indices are used and we have a + // consecutive access in the extract instructions, by checking that no + // element of CurrentOrder still has value E + 1. + CurrentOrder.assign(E, E + 1); + unsigned I = 0; + for (; I < E; ++I) { + auto *Inst = cast<Instruction>(VL[I]); if (Inst->getOperand(0) != Vec) - return false; + break; + Optional<unsigned> Idx = getExtractIndex(Inst); + if (!Idx) + break; + const unsigned ExtIdx = *Idx; + if (ExtIdx != I) { + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + break; + ShouldKeepOrder = false; + CurrentOrder[ExtIdx] = I; + } else { + if (CurrentOrder[I] != E + 1) + break; + CurrentOrder[I] = I; + } + } + if (I < E) { + CurrentOrder.clear(); + return false; } - return true; + return ShouldKeepOrder; } bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { @@ -1985,13 +2065,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { VecTy = VectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); + bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); + int ReuseShuffleCost = 0; + if (NeedToShuffleReuses) { + ReuseShuffleCost = + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } if (E->NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + return ReuseShuffleCost + + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) { + if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && + allSameType(VL) && allSameBlock(VL)) { Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); @@ -2008,37 +2097,86 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { IO->getZExtValue()); } } - return Cost; + return ReuseShuffleCost + Cost; } } - return getGatherCost(E->Scalars); + return ReuseShuffleCost + getGatherCost(VL); } InstructionsState S = getSameOpcode(VL); - assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast<Instruction>(S.OpValue); - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + unsigned ShuffleOrOp = S.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; case Instruction::ExtractValue: case Instruction::ExtractElement: - if (canReuseExtract(VL, S.OpValue)) { - int DeadCost = 0; + if (NeedToShuffleReuses) { + unsigned Idx = 0; + for (unsigned I : E->ReuseShuffleIndices) { + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast<ConstantInt>( + cast<ExtractElementInst>(VL[I])->getIndexOperand()); + Idx = IO->getZExtValue(); + ReuseShuffleCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); + } else { + ReuseShuffleCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); + ++Idx; + } + } + Idx = ReuseShuffleNumbers; + for (Value *V : VL) { + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast<ConstantInt>( + cast<ExtractElementInst>(V)->getIndexOperand()); + Idx = IO->getZExtValue(); + } else { + --Idx; + } + ReuseShuffleCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); + } + } + if (!E->NeedToGather) { + int DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast<Instruction>(VL[i]); // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) + if (areAllUsersVectorized(E)) { // Take credit for instruction that will become dead. - DeadCost += + if (E->hasOneUse()) { + Instruction *Ext = E->user_back(); + if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa<GetElementPtrInst>(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + DeadCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), VecTy, i); + // Add back the cost of s|zext which is subtracted seperately. + DeadCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), E->getType(), Ext); + continue; + } + } + DeadCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + } } - return -DeadCost; + return DeadCost; } - return getGatherCost(VecTy); + return ReuseShuffleCost + getGatherCost(VL); case Instruction::ZExt: case Instruction::SExt: @@ -2053,24 +2191,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); + int ScalarEltCost = + TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy, VL0); + int ScalarCost = VL.size() * ScalarEltCost; VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); + int VecCost = 0; + // Check if the values are candidates to demote. + if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { + VecCost = ReuseShuffleCost + + TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + } return VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. + int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + Builder.getInt1Ty(), VL0); + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); - int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); - return VecCost - ScalarCost; + int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: case Instruction::FAdd: @@ -2099,42 +2250,43 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_None; + TargetTransformInfo::OP_PowerOf2; // If all operands are exactly the same ConstantInt then set the // operand kind to OK_UniformConstantValue. // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { + ConstantInt *CInt0 = nullptr; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { const Instruction *I = cast<Instruction>(VL[i]); - if (!isa<ConstantInt>(I->getOperand(1))) { + ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1)); + if (!CInt) { Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; break; } + if (Op2VP == TargetTransformInfo::OP_PowerOf2 && + !CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; if (i == 0) { - CInt = cast<ConstantInt>(I->getOperand(1)); + CInt0 = CInt; continue; } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast<ConstantInt>(I->getOperand(1))) + if (CInt0 != CInt) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; SmallVector<const Value *, 4> Operands(VL0->operand_values()); - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, - Op2VP, Operands); - int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP, Operands); - return VecCost - ScalarCost; + int ScalarEltCost = TTI->getArithmeticInstrCost( + S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -2142,83 +2294,119 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - int ScalarCost = - VecTy->getNumElements() * + int ScalarEltCost = TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - - return VecCost - ScalarCost; + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment(); - int ScalarLdCost = VecTy->getNumElements() * + unsigned alignment = cast<LoadInst>(VL0)->getAlignment(); + int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0, VL0); - return VecLdCost - ScalarLdCost; + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + int VecLdCost = + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + VecLdCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } + return ReuseShuffleCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. - unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment(); - int ScalarStCost = VecTy->getNumElements() * + unsigned alignment = cast<StoreInst>(VL0)->getAlignment(); + int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0, VL0); - return VecStCost - ScalarStCost; + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; + int VecStCost = + TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); + return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector<Type*, 4> ScalarTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) + SmallVector<Type *, 4> ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) ScalarTys.push_back(CI->getArgOperand(op)->getType()); FastMathFlags FMF; if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) FMF = FPMO->getFastMathFlags(); - int ScalarCallCost = VecTy->getNumElements() * + int ScalarEltCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); + if (NeedToShuffleReuses) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + } + int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; SmallVector<Value *, 4> Args(CI->arg_operands()); int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, VecTy->getNumElements()); - DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost - << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost + << " (" << VecCallCost << "-" << ScalarCallCost << ")" + << " for " << *CI << "\n"); - return VecCallCost - ScalarCallCost; + return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; + assert(S.isAltShuffle() && + ((Instruction::isBinaryOp(S.getOpcode()) && + Instruction::isBinaryOp(S.getAltOpcode())) || + (Instruction::isCast(S.getOpcode()) && + Instruction::isCast(S.getAltOpcode()))) && + "Invalid Shuffle Vector Operand"); int ScalarCost = 0; - int VecCost = 0; + if (NeedToShuffleReuses) { + for (unsigned Idx : E->ReuseShuffleIndices) { + Instruction *I = cast<Instruction>(VL[Idx]); + ReuseShuffleCost -= TTI->getInstructionCost( + I, TargetTransformInfo::TCK_RecipThroughput); + } + for (Value *V : VL) { + Instruction *I = cast<Instruction>(V); + ReuseShuffleCost += TTI->getInstructionCost( + I, TargetTransformInfo::TCK_RecipThroughput); + } + } for (Value *i : VL) { Instruction *I = cast<Instruction>(i); - if (!I) - break; - ScalarCost += - TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + ScalarCost += TTI->getInstructionCost( + I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - Instruction *I0 = cast<Instruction>(VL[0]); - VecCost = - TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); - Instruction *I1 = cast<Instruction>(VL[1]); - VecCost += - TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); - return VecCost - ScalarCost; + int VecCost = 0; + if (Instruction::isBinaryOp(S.getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + } else { + Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); + Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); + VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); + VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + } + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); + return ReuseShuffleCost + VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -2226,8 +2414,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } bool BoUpSLP::isFullyVectorizableTinyTree() { - DEBUG(dbgs() << "SLP: Check whether the tree with height " << - VectorizableTree.size() << " is fully vectorizable .\n"); + LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " + << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather) @@ -2297,13 +2485,13 @@ int BoUpSLP::getSpillCost() { LiveValues.insert(cast<Instruction>(&*J)); } - DEBUG( + LLVM_DEBUG({ dbgs() << "SLP: #LV: " << LiveValues.size(); for (auto *X : LiveValues) dbgs() << " " << X->getName(); dbgs() << ", Looking at "; Inst->dump(); - ); + }); // Now find the sequence of instructions between PrevInst and Inst. BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), @@ -2315,7 +2503,10 @@ int BoUpSLP::getSpillCost() { continue; } - if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { + // Debug informations don't impact spill cost. + if ((isa<CallInst>(&*PrevInstIt) && + !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && + &*PrevInstIt != PrevInst) { SmallVector<Type*, 4> V; for (auto *II : LiveValues) V.push_back(VectorType::get(II->getType(), BundleWidth)); @@ -2333,19 +2524,41 @@ int BoUpSLP::getSpillCost() { int BoUpSLP::getTreeCost() { int Cost = 0; - DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << - VectorizableTree.size() << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " + << VectorizableTree.size() << ".\n"); unsigned BundleWidth = VectorizableTree[0].Scalars.size(); - for (TreeEntry &TE : VectorizableTree) { + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry &TE = VectorizableTree[I]; + + // We create duplicate tree entries for gather sequences that have multiple + // uses. However, we should not compute the cost of duplicate sequences. + // For example, if we have a build vector (i.e., insertelement sequence) + // that is used by more than one vector instruction, we only need to + // compute the cost of the insertelement instructions once. The redundent + // instructions will be eliminated by CSE. + // + // We should consider not creating duplicate tree entries for gather + // sequences, and instead add additional edges to the tree representing + // their uses. Since such an approach results in fewer total entries, + // existing heuristics based on tree size may yeild different results. + // + if (TE.NeedToGather && + std::any_of(std::next(VectorizableTree.begin(), I + 1), + VectorizableTree.end(), [TE](TreeEntry &Entry) { + return Entry.NeedToGather && Entry.isSame(TE.Scalars); + })) + continue; + int C = getEntryCost(&TE); - DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " - << *TE.Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for bundle that starts with " << *TE.Scalars[0] + << ".\n"); Cost += C; } - SmallSet<Value *, 16> ExtractCostCalculated; + SmallPtrSet<Value *, 16> ExtractCostCalculated; int ExtractCost = 0; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. @@ -2386,7 +2599,7 @@ int BoUpSLP::getTreeCost() { << "SLP: Extract Cost = " << ExtractCost << ".\n" << "SLP: Total Cost = " << Cost << ".\n"; } - DEBUG(dbgs() << Str); + LLVM_DEBUG(dbgs() << Str); if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); @@ -2394,10 +2607,14 @@ int BoUpSLP::getTreeCost() { return Cost; } -int BoUpSLP::getGatherCost(Type *Ty) { +int BoUpSLP::getGatherCost(Type *Ty, + const DenseSet<unsigned> &ShuffledIndices) { int Cost = 0; for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (!ShuffledIndices.count(i)) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (!ShuffledIndices.empty()) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } @@ -2408,7 +2625,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. - return getGatherCost(VecTy); + // Check if the same elements are inserted several times and count them as + // shuffle candidates. + DenseSet<unsigned> ShuffledElements; + DenseSet<Value *> UniqueElements; + // Iterate in reverse order to consider insert elements with the high cost. + for (unsigned I = VL.size(); I > 0; --I) { + unsigned Idx = I - 1; + if (!UniqueElements.insert(VL[Idx]).second) + ShuffledElements.insert(Idx); + } + return getGatherCost(VecTy, ShuffledElements); } // Reorder commutative operations in alternate shuffle if the resulting vectors @@ -2420,16 +2647,14 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL, +void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right) { // Push left and right operands of binary operation into Left and Right - unsigned AltOpcode = getAltOpcode(Opcode); - (void)AltOpcode; for (Value *V : VL) { auto *I = cast<Instruction>(V); - assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && - "Incorrect instruction in vector"); + assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); Left.push_back(I->getOperand(0)); Right.push_back(I->getOperand(1)); } @@ -2609,7 +2834,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, // add a[1],c[2] load b[1] // b[2] load b[2] // add a[3],c[3] load b[3] - for (unsigned j = 0; j < VL.size() - 1; ++j) { + for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { @@ -2630,17 +2855,15 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, } } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) { +void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, + const InstructionsState &S) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast<Instruction>(OpValue); + auto *Front = cast<Instruction>(S.OpValue); auto *BB = Front->getParent(); - const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode(); - const unsigned AltOpcode = getAltOpcode(Opcode); assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { - return !sameOpcodeOrAlt(Opcode, AltOpcode, - cast<Instruction>(V)->getOpcode()) || - cast<Instruction>(V)->getParent() == BB; + auto *I = cast<Instruction>(V); + return !S.isOpcodeOrAlt(I) || I->getParent() == BB; })); // The last instruction in the bundle in program order. @@ -2652,7 +2875,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) { // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back())); + BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -2680,7 +2903,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) { if (!LastInst) { SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode())) + if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) LastInst = &I; if (Bundle.empty()) break; @@ -2706,7 +2929,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { if (TreeEntry *E = getTreeEntry(VL[i])) { // Find which lane we need to extract. int FoundLane = -1; - for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { + for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) { // Is this the lane of the scalar that we are looking for ? if (E->Scalars[Lane] == VL[i]) { FoundLane = Lane; @@ -2714,6 +2937,11 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { } } assert(FoundLane >= 0 && "Could not find the correct lane"); + if (!E->ReuseShuffleIndices.empty()) { + FoundLane = + std::distance(E->ReuseShuffleIndices.begin(), + llvm::find(E->ReuseShuffleIndices, FoundLane)); + } ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); } } @@ -2722,66 +2950,128 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { return Vec; } -Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const { - if (const TreeEntry *En = getTreeEntry(OpValue)) { - if (En->isSame(VL) && En->VectorizedValue) - return En->VectorizedValue; - } - return nullptr; -} - Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { InstructionsState S = getSameOpcode(VL); - if (S.Opcode) { + if (S.getOpcode()) { if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) - return vectorizeTree(E); + if (E->isSame(VL)) { + Value *V = vectorizeTree(E); + if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { + // We need to get the vectorized value but without shuffle. + if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) { + V = SV->getOperand(0); + } else { + // Reshuffle to get only unique values. + SmallVector<unsigned, 4> UniqueIdxs; + SmallSet<unsigned, 4> UsedIdxs; + for(unsigned Idx : E->ReuseShuffleIndices) + if (UsedIdxs.insert(Idx).second) + UniqueIdxs.emplace_back(Idx); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + UniqueIdxs); + } + } + return V; + } } } Type *ScalarTy = S.OpValue->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) ScalarTy = SI->getValueOperand()->getType(); + + // Check that every instruction appears once in this bundle. + SmallVector<unsigned, 4> ReuseShuffleIndicies; + SmallVector<Value *, 4> UniqueValues; + if (VL.size() > 2) { + DenseMap<Value *, unsigned> UniquePositions; + for (Value *V : VL) { + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(Res.first->second); + if (Res.second || isa<Constant>(V)) + UniqueValues.emplace_back(V); + } + // Do not shuffle single element or if number of unique values is not power + // of 2. + if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || + !llvm::isPowerOf2_32(UniqueValues.size())) + ReuseShuffleIndicies.clear(); + else + VL = UniqueValues; + } VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - return Gather(VL, VecTy); + Value *V = Gather(VL, VecTy); + if (!ReuseShuffleIndicies.empty()) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + ReuseShuffleIndicies, "shuffle"); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } + return V; +} + +static void inversePermutation(ArrayRef<unsigned> Indices, + SmallVectorImpl<unsigned> &Mask) { + Mask.clear(); + const unsigned E = Indices.size(); + Mask.resize(E); + for (unsigned I = 0; I < E; ++I) + Mask[Indices[I]] = I; } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast<Instruction>(E->Scalars[0]); + Instruction *VL0 = cast<Instruction>(S.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); + if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); auto *V = Gather(E->Scalars, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } E->VectorizedValue = V; return V; } - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + unsigned ShuffleOrOp = S.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - E->VectorizedValue = NewPhi; + Value *V = NewPhi; + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } + E->VectorizedValue = V; // PHINodes may have multiple entries from the same block. We want to // visit every block once. - SmallSet<BasicBlock*, 4> VisitedBBs; + SmallPtrSet<BasicBlock*, 4> VisitedBBs; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; @@ -2804,32 +3094,74 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && "Invalid number of incoming values"); - return NewPhi; + return V; } case Instruction::ExtractElement: { - if (canReuseExtract(E->Scalars, VL0)) { + if (!E->NeedToGather) { Value *V = VL0->getOperand(0); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (!E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); auto *V = Gather(E->Scalars, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (canReuseExtract(E->Scalars, VL0)) { + if (!E->NeedToGather) { LoadInst *LI = cast<LoadInst>(VL0->getOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); - E->VectorizedValue = V; - return propagateMetadata(V, E->Scalars); + Value *NewV = propagateMetadata(V, E->Scalars); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector( + NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); + } + E->VectorizedValue = NewV; + return NewV; } - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); auto *V = Gather(E->Scalars, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } E->VectorizedValue = V; return V; } @@ -2849,15 +3181,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (Value *V : E->Scalars) INVL.push_back(cast<Instruction>(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); Value *InVec = vectorizeTree(INVL); - if (Value *V = alreadyVectorized(E->Scalars, VL0)) - return V; + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } CastInst *CI = dyn_cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -2870,23 +3208,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { RHSV.push_back(cast<Instruction>(V)->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); - if (Value *V = alreadyVectorized(E->Scalars, VL0)) - return V; + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; - if (S.Opcode == Instruction::FCmp) + if (S.getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); + propagateIRFlags(V, E->Scalars, VL0); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -2898,16 +3242,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); Value *Cond = vectorizeTree(CondVec); Value *True = vectorizeTree(TrueVec); Value *False = vectorizeTree(FalseVec); - if (Value *V = alreadyVectorized(E->Scalars, VL0)) - return V; + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } Value *V = Builder.CreateSelect(Cond, True, False); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -2932,7 +3282,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, + reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { @@ -2941,29 +3291,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { RHSVL.push_back(I->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars, VL0)) - return V; + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } Value *V = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); + static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + propagateIRFlags(V, E->Scalars, VL0); + if (auto *I = dyn_cast<Instruction>(V)) + V = propagateMetadata(I, E->Scalars); + + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; - if (Instruction *I = dyn_cast<Instruction>(V)) - return propagateMetadata(I, E->Scalars); - return V; } case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - setInsertPointAfterBundle(E->Scalars, VL0); + bool IsReorder = !E->ReorderIndices.empty(); + if (IsReorder) { + S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); + VL0 = cast<Instruction>(S.OpValue); + } + setInsertPointAfterBundle(E->Scalars, S); LoadInst *LI = cast<LoadInst>(VL0); Type *ScalarLoadTy = LI->getType(); @@ -2985,9 +3346,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } LI->setAlignment(Alignment); - E->VectorizedValue = LI; + Value *V = propagateMetadata(LI, E->Scalars); + if (IsReorder) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + Mask, "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } + E->VectorizedValue = V; ++NumVectorInstructions; - return propagateMetadata(LI, E->Scalars); + return V; } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(VL0); @@ -2998,12 +3371,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (Value *V : E->Scalars) ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); Value *VecValue = vectorizeTree(ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); - StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -3014,13 +3387,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - S->setAlignment(Alignment); - E->VectorizedValue = S; + ST->setAlignment(Alignment); + Value *V = propagateMetadata(ST, E->Scalars); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } + E->VectorizedValue = V; ++NumVectorInstructions; - return propagateMetadata(S, E->Scalars); + return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); ValueList Op0VL; for (Value *V : E->Scalars) @@ -3041,17 +3419,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateGEP( cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); + if (Instruction *I = dyn_cast<Instruction>(V)) + V = propagateMetadata(I, E->Scalars); + + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; - if (Instruction *I = dyn_cast<Instruction>(V)) - return propagateMetadata(I, E->Scalars); - return V; } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - setInsertPointAfterBundle(E->Scalars, VL0); + setInsertPointAfterBundle(E->Scalars, S); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -3075,7 +3457,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *OpVec = vectorizeTree(OpVL); - DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); + LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3093,58 +3475,87 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (ScalarArg && getTreeEntry(ScalarArg)) ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); + propagateIRFlags(V, E->Scalars, VL0); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(Instruction::isBinaryOp(S.Opcode) && + assert(S.isAltShuffle() && + ((Instruction::isBinaryOp(S.getOpcode()) && + Instruction::isBinaryOp(S.getAltOpcode())) || + (Instruction::isCast(S.getOpcode()) && + Instruction::isCast(S.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars, VL0); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); - - if (Value *V = alreadyVectorized(E->Scalars, VL0)) - return V; + Value *LHS, *RHS; + if (Instruction::isBinaryOp(S.getOpcode())) { + reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars, S); + LHS = vectorizeTree(LHSVL); + RHS = vectorizeTree(RHSVL); + } else { + ValueList INVL; + for (Value *V : E->Scalars) + INVL.push_back(cast<Instruction>(V)->getOperand(0)); + setInsertPointAfterBundle(E->Scalars, S); + LHS = vectorizeTree(INVL); + } - // Create a vector of LHS op1 RHS - Value *V0 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; + } - unsigned AltOpcode = getAltOpcode(S.Opcode); - // Create a vector of LHS op2 RHS - Value *V1 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS); + Value *V0, *V1; + if (Instruction::isBinaryOp(S.getOpcode())) { + V0 = Builder.CreateBinOp( + static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + V1 = Builder.CreateBinOp( + static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS); + } else { + V0 = Builder.CreateCast( + static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy); + V1 = Builder.CreateCast( + static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy); + } // Create shuffle to take alternate operations from the vector. - // Also, gather up odd and even scalar ops to propagate IR flags to + // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. - ValueList OddScalars, EvenScalars; + ValueList OpScalars, AltScalars; unsigned e = E->Scalars.size(); SmallVector<Constant *, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { - if (isOdd(i)) { + auto *OpInst = cast<Instruction>(E->Scalars[i]); + assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + if (OpInst->getOpcode() == S.getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); - OddScalars.push_back(E->Scalars[i]); + AltScalars.push_back(E->Scalars[i]); } else { Mask[i] = Builder.getInt32(i); - EvenScalars.push_back(E->Scalars[i]); + OpScalars.push_back(E->Scalars[i]); } } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, EvenScalars); - propagateIRFlags(V1, OddScalars); + propagateIRFlags(V0, OpScalars); + propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + if (Instruction *I = dyn_cast<Instruction>(V)) + V = propagateMetadata(I, E->Scalars); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; - if (Instruction *I = dyn_cast<Instruction>(V)) - return propagateMetadata(I, E->Scalars); return V; } @@ -3183,7 +3594,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { VectorizableTree[0].VectorizedValue = Trunc; } - DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() + << " values .\n"); // If necessary, sign-extend or zero-extend ScalarRoot to the larger type // specified by ScalarType. @@ -3260,7 +3672,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); - } + } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); @@ -3269,7 +3681,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { User->replaceUsesOfWith(Scalar, Ex); } - DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } // For each vectorized value: @@ -3290,7 +3702,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { if (!Ty->isVoidTy()) { #ifndef NDEBUG for (User *U : Scalar->users()) { - DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to replace users in the ignorelist by undef. assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && @@ -3300,7 +3712,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } - DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); eraseInstruction(cast<Instruction>(Scalar)); } } @@ -3310,18 +3722,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { return VectorizableTree[0].VectorizedValue; } -void BoUpSLP::optimizeGatherSequence(Function &F) { - DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() - << " gather sequences instructions.\n"); +void BoUpSLP::optimizeGatherSequence() { + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *it : GatherSeq) { - InsertElementInst *Insert = dyn_cast<InsertElementInst>(it); - - if (!Insert) + for (Instruction *I : GatherSeq) { + if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I)) continue; // Check if this block is inside a loop. - Loop *L = LI->getLoopFor(Insert->getParent()); + Loop *L = LI->getLoopFor(I->getParent()); if (!L) continue; @@ -3333,27 +3743,41 @@ void BoUpSLP::optimizeGatherSequence(Function &F) { // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. - Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); - Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); - if (CurrVec && L->contains(CurrVec)) + auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); + auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); + if (Op0 && L->contains(Op0)) continue; - if (NewElem && L->contains(NewElem)) + if (Op1 && L->contains(Op1)) continue; // We can hoist this instruction. Move it to the pre-header. - Insert->moveBefore(PreHeader->getTerminator()); + I->moveBefore(PreHeader->getTerminator()); } + // Make a list of all reachable blocks in our CSE queue. + SmallVector<const DomTreeNode *, 8> CSEWorkList; + CSEWorkList.reserve(CSEBlocks.size()); + for (BasicBlock *BB : CSEBlocks) + if (DomTreeNode *N = DT->getNode(BB)) { + assert(DT->isReachableFromEntry(N)); + CSEWorkList.push_back(N); + } + + // Sort blocks by domination. This ensures we visit a block after all blocks + // dominating it are visited. + std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), + [this](const DomTreeNode *A, const DomTreeNode *B) { + return DT->properlyDominates(A, B); + }); + // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; - ReversePostOrderTraversal<Function *> RPOT(&F); - for (auto BB : RPOT) { - // Traverse CSEBlocks by RPOT order. - if (!CSEBlocks.count(BB)) - continue; - + for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { + assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && + "Worklist not sorted properly!"); + BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = &*it++; @@ -3384,8 +3808,9 @@ void BoUpSLP::optimizeGatherSequence(Function &F) { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - BoUpSLP *SLP, Value *OpValue) { - if (isa<PHINode>(OpValue)) + BoUpSLP *SLP, + const InstructionsState &S) { + if (isa<PHINode>(S.OpValue)) return true; // Initialize the instruction bundle. @@ -3393,12 +3818,12 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n"); + LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V, OpValue)) + if (!extendSchedulingRegion(V, S)) return false; } @@ -3410,8 +3835,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the // existing schedule. - DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember - << " was already scheduled\n"); + LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember + << " was already scheduled\n"); ReSchedule = true; } assert(BundleMember->isSchedulingEntity() && @@ -3446,8 +3871,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, initialFillReadyList(ReadyInsts); } - DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " - << BB->getName() << "\n"); + LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " + << BB->getName() << "\n"); calculateDependencies(Bundle, true, SLP); @@ -3465,7 +3890,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, } } if (!Bundle->isReady()) { - cancelScheduling(VL, OpValue); + cancelScheduling(VL, S.OpValue); return false; } return true; @@ -3477,7 +3902,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, return; ScheduleData *Bundle = getScheduleData(OpValue); - DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); + LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && @@ -3508,13 +3933,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { } bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, - Value *OpValue) { - if (getScheduleData(V, isOneOf(OpValue, V))) + const InstructionsState &S) { + if (getScheduleData(V, isOneOf(S, V))) return true; Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); - auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool { + auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { ScheduleData *ISD = getScheduleData(I); if (!ISD) return false; @@ -3522,8 +3947,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, "ScheduleData not in scheduling region"); ScheduleData *SD = allocateScheduleDataChunks(); SD->Inst = I; - SD->init(SchedulingRegionID, OpValue); - ExtraScheduleDataMap[I][OpValue] = SD; + SD->init(SchedulingRegionID, S.OpValue); + ExtraScheduleDataMap[I][S.OpValue] = SD; return true; }; if (CheckSheduleForI(I)) @@ -3533,10 +3958,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); - if (isOneOf(OpValue, I) != I) + if (isOneOf(S, I) != I) CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); - DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); + LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; } // Search up and down at the same time, because we don't know if the new @@ -3548,7 +3973,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, BasicBlock::iterator LowerEnd = BB->end(); while (true) { if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { - DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); + LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); return false; } @@ -3556,9 +3981,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; - if (isOneOf(OpValue, I) != I) + if (isOneOf(S, I) != I) CheckSheduleForI(I); - DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); + LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I + << "\n"); return true; } UpIter++; @@ -3568,10 +3994,11 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); - if (isOneOf(OpValue, I) != I) + if (isOneOf(S, I) != I) CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); - DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); + LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I + << "\n"); return true; } DownIter++; @@ -3635,7 +4062,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, assert(isInSchedulingRegion(BundleMember)); if (!BundleMember->hasValidDependencies()) { - DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember + << "\n"); BundleMember->Dependencies = 0; BundleMember->resetUnscheduledDeps(); @@ -3727,7 +4155,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 // and we can abort this loop at i6. if (DistToSrc >= 2 * MaxMemDepDistance) - break; + break; DistToSrc++; } } @@ -3736,7 +4164,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); - DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); + LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst + << "\n"); } } } @@ -3759,7 +4188,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; - DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); + LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); BS->resetSchedule(); @@ -4025,7 +4454,11 @@ void BoUpSLP::computeMinimumValueSizes() { // We start by looking at each entry that can be demoted. We compute the // maximum bit width required to store the scalar by using ValueTracking to // compute the number of high-order bits we can truncate. - if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) { + if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && + llvm::all_of(TreeRoot, [](Value *R) { + assert(R->hasOneUse() && "Root should have only one use!"); + return isa<GetElementPtrInst>(R->user_back()); + })) { MaxBitWidth = 8u; // Determine if the sign bit of all the roots is known to be zero. If not, @@ -4188,7 +4621,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; - DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); // Use the bottom up slp vectorizer to construct chains that start with // store instructions. @@ -4203,8 +4636,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Vectorize trees that end at stores. if (!Stores.empty()) { - DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() - << " underlying objects.\n"); + LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() + << " underlying objects.\n"); Changed |= vectorizeStoreChains(R); } @@ -4215,21 +4648,21 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // is primarily intended to catch gather-like idioms ending at // non-consecutive loads. if (!GEPs.empty()) { - DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() - << " underlying objects.\n"); + LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() + << " underlying objects.\n"); Changed |= vectorizeGEPIndices(BB, R); } } if (Changed) { - R.optimizeGatherSequence(F); - DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); - DEBUG(verifyFunction(F)); + R.optimizeGatherSequence(); + LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); + LLVM_DEBUG(verifyFunction(F)); } return Changed; } -/// \brief Check that the Values in the slice in VL array are still existent in +/// Check that the Values in the slice in VL array are still existent in /// the WeakTrackingVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakTrackingVH @@ -4244,30 +4677,28 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, unsigned VecRegSize) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - unsigned Sz = R.getVectorElementSize(Chain[0]); - unsigned VF = VecRegSize / Sz; + const unsigned ChainLen = Chain.size(); + LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + const unsigned Sz = R.getVectorElementSize(Chain[0]); + const unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; // Keep track of values that were deleted by vectorizing in the loop below. - SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end()); + const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end()); bool Changed = false; // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; + for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { // Check that a previous iteration of this loop did not delete the Value. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) continue; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); ArrayRef<Value *> Operands = Chain.slice(i, VF); R.buildTree(Operands); @@ -4278,9 +4709,10 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, int Cost = R.getTreeCost(); - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF + << "\n"); if (Cost < -SLPCostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); using namespace ore; @@ -4417,66 +4849,48 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, None, true); + return tryToVectorizeList(VL, R, /*UserCost=*/0, true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - ArrayRef<Value *> BuildVector, - bool AllowReorder, - bool NeedExtraction) { + int UserCost, bool AllowReorder) { if (VL.size() < 2) return false; - DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size() - << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " + << VL.size() << ".\n"); - // Check that all of the parts are scalar instructions of the same type. - Instruction *I0 = dyn_cast<Instruction>(VL[0]); - if (!I0) + // Check that all of the parts are scalar instructions of the same type, + // we permit an alternate opcode via InstructionsState. + InstructionsState S = getSameOpcode(VL); + if (!S.getOpcode()) return false; - unsigned Opcode0 = I0->getOpcode(); - + Instruction *I0 = cast<Instruction>(S.OpValue); unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); if (MaxVF < 2) { - R.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "SmallVF", I0) - << "Cannot SLP vectorize list: vectorization factor " - << "less than 2 is not supported"; - }); - return false; + R.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) + << "Cannot SLP vectorize list: vectorization factor " + << "less than 2 is not supported"; + }); + return false; } for (Value *V : VL) { Type *Ty = V->getType(); if (!isValidElementType(Ty)) { - // NOTE: the following will give user internal llvm type name, which may not be useful + // NOTE: the following will give user internal llvm type name, which may + // not be useful. R.getORE()->emit([&]() { - std::string type_str; - llvm::raw_string_ostream rso(type_str); - Ty->print(rso); - return OptimizationRemarkMissed( - SV_NAME, "UnsupportedType", I0) - << "Cannot SLP vectorize list: type " - << rso.str() + " is unsupported by vectorizer"; - }); - return false; - } - Instruction *Inst = dyn_cast<Instruction>(V); - - if (!Inst) - return false; - if (Inst->getOpcode() != Opcode0) { - R.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "InequableTypes", I0) - << "Cannot SLP vectorize list: not all of the " - << "parts of scalar instructions are of the same type: " - << ore::NV("Instruction1Opcode", I0) << " and " - << ore::NV("Instruction2Opcode", Inst); + std::string type_str; + llvm::raw_string_ostream rso(type_str); + Ty->print(rso); + return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) + << "Cannot SLP vectorize list: type " + << rso.str() + " is unsupported by vectorizer"; }); return false; } @@ -4513,24 +4927,20 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth)) continue; - DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " - << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " + << "\n"); ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); - ArrayRef<Value *> EmptyArray; - ArrayRef<Value *> BuildVectorSlice; - if (!BuildVector.empty()) - BuildVectorSlice = BuildVector.slice(I, OpsWidth); - - R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice); + R.buildTree(Ops); + Optional<ArrayRef<unsigned>> Order = R.bestOrder(); // TODO: check if we can allow reordering for more cases. - if (AllowReorder && R.shouldReorder()) { + if (AllowReorder && Order) { + // TODO: reorder tree nodes without tree rebuilding. // Conceptually, there is nothing actually preventing us from trying to // reorder a larger list. In fact, we do exactly this when vectorizing // reductions. However, at this point, we only expect to get here when // there are exactly two operations. assert(Ops.size() == 2); - assert(BuildVectorSlice.empty()); Value *ReorderedOps[] = {Ops[1], Ops[0]}; R.buildTree(ReorderedOps, None); } @@ -4538,43 +4948,19 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + int Cost = R.getTreeCost() - UserCost; CandidateFound = true; MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { - DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", cast<Instruction>(Ops[0])) << "SLP vectorized with cost " << ore::NV("Cost", Cost) << " and with tree size " << ore::NV("TreeSize", R.getTreeSize())); - Value *VectorizedRoot = R.vectorizeTree(); - - // Reconstruct the build vector by extracting the vectorized root. This - // way we handle the case where some elements of the vector are - // undefined. - // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) - if (!BuildVectorSlice.empty()) { - // The insert point is the last build vector instruction. The - // vectorized root will precede it. This guarantees that we get an - // instruction. The vectorized tree could have been constant folded. - Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); - unsigned VecIdx = 0; - for (auto &V : BuildVectorSlice) { - IRBuilder<NoFolder> Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); - Instruction *I = cast<Instruction>(V); - assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I)); - Instruction *Extract = - cast<Instruction>(Builder.CreateExtractElement( - VectorizedRoot, Builder.getInt32(VecIdx++))); - I->setOperand(1, Extract); - I->moveAfter(Extract); - InsertAfter = I; - } - } + R.vectorizeTree(); // Move to the next bundle. I += VF - 1; NextInst = I + 1; @@ -4585,18 +4971,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (!Changed && CandidateFound) { R.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "NotBeneficial", I0) - << "List vectorization was possible but not beneficial with cost " - << ore::NV("Cost", MinCost) << " >= " - << ore::NV("Treshold", -SLPCostThreshold); + return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) + << "List vectorization was possible but not beneficial with cost " + << ore::NV("Cost", MinCost) << " >= " + << ore::NV("Treshold", -SLPCostThreshold); }); } else if (!Changed) { R.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "NotPossible", I0) - << "Cannot SLP vectorize list: vectorization was impossible" - << " with available vectorization factors"; + return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) + << "Cannot SLP vectorize list: vectorization was impossible" + << " with available vectorization factors"; }); } return Changed; @@ -4645,7 +5029,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { return false; } -/// \brief Generate a shuffle mask to be used in a reduction tree. +/// Generate a shuffle mask to be used in a reduction tree. /// /// \param VecLen The length of the vector to be reduced. /// \param NumEltsToRdx The number of elements that should be reduced in the @@ -5128,6 +5512,77 @@ class HorizontalReduction { return OperationData( Instruction::FCmp, LHS, RHS, RK_Max, cast<Instruction>(Select->getCondition())->hasNoNaNs()); + } else { + // Try harder: look for min/max pattern based on instructions producing + // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). + // During the intermediate stages of SLP, it's very common to have + // pattern like this (since optimizeGatherSequence is run only once + // at the end): + // %1 = extractelement <2 x i32> %a, i32 0 + // %2 = extractelement <2 x i32> %a, i32 1 + // %cond = icmp sgt i32 %1, %2 + // %3 = extractelement <2 x i32> %a, i32 0 + // %4 = extractelement <2 x i32> %a, i32 1 + // %select = select i1 %cond, i32 %3, i32 %4 + CmpInst::Predicate Pred; + Instruction *L1; + Instruction *L2; + + LHS = Select->getTrueValue(); + RHS = Select->getFalseValue(); + Value *Cond = Select->getCondition(); + + // TODO: Support inverse predicates. + if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { + if (!isa<ExtractElementInst>(RHS) || + !L2->isIdenticalTo(cast<Instruction>(RHS))) + return OperationData(V); + } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { + if (!isa<ExtractElementInst>(LHS) || + !L1->isIdenticalTo(cast<Instruction>(LHS))) + return OperationData(V); + } else { + if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) + return OperationData(V); + if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || + !L1->isIdenticalTo(cast<Instruction>(LHS)) || + !L2->isIdenticalTo(cast<Instruction>(RHS))) + return OperationData(V); + } + switch (Pred) { + default: + return OperationData(V); + + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); + + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SLE: + return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); + + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULT: + case CmpInst::FCMP_ULE: + return OperationData(Instruction::FCmp, LHS, RHS, RK_Min, + cast<Instruction>(Cond)->hasNoNaNs()); + + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); + + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SGE: + return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); + + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGT: + case CmpInst::FCMP_UGE: + return OperationData(Instruction::FCmp, LHS, RHS, RK_Max, + cast<Instruction>(Cond)->hasNoNaNs()); + } } } return OperationData(V); @@ -5136,7 +5591,7 @@ class HorizontalReduction { public: HorizontalReduction() = default; - /// \brief Try to find a reduction tree. + /// Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { assert((!Phi || is_contained(Phi->operands(), B)) && "Thi phi needs to use the binary operator"); @@ -5164,6 +5619,8 @@ public: Type *Ty = B->getType(); if (!isValidElementType(Ty)) return false; + if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy()) + return false; ReducedValueData.clear(); ReductionRoot = B; @@ -5262,7 +5719,7 @@ public: return true; } - /// \brief Attempt to vectorize the tree found by + /// Attempt to vectorize the tree found by /// matchAssociativeReduction. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { if (ReducedVals.empty()) @@ -5295,9 +5752,14 @@ public: while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); - if (V.shouldReorder()) { - SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); - V.buildTree(Reversed, ExternallyUsedValues, IgnoreList); + Optional<ArrayRef<unsigned>> Order = V.bestOrder(); + // TODO: Handle orders of size less than number of elements in the vector. + if (Order && Order->size() == VL.size()) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector<Value *, 4> ReorderedOps(VL.size()); + llvm::transform(*Order, ReorderedOps.begin(), + [VL](const unsigned Idx) { return VL[Idx]; }); + V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) break; @@ -5305,8 +5767,9 @@ public: V.computeMinimumValueSizes(); // Estimate cost. - int Cost = - V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth); + int TreeCost = V.getTreeCost(); + int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); + int Cost = TreeCost + ReductionCost; if (Cost >= -SLPCostThreshold) { V.getORE()->emit([&]() { return OptimizationRemarkMissed( @@ -5319,8 +5782,8 @@ public: break; } - DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost - << ". (HorRdx)\n"); + LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" + << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { return OptimizationRemark( SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0])) @@ -5382,7 +5845,7 @@ public: } private: - /// \brief Calculate the cost of a reduction. + /// Calculate the cost of a reduction. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); @@ -5441,16 +5904,16 @@ private: } ScalarReduxCost *= (ReduxWidth - 1); - DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost - << " for reduction that starts with " << *FirstReducedVal - << " (It is a " - << (IsPairwiseReduction ? "pairwise" : "splitting") - << " reduction)\n"); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost + << " for reduction that starts with " << *FirstReducedVal + << " (It is a " + << (IsPairwiseReduction ? "pairwise" : "splitting") + << " reduction)\n"); return VecReduxCost - ScalarReduxCost; } - /// \brief Emit a horizontal reduction of the vectorized value. + /// Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, unsigned ReduxWidth, const TargetTransformInfo *TTI) { assert(VectorizedValue && "Need to have a vectorized tree node"); @@ -5486,7 +5949,7 @@ private: } // end anonymous namespace -/// \brief Recognize construction of vectors like +/// Recognize construction of vectors like /// %ra = insertelement <4 x float> undef, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 @@ -5495,11 +5958,17 @@ private: /// /// Returns true if it matches static bool findBuildVector(InsertElementInst *LastInsertElem, - SmallVectorImpl<Value *> &BuildVector, - SmallVectorImpl<Value *> &BuildVectorOpds) { + TargetTransformInfo *TTI, + SmallVectorImpl<Value *> &BuildVectorOpds, + int &UserCost) { + UserCost = 0; Value *V = nullptr; do { - BuildVector.push_back(LastInsertElem); + if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) { + UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, + LastInsertElem->getType(), + CI->getZExtValue()); + } BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); V = LastInsertElem->getOperand(0); if (isa<UndefValue>(V)) @@ -5508,20 +5977,17 @@ static bool findBuildVector(InsertElementInst *LastInsertElem, if (!LastInsertElem || !LastInsertElem->hasOneUse()) return false; } while (true); - std::reverse(BuildVector.begin(), BuildVector.end()); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } -/// \brief Like findBuildVector, but looks for construction of aggregate. +/// Like findBuildVector, but looks for construction of aggregate. /// /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, - SmallVectorImpl<Value *> &BuildVector, SmallVectorImpl<Value *> &BuildVectorOpds) { Value *V; do { - BuildVector.push_back(IV); BuildVectorOpds.push_back(IV->getInsertedValueOperand()); V = IV->getAggregateOperand(); if (isa<UndefValue>(V)) @@ -5530,7 +5996,6 @@ static bool findBuildAggregate(InsertValueInst *IV, if (!IV || !IV->hasOneUse()) return false; } while (true); - std::reverse(BuildVector.begin(), BuildVector.end()); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } @@ -5539,7 +6004,7 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } -/// \brief Try and get a reduction value from a phi node. +/// Try and get a reduction value from a phi node. /// /// Given a phi node \p P in a block \p ParentBB, consider possible reductions /// if they come from either \p ParentBB or a containing loop latch. @@ -5552,9 +6017,8 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, // reduction phi. Vectorizing such cases has been reported to cause // miscompiles. See PR25787. auto DominatedReduxValue = [&](Value *R) { - return ( - dyn_cast<Instruction>(R) && - DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); + return isa<Instruction>(R) && + DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); }; Value *Rdx = nullptr; @@ -5624,7 +6088,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0}); - SmallSet<Value *, 8> VisitedInstrs; + SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; while (!Stack.empty()) { Value *V; @@ -5706,27 +6170,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, if (!R.canMapToVector(IVI->getType(), DL)) return false; - SmallVector<Value *, 16> BuildVector; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds)) + if (!findBuildAggregate(IVI, BuildVectorOpds)) return false; - DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); + LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true); + return tryToVectorizeList(BuildVectorOpds, R); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { - SmallVector<Value *, 16> BuildVector; + int UserCost; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) + if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || + (llvm::all_of(BuildVectorOpds, + [](Value *V) { return isa<ExtractElementInst>(V); }) && + isShuffle(BuildVectorOpds))) return false; // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, BuildVector); + return tryToVectorizeList(BuildVectorOpds, R, UserCost); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -5763,7 +6229,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; - SmallSet<Value *, 16> VisitedInstrs; + SmallPtrSet<Value *, 16> VisitedInstrs; bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -5798,14 +6264,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to vectorize them. unsigned NumElts = (SameTypeIt - IncIt); - DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" + << NumElts << ")\n"); // The order in which the phi nodes appear in the program does not matter. // So allow tryToVectorizeList to reorder them if it is beneficial. This // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - None, AllowReorder)) { + /*UserCost=*/0, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -5885,7 +6352,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || isa<InsertValueInst>(it)) PostProcessInstructions.push_back(&*it); - } return Changed; @@ -5899,8 +6365,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { if (Entry.second.size() < 2) continue; - DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " - << Entry.second.size() << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " + << Entry.second.size() << ".\n"); // We process the getelementptr list in chunks of 16 (like we do for // stores) to minimize compile-time. @@ -5982,14 +6448,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (it->second.size() < 2) continue; - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " - << it->second.size() << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " + << it->second.size() << ".\n"); // Process the stores in chunks of 16. // TODO: The limit of 16 inhibits greater vectorization factors. // For example, AVX2 supports v32i8. Increasing this limit, however, // may cause a significant compile-time increase. - for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { + for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) { unsigned Len = std::min<unsigned>(CE - CI, 16); Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); } diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h new file mode 100644 index 000000000000..f43a8bb123b1 --- /dev/null +++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -0,0 +1,131 @@ +//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H +#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H + +#include "LoopVectorizationPlanner.h" +#include "VPlan.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/IRBuilder.h" + +namespace llvm { + +class LoopVectorizationLegality; +class LoopVectorizationCostModel; +class TargetTransformInfo; +class TargetLibraryInfo; + +/// Helper class to create VPRecipies from IR instructions. +class VPRecipeBuilder { + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Target Library Info. + const TargetLibraryInfo *TLI; + + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + + /// The profitablity analysis. + LoopVectorizationCostModel &CM; + + VPBuilder &Builder; + + /// When we if-convert we need to create edge masks. We have to cache values + /// so that we don't end up with exponential recursion/IR. Note that + /// if-conversion currently takes place during VPlan-construction, so these + /// caches are only used at that stage. + using EdgeMaskCacheTy = + DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; + using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; + EdgeMaskCacheTy EdgeMaskCache; + BlockMaskCacheTy BlockMaskCache; + +public: + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); + + /// A helper function that computes the predicate of the edge between SRC + /// and DST. + VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + + /// Check if \I belongs to an Interleave Group within the given VF \p Range, + /// \return true in the first returned value if so and false otherwise. + /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG + /// for \p Range.Start, and provide it as the second returned value. + /// Note that if \I is an adjunct member of an IG for \p Range.Start, the + /// \return value is <true, nullptr>, as it is handled by another recipe. + /// \p Range.End may be decreased to ensure same decision from \p Range.Start + /// to \p Range.End. + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + + /// Check if \I is a memory instruction to be widened for \p Range.Start and + /// potentially masked. Such instructions are handled by a recipe that takes + /// an additional VPInstruction for the mask. + VPWidenMemoryInstructionRecipe * + tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); + + /// Check if an induction recipe should be constructed for \I within the given + /// VF \p Range. If so build and return it. If not, return null. \p Range.End + /// may be decreased to ensure same decision from \p Range.Start to + /// \p Range.End. + VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, + VFRange &Range); + + /// Handle non-loop phi nodes. Currently all such phi nodes are turned into + /// a sequence of select instructions as the vectorizer currently performs + /// full if-conversion. + VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); + + /// Check if \p I can be widened within the given VF \p Range. If \p I can be + /// widened for \p Range.Start, check if the last recipe of \p VPBB can be + /// extended to include \p I or else build a new VPWidenRecipe for it and + /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, + /// false otherwise. Range.End may be decreased to ensure same decision from + /// \p Range.Start to \p Range.End. + bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); + + /// Create a replicating region for instruction \p I that requires + /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. + VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, + VPlanPtr &Plan); + +public: + VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM, VPBuilder &Builder) + : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + Builder(Builder) {} + + /// Check if a recipe can be create for \p I withing the given VF \p Range. + /// If a recipe can be created, it adds it to \p VPBB. + bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, + VPBasicBlock *VPBB); + + /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it + /// is predicated. \return \p VPBB augmented with this new recipe if \p I is + /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new + /// Region. Update the packing decision of predicated instructions if they + /// feed \p I. Range.End may be decreased to ensure same recipe behavior from + /// \p Range.Start to \p Range.End. + VPBasicBlock *handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, + VPlanPtr &Plan); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp index 4e54fc6db2a5..f7b07b722bb1 100644 --- a/lib/Transforms/Vectorize/VPlan.cpp +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -116,7 +116,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { BasicBlock *PrevBB = CFG.PrevBB; BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), PrevBB->getParent(), CFG.LastBB); - DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { @@ -125,7 +125,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); - DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); if (isa<UnreachableInst>(PredBBTerminator)) { assert(PredVPSuccessors.size() == 1 && "Predecessor ending w/o branch must have single successor."); @@ -175,8 +175,8 @@ void VPBasicBlock::execute(VPTransformState *State) { } // 2. Fill the IR basic block with IR instructions. - DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() - << " in BB:" << NewBB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() + << " in BB:" << NewBB->getName() << '\n'); State->CFG.VPBB2IRBB[this] = NewBB; State->CFG.PrevVPBB = this; @@ -184,7 +184,7 @@ void VPBasicBlock::execute(VPTransformState *State) { for (VPRecipeBase &Recipe : Recipes) Recipe.execute(*State); - DEBUG(dbgs() << "LV: filled BB:" << *NewBB); + LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } void VPRegionBlock::execute(VPTransformState *State) { @@ -193,7 +193,7 @@ void VPRegionBlock::execute(VPTransformState *State) { if (!isReplicator()) { // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { - DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } return; @@ -210,7 +210,7 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { - DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } } @@ -220,6 +220,15 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance.reset(); } +void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { + Parent = InsertPos->getParent(); + Parent->getRecipeList().insert(InsertPos->getIterator(), this); +} + +iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { + return getParent()->getRecipeList().erase(getIterator()); +} + void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; @@ -356,7 +365,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, "One successor of a basic block does not lead to the other."); assert(InterimSucc->getSinglePredecessor() && "Interim successor has more than one predecessor."); - assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 && + assert(pred_size(PostDomSucc) == 2 && "PostDom successor has more than two predecessors."); DT->addNewBlock(InterimSucc, BB); DT->addNewBlock(PostDomSucc, BB); @@ -448,6 +457,18 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { bumpIndent(1); for (const VPRecipeBase &Recipe : *BasicBlock) Recipe.print(OS, Indent); + + // Dump the condition bit. + const VPValue *CBV = BasicBlock->getCondBit(); + if (CBV) { + OS << " +\n" << Indent << " \"CondBit: "; + if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) { + CBI->printAsOperand(OS); + OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; + } else + CBV->printAsOperand(OS); + } + bumpIndent(-2); OS << "\n" << Indent << "]\n"; dumpEdges(BasicBlock); diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index 2ccabfd6af25..866951cb79a4 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -30,6 +30,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" @@ -42,15 +43,10 @@ #include <map> #include <string> -// The (re)use of existing LoopVectorize classes is subject to future VPlan -// refactoring. -namespace { -class LoopVectorizationLegality; -class LoopVectorizationCostModel; -} // namespace - namespace llvm { +class LoopVectorizationLegality; +class LoopVectorizationCostModel; class BasicBlock; class DominatorTree; class InnerLoopVectorizer; @@ -60,6 +56,20 @@ class raw_ostream; class Value; class VPBasicBlock; class VPRegionBlock; +class VPlan; + +/// A range of powers-of-2 vectorization factors with fixed start and +/// adjustable end. The range includes start and excludes end, e.g.,: +/// [1, 9) = {1, 2, 4, 8} +struct VFRange { + // A power of 2. + const unsigned Start; + + // Need not be a power of 2. If End <= Start range is empty. + unsigned End; +}; + +using VPlanPtr = std::unique_ptr<VPlan>; /// In what follows, the term "input IR" refers to code that is fed into the /// vectorizer whereas the term "output IR" refers to code that is generated by @@ -311,6 +321,8 @@ struct VPTransformState { /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. class VPBlockBase { + friend class VPBlockUtils; + private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -327,6 +339,9 @@ private: /// List of successor blocks. SmallVector<VPBlockBase *, 1> Successors; + /// Successor selector, null for zero or single successor blocks. + VPValue *CondBit = nullptr; + /// Add \p Successor as the last successor to this block. void appendSuccessor(VPBlockBase *Successor) { assert(Successor && "Cannot add nullptr successor!"); @@ -377,6 +392,7 @@ public: /// for any other purpose, as the values may change as LLVM evolves. unsigned getVPBlockID() const { return SubclassID; } + VPRegionBlock *getParent() { return Parent; } const VPRegionBlock *getParent() const { return Parent; } void setParent(VPRegionBlock *P) { Parent = P; } @@ -411,6 +427,9 @@ public: return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); } + size_t getNumSuccessors() const { return Successors.size(); } + size_t getNumPredecessors() const { return Predecessors.size(); } + /// An Enclosing Block of a block B is any block containing B, including B /// itself. \return the closest enclosing block starting from "this", which /// has successors. \return the root enclosing block if all enclosing blocks @@ -454,34 +473,41 @@ public: return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); } - /// Sets a given VPBlockBase \p Successor as the single successor and \return - /// \p Successor. The parent of this Block is copied to be the parent of - /// \p Successor. - VPBlockBase *setOneSuccessor(VPBlockBase *Successor) { + /// \return the condition bit selecting the successor. + VPValue *getCondBit() { return CondBit; } + + const VPValue *getCondBit() const { return CondBit; } + + void setCondBit(VPValue *CV) { CondBit = CV; } + + /// Set a given VPBlockBase \p Successor as the single successor of this + /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. + /// This VPBlockBase must have no successors. + void setOneSuccessor(VPBlockBase *Successor) { assert(Successors.empty() && "Setting one successor when others exist."); appendSuccessor(Successor); - Successor->appendPredecessor(this); - Successor->Parent = Parent; - return Successor; } - /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two - /// successors. The parent of this Block is copied to be the parent of both - /// \p IfTrue and \p IfFalse. - void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { + /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two + /// successors of this VPBlockBase. \p Condition is set as the successor + /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p + /// IfFalse. This VPBlockBase must have no successors. + void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse, + VPValue *Condition) { assert(Successors.empty() && "Setting two successors when others exist."); + assert(Condition && "Setting two successors without condition!"); + CondBit = Condition; appendSuccessor(IfTrue); appendSuccessor(IfFalse); - IfTrue->appendPredecessor(this); - IfFalse->appendPredecessor(this); - IfTrue->Parent = Parent; - IfFalse->Parent = Parent; } - void disconnectSuccessor(VPBlockBase *Successor) { - assert(Successor && "Successor to disconnect is null."); - removeSuccessor(Successor); - Successor->removePredecessor(this); + /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase. + /// This VPBlockBase must have no predecessors. This VPBlockBase is not added + /// as successor of any VPBasicBlock in \p NewPreds. + void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) { + assert(Predecessors.empty() && "Block predecessors already set."); + for (auto *Pred : NewPreds) + appendPredecessor(Pred); } /// The method which generates the output IR that correspond to this @@ -539,6 +565,15 @@ public: /// Each recipe prints itself. virtual void print(raw_ostream &O, const Twine &Indent) const = 0; + + /// Insert an unlinked recipe into a basic block immediately before + /// the specified recipe. + void insertBefore(VPRecipeBase *InsertPos); + + /// This method unlinks 'this' from the containing basic block and deletes it. + /// + /// \returns an iterator pointing to the element after the erased one + iplist<VPRecipeBase>::iterator eraseFromParent(); }; /// This is a concrete Recipe that models a single VPlan-level instruction. @@ -546,6 +581,8 @@ public: /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { + friend class VPlanHCFGTransforms; + public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. enum { Not = Instruction::OtherOpsEnd + 1 }; @@ -559,10 +596,13 @@ private: void generateInstruction(VPTransformState &State, unsigned Part); public: - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPInstructionSC, Operands), VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {} + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} + /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPInstructionSC; @@ -907,7 +947,10 @@ public: inline const VPRecipeBase &back() const { return Recipes.back(); } inline VPRecipeBase &back() { return Recipes.back(); } - /// \brief Returns a pointer to a member of the recipe list. + /// Returns a reference to the list of recipes. + RecipeListTy &getRecipeList() { return Recipes; } + + /// Returns a pointer to a member of the recipe list. static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { return &VPBasicBlock::Recipes; } @@ -968,6 +1011,9 @@ public: Entry->setParent(this); Exit->setParent(this); } + VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) + : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr), + IsReplicator(IsReplicator) {} ~VPRegionBlock() override { if (Entry) @@ -982,9 +1028,27 @@ public: const VPBlockBase *getEntry() const { return Entry; } VPBlockBase *getEntry() { return Entry; } + /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p + /// EntryBlock must have no predecessors. + void setEntry(VPBlockBase *EntryBlock) { + assert(EntryBlock->getPredecessors().empty() && + "Entry block cannot have predecessors."); + Entry = EntryBlock; + EntryBlock->setParent(this); + } + const VPBlockBase *getExit() const { return Exit; } VPBlockBase *getExit() { return Exit; } + /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p + /// ExitBlock must have no successors. + void setExit(VPBlockBase *ExitBlock) { + assert(ExitBlock->getSuccessors().empty() && + "Exit block cannot have successors."); + Exit = ExitBlock; + ExitBlock->setParent(this); + } + /// An indicator whether this region is to generate multiple replicated /// instances of output IR corresponding to its VPBlockBases. bool isReplicator() const { return IsReplicator; } @@ -1012,6 +1076,13 @@ private: /// Holds the name of the VPlan, for printing. std::string Name; + /// Holds all the external definitions created for this VPlan. + // TODO: Introduce a specific representation for external definitions in + // VPlan. External definitions must be immutable and hold a pointer to its + // underlying IR that will be used to implement its structural comparison + // (operators '==' and '<'). + SmallPtrSet<VPValue *, 16> VPExternalDefs; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -1024,6 +1095,8 @@ public: VPBlockBase::deleteCFG(Entry); for (auto &MapEntry : Value2VPValue) delete MapEntry.second; + for (VPValue *Def : VPExternalDefs) + delete Def; } /// Generate the IR code for this VPlan. @@ -1042,6 +1115,12 @@ public: void setName(const Twine &newName) { Name = newName.str(); } + /// Add \p VPVal to the pool of external definitions if it's not already + /// in the pool. + void addExternalDef(VPValue *VPVal) { + VPExternalDefs.insert(VPVal); + } + void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); @@ -1189,6 +1268,72 @@ template <> struct GraphTraits<Inverse<VPBlockBase *>> { } }; +//===----------------------------------------------------------------------===// +// VPlan Utilities +//===----------------------------------------------------------------------===// + +/// Class that provides utilities for VPBlockBases in VPlan. +class VPBlockUtils { +public: + VPBlockUtils() = delete; + + /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p + /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p + /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr + /// has more than one successor, its conditional bit is propagated to \p + /// NewBlock. \p NewBlock must have neither successors nor predecessors. + static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { + assert(NewBlock->getSuccessors().empty() && + "Can't insert new block with successors."); + // TODO: move successors from BlockPtr to NewBlock when this functionality + // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr + // already has successors. + BlockPtr->setOneSuccessor(NewBlock); + NewBlock->setPredecessors({BlockPtr}); + NewBlock->setParent(BlockPtr->getParent()); + } + + /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p + /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p + /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr + /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor + /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse + /// must have neither successors nor predecessors. + static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, + VPValue *Condition, VPBlockBase *BlockPtr) { + assert(IfTrue->getSuccessors().empty() && + "Can't insert IfTrue with successors."); + assert(IfFalse->getSuccessors().empty() && + "Can't insert IfFalse with successors."); + BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition); + IfTrue->setPredecessors({BlockPtr}); + IfFalse->setPredecessors({BlockPtr}); + IfTrue->setParent(BlockPtr->getParent()); + IfFalse->setParent(BlockPtr->getParent()); + } + + /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to + /// the successors of \p From and \p From to the predecessors of \p To. Both + /// VPBlockBases must have the same parent, which can be null. Both + /// VPBlockBases can be already connected to other VPBlockBases. + static void connectBlocks(VPBlockBase *From, VPBlockBase *To) { + assert((From->getParent() == To->getParent()) && + "Can't connect two block with different parents"); + assert(From->getNumSuccessors() < 2 && + "Blocks can't have more than two successors."); + From->appendSuccessor(To); + To->appendPredecessor(From); + } + + /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To + /// from the successors of \p From and \p From from the predecessors of \p To. + static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) { + assert(To && "Successor to disconnect is null."); + From->removeSuccessor(To); + To->removePredecessor(From); + } +}; + } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/lib/Transforms/Vectorize/VPlanBuilder.h b/lib/Transforms/Vectorize/VPlanBuilder.h deleted file mode 100644 index d6eb3397d044..000000000000 --- a/lib/Transforms/Vectorize/VPlanBuilder.h +++ /dev/null @@ -1,61 +0,0 @@ -//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file provides a VPlan-based builder utility analogous to IRBuilder. -/// It provides an instruction-level API for generating VPInstructions while -/// abstracting away the Recipe manipulation details. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H - -#include "VPlan.h" - -namespace llvm { - -class VPBuilder { -private: - VPBasicBlock *BB = nullptr; - VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); - - VPInstruction *createInstruction(unsigned Opcode, - std::initializer_list<VPValue *> Operands) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands); - BB->insert(Instr, InsertPt); - return Instr; - } - -public: - VPBuilder() {} - - /// \brief This specifies that created VPInstructions should be appended to - /// the end of the specified block. - void setInsertPoint(VPBasicBlock *TheBB) { - assert(TheBB && "Attempting to set a null insert point"); - BB = TheBB; - InsertPt = BB->end(); - } - - VPValue *createNot(VPValue *Operand) { - return createInstruction(VPInstruction::Not, {Operand}); - } - - VPValue *createAnd(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); - } - - VPValue *createOr(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); - } -}; - -} // namespace llvm - -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp new file mode 100644 index 000000000000..08129b74cddf --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -0,0 +1,336 @@ +//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the construction of a VPlan-based Hierarchical CFG +/// (H-CFG) for an incoming IR. This construction comprises the following +/// components and steps: +// +/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that +/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top +/// Region) is created to enclose and serve as parent of all the VPBasicBlocks +/// in the plain CFG. +/// NOTE: At this point, there is a direct correspondence between all the +/// VPBasicBlocks created for the initial plain CFG and the incoming +/// BasicBlocks. However, this might change in the future. +/// +//===----------------------------------------------------------------------===// + +#include "VPlanHCFGBuilder.h" +#include "LoopVectorizationPlanner.h" +#include "llvm/Analysis/LoopIterator.h" + +#define DEBUG_TYPE "loop-vectorize" + +using namespace llvm; + +namespace { +// Class that is used to build the plain CFG for the incoming IR. +class PlainCFGBuilder { +private: + // The outermost loop of the input loop nest considered for vectorization. + Loop *TheLoop; + + // Loop Info analysis. + LoopInfo *LI; + + // Vectorization plan that we are working on. + VPlan &Plan; + + // Output Top Region. + VPRegionBlock *TopRegion = nullptr; + + // Builder of the VPlan instruction-level representation. + VPBuilder VPIRBuilder; + + // NOTE: The following maps are intentionally destroyed after the plain CFG + // construction because subsequent VPlan-to-VPlan transformation may + // invalidate them. + // Map incoming BasicBlocks to their newly-created VPBasicBlocks. + DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB; + // Map incoming Value definitions to their newly-created VPValues. + DenseMap<Value *, VPValue *> IRDef2VPValue; + + // Hold phi node's that need to be fixed once the plain CFG has been built. + SmallVector<PHINode *, 8> PhisToFix; + + // Utility functions. + void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); + void fixPhiNodes(); + VPBasicBlock *getOrCreateVPBB(BasicBlock *BB); + bool isExternalDef(Value *Val); + VPValue *getOrCreateVPOperand(Value *IRVal); + void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB); + +public: + PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) + : TheLoop(Lp), LI(LI), Plan(P) {} + + // Build the plain CFG and return its Top Region. + VPRegionBlock *buildPlainCFG(); +}; +} // anonymous namespace + +// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB +// must have no predecessors. +void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) { + SmallVector<VPBlockBase *, 8> VPBBPreds; + // Collect VPBB predecessors. + for (BasicBlock *Pred : predecessors(BB)) + VPBBPreds.push_back(getOrCreateVPBB(Pred)); + + VPBB->setPredecessors(VPBBPreds); +} + +// Add operands to VPInstructions representing phi nodes from the input IR. +void PlainCFGBuilder::fixPhiNodes() { + for (auto *Phi : PhisToFix) { + assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode."); + VPValue *VPVal = IRDef2VPValue[Phi]; + assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node."); + auto *VPPhi = cast<VPInstruction>(VPVal); + assert(VPPhi->getNumOperands() == 0 && + "Expected VPInstruction with no operands."); + + for (Value *Op : Phi->operands()) + VPPhi->addOperand(getOrCreateVPOperand(Op)); + } +} + +// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an +// existing one if it was already created. +VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { + auto BlockIt = BB2VPBB.find(BB); + if (BlockIt != BB2VPBB.end()) + // Retrieve existing VPBB. + return BlockIt->second; + + // Create new VPBB. + LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n"); + VPBasicBlock *VPBB = new VPBasicBlock(BB->getName()); + BB2VPBB[BB] = VPBB; + VPBB->setParent(TopRegion); + return VPBB; +} + +// Return true if \p Val is considered an external definition. An external +// definition is either: +// 1. A Value that is not an Instruction. This will be refined in the future. +// 2. An Instruction that is outside of the CFG snippet represented in VPlan, +// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c) +// outermost loop exits. +bool PlainCFGBuilder::isExternalDef(Value *Val) { + // All the Values that are not Instructions are considered external + // definitions for now. + Instruction *Inst = dyn_cast<Instruction>(Val); + if (!Inst) + return true; + + BasicBlock *InstParent = Inst->getParent(); + assert(InstParent && "Expected instruction parent."); + + // Check whether Instruction definition is in loop PH. + BasicBlock *PH = TheLoop->getLoopPreheader(); + assert(PH && "Expected loop pre-header."); + + if (InstParent == PH) + // Instruction definition is in outermost loop PH. + return false; + + // Check whether Instruction definition is in the loop exit. + BasicBlock *Exit = TheLoop->getUniqueExitBlock(); + assert(Exit && "Expected loop with single exit."); + if (InstParent == Exit) { + // Instruction definition is in outermost loop exit. + return false; + } + + // Check whether Instruction definition is in loop body. + return !TheLoop->contains(Inst); +} + +// Create a new VPValue or retrieve an existing one for the Instruction's +// operand \p IRVal. This function must only be used to create/retrieve VPValues +// for *Instruction's operands* and not to create regular VPInstruction's. For +// the latter, please, look at 'createVPInstructionsForVPBB'. +VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) { + auto VPValIt = IRDef2VPValue.find(IRVal); + if (VPValIt != IRDef2VPValue.end()) + // Operand has an associated VPInstruction or VPValue that was previously + // created. + return VPValIt->second; + + // Operand doesn't have a previously created VPInstruction/VPValue. This + // means that operand is: + // A) a definition external to VPlan, + // B) any other Value without specific representation in VPlan. + // For now, we use VPValue to represent A and B and classify both as external + // definitions. We may introduce specific VPValue subclasses for them in the + // future. + assert(isExternalDef(IRVal) && "Expected external definition as operand."); + + // A and B: Create VPValue and add it to the pool of external definitions and + // to the Value->VPValue map. + VPValue *NewVPVal = new VPValue(IRVal); + Plan.addExternalDef(NewVPVal); + IRDef2VPValue[IRVal] = NewVPVal; + return NewVPVal; +} + +// Create new VPInstructions in a VPBasicBlock, given its BasicBlock +// counterpart. This function must be invoked in RPO so that the operands of a +// VPInstruction in \p BB have been visited before (except for Phi nodes). +void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, + BasicBlock *BB) { + VPIRBuilder.setInsertPoint(VPBB); + for (Instruction &InstRef : *BB) { + Instruction *Inst = &InstRef; + + // There shouldn't be any VPValue for Inst at this point. Otherwise, we + // visited Inst when we shouldn't, breaking the RPO traversal order. + assert(!IRDef2VPValue.count(Inst) && + "Instruction shouldn't have been visited."); + + if (auto *Br = dyn_cast<BranchInst>(Inst)) { + // Branch instruction is not explicitly represented in VPlan but we need + // to represent its condition bit when it's conditional. + if (Br->isConditional()) + getOrCreateVPOperand(Br->getCondition()); + + // Skip the rest of the Instruction processing for Branch instructions. + continue; + } + + VPInstruction *NewVPInst; + if (auto *Phi = dyn_cast<PHINode>(Inst)) { + // Phi node's operands may have not been visited at this point. We create + // an empty VPInstruction that we will fix once the whole plain CFG has + // been built. + NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp( + Inst->getOpcode(), {} /*No operands*/, Inst)); + PhisToFix.push_back(Phi); + } else { + // Translate LLVM-IR operands into VPValue operands and set them in the + // new VPInstruction. + SmallVector<VPValue *, 4> VPOperands; + for (Value *Op : Inst->operands()) + VPOperands.push_back(getOrCreateVPOperand(Op)); + + // Build VPInstruction for any arbitraty Instruction without specific + // representation in VPlan. + NewVPInst = cast<VPInstruction>( + VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst)); + } + + IRDef2VPValue[Inst] = NewVPInst; + } +} + +// Main interface to build the plain CFG. +VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { + // 1. Create the Top Region. It will be the parent of all VPBBs. + TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/); + + // 2. Scan the body of the loop in a topological order to visit each basic + // block after having visited its predecessor basic blocks. Create a VPBB for + // each BB and link it to its successor and predecessor VPBBs. Note that + // predecessors must be set in the same order as they are in the incomming IR. + // Otherwise, there might be problems with existing phi nodes and algorithm + // based on predecessors traversal. + + // Loop PH needs to be explicitly visited since it's not taken into account by + // LoopBlocksDFS. + BasicBlock *PreheaderBB = TheLoop->getLoopPreheader(); + assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) && + "Unexpected loop preheader"); + VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB); + createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB); + // Create empty VPBB for Loop H so that we can link PH->H. + VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader()); + // Preheader's predecessors will be set during the loop RPO traversal below. + PreheaderVPBB->setOneSuccessor(HeaderVPBB); + + LoopBlocksRPO RPO(TheLoop); + RPO.perform(LI); + + for (BasicBlock *BB : RPO) { + // Create or retrieve the VPBasicBlock for this BB and create its + // VPInstructions. + VPBasicBlock *VPBB = getOrCreateVPBB(BB); + createVPInstructionsForVPBB(VPBB, BB); + + // Set VPBB successors. We create empty VPBBs for successors if they don't + // exist already. Recipes will be created when the successor is visited + // during the RPO traversal. + TerminatorInst *TI = BB->getTerminator(); + assert(TI && "Terminator expected."); + unsigned NumSuccs = TI->getNumSuccessors(); + + if (NumSuccs == 1) { + VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0)); + assert(SuccVPBB && "VPBB Successor not found."); + VPBB->setOneSuccessor(SuccVPBB); + } else if (NumSuccs == 2) { + VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0)); + assert(SuccVPBB0 && "Successor 0 not found."); + VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1)); + assert(SuccVPBB1 && "Successor 1 not found."); + + // Get VPBB's condition bit. + assert(isa<BranchInst>(TI) && "Unsupported terminator!"); + auto *Br = cast<BranchInst>(TI); + Value *BrCond = Br->getCondition(); + // Look up the branch condition to get the corresponding VPValue + // representing the condition bit in VPlan (which may be in another VPBB). + assert(IRDef2VPValue.count(BrCond) && + "Missing condition bit in IRDef2VPValue!"); + VPValue *VPCondBit = IRDef2VPValue[BrCond]; + + // Link successors using condition bit. + VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit); + } else + llvm_unreachable("Number of successors not supported."); + + // Set VPBB predecessors in the same order as they are in the incoming BB. + setVPBBPredsFromBB(VPBB, BB); + } + + // 3. Process outermost loop exit. We created an empty VPBB for the loop + // single exit BB during the RPO traversal of the loop body but Instructions + // weren't visited because it's not part of the the loop. + BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock(); + assert(LoopExitBB && "Loops with multiple exits are not supported."); + VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB]; + createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB); + // Loop exit was already set as successor of the loop exiting BB. + // We only set its predecessor VPBB now. + setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB); + + // 4. The whole CFG has been built at this point so all the input Values must + // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding + // VPlan operands. + fixPhiNodes(); + + // 5. Final Top Region setup. Set outermost loop pre-header and single exit as + // Top Region entry and exit. + TopRegion->setEntry(PreheaderVPBB); + TopRegion->setExit(LoopExitVPBB); + return TopRegion; +} + +// Public interface to build a H-CFG. +void VPlanHCFGBuilder::buildHierarchicalCFG(VPlan &Plan) { + // Build Top Region enclosing the plain CFG and set it as VPlan entry. + PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan); + VPRegionBlock *TopRegion = PCFGBuilder.buildPlainCFG(); + Plan.setEntry(TopRegion); + LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan); + + Verifier.verifyHierarchicalCFG(TopRegion); +} diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h new file mode 100644 index 000000000000..c4e69843615a --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -0,0 +1,55 @@ +//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the VPlanHCFGBuilder class which contains the public +/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG +/// (H-CFG) for an incoming IR. +/// +/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks +/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan +/// consists of a VPRegionBlock, denoted Top Region, which encloses any other +/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG +/// other than the Top Region will have a parent VPRegionBlock and allows us +/// to easily add more nodes before/after the main vector loop (such as the +/// reduction epilogue). +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H + +#include "VPlan.h" +#include "VPlanVerifier.h" + +namespace llvm { + +class Loop; + +/// Main class to build the VPlan H-CFG for an incoming IR. +class VPlanHCFGBuilder { +private: + // The outermost loop of the input loop nest considered for vectorization. + Loop *TheLoop; + + // Loop Info analysis. + LoopInfo *LI; + + // VPlan verifier utility. + VPlanVerifier Verifier; + +public: + VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI) : TheLoop(Lp), LI(LI) {} + + /// Build H-CFG for TheLoop and update \p Plan accordingly. + void buildHierarchicalCFG(VPlan &Plan); +}; +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp new file mode 100644 index 000000000000..e3cbab077e61 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -0,0 +1,73 @@ +//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements a set of utility VPlan to VPlan transformations. +/// +//===----------------------------------------------------------------------===// + +#include "VPlanHCFGTransforms.h" +#include "llvm/ADT/PostOrderIterator.h" + +using namespace llvm; + +void VPlanHCFGTransforms::VPInstructionsToVPRecipes( + VPlanPtr &Plan, + LoopVectorizationLegality::InductionList *Inductions, + SmallPtrSetImpl<Instruction *> &DeadInstructions) { + + VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry()); + ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry()); + for (VPBlockBase *Base : RPOT) { + // Do not widen instructions in pre-header and exit blocks. + if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0) + continue; + + VPBasicBlock *VPBB = Base->getEntryBasicBlock(); + VPRecipeBase *LastRecipe = nullptr; + // Introduce each ingredient into VPlan. + for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) { + VPRecipeBase *Ingredient = &*I++; + // Can only handle VPInstructions. + VPInstruction *VPInst = cast<VPInstruction>(Ingredient); + Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); + if (DeadInstructions.count(Inst)) { + Ingredient->eraseFromParent(); + continue; + } + + VPRecipeBase *NewRecipe = nullptr; + // Create VPWidenMemoryInstructionRecipe for loads and stores. + if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) + NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/); + else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) { + InductionDescriptor II = Inductions->lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) { + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); + } else + NewRecipe = new VPWidenPHIRecipe(Phi); + } else { + // If the last recipe is a VPWidenRecipe, add Inst to it instead of + // creating a new recipe. + if (VPWidenRecipe *WidenRecipe = + dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) { + WidenRecipe->appendInstruction(Inst); + Ingredient->eraseFromParent(); + continue; + } + NewRecipe = new VPWidenRecipe(Inst); + } + + NewRecipe->insertBefore(Ingredient); + LastRecipe = NewRecipe; + Ingredient->eraseFromParent(); + } + } +} diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h new file mode 100644 index 000000000000..ae549c6871b3 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h @@ -0,0 +1,36 @@ +//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides utility VPlan to VPlan transformations. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H + +#include "VPlan.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" + +namespace llvm { + +class VPlanHCFGTransforms { + +public: + /// Replaces the VPInstructions in \p Plan with corresponding + /// widen recipes. + static void VPInstructionsToVPRecipes( + VPlanPtr &Plan, + LoopVectorizationLegality::InductionList *Inductions, + SmallPtrSetImpl<Instruction *> &DeadInstructions); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h index 50966891e0eb..08f142915b49 100644 --- a/lib/Transforms/Vectorize/VPlanValue.h +++ b/lib/Transforms/Vectorize/VPlanValue.h @@ -37,13 +37,34 @@ class VPUser; // coming from the input IR, instructions which VPlan will generate if executed // and live-outs which the VPlan will need to fix accordingly. class VPValue { + friend class VPBuilder; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). SmallVector<VPUser *, 1> Users; protected: - VPValue(const unsigned char SC) : SubclassID(SC) {} + // Hold the underlying Value, if any, attached to this VPValue. + Value *UnderlyingVal; + + VPValue(const unsigned char SC, Value *UV = nullptr) + : SubclassID(SC), UnderlyingVal(UV) {} + + // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to + // the front-end and back-end of VPlan so that the middle-end is as + // independent as possible of the underlying IR. We grant access to the + // underlying IR using friendship. In that way, we should be able to use VPlan + // for multiple underlying IRs (Polly?) by providing a new VPlan front-end, + // back-end and analysis information for the new IR. + + /// Return the underlying Value attached to this VPValue. + Value *getUnderlyingValue() { return UnderlyingVal; } + + // Set \p Val as the underlying Value of this VPValue. + void setUnderlyingValue(Value *Val) { + assert(!UnderlyingVal && "Underlying Value is already set."); + UnderlyingVal = Val; + } public: /// An enumeration for keeping track of the concrete subclass of VPValue that @@ -52,7 +73,7 @@ public: /// type identification. enum { VPValueSC, VPUserSC, VPInstructionSC }; - VPValue() : SubclassID(VPValueSC) {} + VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {} VPValue(const VPValue &) = delete; VPValue &operator=(const VPValue &) = delete; @@ -94,11 +115,6 @@ class VPUser : public VPValue { private: SmallVector<VPValue *, 2> Operands; - void addOperand(VPValue *Operand) { - Operands.push_back(Operand); - Operand->addUser(*this); - } - protected: VPUser(const unsigned char SC) : VPValue(SC) {} VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) { @@ -120,6 +136,11 @@ public: V->getVPValueID() <= VPInstructionSC; } + void addOperand(VPValue *Operand) { + Operands.push_back(Operand); + Operand->addUser(*this); + } + unsigned getNumOperands() const { return Operands.size(); } inline VPValue *getOperand(unsigned N) const { assert(N < Operands.size() && "Operand index out of bounds"); diff --git a/lib/Transforms/Vectorize/VPlanVerifier.cpp b/lib/Transforms/Vectorize/VPlanVerifier.cpp new file mode 100644 index 000000000000..054bed4e177f --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -0,0 +1,133 @@ +//===-- VPlanVerifier.cpp -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the class VPlanVerifier, which contains utility functions +/// to check the consistency and invariants of a VPlan. +/// +//===----------------------------------------------------------------------===// + +#include "VPlanVerifier.h" +#include "llvm/ADT/DepthFirstIterator.h" + +#define DEBUG_TYPE "loop-vectorize" + +using namespace llvm; + +static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false), + cl::Hidden, + cl::desc("Verify VPlan H-CFG.")); + +#ifndef NDEBUG +/// Utility function that checks whether \p VPBlockVec has duplicate +/// VPBlockBases. +static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) { + SmallDenseSet<const VPBlockBase *, 8> VPBlockSet; + for (const auto *Block : VPBlockVec) { + if (VPBlockSet.count(Block)) + return true; + VPBlockSet.insert(Block); + } + return false; +} +#endif + +/// Helper function that verifies the CFG invariants of the VPBlockBases within +/// \p Region. Checks in this function are generic for VPBlockBases. They are +/// not specific for VPBasicBlocks or VPRegionBlocks. +static void verifyBlocksInRegion(const VPRegionBlock *Region) { + for (const VPBlockBase *VPB : + make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()), + df_iterator<const VPBlockBase *>::end(Region->getExit()))) { + // Check block's parent. + assert(VPB->getParent() == Region && "VPBlockBase has wrong parent"); + + // Check block's condition bit. + if (VPB->getNumSuccessors() > 1) + assert(VPB->getCondBit() && "Missing condition bit!"); + else + assert(!VPB->getCondBit() && "Unexpected condition bit!"); + + // Check block's successors. + const auto &Successors = VPB->getSuccessors(); + // There must be only one instance of a successor in block's successor list. + // TODO: This won't work for switch statements. + assert(!hasDuplicates(Successors) && + "Multiple instances of the same successor."); + + for (const VPBlockBase *Succ : Successors) { + // There must be a bi-directional link between block and successor. + const auto &SuccPreds = Succ->getPredecessors(); + assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) != + SuccPreds.end() && + "Missing predecessor link."); + (void)SuccPreds; + } + + // Check block's predecessors. + const auto &Predecessors = VPB->getPredecessors(); + // There must be only one instance of a predecessor in block's predecessor + // list. + // TODO: This won't work for switch statements. + assert(!hasDuplicates(Predecessors) && + "Multiple instances of the same predecessor."); + + for (const VPBlockBase *Pred : Predecessors) { + // Block and predecessor must be inside the same region. + assert(Pred->getParent() == VPB->getParent() && + "Predecessor is not in the same region."); + + // There must be a bi-directional link between block and predecessor. + const auto &PredSuccs = Pred->getSuccessors(); + assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) != + PredSuccs.end() && + "Missing successor link."); + (void)PredSuccs; + } + } +} + +/// Verify the CFG invariants of VPRegionBlock \p Region and its nested +/// VPBlockBases. Do not recurse inside nested VPRegionBlocks. +static void verifyRegion(const VPRegionBlock *Region) { + const VPBlockBase *Entry = Region->getEntry(); + const VPBlockBase *Exit = Region->getExit(); + + // Entry and Exit shouldn't have any predecessor/successor, respectively. + assert(!Entry->getNumPredecessors() && "Region entry has predecessors."); + assert(!Exit->getNumSuccessors() && "Region exit has successors."); + (void)Entry; + (void)Exit; + + verifyBlocksInRegion(Region); +} + +/// Verify the CFG invariants of VPRegionBlock \p Region and its nested +/// VPBlockBases. Recurse inside nested VPRegionBlocks. +static void verifyRegionRec(const VPRegionBlock *Region) { + verifyRegion(Region); + + // Recurse inside nested regions. + for (const VPBlockBase *VPB : + make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()), + df_iterator<const VPBlockBase *>::end(Region->getExit()))) { + if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB)) + verifyRegionRec(SubRegion); + } +} + +void VPlanVerifier::verifyHierarchicalCFG( + const VPRegionBlock *TopRegion) const { + if (!EnableHCFGVerifier) + return; + + LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n"); + assert(!TopRegion->getParent() && "VPlan Top Region should have no parent."); + verifyRegionRec(TopRegion); +} diff --git a/lib/Transforms/Vectorize/VPlanVerifier.h b/lib/Transforms/Vectorize/VPlanVerifier.h new file mode 100644 index 000000000000..d2f99d006a66 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanVerifier.h @@ -0,0 +1,44 @@ +//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the class VPlanVerifier, which contains utility functions +/// to check the consistency of a VPlan. This includes the following kinds of +/// invariants: +/// +/// 1. Region/Block invariants: +/// - Region's entry/exit block must have no predecessors/successors, +/// respectively. +/// - Block's parent must be the region immediately containing the block. +/// - Linked blocks must have a bi-directional link (successor/predecessor). +/// - All predecessors/successors of a block must belong to the same region. +/// - Blocks must have no duplicated successor/predecessor. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H + +#include "VPlan.h" + +namespace llvm { + +/// Class with utility functions that can be used to check the consistency and +/// invariants of a VPlan, including the components of its H-CFG. +class VPlanVerifier { +public: + /// Verify the invariants of the H-CFG starting from \p TopRegion. The + /// verification process comprises the following steps: + /// 1. Region/Block verification: Check the Region/Block verification + /// invariants for every region in the H-CFG. + void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const; +}; +} // namespace llvm + +#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index b04905bfc6fa..f62a88558328 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -34,10 +34,6 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) { initializeVectorization(*unwrap(R)); } -// DEPRECATED: Remove after the LLVM 5 release. -void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { -} - void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopVectorizePass()); } |