diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
commit | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch) | |
tree | 599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Transforms/Vectorize | |
parent | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff) |
Diffstat (limited to 'lib/Transforms/Vectorize')
21 files changed, 1856 insertions, 873 deletions
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 9ff18328c219..4273080ddd91 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1,9 +1,8 @@ //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -927,7 +926,7 @@ bool Vectorizer::vectorizeStoreChain( StoreInst *S0 = cast<StoreInst>(Chain[0]); // If the vector has an int element, default to int for the whole store. - Type *StoreTy; + Type *StoreTy = nullptr; for (Instruction *I : Chain) { StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); if (StoreTy->isIntOrIntVectorTy()) @@ -939,6 +938,7 @@ bool Vectorizer::vectorizeStoreChain( break; } } + assert(StoreTy && "Failed to find store type"); unsigned Sz = DL.getTypeSizeInBits(StoreTy); unsigned AS = S0->getPointerAddressSpace(); @@ -1152,13 +1152,8 @@ bool Vectorizer::vectorizeLoadChain( vectorizeLoadChain(Chains.second, InstructionsProcessed); } - unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), - StackAdjustedAlignment, - DL, L0, nullptr, &DT); - if (NewAlign != 0) - Alignment = NewAlign; - - Alignment = NewAlign; + Alignment = getOrEnforceKnownAlignment( + L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT); } if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { @@ -1182,7 +1177,7 @@ bool Vectorizer::vectorizeLoadChain( Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment); + LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); propagateMetadata(LI, Chain); if (VecLoadTy) { diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index b44fe5a52a2f..6ef8dc2d3cd7 100644 --- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1,9 +1,8 @@ //===- LoopVectorizationLegality.cpp --------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -23,6 +22,8 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME +extern cl::opt<bool> EnableVPlanPredication; + static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -46,6 +47,18 @@ static const unsigned MaxInterleaveFactor = 16; namespace llvm { +#ifndef NDEBUG +static void debugVectorizationFailure(const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: Not vectorizing: " << DebugMsg; + if (I != nullptr) + dbgs() << " " << *I; + else + dbgs() << '.'; + dbgs() << '\n'; +} +#endif + OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, @@ -103,6 +116,25 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, << "LV: Interleaving disabled by the pass manager\n"); } +void LoopVectorizeHints::setAlreadyVectorized() { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + + MDNode *IsVectorizedMD = MDNode::get( + Context, + {MDString::get(Context, "llvm.loop.isvectorized"), + ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); + MDNode *LoopID = TheLoop->getLoopID(); + MDNode *NewLoopID = + makePostTransformationMetadata(Context, LoopID, + {Twine(Prefix(), "vectorize.").str(), + Twine(Prefix(), "interleave.").str()}, + {IsVectorizedMD}); + TheLoop->setLoopID(NewLoopID); + + // Update internal cache. + IsVectorized.Value = 1; +} + bool LoopVectorizeHints::allowVectorization( Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { if (getForce() == LoopVectorizeHints::FK_Disabled) { @@ -230,57 +262,6 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { } } -MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name, - unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = { - MDString::get(Context, Name), - ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); -} - -bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node, - ArrayRef<Hint> HintTypes) { - MDString *Name = dyn_cast<MDString>(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; -} - -void LoopVectorizeHints::writeHintsToMetadata(ArrayRef<Hint> HintTypes) { - if (HintTypes.empty()) - return; - - // Reserve the first element to LoopID (see below). - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); -} - bool LoopVectorizationRequirements::doesNotMeet( Function *F, Loop *L, const LoopVectorizeHints &Hints) { const char *PassName = Hints.vectorizeAnalysisPassName(); @@ -464,6 +445,14 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return LAI->isUniform(V); } +void LoopVectorizationLegality::reportVectorizationFailure( + const StringRef DebugMsg, const StringRef OREMsg, + const StringRef ORETag, Instruction *I) const { + LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), + ORETag, TheLoop, I) << OREMsg); +} + bool LoopVectorizationLegality::canVectorizeOuterLoop() { assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); // Store the result and return it at the end instead of exiting early, in case @@ -476,9 +465,9 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { // not supported yet. auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); if (!Br) { - LLVM_DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("Unsupported basic block terminator", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -488,13 +477,16 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { // Check whether the BranchInst is a supported one. Only unconditional // branches, conditional branches with an outer loop invariant condition or // backedges are supported. - if (Br && Br->isConditional() && + // FIXME: We skip these checks when VPlan predication is enabled as we + // want to allow divergent branches. This whole check will be removed + // once VPlan predication is on by default. + if (!EnableVPlanPredication && Br && Br->isConditional() && !TheLoop->isLoopInvariant(Br->getCondition()) && !LI->isLoopHeader(Br->getSuccessor(0)) && !LI->isLoopHeader(Br->getSuccessor(1))) { - LLVM_DEBUG(dbgs() << "LV: Unsupported conditional branch.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("Unsupported conditional branch", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -506,11 +498,9 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { // simple outer loops scenarios with uniform nested loops. if (!isUniformLoopNest(TheLoop /*loop nest*/, TheLoop /*context outer loop*/)) { - LLVM_DEBUG( - dbgs() - << "LV: Not vectorizing: Outer loop contains divergent loops.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("Outer loop contains divergent loops", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -519,10 +509,9 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { // Check whether we are able to set up outer loop induction. if (!setupOuterLoopInductions()) { - LLVM_DEBUG( - dbgs() << "LV: Not vectorizing: Unsupported outer loop Phi(s).\n"); - ORE->emit(createMissedAnalysis("UnsupportedPhi") - << "Unsupported outer loop Phi(s)"); + reportVectorizationFailure("Unsupported outer loop Phi(s)", + "Unsupported outer loop Phi(s)", + "UnsupportedPhi"); if (DoExtraAnalysis) Result = false; else @@ -627,9 +616,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that this PHI type is allowed. if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "loop control flow is not understood by vectorizer"); - LLVM_DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); + reportVectorizationFailure("Found a non-int non-pointer PHI", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); return false; } @@ -647,9 +636,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "control flow not understood by vectorizer"); - LLVM_DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + reportVectorizationFailure("Found an invalid PHI", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood", Phi); return false; } @@ -698,10 +687,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) - << "value that could not be identified as " - "reduction is used outside the loop"); - LLVM_DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); + reportVectorizationFailure("Found an unidentified PHI", + "value that could not be identified as " + "reduction is used outside the loop", + "NonReductionValueUsedOutsideLoop", Phi); return false; } // end of PHI handling @@ -728,31 +717,33 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // but it's hard to provide meaningful yet generic advice. // Also, should this be guarded by allowExtraAnalysis() and/or be part // of the returned info from isFunctionVectorizable()? - ORE->emit(createMissedAnalysis("CantVectorizeLibcall", CI) - << "library call cannot be vectorized. " - "Try compiling with -fno-math-errno, -ffast-math, " - "or similar flags"); + reportVectorizationFailure("Found a non-intrinsic callsite", + "library call cannot be vectorized. " + "Try compiling with -fno-math-errno, -ffast-math, " + "or similar flags", + "CantVectorizeLibcall", CI); } else { - ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) - << "call instruction cannot be vectorized"); + reportVectorizationFailure("Found a non-intrinsic callsite", + "call instruction cannot be vectorized", + "CantVectorizeLibcall", CI); } - LLVM_DEBUG( - dbgs() << "LV: Found a non-intrinsic callsite.\n"); return false; } - // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the - // second argument is the same (i.e. loop invariant) - if (CI && hasVectorInstrinsicScalarOpd( - getVectorIntrinsicIDForCall(CI, TLI), 1)) { + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized (i.e. loop invariant). + if (CI) { auto *SE = PSE.getSE(); - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) - << "intrinsic instruction cannot be vectorized"); - LLVM_DEBUG(dbgs() - << "LV: Found unvectorizable intrinsic " << *CI << "\n"); - return false; - } + Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) + if (hasVectorInstrinsicScalarOpd(IntrinID, i)) { + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { + reportVectorizationFailure("Found unvectorizable intrinsic", + "intrinsic instruction cannot be vectorized", + "CantVectorizeIntrinsic", CI); + return false; + } + } } // Check that the instruction return type is vectorizable. @@ -760,9 +751,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if ((!VectorType::isValidElementType(I.getType()) && !I.getType()->isVoidTy()) || isa<ExtractElementInst>(I)) { - ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) - << "instruction return type cannot be vectorized"); - LLVM_DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); + reportVectorizationFailure("Found unvectorizable type", + "instruction return type cannot be vectorized", + "CantVectorizeInstructionReturnType", &I); return false; } @@ -770,11 +761,44 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (auto *ST = dyn_cast<StoreInst>(&I)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { - ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) - << "store instruction cannot be vectorized"); + reportVectorizationFailure("Store instruction cannot be vectorized", + "store instruction cannot be vectorized", + "CantVectorizeStore", ST); return false; } + // For nontemporal stores, check that a nontemporal vector version is + // supported on the target. + if (ST->getMetadata(LLVMContext::MD_nontemporal)) { + // Arbitrarily try a vector of 2 elements. + Type *VecTy = VectorType::get(T, /*NumElements=*/2); + assert(VecTy && "did not find vectorized version of stored type"); + unsigned Alignment = getLoadStoreAlignment(ST); + if (!TTI->isLegalNTStore(VecTy, Alignment)) { + reportVectorizationFailure( + "nontemporal store instruction cannot be vectorized", + "nontemporal store instruction cannot be vectorized", + "CantVectorizeNontemporalStore", ST); + return false; + } + } + + } else if (auto *LD = dyn_cast<LoadInst>(&I)) { + if (LD->getMetadata(LLVMContext::MD_nontemporal)) { + // For nontemporal loads, check that a nontemporal vector version is + // supported on the target (arbitrarily try a vector of 2 elements). + Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2); + assert(VecTy && "did not find vectorized version of load type"); + unsigned Alignment = getLoadStoreAlignment(LD); + if (!TTI->isLegalNTLoad(VecTy, Alignment)) { + reportVectorizationFailure( + "nontemporal load instruction cannot be vectorized", + "nontemporal load instruction cannot be vectorized", + "CantVectorizeNontemporalLoad", LD); + return false; + } + } + // FP instructions can allow unsafe algebra, thus vectorizable by // non-IEEE-754 compliant SIMD units. // This applies to floating-point math operations and calls, not memory @@ -797,23 +821,27 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { AllowedExit.insert(&I); continue; } - ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) - << "value cannot be used outside the loop"); + reportVectorizationFailure("Value cannot be used outside the loop", + "value cannot be used outside the loop", + "ValueUsedOutsideLoop", &I); return false; } } // next instr. } if (!PrimaryInduction) { - LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { - ORE->emit(createMissedAnalysis("NoInductionVariable") - << "loop induction variable could not be identified"); + reportVectorizationFailure("Did not find one integer induction var", + "loop induction variable could not be identified", + "NoInductionVariable"); return false; } else if (!WidestIndTy) { - ORE->emit(createMissedAnalysis("NoIntegerInductionVariable") - << "integer loop induction variable could not be identified"); + reportVectorizationFailure("Did not find one integer induction var", + "integer loop induction variable could not be identified", + "NoIntegerInductionVariable"); return false; + } else { + LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); } } @@ -839,11 +867,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { - ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "write to a loop invariant address could not " - "be vectorized"); - LLVM_DEBUG( - dbgs() << "LV: Non vectorizable stores to a uniform address\n"); + reportVectorizationFailure("Stores to a uniform address", + "write to a loop invariant address could not be vectorized", + "CantVectorizeStoreToLoopInvariantAddress"); return false; } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -925,8 +951,9 @@ bool LoopVectorizationLegality::blockCanBePredicated( bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { - ORE->emit(createMissedAnalysis("IfConversionDisabled") - << "if-conversion is disabled"); + reportVectorizationFailure("If-conversion is disabled", + "if-conversion is disabled", + "IfConversionDisabled"); return false; } @@ -950,21 +977,26 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { - ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) - << "loop contains a switch statement"); + reportVectorizationFailure("Loop contains a switch statement", + "loop contains a switch statement", + "LoopContainsSwitch", BB->getTerminator()); return false; } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); + reportVectorizationFailure( + "Control flow cannot be substituted for a select", + "control flow cannot be substituted for a select", + "NoCFGForSelect", BB->getTerminator()); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); + reportVectorizationFailure( + "Control flow cannot be substituted for a select", + "control flow cannot be substituted for a select", + "NoCFGForSelect", BB->getTerminator()); return false; } } @@ -992,9 +1024,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!Lp->getLoopPreheader()) { - LLVM_DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("Loop doesn't have a legal pre-header", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -1003,8 +1035,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, // We must have a single backedge. if (Lp->getNumBackEdges() != 1) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("The loop must have a single backedge", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -1013,8 +1046,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, // We must have a single exiting block. if (!Lp->getExitingBlock()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("The loop must have an exiting block", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -1025,8 +1059,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, // checked at the end of each iteration. With that we can assume that all // instructions in the loop are executed the same number of times. if (Lp->getExitingBlock() != Lp->getLoopLatch()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); + reportVectorizationFailure("The exiting block is not the loop latch", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood"); if (DoExtraAnalysis) Result = false; else @@ -1087,7 +1122,9 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { assert(UseVPlanNativePath && "VPlan-native path is not enabled."); if (!canVectorizeOuterLoop()) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n"); + reportVectorizationFailure("Unsupported outer loop", + "unsupported outer loop", + "UnsupportedOuterLoop"); // TODO: Implement DoExtraAnalysis when subsequent legal checks support // outer loops. return false; @@ -1137,10 +1174,9 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { - ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") - << "Too many SCEV assumptions need to be made and checked " - << "at runtime"); - LLVM_DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + reportVectorizationFailure("Too many SCEV checks needed", + "Too many SCEV assumptions need to be made and checked at runtime", + "TooManySCEVRunTimeChecks"); if (DoExtraAnalysis) Result = false; else @@ -1159,20 +1195,20 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); if (!PrimaryInduction) { - ORE->emit(createMissedAnalysis("NoPrimaryInduction") - << "Missing a primary induction variable in the loop, which is " - << "needed in order to fold tail by masking as required."); - LLVM_DEBUG(dbgs() << "LV: No primary induction, cannot fold tail by " - << "masking.\n"); + reportVectorizationFailure( + "No primary induction, cannot fold tail by masking", + "Missing a primary induction variable in the loop, which is " + "needed in order to fold tail by masking as required.", + "NoPrimaryInduction"); return false; } // TODO: handle reductions when tail is folded by masking. if (!Reductions.empty()) { - ORE->emit(createMissedAnalysis("ReductionFoldingTailByMasking") - << "Cannot fold tail by masking in the presence of reductions."); - LLVM_DEBUG(dbgs() << "LV: Loop has reductions, cannot fold tail by " - << "masking.\n"); + reportVectorizationFailure( + "Loop has reductions, cannot fold tail by masking", + "Cannot fold tail by masking in the presence of reductions.", + "ReductionFoldingTailByMasking"); return false; } @@ -1183,10 +1219,10 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { Instruction *UI = cast<Instruction>(U); if (TheLoop->contains(UI)) continue; - ORE->emit(createMissedAnalysis("LiveOutFoldingTailByMasking") - << "Cannot fold tail by masking in the presence of live outs."); - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop has an " - << "outside user for : " << *UI << '\n'); + reportVectorizationFailure( + "Cannot fold tail by masking, loop has an outside user for", + "Cannot fold tail by masking in the presence of live outs.", + "LiveOutFoldingTailByMasking", UI); return false; } } @@ -1198,9 +1234,10 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { if (!blockCanBePredicated(BB, SafePointers)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as required.\n"); + reportVectorizationFailure( + "Cannot fold tail by masking as required", + "control flow cannot be substituted for a select", + "NoCFGForSelect", BB->getTerminator()); return false; } } diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 2aa219064299..97077cce83e3 100644 --- a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -1,9 +1,8 @@ //===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -172,6 +171,13 @@ struct VectorizationFactor { unsigned Width; // Cost of the loop with that width unsigned Cost; + + // Width 1 means no vectorization, cost 0 means uncomputed cost. + static VectorizationFactor Disabled() { return {1, 0}; } + + bool operator==(const VectorizationFactor &rhs) const { + return Width == rhs.Width && Cost == rhs.Cost; + } }; /// Planner drives the vectorization process after having passed @@ -192,11 +198,9 @@ class LoopVectorizationPlanner { /// The legality analysis. LoopVectorizationLegality *Legal; - /// The profitablity analysis. + /// The profitability analysis. LoopVectorizationCostModel &CM; - using VPlanPtr = std::unique_ptr<VPlan>; - SmallVector<VPlanPtr, 4> VPlans; /// This class is used to enable the VPlan to invoke a method of ILV. This is @@ -222,8 +226,9 @@ public: LoopVectorizationCostModel &CM) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} - /// Plan how to best vectorize, return the best VF and its cost. - VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Plan how to best vectorize, return the best VF and its cost, or None if + /// vectorization and interleaving should be avoided up front. + Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c45dee590b84..46265e3f3e13 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1,9 +1,8 @@ //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,8 +56,10 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" +#include "VPlan.h" #include "VPlanHCFGBuilder.h" #include "VPlanHCFGTransforms.h" +#include "VPlanPredicator.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -86,7 +87,9 @@ #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -133,6 +136,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> #include <cassert> @@ -256,6 +260,13 @@ cl::opt<bool> EnableVPlanNativePath( cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); +// FIXME: Remove this switch once we have divergence analysis. Currently we +// assume divergent non-backedge branches when this switch is true. +cl::opt<bool> EnableVPlanPredication( + "enable-vplan-predication", cl::init(false), cl::Hidden, + cl::desc("Enable VPlan-native vectorization path predicator with " + "support for outer loop vectorization.")); + // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the @@ -267,6 +278,13 @@ static cl::opt<bool> VPlanBuildStressTest( "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path).")); +cl::opt<bool> llvm::EnableLoopInterleaving( + "interleave-loops", cl::init(true), cl::Hidden, + cl::desc("Enable loop interleaving in Loop vectorization passes")); +cl::opt<bool> llvm::EnableLoopVectorization( + "vectorize-loops", cl::init(true), cl::Hidden, + cl::desc("Run the Loop vectorization passes")); + /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -311,11 +329,14 @@ static unsigned getReciprocalPredBlockProb() { return 2; } /// A helper function that adds a 'fast' flag to floating-point operations. static Value *addFastMathFlag(Value *V) { - if (isa<FPMathOperator>(V)) { - FastMathFlags Flags; - Flags.setFast(); - cast<Instruction>(V)->setFastMathFlags(Flags); - } + if (isa<FPMathOperator>(V)) + cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); + return V; +} + +static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { + if (isa<FPMathOperator>(V)) + cast<Instruction>(V)->setFastMathFlags(FMF); return V; } @@ -760,7 +781,7 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa<DbgInfoIntrinsic>(Inst)) { - auto NewDIL = DIL->cloneWithDuplicationFactor(UF * VF); + auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -836,7 +857,7 @@ public: AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if - /// vectorization should be avoided up front. + /// vectorization and interleaving should be avoided up front. Optional<unsigned> computeMaxVF(bool OptForSize); /// \return The most profitable vectorization factor and the cost of that VF. @@ -1149,6 +1170,18 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized + /// with factor VF. Return the cost of the instruction, including + /// scalarization overhead if it's needed. + unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + + /// Estimate cost of a call instruction CI if it were vectorized with factor + /// VF. Return the cost of the instruction, including scalarization overhead + /// if it's needed. The flag NeedToScalarize shows if the call needs to be + /// scalarized - + /// i.e. either vector version isn't available, or is too expensive. + unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + private: unsigned NumPredStores = 0; @@ -1201,6 +1234,10 @@ private: /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Estimate the overhead of scalarizing an instruction. This is a + /// convenience wrapper for the type-based getScalarizationOverhead API. + unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -1295,6 +1332,30 @@ private: DecisionList WideningDecisions; + /// Returns true if \p V is expected to be vectorized and it needs to be + /// extracted. + bool needsExtract(Value *V, unsigned VF) const { + Instruction *I = dyn_cast<Instruction>(V); + if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) + return false; + + // Assume we can vectorize V (and hence we need extraction) if the + // scalars are not computed yet. This can happen, because it is called + // via getScalarizationOverhead from setCostBasedWideningDecision, before + // the scalars are collected. That should be a safe assumption in most + // cases, because we check if the operands have vectorizable types + // beforehand in LoopVectorizationLegality. + return Scalars.find(VF) == Scalars.end() || + !isScalarAfterVectorization(I, VF); + }; + + /// Returns a range containing only operands needing to be extracted. + SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, + unsigned VF) { + return SmallVector<Value *, 4>(make_filter_range( + Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); + } + public: /// The loop that we evaluate. Loop *TheLoop; @@ -1372,12 +1433,6 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp, return false; } - if (!Hints.getWidth()) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); - Hints.emitRemarkWithHints(); - return false; - } - if (Hints.getInterleave() > 1) { // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " @@ -1447,12 +1502,13 @@ struct LoopVectorize : public FunctionPass { auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA, *ORE); + GetLAA, *ORE, PSI); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1478,6 +1534,7 @@ struct LoopVectorize : public FunctionPass { AU.addPreserved<BasicAAWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addRequired<ProfileSummaryInfoWrapperPass>(); } }; @@ -2051,7 +2108,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. - NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); if (InBounds) cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); @@ -2093,8 +2150,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, GroupMask, UndefVec, "wide.masked.vec"); } else - NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], - Group->getAlignment(), "wide.vec"); + NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], + Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2239,16 +2296,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF))); + Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); PartPtr->setIsInBounds(InBounds); PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF))); + Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. Mask[Part] = reverseVector(Mask[Part]); } else { PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF))); + Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); PartPtr->setIsInBounds(InBounds); } @@ -2305,7 +2362,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, UndefValue::get(DataTy), "wide.masked.load"); else - NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -2665,7 +2723,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex( assert(isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"); return B.CreateGEP( - nullptr, StartValue, + StartValue->getType()->getPointerElementType(), StartValue, CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()))); } @@ -2849,26 +2907,42 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { BCResumeVal->addIncoming(EndValue, MiddleBlock); // Fix the scalar body counter (PHI node). - unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); + OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); } + // We need the OrigLoop (scalar loop part) latch terminator to help + // produce correct debug info for the middle block BB instructions. + // The legality check stage guarantees that the loop will have a single + // latch. + assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && + "Scalar loop latch terminator isn't a branch"); + BranchInst *ScalarLatchBr = + cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); - if (!Cost->foldTailByMasking()) + if (!Cost->foldTailByMasking()) { CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), - BranchInst::Create(ExitBlock, ScalarPH, CmpN)); + + // Here we use the same DebugLoc as the scalar loop latch branch instead + // of the corresponding compare because they may have ended up with + // different line numbers and we want to avoid awkward line stepping while + // debugging. Eg. if the compare has got a line number inside the loop. + cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); + } + + BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); + ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); @@ -3022,45 +3096,9 @@ static void cse(BasicBlock *BB) { } } -/// Estimate the overhead of scalarizing an instruction. This is a -/// convenience wrapper for the type-based getScalarizationOverhead API. -static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, - const TargetTransformInfo &TTI) { - if (VF == 1) - return 0; - - unsigned Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy() && - (!isa<LoadInst>(I) || - !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); - - // Some targets keep addresses scalar. - if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) - return Cost; - - if (CallInst *CI = dyn_cast<CallInst>(I)) { - SmallVector<const Value *, 4> Operands(CI->arg_operands()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - else if (!isa<StoreInst>(I) || - !TTI.supportsEfficientVectorElementLoadStore()) { - SmallVector<const Value *, 4> Operands(I->operand_values()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - - return Cost; -} - -// Estimate cost of a call instruction CI if it were vectorized with factor VF. -// Return the cost of the instruction, including scalarization overhead if it's -// needed. The flag NeedToScalarize shows if the call needs to be scalarized - -// i.e. either vector version isn't available, or is too expensive. -static unsigned getVectorCallCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, - bool &NeedToScalarize) { +unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, + unsigned VF, + bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); StringRef FnName = CI->getCalledFunction()->getName(); Type *ScalarRetTy = CI->getType(); @@ -3083,7 +3121,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3102,12 +3140,8 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, return Cost; } -// Estimate cost of an intrinsic call instruction CI if it were vectorized with -// factor VF. Return the cost of the instruction, including scalarization -// overhead if it's needed. -static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI) { +unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + unsigned VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3468,7 +3502,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { Start->addIncoming(Incoming, BB); } - Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); + Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); Phi->setName("scalar.recur"); // Finally, fix users of the recurrence outside the loop. The users will need @@ -3596,14 +3630,23 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); - setDebugLocFromInst(Builder, ReducedPartRdx); + + // The middle block terminator has already been assigned a DebugLoc here (the + // OrigLoop's single latch terminator). We want the whole middle block to + // appear to execute on this line because: (a) it is all compiler generated, + // (b) these instructions are always executed after evaluating the latch + // conditional branch, and (c) other passes may add new predecessors which + // terminate on this line. This is the easiest way to ensure we don't + // accidentally cause an extra step back into the loop while debugging. + setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); for (unsigned Part = 1; Part < UF; ++Part) { Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); if (Op != Instruction::ICmp && Op != Instruction::FCmp) // Floating point operations had to be 'fast' to enable the reduction. ReducedPartRdx = addFastMathFlag( Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, - ReducedPartRdx, "bin.rdx")); + ReducedPartRdx, "bin.rdx"), + RdxDesc.getFastMathFlags()); else ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, RdxPart); @@ -3935,9 +3978,11 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. - auto *NewGEP = GEP->isInBounds() - ? Builder.CreateInBoundsGEP(Ptr, Indices) - : Builder.CreateGEP(Ptr, Indices); + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); assert((VF == 1 || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); @@ -3955,6 +4000,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: + case Instruction::FNeg: case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: @@ -3965,21 +4011,22 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - // Just widen binops. - auto *BinOp = cast<BinaryOperator>(&I); - setDebugLocFromInst(Builder, BinOp); + // Just widen unops and binops. + setDebugLocFromInst(Builder, &I); for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part); - Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + SmallVector<Value *, 2> Ops; + for (Value *Op : I.operands()) + Ops.push_back(getOrCreateVectorValue(Op, Part)); - if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) - VecOp->copyIRFlags(BinOp); + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) + VecOp->copyIRFlags(&I); // Use this vector value for all users of the original instruction. VectorLoopValueMap.setVectorValue(&I, Part, V); - addMetadata(V, BinOp); + addMetadata(V, &I); } break; @@ -4088,9 +4135,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); @@ -4395,6 +4442,13 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, auto *Group = getInterleavedAccessGroup(I); assert(Group && "Must have a group."); + // If the instruction's allocated size doesn't equal it's type size, it + // requires padding and will be scalarized. + auto &DL = I->getModule()->getDataLayout(); + auto *ScalarTy = getMemInstValueType(I); + if (hasIrregularType(ScalarTy, DL, VF)) + return false; + // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps. @@ -4987,6 +5041,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, if (LoopCost == 0) LoopCost = expectedCost(VF).first; + assert(LoopCost && "Non-zero loop cost expected"); + // Clamp the calculated IC to be between the 1 and the max interleave count // that the target allows. if (IC > MaxInterleaveCount) @@ -5314,15 +5370,6 @@ int LoopVectorizationCostModel::computePredInstDiscount( return true; }; - // Returns true if an operand that cannot be scalarized must be extracted - // from a vector. We will account for this scalarization overhead below. Note - // that the non-void predicated instructions are placed in their own blocks, - // and their return values are inserted into vectors. Thus, an extract would - // still be required. - auto needsExtract = [&](Instruction *I) -> bool { - return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF); - }; - // Compute the expected cost discount from scalarizing the entire expression // feeding the predicated instruction. We currently only consider expressions // that are single-use instruction chains. @@ -5362,7 +5409,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J)) + else if (needsExtract(J, VF)) ScalarCost += TTI.getScalarizationOverhead( ToVectorTy(J->getType(),VF), false, true); } @@ -5484,7 +5531,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated @@ -5636,6 +5683,36 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VectorizationCostTy(C, TypeNotScalarized); } +unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + unsigned VF) { + + if (VF == 1) + return 0; + + unsigned Cost = 0; + Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); + + // Some targets keep addresses scalar. + if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) + return Cost; + + // Some targets support efficient element stores. + if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) + return Cost; + + // Collect operands to consider. + CallInst *CI = dyn_cast<CallInst>(I); + Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); + + // Skip operands that do not require extraction/scalarization and do not incur + // any overhead. + return Cost + TTI.getOperandsScalarizationOverhead( + filterExtractingOperands(Ops, VF), VF); +} + void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; @@ -5876,7 +5953,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // The cost of insertelement and extractelement instructions needed for // scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -5916,6 +5993,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); } + case Instruction::FNeg: { + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, + I->getOperand(0)); + } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); @@ -5997,16 +6082,16 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); - unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); return CallCost; } default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, TTI); + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6027,10 +6112,13 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { +Pass *createLoopVectorizePass() { return new LoopVectorize(); } + Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, bool VectorizeOnlyWhenForced) { return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); @@ -6066,50 +6154,65 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } +// TODO: we could return a pair of values that specify the max VF and +// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of +// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment +// doesn't have a cost model that can choose which plan to execute if +// more than one is generated. +static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, + LoopVectorizationCostModel &CM) { + unsigned WidestType; + std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); + return WidestVectorRegBits / WidestType; +} + VectorizationFactor LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, unsigned UserVF) { - // Width 1 means no vectorization, cost 0 means uncomputed cost. - const VectorizationFactor NoVectorization = {1U, 0U}; - + unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. if (!OrigLoop->empty()) { - // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing. - // This won't be necessary when UserVF is not required in the VPlan-native - // path. - if (VPlanBuildStressTest && !UserVF) - UserVF = 4; - + // If the user doesn't provide a vectorization factor, determine a + // reasonable one. + if (!UserVF) { + VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); + + // Make sure we have a VF > 1 for stress testing. + if (VPlanBuildStressTest && VF < 2) { + LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " + << "overriding computed VF.\n"); + VF = 4; + } + } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(UserVF && "Expected UserVF for outer loop vectorization."); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - buildVPlans(UserVF, UserVF); + assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF + << " to build VPlans.\n"); + buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) - return NoVectorization; + return VectorizationFactor::Disabled(); - return {UserVF, 0}; + return {VF, 0}; } LLVM_DEBUG( dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " "VPlan-native path.\n"); - return NoVectorization; + return VectorizationFactor::Disabled(); } -VectorizationFactor -LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize, + unsigned UserVF) { assert(OrigLoop->empty() && "Inner loop expected."); - // Width 1 means no vectorization, cost 0 means uncomputed cost. - const VectorizationFactor NoVectorization = {1U, 0U}; Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); - if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. - return NoVectorization; + if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. + return None; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredication(OrigLoop->getHeader()) && @@ -6129,7 +6232,7 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { CM.selectUserVectorizationFactor(UserVF); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {UserVF, 0}; + return {{UserVF, 0}}; } unsigned MaxVF = MaybeMaxVF.getValue(); @@ -6148,7 +6251,7 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { buildVPlansWithVPRecipes(1, MaxVF); LLVM_DEBUG(printPlans(dbgs())); if (MaxVF == 1) - return NoVectorization; + return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. return CM.selectVectorizationFactor(MaxVF); @@ -6527,6 +6630,7 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FCmp: case Instruction::FDiv: case Instruction::FMul: + case Instruction::FNeg: case Instruction::FPExt: case Instruction::FPToSI: case Instruction::FPToUI: @@ -6582,9 +6686,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; } if (isa<LoadInst>(I) || isa<StoreInst>(I)) { @@ -6756,8 +6860,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, } } -LoopVectorizationPlanner::VPlanPtr -LoopVectorizationPlanner::buildVPlanWithVPRecipes( +VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, SmallPtrSetImpl<Instruction *> &DeadInstructions) { // Hold a mapping from predicated instructions to their recipes, in order to @@ -6772,7 +6875,7 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique<VPlan>(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -6881,8 +6984,7 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes( return Plan; } -LoopVectorizationPlanner::VPlanPtr -LoopVectorizationPlanner::buildVPlan(VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -6897,13 +6999,22 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); + for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) + Plan->addVF(VF); + + if (EnableVPlanPredication) { + VPlanPredicator VPP(*Plan); + VPP.predicate(); + + // Avoid running transformation to recipes until masked code generation in + // VPlan-native path is in place. + return Plan; + } + SmallPtrSet<Instruction *, 1> DeadInstructions; VPlanHCFGTransforms::VPInstructionsToVPRecipes( Plan, Legal->getInductionVars(), DeadInstructions); - for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) - Plan->addVF(VF); - return Plan; } @@ -7096,7 +7207,8 @@ static bool processLoopInVPlanNativePath( Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) { + OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); @@ -7109,24 +7221,28 @@ static bool processLoopInVPlanNativePath( LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); // Get user vectorization factor. - unsigned UserVF = Hints.getWidth(); + const unsigned UserVF = Hints.getWidth(); - // Check the function attributes to find out if this function should be - // optimized for size. + // Check the function attributes and profiles to find out if this function + // should be optimized for size. bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); // Plan how to best vectorize, return the best VF and its cost. - VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector - // code. - if (VPlanBuildStressTest) + // code. Masked vector code generation support will follow soon. + // Also, do not attempt to vectorize if no vector code will be produced. + if (VPlanBuildStressTest || EnableVPlanPredication || + VectorizationFactor::Disabled() == VF) return false; LVP.setBestPlan(VF.Width, 1); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL, + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, &CM); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -7184,7 +7300,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE, + LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, &Requirements, &Hints, DB, AC); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -7192,10 +7308,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Check the function attributes to find out if this function should be - // optimized for size. + // Check the function attributes and profiles to find out if this function + // should be optimized for size. bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7204,7 +7322,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // pipeline. if (!L->empty()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, Hints); + ORE, BFI, PSI, Hints); assert(L->empty() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip @@ -7304,14 +7422,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - VectorizationFactor VF = LVP.plan(OptForSize, UserVF); + Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF); - // Select the interleave count. - unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); - - // Get user interleave count. + VectorizationFactor VF = VectorizationFactor::Disabled(); + unsigned IC = 1; unsigned UserIC = Hints.getInterleave(); + if (MaybeVF) { + VF = *MaybeVF; + // Select the interleave count. + IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + } + // Identify the diagnostic messages that should be produced. std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; @@ -7330,7 +7452,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizeLoop = false; } - if (IC == 1 && UserIC <= 1) { + if (!MaybeVF && UserIC > 1) { + // Tell the user interleaving was avoided up-front, despite being explicitly + // requested. + LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " + "interleaving should be avoided up front\n"); + IntDiagMsg = std::make_pair( + "InterleavingAvoided", + "Ignoring UserIC, because interleaving was avoided up front"); + InterleaveLoop = false; + } else if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = std::make_pair( @@ -7457,7 +7588,7 @@ bool LoopVectorizePass::runImpl( DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, - OptimizationRemarkEmitter &ORE_) { + OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; @@ -7469,6 +7600,7 @@ bool LoopVectorizePass::runImpl( GetLAA = &GetLAA_; DB = &DB_; ORE = &ORE_; + PSI = PSI_; // Don't attempt if // 1. the target claims to have no vector registers, and @@ -7488,7 +7620,8 @@ bool LoopVectorizePass::runImpl( // will simplify all loops, regardless of whether anything end up being // vectorized. for (auto &L : *LI) - Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */); + Changed |= + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops @@ -7527,15 +7660,22 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &DB = AM.getResult<DemandedBitsAnalysis>(F); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + MemorySSA *MSSA = EnableMSSALoopDependency + ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() + : nullptr; auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; + const ModuleAnalysisManager &MAM = + AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); bool Changed = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE); + runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2e856a7e6802..27a86c0bca91 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1,9 +1,8 @@ //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -106,6 +105,10 @@ using namespace slpvectorizer; STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); +cl::opt<bool> + llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden, + cl::desc("Run the SLP vectorization passes")); + static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " @@ -207,6 +210,13 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } +/// \returns True if \p I is commutative, handles CmpInst as well as Instruction. +static bool isCommutative(Instruction *I) { + if (auto *IC = dyn_cast<CmpInst>(I)) + return IC->isCommutative(); + return I->isCommutative(); +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -438,8 +448,9 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, case Instruction::Call: { CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (hasVectorInstrinsicScalarOpd(ID, 1)) { - return (CI->getArgOperand(1) == Scalar); + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + if (hasVectorInstrinsicScalarOpd(ID, i)) + return (CI->getArgOperand(i) == Scalar); } LLVM_FALLTHROUGH; } @@ -474,6 +485,8 @@ namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { + struct TreeEntry; + public: using ValueList = SmallVector<Value *, 8>; using InstrList = SmallVector<Instruction *, 16>; @@ -517,7 +530,7 @@ public: /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost(); + int getSpillCost() const; /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. @@ -576,7 +589,7 @@ public: /// the stored value. Otherwise, the size is the width of the largest loaded /// value reaching V. This method is used by the vectorizer to calculate /// vectorization factors. - unsigned getVectorElementSize(Value *V); + unsigned getVectorElementSize(Value *V) const; /// Compute the minimum type sizes required to represent the entries in a /// vectorizable tree. @@ -599,13 +612,512 @@ public: /// \returns True if the VectorizableTree is both tiny and not fully /// vectorizable. We do not vectorize such trees. - bool isTreeTinyAndNotFullyVectorizable(); + bool isTreeTinyAndNotFullyVectorizable() const; OptimizationRemarkEmitter *getORE() { return ORE; } -private: - struct TreeEntry; + /// This structure holds any data we need about the edges being traversed + /// during buildTree_rec(). We keep track of: + /// (i) the user TreeEntry index, and + /// (ii) the index of the edge. + struct EdgeInfo { + EdgeInfo() = default; + EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) + : UserTE(UserTE), EdgeIdx(EdgeIdx) {} + /// The user TreeEntry. + TreeEntry *UserTE = nullptr; + /// The operand index of the use. + unsigned EdgeIdx = UINT_MAX; +#ifndef NDEBUG + friend inline raw_ostream &operator<<(raw_ostream &OS, + const BoUpSLP::EdgeInfo &EI) { + EI.dump(OS); + return OS; + } + /// Debug print. + void dump(raw_ostream &OS) const { + OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") + << " EdgeIdx:" << EdgeIdx << "}"; + } + LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } +#endif + }; + + /// A helper data structure to hold the operands of a vector of instructions. + /// This supports a fixed vector length for all operand vectors. + class VLOperands { + /// For each operand we need (i) the value, and (ii) the opcode that it + /// would be attached to if the expression was in a left-linearized form. + /// This is required to avoid illegal operand reordering. + /// For example: + /// \verbatim + /// 0 Op1 + /// |/ + /// Op1 Op2 Linearized + Op2 + /// \ / ----------> |/ + /// - - + /// + /// Op1 - Op2 (0 + Op1) - Op2 + /// \endverbatim + /// + /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. + /// + /// Another way to think of this is to track all the operations across the + /// path from the operand all the way to the root of the tree and to + /// calculate the operation that corresponds to this path. For example, the + /// path from Op2 to the root crosses the RHS of the '-', therefore the + /// corresponding operation is a '-' (which matches the one in the + /// linearized tree, as shown above). + /// + /// For lack of a better term, we refer to this operation as Accumulated + /// Path Operation (APO). + struct OperandData { + OperandData() = default; + OperandData(Value *V, bool APO, bool IsUsed) + : V(V), APO(APO), IsUsed(IsUsed) {} + /// The operand value. + Value *V = nullptr; + /// TreeEntries only allow a single opcode, or an alternate sequence of + /// them (e.g, +, -). Therefore, we can safely use a boolean value for the + /// APO. It is set to 'true' if 'V' is attached to an inverse operation + /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise + /// (e.g., Add/Mul) + bool APO = false; + /// Helper data for the reordering function. + bool IsUsed = false; + }; + + /// During operand reordering, we are trying to select the operand at lane + /// that matches best with the operand at the neighboring lane. Our + /// selection is based on the type of value we are looking for. For example, + /// if the neighboring lane has a load, we need to look for a load that is + /// accessing a consecutive address. These strategies are summarized in the + /// 'ReorderingMode' enumerator. + enum class ReorderingMode { + Load, ///< Matching loads to consecutive memory addresses + Opcode, ///< Matching instructions based on opcode (same or alternate) + Constant, ///< Matching constants + Splat, ///< Matching the same instruction multiple times (broadcast) + Failed, ///< We failed to create a vectorizable group + }; + + using OperandDataVec = SmallVector<OperandData, 2>; + + /// A vector of operand vectors. + SmallVector<OperandDataVec, 4> OpsVec; + + const DataLayout &DL; + ScalarEvolution &SE; + + /// \returns the operand data at \p OpIdx and \p Lane. + OperandData &getData(unsigned OpIdx, unsigned Lane) { + return OpsVec[OpIdx][Lane]; + } + + /// \returns the operand data at \p OpIdx and \p Lane. Const version. + const OperandData &getData(unsigned OpIdx, unsigned Lane) const { + return OpsVec[OpIdx][Lane]; + } + + /// Clears the used flag for all entries. + void clearUsed() { + for (unsigned OpIdx = 0, NumOperands = getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) + OpsVec[OpIdx][Lane].IsUsed = false; + } + + /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. + void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { + std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); + } + + // Search all operands in Ops[*][Lane] for the one that matches best + // Ops[OpIdx][LastLane] and return its opreand index. + // If no good match can be found, return None. + Optional<unsigned> + getBestOperand(unsigned OpIdx, int Lane, int LastLane, + ArrayRef<ReorderingMode> ReorderingModes) { + unsigned NumOperands = getNumOperands(); + + // The operand of the previous lane at OpIdx. + Value *OpLastLane = getData(OpIdx, LastLane).V; + + // Our strategy mode for OpIdx. + ReorderingMode RMode = ReorderingModes[OpIdx]; + + // The linearized opcode of the operand at OpIdx, Lane. + bool OpIdxAPO = getData(OpIdx, Lane).APO; + + const unsigned BestScore = 2; + const unsigned GoodScore = 1; + + // The best operand index and its score. + // Sometimes we have more than one option (e.g., Opcode and Undefs), so we + // are using the score to differentiate between the two. + struct BestOpData { + Optional<unsigned> Idx = None; + unsigned Score = 0; + } BestOp; + + // Iterate through all unused operands and look for the best. + for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { + // Get the operand at Idx and Lane. + OperandData &OpData = getData(Idx, Lane); + Value *Op = OpData.V; + bool OpAPO = OpData.APO; + + // Skip already selected operands. + if (OpData.IsUsed) + continue; + + // Skip if we are trying to move the operand to a position with a + // different opcode in the linearized tree form. This would break the + // semantics. + if (OpAPO != OpIdxAPO) + continue; + + // Look for an operand that matches the current mode. + switch (RMode) { + case ReorderingMode::Load: + if (isa<LoadInst>(Op)) { + // Figure out which is left and right, so that we can check for + // consecutive loads + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + if (isConsecutiveAccess(cast<LoadInst>(OpLeft), + cast<LoadInst>(OpRight), DL, SE)) + BestOp.Idx = Idx; + } + break; + case ReorderingMode::Opcode: + // We accept both Instructions and Undefs, but with different scores. + if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && + cast<Instruction>(Op)->getOpcode() == + cast<Instruction>(OpLastLane)->getOpcode()) || + (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || + isa<UndefValue>(Op)) { + // An instruction has a higher score than an undef. + unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; + case ReorderingMode::Constant: + if (isa<Constant>(Op)) { + unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; + case ReorderingMode::Splat: + if (Op == OpLastLane) + BestOp.Idx = Idx; + break; + case ReorderingMode::Failed: + return None; + } + } + + if (BestOp.Idx) { + getData(BestOp.Idx.getValue(), Lane).IsUsed = true; + return BestOp.Idx; + } + // If we could not find a good match return None. + return None; + } + + /// Helper for reorderOperandVecs. \Returns the lane that we should start + /// reordering from. This is the one which has the least number of operands + /// that can freely move about. + unsigned getBestLaneToStartReordering() const { + unsigned BestLane = 0; + unsigned Min = UINT_MAX; + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) { + unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); + if (NumFreeOps < Min) { + Min = NumFreeOps; + BestLane = Lane; + } + } + return BestLane; + } + + /// \Returns the maximum number of operands that are allowed to be reordered + /// for \p Lane. This is used as a heuristic for selecting the first lane to + /// start operand reordering. + unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + unsigned CntTrue = 0; + unsigned NumOperands = getNumOperands(); + // Operands with the same APO can be reordered. We therefore need to count + // how many of them we have for each APO, like this: Cnt[APO] = x. + // Since we only have two APOs, namely true and false, we can avoid using + // a map. Instead we can simply count the number of operands that + // correspond to one of them (in this case the 'true' APO), and calculate + // the other by subtracting it from the total number of operands. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) + if (getData(OpIdx, Lane).APO) + ++CntTrue; + unsigned CntFalse = NumOperands - CntTrue; + return std::max(CntTrue, CntFalse); + } + + /// Go through the instructions in VL and append their operands. + void appendOperandsOfVL(ArrayRef<Value *> VL) { + assert(!VL.empty() && "Bad VL"); + assert((empty() || VL.size() == getNumLanes()) && + "Expected same number of lanes"); + assert(isa<Instruction>(VL[0]) && "Expected instruction"); + unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); + OpsVec.resize(NumOperands); + unsigned NumLanes = VL.size(); + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + OpsVec[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); + // Our tree has just 3 nodes: the root and two operands. + // It is therefore trivial to get the APO. We only need to check the + // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or + // RHS operand. The LHS operand of both add and sub is never attached + // to an inversese operation in the linearized form, therefore its APO + // is false. The RHS is true only if VL[Lane] is an inverse operation. + + // Since operand reordering is performed on groups of commutative + // operations or alternating sequences (e.g., +, -), we can safely + // tell the inverse operations by checking commutativity. + bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); + bool APO = (OpIdx == 0) ? false : IsInverseOperation; + OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), + APO, false}; + } + } + } + + /// \returns the number of operands. + unsigned getNumOperands() const { return OpsVec.size(); } + + /// \returns the number of lanes. + unsigned getNumLanes() const { return OpsVec[0].size(); } + + /// \returns the operand value at \p OpIdx and \p Lane. + Value *getValue(unsigned OpIdx, unsigned Lane) const { + return getData(OpIdx, Lane).V; + } + /// \returns true if the data structure is empty. + bool empty() const { return OpsVec.empty(); } + + /// Clears the data. + void clear() { OpsVec.clear(); } + + /// \Returns true if there are enough operands identical to \p Op to fill + /// the whole vector. + /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. + bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { + bool OpAPO = getData(OpIdx, Lane).APO; + for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { + if (Ln == Lane) + continue; + // This is set to true if we found a candidate for broadcast at Lane. + bool FoundCandidate = false; + for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { + OperandData &Data = getData(OpI, Ln); + if (Data.APO != OpAPO || Data.IsUsed) + continue; + if (Data.V == Op) { + FoundCandidate = true; + Data.IsUsed = true; + break; + } + } + if (!FoundCandidate) + return false; + } + return true; + } + + public: + /// Initialize with all the operands of the instruction vector \p RootVL. + VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, + ScalarEvolution &SE) + : DL(DL), SE(SE) { + // Append all the operands of RootVL. + appendOperandsOfVL(RootVL); + } + + /// \Returns a value vector with the operands across all lanes for the + /// opearnd at \p OpIdx. + ValueList getVL(unsigned OpIdx) const { + ValueList OpVL(OpsVec[OpIdx].size()); + assert(OpsVec[OpIdx].size() == getNumLanes() && + "Expected same num of lanes across all operands"); + for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) + OpVL[Lane] = OpsVec[OpIdx][Lane].V; + return OpVL; + } + + // Performs operand reordering for 2 or more operands. + // The original operands are in OrigOps[OpIdx][Lane]. + // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. + void reorder() { + unsigned NumOperands = getNumOperands(); + unsigned NumLanes = getNumLanes(); + // Each operand has its own mode. We are using this mode to help us select + // the instructions for each lane, so that they match best with the ones + // we have selected so far. + SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); + + // This is a greedy single-pass algorithm. We are going over each lane + // once and deciding on the best order right away with no back-tracking. + // However, in order to increase its effectiveness, we start with the lane + // that has operands that can move the least. For example, given the + // following lanes: + // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd + // Lane 1 : A[1] = C[1] - B[1] // Visited 1st + // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd + // Lane 3 : A[3] = C[3] - B[3] // Visited 4th + // we will start at Lane 1, since the operands of the subtraction cannot + // be reordered. Then we will visit the rest of the lanes in a circular + // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. + + // Find the first lane that we will start our search from. + unsigned FirstLane = getBestLaneToStartReordering(); + + // Initialize the modes. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + Value *OpLane0 = getValue(OpIdx, FirstLane); + // Keep track if we have instructions with all the same opcode on one + // side. + if (isa<LoadInst>(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Load; + else if (isa<Instruction>(OpLane0)) { + // Check if OpLane0 should be broadcast. + if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) + ReorderingModes[OpIdx] = ReorderingMode::Splat; + else + ReorderingModes[OpIdx] = ReorderingMode::Opcode; + } + else if (isa<Constant>(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Constant; + else if (isa<Argument>(OpLane0)) + // Our best hope is a Splat. It may save some cost in some cases. + ReorderingModes[OpIdx] = ReorderingMode::Splat; + else + // NOTE: This should be unreachable. + ReorderingModes[OpIdx] = ReorderingMode::Failed; + } + + // If the initial strategy fails for any of the operand indexes, then we + // perform reordering again in a second pass. This helps avoid assigning + // high priority to the failed strategy, and should improve reordering for + // the non-failed operand indexes. + for (int Pass = 0; Pass != 2; ++Pass) { + // Skip the second pass if the first pass did not fail. + bool StrategyFailed = false; + // Mark all operand data as free to use. + clearUsed(); + // We keep the original operand order for the FirstLane, so reorder the + // rest of the lanes. We are visiting the nodes in a circular fashion, + // using FirstLane as the center point and increasing the radius + // distance. + for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { + // Visit the lane on the right and then the lane on the left. + for (int Direction : {+1, -1}) { + int Lane = FirstLane + Direction * Distance; + if (Lane < 0 || Lane >= (int)NumLanes) + continue; + int LastLane = Lane - Direction; + assert(LastLane >= 0 && LastLane < (int)NumLanes && + "Out of bounds"); + // Look for a good match for each operand. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + // Search for the operand that matches SortedOps[OpIdx][Lane-1]. + Optional<unsigned> BestIdx = + getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); + // By not selecting a value, we allow the operands that follow to + // select a better matching value. We will get a non-null value in + // the next run of getBestOperand(). + if (BestIdx) { + // Swap the current operand with the one returned by + // getBestOperand(). + swap(OpIdx, BestIdx.getValue(), Lane); + } else { + // We failed to find a best operand, set mode to 'Failed'. + ReorderingModes[OpIdx] = ReorderingMode::Failed; + // Enable the second pass. + StrategyFailed = true; + } + } + } + } + // Skip second pass if the strategy did not fail. + if (!StrategyFailed) + break; + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { + switch (RMode) { + case ReorderingMode::Load: + return "Load"; + case ReorderingMode::Opcode: + return "Opcode"; + case ReorderingMode::Constant: + return "Constant"; + case ReorderingMode::Splat: + return "Splat"; + case ReorderingMode::Failed: + return "Failed"; + } + llvm_unreachable("Unimplemented Reordering Type"); + } + + LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, + raw_ostream &OS) { + return OS << getModeStr(RMode); + } + + /// Debug print. + LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { + printMode(RMode, dbgs()); + } + + friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { + return printMode(RMode, OS); + } + + LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { + const unsigned Indent = 2; + unsigned Cnt = 0; + for (const OperandDataVec &OpDataVec : OpsVec) { + OS << "Operand " << Cnt++ << "\n"; + for (const OperandData &OpData : OpDataVec) { + OS.indent(Indent) << "{"; + if (Value *V = OpData.V) + OS << *V; + else + OS << "null"; + OS << ", APO:" << OpData.APO << "}\n"; + } + OS << "\n"; + } + return OS; + } + + /// Debug print. + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif + }; + +private: /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I) const; @@ -613,7 +1125,8 @@ private: int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, + const EdgeInfo &EI); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -631,12 +1144,12 @@ private: /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices); + int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef<Value *> VL); + int getGatherCost(ArrayRef<Value *> VL) const; /// Set the Builder insert point to one after the last instruction in /// the bundle @@ -648,22 +1161,18 @@ private: /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. - bool isFullyVectorizableTinyTree(); + bool isFullyVectorizableTinyTree() const; - /// \reorder commutative operands in alt shuffle if they result in - /// vectorized code. - void reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right); - - /// \reorder commutative operands to get better probability of + /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right); + static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + const DataLayout &DL, + ScalarEvolution &SE); struct TreeEntry { - TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {} + using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; + TreeEntry(VecTreeTy &Container) : Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { @@ -696,20 +1205,103 @@ private: /// to be a pointer and needs to be able to initialize the child iterator. /// Thus we need a reference back to the container to translate the indices /// to entries. - std::vector<TreeEntry> &Container; + VecTreeTy &Container; /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. - SmallVector<int, 1> UserTreeIndices; + SmallVector<EdgeInfo, 1> UserTreeIndices; + + /// The index of this treeEntry in VectorizableTree. + int Idx = -1; + + private: + /// The operands of each instruction in each lane Operands[op_index][lane]. + /// Note: This helps avoid the replication of the code that performs the + /// reordering of operands during buildTree_rec() and vectorizeTree(). + SmallVector<ValueList, 2> Operands; + + public: + /// Set this bundle's \p OpIdx'th operand to \p OpVL. + void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL, + ArrayRef<unsigned> ReuseShuffleIndices) { + if (Operands.size() < OpIdx + 1) + Operands.resize(OpIdx + 1); + assert(Operands[OpIdx].size() == 0 && "Already resized?"); + Operands[OpIdx].resize(Scalars.size()); + for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) + Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty()) + ? OpVL[ReuseShuffleIndices[Lane]] + : OpVL[Lane]; + } + + /// If there is a user TreeEntry, then set its operand. + void trySetUserTEOperand(const EdgeInfo &UserTreeIdx, + ArrayRef<Value *> OpVL, + ArrayRef<unsigned> ReuseShuffleIndices) { + if (UserTreeIdx.UserTE) + UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL, + ReuseShuffleIndices); + } + + /// \returns the \p OpIdx operand of this TreeEntry. + ValueList &getOperand(unsigned OpIdx) { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + + /// \return the single \p OpIdx operand. + Value *getSingleOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + assert(!Operands[OpIdx].empty() && "No operand available"); + return Operands[OpIdx][0]; + } + +#ifndef NDEBUG + /// Debug printer. + LLVM_DUMP_METHOD void dump() const { + dbgs() << Idx << ".\n"; + for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { + dbgs() << "Operand " << OpI << ":\n"; + for (const Value *V : Operands[OpI]) + dbgs().indent(2) << *V << "\n"; + } + dbgs() << "Scalars: \n"; + for (Value *V : Scalars) + dbgs().indent(2) << *V << "\n"; + dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "VectorizedValue: "; + if (VectorizedValue) + dbgs() << *VectorizedValue; + else + dbgs() << "NULL"; + dbgs() << "\n"; + dbgs() << "ReuseShuffleIndices: "; + if (ReuseShuffleIndices.empty()) + dbgs() << "Emtpy"; + else + for (unsigned Idx : ReuseShuffleIndices) + dbgs() << Idx << ", "; + dbgs() << "\n"; + dbgs() << "ReorderIndices: "; + for (unsigned Idx : ReorderIndices) + dbgs() << Idx << ", "; + dbgs() << "\n"; + dbgs() << "UserTreeIndices: "; + for (const auto &EInfo : UserTreeIndices) + dbgs() << EInfo << ", "; + dbgs() << "\n"; + } +#endif }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx, - ArrayRef<unsigned> ReuseShuffleIndices = None, - ArrayRef<unsigned> ReorderIndices = None) { - VectorizableTree.emplace_back(VectorizableTree); - int idx = VectorizableTree.size() - 1; - TreeEntry *Last = &VectorizableTree[idx]; + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + const EdgeInfo &UserTreeIdx, + ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<unsigned> ReorderIndices = None) { + VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree)); + TreeEntry *Last = VectorizableTree.back().get(); + Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), @@ -718,25 +1310,44 @@ private: if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + ScalarToTreeEntry[VL[i]] = Last->Idx; } } else { MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx >= 0) + if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - UserTreeIdx = idx; + + Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); + return Last; } /// -- Vectorization State -- /// Holds all of the tree entries. - std::vector<TreeEntry> VectorizableTree; + TreeEntry::VecTreeTy VectorizableTree; + +#ifndef NDEBUG + /// Debug printer. + LLVM_DUMP_METHOD void dumpVectorizableTree() const { + for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { + VectorizableTree[Id]->dump(); + dbgs() << "\n"; + } + } +#endif TreeEntry *getTreeEntry(Value *V) { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; + return VectorizableTree[I->second].get(); + return nullptr; + } + + const TreeEntry *getTreeEntry(Value *V) const { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return VectorizableTree[I->second].get(); return nullptr; } @@ -1246,21 +1857,25 @@ template <> struct GraphTraits<BoUpSLP *> { /// NodeRef has to be a pointer per the GraphWriter. using NodeRef = TreeEntry *; + using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; + /// Add the VectorizableTree to the index iterator to be able to return /// TreeEntry pointers. struct ChildIteratorType - : public iterator_adaptor_base<ChildIteratorType, - SmallVector<int, 1>::iterator> { - std::vector<TreeEntry> &VectorizableTree; + : public iterator_adaptor_base< + ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { + ContainerTy &VectorizableTree; - ChildIteratorType(SmallVector<int, 1>::iterator W, - std::vector<TreeEntry> &VT) + ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, + ContainerTy &VT) : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} - NodeRef operator*() { return &VectorizableTree[*I]; } + NodeRef operator*() { return I->UserTE; } }; - static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; } + static NodeRef getEntryNode(BoUpSLP &R) { + return R.VectorizableTree[0].get(); + } static ChildIteratorType child_begin(NodeRef N) { return {N->UserTreeIndices.begin(), N->Container}; @@ -1272,7 +1887,19 @@ template <> struct GraphTraits<BoUpSLP *> { /// For the node iterator we just need to turn the TreeEntry iterator into a /// TreeEntry* iterator so that it dereferences to NodeRef. - using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>; + class nodes_iterator { + using ItTy = ContainerTy::iterator; + ItTy It; + + public: + nodes_iterator(const ItTy &It2) : It(It2) {} + NodeRef operator*() { return It->get(); } + nodes_iterator operator++() { + ++It; + return *this; + } + bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } + }; static nodes_iterator nodes_begin(BoUpSLP *R) { return nodes_iterator(R->VectorizableTree.begin()); @@ -1331,11 +1958,11 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (TreeEntry &EIdx : VectorizableTree) { - TreeEntry *Entry = &EIdx; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. if (Entry->NeedToGather) @@ -1393,7 +2020,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, - int UserTreeIdx) { + const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); InstructionsState S = getSameOpcode(VL); @@ -1450,6 +2077,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); + E->trySetUserTEOperand(UserTreeIdx, VL, None); return; } @@ -1468,8 +2096,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. + // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i])) { + if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false, UserTreeIdx); return; @@ -1548,7 +2177,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1558,7 +2187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } @@ -1571,6 +2200,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); + // This is a special case, as it does not gather, but at the same time + // we are not extending buildTree_rec() towards the operands. + ValueList Op0; + Op0.assign(VL.size(), VL0->getOperand(0)); + VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); return; } if (!CurrentOrder.empty()) { @@ -1588,6 +2222,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ++StoredCurrentOrderAndNum->getSecond(); newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); + // This is a special case, as it does not gather, but at the same time + // we are not extending buildTree_rec() towards the operands. + ValueList Op0; + Op0.assign(VL.size(), VL0->getOperand(0)); + VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); @@ -1693,7 +2332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1702,7 +2341,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } @@ -1710,10 +2349,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FCmp: { // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast<CmpInst>(VL[i]); - if (Cmp->getPredicate() != P0 || + if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); @@ -1723,20 +2363,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + ValueList Left, Right; + if (cast<CmpInst>(VL0)->isCommutative()) { + // Commutative predicate - collect + sort operands of the instructions + // so that each side is more likely to have the same opcode. + assert(P0 == SwapP0 && "Commutative Predicate mismatch"); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + } else { + // Collect operands - commute if it uses the swapped predicate. + for (Value *V : VL) { + auto *Cmp = cast<CmpInst>(V); + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + if (Cmp->getPredicate() != P0) + std::swap(LHS, RHS); + Left.push_back(LHS); + Right.push_back(RHS); + } } + + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); return; } case Instruction::Select: + case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -1754,17 +2408,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + case Instruction::Xor: { + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); return; } @@ -1774,10 +2428,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; - + } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (unsigned j = 0; j < VL.size(); ++j) { @@ -1815,7 +2469,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1823,7 +2477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } @@ -1837,14 +2491,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } case Instruction::Call: { @@ -1860,9 +2514,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } Function *Int = CI->getCalledFunction(); - Value *A1I = nullptr; - if (hasVectorInstrinsicScalarOpd(ID, 1)) - A1I = CI->getArgOperand(1); + unsigned NumArgs = CI->getNumArgOperands(); + SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); + for (unsigned j = 0; j != NumArgs; ++j) + if (hasVectorInstrinsicScalarOpd(ID, j)) + ScalarArgs[j] = CI->getArgOperand(j); for (unsigned i = 1, e = VL.size(); i != e; ++i) { CallInst *CI2 = dyn_cast<CallInst>(VL[i]); if (!CI2 || CI2->getCalledFunction() != Int || @@ -1874,16 +2530,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, << "\n"); return; } - // ctlz,cttz and powi are special intrinsics whose second argument - // should be same in order for them to be vectorized. - if (hasVectorInstrinsicScalarOpd(ID, 1)) { - Value *A1J = CI2->getArgOperand(1); - if (A1I != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI - << " argument " << A1I << "!=" << A1J << "\n"); - return; + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized. + for (unsigned j = 0; j != NumArgs; ++j) { + if (hasVectorInstrinsicScalarOpd(ID, j)) { + Value *A1J = CI2->getArgOperand(j); + if (ScalarArgs[j] != A1J) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument " << ScalarArgs[j] << "!=" << A1J + << "\n"); + return; + } } } // Verify that the bundle operands are identical between the two calls. @@ -1899,7 +2558,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1907,11 +2566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CallInst *CI2 = dyn_cast<CallInst>(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } - case Instruction::ShuffleVector: + case Instruction::ShuffleVector: { // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. if (!S.isAltShuffle()) { @@ -1920,15 +2579,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); return; } @@ -1938,10 +2597,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, i}); } return; - + } default: BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); @@ -2223,6 +2882,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } + case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -2260,7 +2920,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ConstantInt *CInt0 = nullptr; for (unsigned i = 0, e = VL.size(); i < e; ++i) { const Instruction *I = cast<Instruction>(VL[i]); - ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1)); + unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0; + ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx)); if (!CInt) { Op2VK = TargetTransformInfo::OK_AnyValue; Op2VP = TargetTransformInfo::OP_None; @@ -2413,31 +3074,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } } -bool BoUpSLP::isFullyVectorizableTinyTree() { +bool BoUpSLP::isFullyVectorizableTinyTree() const { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather) + if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0].NeedToGather && - (allConstant(VectorizableTree[1].Scalars) || - isSplat(VectorizableTree[1].Scalars))) + if (!VectorizableTree[0]->NeedToGather && + (allConstant(VectorizableTree[1]->Scalars) || + isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) return false; return true; } -bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { +bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -2457,19 +3118,19 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { return true; } -int BoUpSLP::getSpillCost() { +int BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front().Scalars.size(); + unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); int Cost = 0; SmallPtrSet<Instruction*, 4> LiveValues; Instruction *PrevInst = nullptr; - for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]); + for (const auto &TEPtr : VectorizableTree) { + Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); if (!Inst) continue; @@ -2494,6 +3155,7 @@ int BoUpSLP::getSpillCost() { }); // Now find the sequence of instructions between PrevInst and Inst. + unsigned NumCalls = 0; BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), PrevInstIt = PrevInst->getIterator().getReverse(); @@ -2506,16 +3168,19 @@ int BoUpSLP::getSpillCost() { // Debug informations don't impact spill cost. if ((isa<CallInst>(&*PrevInstIt) && !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && - &*PrevInstIt != PrevInst) { - SmallVector<Type*, 4> V; - for (auto *II : LiveValues) - V.push_back(VectorType::get(II->getType(), BundleWidth)); - Cost += TTI->getCostOfKeepingLiveOverCall(V); - } + &*PrevInstIt != PrevInst) + NumCalls++; ++PrevInstIt; } + if (NumCalls) { + SmallVector<Type*, 4> V; + for (auto *II : LiveValues) + V.push_back(VectorType::get(II->getType(), BundleWidth)); + Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); + } + PrevInst = Inst; } @@ -2527,10 +3192,10 @@ int BoUpSLP::getTreeCost() { LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = VectorizableTree[I]; + TreeEntry &TE = *VectorizableTree[I].get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -2545,10 +3210,11 @@ int BoUpSLP::getTreeCost() { // existing heuristics based on tree size may yield different results. // if (TE.NeedToGather && - std::any_of(std::next(VectorizableTree.begin(), I + 1), - VectorizableTree.end(), [TE](TreeEntry &Entry) { - return Entry.NeedToGather && Entry.isSame(TE.Scalars); - })) + std::any_of( + std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), + [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { + return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); + })) continue; int C = getEntryCost(&TE); @@ -2575,7 +3241,7 @@ int BoUpSLP::getTreeCost() { // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2608,17 +3274,17 @@ int BoUpSLP::getTreeCost() { } int BoUpSLP::getGatherCost(Type *Ty, - const DenseSet<unsigned> &ShuffledIndices) { + const DenseSet<unsigned> &ShuffledIndices) const { int Cost = 0; for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) if (!ShuffledIndices.count(i)) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); if (!ShuffledIndices.empty()) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } -int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { +int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -2638,221 +3304,19 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { return getGatherCost(VecTy, ShuffledElements); } -// Reorder commutative operations in alternate shuffle if the resulting vectors -// are consecutive loads. This would allow us to vectorize the tree. -// If we have something like- -// load a[0] - load b[0] -// load b[1] + load a[1] -// load a[2] - load b[2] -// load a[3] + load b[3] -// Reordering the second load b[1] load a[1] would allow us to vectorize this -// code. -void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast<Instruction>(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - } - - // Reorder if we have a commutative operation and consecutive access - // are on either side of the alternate instructions. - for (unsigned j = 0; j < VL.size() - 1; ++j) { - if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { - Instruction *VL1 = cast<Instruction>(VL[j]); - Instruction *VL2 = cast<Instruction>(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { - Instruction *VL1 = cast<Instruction>(VL[j]); - Instruction *VL2 = cast<Instruction>(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - } -} - -// Return true if I should be commuted before adding it's left and right -// operands to the arrays Left and Right. -// -// The vectorizer is trying to either have all elements one side being -// instruction with the same opcode to enable further vectorization, or having -// a splat to lower the vectorizing cost. -static bool shouldReorderOperands( - int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left, - ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, - bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); - // If we have "SplatRight", try to see if commuting is needed to preserve it. - if (SplatRight) { - if (VRight == Right[i - 1]) - // Preserve SplatRight - return false; - if (VLeft == Right[i - 1]) { - // Commuting would preserve SplatRight, but we don't want to break - // SplatLeft either, i.e. preserve the original order if possible. - // (FIXME: why do we care?) - if (SplatLeft && VLeft == Left[i - 1]) - return false; - return true; - } - } - // Symmetrically handle Right side. - if (SplatLeft) { - if (VLeft == Left[i - 1]) - // Preserve SplatLeft - return false; - if (VRight == Left[i - 1]) - return true; - } - - Instruction *ILeft = dyn_cast<Instruction>(VLeft); - Instruction *IRight = dyn_cast<Instruction>(VRight); - - // If we have "AllSameOpcodeRight", try to see if the left operands preserves - // it and not the right, in this case we want to commute. - if (AllSameOpcodeRight) { - unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); - if (IRight && RightPrevOpcode == IRight->getOpcode()) - // Do not commute, a match on the right preserves AllSameOpcodeRight - return false; - if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { - // We have a match and may want to commute, but first check if there is - // not also a match on the existing operands on the Left to preserve - // AllSameOpcodeLeft, i.e. preserve the original order if possible. - // (FIXME: why do we care?) - if (AllSameOpcodeLeft && ILeft && - cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) - return false; - return true; - } - } - // Symmetrically handle Left side. - if (AllSameOpcodeLeft) { - unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); - if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) - return false; - if (IRight && LeftPrevOpcode == IRight->getOpcode()) - return true; - } - return false; -} - -void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, - ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - if (!VL.empty()) { - // Peel the first iteration out of the loop since there's nothing - // interesting to do anyway and it simplifies the checks in the loop. - auto *I = cast<Instruction>(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); - if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) - // Favor having instruction to the right. FIXME: why? - std::swap(VLeft, VRight); - Left.push_back(VLeft); - Right.push_back(VRight); - } - - // Keep track if we have instructions with all the same opcode on one side. - bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); - bool AllSameOpcodeRight = isa<Instruction>(Right[0]); - // Keep track if we have one side with all the same value (broadcast). - bool SplatLeft = true; - bool SplatRight = true; - - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - assert(((I->getOpcode() == Opcode && I->isCommutative()) || - (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && - "Can only process commutative instruction"); - // Commute to favor either a splat or maximizing having the same opcodes on - // one side. - Value *VLeft; - Value *VRight; - if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, - VRight)) { - Left.push_back(VRight); - Right.push_back(VLeft); - } else { - Left.push_back(VLeft); - Right.push_back(VRight); - } - // Update Splat* and AllSameOpcode* after the insertion. - SplatRight = SplatRight && (Right[i - 1] == Right[i]); - SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); - AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && - (cast<Instruction>(Left[i - 1])->getOpcode() == - cast<Instruction>(Left[i])->getOpcode()); - AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && - (cast<Instruction>(Right[i - 1])->getOpcode() == - cast<Instruction>(Right[i])->getOpcode()); - } - - // If one operand end up being broadcast, return this operand order. - if (SplatRight || SplatLeft) +// Perform operand reordering on the instructions in VL and return the reordered +// operands in Left and Right. +void BoUpSLP::reorderInputsAccordingToOpcode( + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, const DataLayout &DL, + ScalarEvolution &SE) { + if (VL.empty()) return; - - // Finally check if we can get longer vectorizable chain by reordering - // without breaking the good operand order detected above. - // E.g. If we have something like- - // load a[0] load b[0] - // load b[1] load a[1] - // load a[2] load b[2] - // load a[3] load b[3] - // Reordering the second load b[1] load a[1] would allow us to vectorize - // this code and we still retain AllSameOpcode property. - // FIXME: This load reordering might break AllSameOpcode in some rare cases - // such as- - // add a[0],c[0] load b[0] - // add a[1],c[2] load b[1] - // b[2] load b[2] - // add a[3],c[3] load b[3] - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { - if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - } - } - if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { - if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - } - } - // else unchanged - } + VLOperands Ops(VL, DL, SE); + // Reorder the operands in place. + Ops.reorder(); + Left = Ops.getVL(0); + Right = Ops.getVL(1); } void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, @@ -3082,13 +3546,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { continue; } - // Prepare the operand vector. - for (Value *V : E->Scalars) - Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB)); - Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(E->getOperand(i)); NewPhi->addIncoming(Vec, IBB); } @@ -3099,7 +3559,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ExtractElement: { if (!E->NeedToGather) { - Value *V = VL0->getOperand(0); + Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); @@ -3132,11 +3592,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractValue: { if (!E->NeedToGather) { - LoadInst *LI = cast<LoadInst>(VL0->getOperand(0)); + LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); Value *NewV = propagateMetadata(V, E->Scalars); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -3177,13 +3637,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast<Instruction>(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(E->getOperand(0)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3202,16 +3658,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::FCmp: case Instruction::ICmp: { - ValueList LHSV, RHSV; - for (Value *V : E->Scalars) { - LHSV.push_back(cast<Instruction>(V)->getOperand(0)); - RHSV.push_back(cast<Instruction>(V)->getOperand(1)); - } - setInsertPointAfterBundle(E->Scalars, S); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(E->getOperand(0)); + Value *R = vectorizeTree(E->getOperand(1)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3235,31 +3685,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (Value *V : E->Scalars) { - CondVec.push_back(cast<Instruction>(V)->getOperand(0)); - TrueVec.push_back(cast<Instruction>(V)->getOperand(1)); - FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); + setInsertPointAfterBundle(E->Scalars, S); + + Value *Cond = vectorizeTree(E->getOperand(0)); + Value *True = vectorizeTree(E->getOperand(1)); + Value *False = vectorizeTree(E->getOperand(2)); + + if (E->VectorizedValue) { + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); + return E->VectorizedValue; } + Value *V = Builder.CreateSelect(Cond, True, False); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); + } + E->VectorizedValue = V; + ++NumVectorInstructions; + return V; + } + case Instruction::FNeg: { setInsertPointAfterBundle(E->Scalars, S); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Op = vectorizeTree(E->getOperand(0)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - Value *V = Builder.CreateSelect(Cond, True, False); + Value *V = Builder.CreateUnOp( + static_cast<Instruction::UnaryOps>(S.getOpcode()), Op); + propagateIRFlags(V, E->Scalars, VL0); + if (auto *I = dyn_cast<Instruction>(V)) + V = propagateMetadata(I, E->Scalars); + if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; ++NumVectorInstructions; + return V; } case Instruction::Add: @@ -3280,21 +3748,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - ValueList LHSVL, RHSVL; - if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); - else - for (Value *V : E->Scalars) { - auto *I = cast<Instruction>(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); - } - setInsertPointAfterBundle(E->Scalars, S); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(E->getOperand(0)); + Value *RHS = vectorizeTree(E->getOperand(1)); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3341,7 +3798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecPtr); + LI = Builder.CreateLoad(VecTy, VecPtr); if (!Alignment) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } @@ -3367,13 +3824,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); - ValueList ScalarStoreValues; - for (Value *V : E->Scalars) - ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); - Value *VecValue = vectorizeTree(ScalarStoreValues); + Value *VecValue = vectorizeTree(E->getOperand(0)); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); @@ -3400,20 +3853,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::GetElementPtr: { setInsertPointAfterBundle(E->Scalars, S); - ValueList Op0VL; - for (Value *V : E->Scalars) - Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); - - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(E->getOperand(0)); std::vector<Value *> OpVecs; for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; ++j) { - ValueList OpVL; - for (Value *V : E->Scalars) - OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); - - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(E->getOperand(j)); OpVecs.push_back(OpVec); } @@ -3443,20 +3888,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; - // ctlz,cttz and powi are special intrinsics whose second argument is - // a scalar. This argument should not be vectorized. - if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { + // Some intrinsics have scalar arguments. This argument should not be + // vectorized. + if (hasVectorInstrinsicScalarOpd(IID, j)) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); continue; } - for (Value *V : E->Scalars) { - CallInst *CEI = cast<CallInst>(V); - OpVL.push_back(CEI->getArgOperand(j)); - } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(E->getOperand(j)); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3485,7 +3926,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ShuffleVector: { - ValueList LHSVL, RHSVL; assert(S.isAltShuffle() && ((Instruction::isBinaryOp(S.getOpcode()) && Instruction::isBinaryOp(S.getAltOpcode())) || @@ -3495,16 +3935,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS, *RHS; if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(LHSVL); - RHS = vectorizeTree(RHSVL); + LHS = vectorizeTree(E->getOperand(0)); + RHS = vectorizeTree(E->getOperand(1)); } else { - ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast<Instruction>(V)->getOperand(0)); setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(INVL); + LHS = vectorizeTree(E->getOperand(0)); } if (E->VectorizedValue) { @@ -3578,20 +4014,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast<Instruction>(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); - auto BundleWidth = VectorizableTree[0].Scalars.size(); + auto BundleWidth = VectorizableTree[0]->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *VecTy = VectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); - VectorizableTree[0].VectorizedValue = Trunc; + VectorizableTree[0]->VectorizedValue = Trunc; } LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() @@ -3687,8 +4123,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } // For each vectorized value: - for (TreeEntry &EIdx : VectorizableTree) { - TreeEntry *Entry = &EIdx; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. if (Entry->NeedToGather) @@ -3721,7 +4157,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Builder.ClearInsertionPoint(); - return VectorizableTree[0].VectorizedValue; + return VectorizableTree[0]->VectorizedValue; } void BoUpSLP::optimizeGatherSequence() { @@ -3767,10 +4203,10 @@ void BoUpSLP::optimizeGatherSequence() { // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), - [this](const DomTreeNode *A, const DomTreeNode *B) { - return DT->properlyDominates(A, B); - }); + llvm::stable_sort(CSEWorkList, + [this](const DomTreeNode *A, const DomTreeNode *B) { + return DT->properlyDominates(A, B); + }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the @@ -3989,7 +4425,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, << "\n"); return true; } - UpIter++; + ++UpIter; } if (DownIter != LowerEnd) { if (&*DownIter == I) { @@ -4003,7 +4439,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, << "\n"); return true; } - DownIter++; + ++DownIter; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); @@ -4253,7 +4689,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { BS->ScheduleStart = nullptr; } -unsigned BoUpSLP::getVectorElementSize(Value *V) { +unsigned BoUpSLP::getVectorElementSize(Value *V) const { // If V is a store, just return the width of the stored value without // traversing the expression tree. This is the common case. if (auto *Store = dyn_cast<StoreInst>(V)) @@ -4390,7 +4826,7 @@ void BoUpSLP::computeMinimumValueSizes() { return; // We only attempt to truncate integer expressions. - auto &TreeRoot = VectorizableTree[0].Scalars; + auto &TreeRoot = VectorizableTree[0]->Scalars; auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); if (!TreeRootIT) return; @@ -4411,8 +4847,8 @@ void BoUpSLP::computeMinimumValueSizes() { // Collect the scalar values of the vectorizable expression. We will use this // context to determine which values can be demoted. If we see a truncation, // we mark it as seeding another demotion. - for (auto &Entry : VectorizableTree) - Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end()); + for (auto &EntryPtr : VectorizableTree) + Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); // Ensure the roots of the vectorizable tree don't form a cycle. They must // have a single external user that is not in the vectorizable tree. @@ -4746,38 +5182,29 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - // Do a quadratic search on all of the given stores in reverse order and find - // all of the pairs of stores that follow each other. - SmallVector<unsigned, 16> IndexQueue; - unsigned E = Stores.size(); - IndexQueue.resize(E - 1); - for (unsigned I = E; I > 0; --I) { - unsigned Idx = I - 1; - // If a store has multiple consecutive store candidates, search Stores - // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... - // This is because usually pairing with immediate succeeding or preceding - // candidate create the best chance to find slp vectorization opportunity. - unsigned Offset = 1; - unsigned Cnt = 0; - for (unsigned J = 0; J < E - 1; ++J, ++Offset) { - if (Idx >= Offset) { - IndexQueue[Cnt] = Idx - Offset; - ++Cnt; - } - if (Idx + Offset < E) { - IndexQueue[Cnt] = Idx + Offset; - ++Cnt; - } - } + auto &&FindConsecutiveAccess = + [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) { + if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + return false; - for (auto K : IndexQueue) { - if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) { Tails.insert(Stores[Idx]); Heads.insert(Stores[K]); ConsecutiveChain[Stores[K]] = Stores[Idx]; + return true; + }; + + // Do a quadratic search on all of the given stores in reverse order and find + // all of the pairs of stores that follow each other. + int E = Stores.size(); + for (int Idx = E - 1; Idx >= 0; --Idx) { + // If a store has multiple consecutive store candidates, search according + // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find slp vectorization opportunity. + for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset) + if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || + (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) break; - } - } } // For stores that start but don't end a link in the chain: @@ -5740,6 +6167,9 @@ public: unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); Value *VectorizedTree = nullptr; + + // FIXME: Fast-math-flags should be set based on the instructions in the + // reduction (not all of 'fast' are required). IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); FastMathFlags Unsafe; Unsafe.setFast(); @@ -5929,10 +6359,14 @@ private: assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); - if (!IsPairwiseReduction) + if (!IsPairwiseReduction) { + // FIXME: The builder should use an FMF guard. It should not be hard-coded + // to 'fast'. + assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF"); return createSimpleTargetReduction( Builder, TTI, ReductionData.getOpcode(), VectorizedValue, ReductionData.getFlags(), ReductionOps.back()); + } Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { @@ -6256,7 +6690,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Sort by type. - std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); + llvm::stable_sort(Incoming, PhiTypeSorterFunc); // Try to vectorize elements base on their type. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), @@ -6297,7 +6731,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { SmallVector<WeakVH, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.count(&*it) > 0 && diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h index 15d38ac9c84c..0ca6a6b93cfd 100644 --- a/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -1,9 +1,8 @@ //===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -30,9 +29,6 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; - /// Target Transform Info. - const TargetTransformInfo *TTI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -105,11 +101,9 @@ public: public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - Builder(Builder) {} + : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {} /// Check if a recipe can be create for \p I withing the given VF \p Range. /// If a recipe can be created, it adds it to \p VPBB. diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp index 05a5400beb4e..517d759d7bfc 100644 --- a/lib/Transforms/Vectorize/VPlan.cpp +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -1,9 +1,8 @@ //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -374,10 +373,9 @@ void VPlan::execute(VPTransformState *State) { BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); assert(VectorHeaderBB && "Loop preheader does not have a single successor."); - BasicBlock *VectorLatchBB = VectorHeaderBB; // 1. Make room to generate basic-blocks inside loop body if needed. - VectorLatchBB = VectorHeaderBB->splitBasicBlock( + BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); Loop *L = State->LI->getLoopFor(VectorHeaderBB); L->addBasicBlockToLoop(VectorLatchBB, *State->LI); @@ -561,6 +559,19 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { bumpIndent(1); OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; bumpIndent(1); + + // Dump the block predicate. + const VPValue *Pred = BasicBlock->getPredicate(); + if (Pred) { + OS << " +\n" << Indent << " \"BlockPredicate: "; + if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { + PredI->printAsOperand(OS); + OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) + << ")\\l\""; + } else + Pred->printAsOperand(OS); + } + for (const VPRecipeBase &Recipe : *BasicBlock) Recipe.print(OS, Indent); diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index 5c1b4a83c30e..8a06412ad590 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -1,9 +1,8 @@ //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -353,6 +352,9 @@ private: /// Successor selector, null for zero or single successor blocks. VPValue *CondBit = nullptr; + /// Current block predicate - null if the block does not need a predicate. + VPValue *Predicate = nullptr; + /// Add \p Successor as the last successor to this block. void appendSuccessor(VPBlockBase *Successor) { assert(Successor && "Cannot add nullptr successor!"); @@ -491,6 +493,12 @@ public: void setCondBit(VPValue *CV) { CondBit = CV; } + VPValue *getPredicate() { return Predicate; } + + const VPValue *getPredicate() const { return Predicate; } + + void setPredicate(VPValue *Pred) { Predicate = Pred; } + /// Set a given VPBlockBase \p Successor as the single successor of this /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. /// This VPBlockBase must have no successors. @@ -521,6 +529,15 @@ public: appendPredecessor(Pred); } + /// Remove all the predecessor of this block. + void clearPredecessors() { Predecessors.clear(); } + + /// Remove all the successors of this block and set to null its condition bit + void clearSuccessors() { + Successors.clear(); + CondBit = nullptr; + } + /// The method which generates the output IR that correspond to this /// VPBlockBase, thereby "executing" the VPlan. virtual void execute(struct VPTransformState *State) = 0; @@ -1491,6 +1508,41 @@ public: From->removeSuccessor(To); To->removePredecessor(From); } + + /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. + static bool isBackEdge(const VPBlockBase *FromBlock, + const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { + assert(FromBlock->getParent() == ToBlock->getParent() && + FromBlock->getParent() && "Must be in same region"); + const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock); + const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock); + if (!FromLoop || !ToLoop || FromLoop != ToLoop) + return false; + + // A back-edge is a branch from the loop latch to its header. + return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader(); + } + + /// Returns true if \p Block is a loop latch + static bool blockIsLoopLatch(const VPBlockBase *Block, + const VPLoopInfo *VPLInfo) { + if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block)) + return ParentVPL->isLoopLatch(Block); + + return false; + } + + /// Count and return the number of succesors of \p PredBlock excluding any + /// backedges. + static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock, + VPLoopInfo *VPLI) { + unsigned Count = 0; + for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) { + if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI)) + Count++; + } + return Count; + } }; class VPInterleavedAccessInfo { diff --git a/lib/Transforms/Vectorize/VPlanDominatorTree.h b/lib/Transforms/Vectorize/VPlanDominatorTree.h index 1b81097b6d31..19f5d2c00c60 100644 --- a/lib/Transforms/Vectorize/VPlanDominatorTree.h +++ b/lib/Transforms/Vectorize/VPlanDominatorTree.h @@ -1,9 +1,8 @@ //===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 0f42694e193b..df96f67288f1 100644 --- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -1,9 +1,8 @@ //===-- VPlanHCFGBuilder.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -64,7 +63,9 @@ private: void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); void fixPhiNodes(); VPBasicBlock *getOrCreateVPBB(BasicBlock *BB); +#ifndef NDEBUG bool isExternalDef(Value *Val); +#endif VPValue *getOrCreateVPOperand(Value *IRVal); void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB); @@ -119,6 +120,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { return VPBB; } +#ifndef NDEBUG // Return true if \p Val is considered an external definition. An external // definition is either: // 1. A Value that is not an Instruction. This will be refined in the future. @@ -154,6 +156,7 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) { // Check whether Instruction definition is in loop body. return !TheLoop->contains(Inst); } +#endif // Create a new VPValue or retrieve an existing one for the Instruction's // operand \p IRVal. This function must only be used to create/retrieve VPValues diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index 3f11dcb5164d..238ee7e6347c 100644 --- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -1,9 +1,8 @@ //===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp index 3ad7fc7e7b96..7ed7d21b6caa 100644 --- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -1,9 +1,8 @@ //===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h index ae549c6871b3..79a23c33184f 100644 --- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h @@ -1,9 +1,8 @@ //===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanLoopInfo.h b/lib/Transforms/Vectorize/VPlanLoopInfo.h index 5c2485fc2145..5208f2d58e2b 100644 --- a/lib/Transforms/Vectorize/VPlanLoopInfo.h +++ b/lib/Transforms/Vectorize/VPlanLoopInfo.h @@ -1,9 +1,8 @@ //===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanPredicator.cpp b/lib/Transforms/Vectorize/VPlanPredicator.cpp new file mode 100644 index 000000000000..7a80f3ff80a5 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -0,0 +1,248 @@ +//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the VPlanPredicator class which contains the public +/// interfaces to predicate and linearize the VPlan region. +/// +//===----------------------------------------------------------------------===// + +#include "VPlanPredicator.h" +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "VPlanPredicator" + +using namespace llvm; + +// Generate VPInstructions at the beginning of CurrBB that calculate the +// predicate being propagated from PredBB to CurrBB depending on the edge type +// between them. For example if: +// i. PredBB is controlled by predicate %BP, and +// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition +// bit value %CBV then this function will generate the following two +// VPInstructions at the start of CurrBB: +// %IntermediateVal = not %CBV +// %FinalVal = and %BP %IntermediateVal +// It returns %FinalVal. +VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB, + VPBasicBlock *CurrBB) { + VPValue *CBV = PredBB->getCondBit(); + + // Set the intermediate value - this is either 'CBV', or 'not CBV' + // depending on the edge type. + EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB); + VPValue *IntermediateVal = nullptr; + switch (ET) { + case EdgeType::TRUE_EDGE: + // CurrBB is the true successor of PredBB - nothing to do here. + IntermediateVal = CBV; + break; + + case EdgeType::FALSE_EDGE: + // CurrBB is the False successor of PredBB - compute not of CBV. + IntermediateVal = Builder.createNot(CBV); + break; + } + + // Now AND intermediate value with PredBB's block predicate if it has one. + VPValue *BP = PredBB->getPredicate(); + if (BP) + return Builder.createAnd(BP, IntermediateVal); + else + return IntermediateVal; +} + +// Generate a tree of ORs for all IncomingPredicates in WorkList. +// Note: This function destroys the original Worklist. +// +// P1 P2 P3 P4 P5 +// \ / \ / / +// OR1 OR2 / +// \ | / +// \ +/-+ +// \ / | +// OR3 | +// \ | +// OR4 <- Returns this +// | +// +// The algorithm uses a worklist of predicates as its main data structure. +// We pop a pair of values from the front (e.g. P1 and P2), generate an OR +// (in this example OR1), and push it back. In this example the worklist +// contains {P3, P4, P5, OR1}. +// The process iterates until we have only one element in the Worklist (OR4). +// The last element is the root predicate which is returned. +VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) { + if (Worklist.empty()) + return nullptr; + + // The worklist initially contains all the leaf nodes. Initialize the tree + // using them. + while (Worklist.size() >= 2) { + // Pop a pair of values from the front. + VPValue *LHS = Worklist.front(); + Worklist.pop_front(); + VPValue *RHS = Worklist.front(); + Worklist.pop_front(); + + // Create an OR of these values. + VPValue *Or = Builder.createOr(LHS, RHS); + + // Push OR to the back of the worklist. + Worklist.push_back(Or); + } + + assert(Worklist.size() == 1 && "Expected 1 item in worklist"); + + // The root is the last node in the worklist. + VPValue *Root = Worklist.front(); + + // This root needs to replace the existing block predicate. This is done in + // the caller function. + return Root; +} + +// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE +VPlanPredicator::EdgeType +VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock, + VPBlockBase *ToBlock) { + unsigned Count = 0; + for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) { + if (SuccBlock == ToBlock) { + assert(Count < 2 && "Switch not supported currently"); + return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE; + } + Count++; + } + + llvm_unreachable("Broken getEdgeTypeBetween"); +} + +// Generate all predicates needed for CurrBlock by going through its immediate +// predecessor blocks. +void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock, + VPRegionBlock *Region) { + // Blocks that dominate region exit inherit the predicate from the region. + // Return after setting the predicate. + if (VPDomTree.dominates(CurrBlock, Region->getExit())) { + VPValue *RegionBP = Region->getPredicate(); + CurrBlock->setPredicate(RegionBP); + return; + } + + // Collect all incoming predicates in a worklist. + std::list<VPValue *> IncomingPredicates; + + // Set the builder's insertion point to the top of the current BB + VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock()); + Builder.setInsertPoint(CurrBB, CurrBB->begin()); + + // For each predecessor, generate the VPInstructions required for + // computing 'BP AND (not) CBV" at the top of CurrBB. + // Collect the outcome of this calculation for all predecessors + // into IncomingPredicates. + for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) { + // Skip back-edges + if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI)) + continue; + + VPValue *IncomingPredicate = nullptr; + unsigned NumPredSuccsNoBE = + VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI); + + // If there is an unconditional branch to the currBB, then we don't create + // edge predicates. We use the predecessor's block predicate instead. + if (NumPredSuccsNoBE == 1) + IncomingPredicate = PredBlock->getPredicate(); + else if (NumPredSuccsNoBE == 2) { + // Emit recipes into CurrBlock if required + assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits"); + IncomingPredicate = + getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB); + } else + llvm_unreachable("FIXME: switch statement ?"); + + if (IncomingPredicate) + IncomingPredicates.push_back(IncomingPredicate); + } + + // Logically OR all incoming predicates by building the Predicate Tree. + VPValue *Predicate = genPredicateTree(IncomingPredicates); + + // Now update the block's predicate with the new one. + CurrBlock->setPredicate(Predicate); +} + +// Generate all predicates needed for Region. +void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) { + VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry()); + ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock); + + // Generate edge predicates and append them to the block predicate. RPO is + // necessary since the predecessor blocks' block predicate needs to be set + // before the current block's block predicate can be computed. + for (VPBlockBase *Block : make_range(RPOT.begin(), RPOT.end())) { + // TODO: Handle nested regions once we start generating the same. + assert(!isa<VPRegionBlock>(Block) && "Nested region not expected"); + createOrPropagatePredicates(Block, Region); + } +} + +// Linearize the CFG within Region. +// TODO: Predication and linearization need RPOT for every region. +// This traversal is expensive. Since predication is not adding new +// blocks, we should be able to compute RPOT once in predication and +// reuse it here. This becomes even more important once we have nested +// regions. +void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { + ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); + VPBlockBase *PrevBlock = nullptr; + + for (VPBlockBase *CurrBlock : make_range(RPOT.begin(), RPOT.end())) { + // TODO: Handle nested regions once we start generating the same. + assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected"); + + // Linearize control flow by adding an unconditional edge between PrevBlock + // and CurrBlock skipping loop headers and latches to keep intact loop + // header predecessors and loop latch successors. + if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) && + !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) { + + LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->" + << CurrBlock->getName() << "\n"); + + PrevBlock->clearSuccessors(); + CurrBlock->clearPredecessors(); + VPBlockUtils::connectBlocks(PrevBlock, CurrBlock); + } + + PrevBlock = CurrBlock; + } +} + +// Entry point. The driver function for the predicator. +void VPlanPredicator::predicate(void) { + // Predicate the blocks within Region. + predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry())); + + // Linearlize the blocks with Region. + linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry())); +} + +VPlanPredicator::VPlanPredicator(VPlan &Plan) + : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) { + // FIXME: Predicator is currently computing the dominator information for the + // top region. Once we start storing dominator information in a VPRegionBlock, + // we can avoid this recalculation. + VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry()))); +} diff --git a/lib/Transforms/Vectorize/VPlanPredicator.h b/lib/Transforms/Vectorize/VPlanPredicator.h new file mode 100644 index 000000000000..692afd2978d5 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanPredicator.h @@ -0,0 +1,74 @@ +//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the VPlanPredicator class which contains the public +/// interfaces to predicate and linearize the VPlan region. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H + +#include "LoopVectorizationPlanner.h" +#include "VPlan.h" +#include "VPlanDominatorTree.h" + +namespace llvm { + +class VPlanPredicator { +private: + enum class EdgeType { + TRUE_EDGE, + FALSE_EDGE, + }; + + // VPlan being predicated. + VPlan &Plan; + + // VPLoopInfo for Plan's HCFG. + VPLoopInfo *VPLI; + + // Dominator tree for Plan's HCFG. + VPDominatorTree VPDomTree; + + // VPlan builder used to generate VPInstructions for block predicates. + VPBuilder Builder; + + /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if + /// \p ToBlock is either the unconditional successor or the conditional true + /// successor of \p FromBlock and FALSE_EDGE otherwise. + EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock); + + /// Create and return VPValue corresponding to the predicate for the edge from + /// \p PredBB to \p CurrentBlock. + VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB); + + /// Generate and return the result of ORing all the predicate VPValues in \p + /// Worklist. + VPValue *genPredicateTree(std::list<VPValue *> &Worklist); + + /// Create or propagate predicate for \p CurrBlock in region \p Region using + /// predicate(s) of its predecessor(s) + void createOrPropagatePredicates(VPBlockBase *CurrBlock, + VPRegionBlock *Region); + + /// Predicate the CFG within \p Region. + void predicateRegionRec(VPRegionBlock *Region); + + /// Linearize the CFG within \p Region. + void linearizeRegionRec(VPRegionBlock *Region); + +public: + VPlanPredicator(VPlan &Plan); + + /// Predicate Plan's HCFG. + void predicate(void); +}; +} // end namespace llvm +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp index ad3a85a6f760..e5ab24e52df6 100644 --- a/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -1,9 +1,8 @@ //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// This file implements SLP analysis based on VPlan. The analysis is based on diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h index b473579b699f..7b6c228c229e 100644 --- a/lib/Transforms/Vectorize/VPlanValue.h +++ b/lib/Transforms/Vectorize/VPlanValue.h @@ -1,9 +1,8 @@ //===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanVerifier.cpp b/lib/Transforms/Vectorize/VPlanVerifier.cpp index 054bed4e177f..394b1b93113b 100644 --- a/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -1,9 +1,8 @@ //===-- VPlanVerifier.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/VPlanVerifier.h b/lib/Transforms/Vectorize/VPlanVerifier.h index d2f99d006a66..7d2b26252172 100644 --- a/lib/Transforms/Vectorize/VPlanVerifier.h +++ b/lib/Transforms/Vectorize/VPlanVerifier.h @@ -1,9 +1,8 @@ //===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 559ab1968844..6a4f9169c2af 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -1,9 +1,8 @@ //===-- Vectorize.cpp -----------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // |