diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-07-01 13:22:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-07-01 13:22:02 +0000 |
commit | 9df3605dea17e84f8183581f6103bd0c79e2a606 (patch) | |
tree | 70a2f36ce9eb9bb213603cd7f2f120af53fc176f /lib | |
parent | 08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (diff) |
Notes
Diffstat (limited to 'lib')
282 files changed, 8010 insertions, 6473 deletions
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp index ddd5123d0eff..0de7ad98af46 100644 --- a/lib/Analysis/CFLAndersAliasAnalysis.cpp +++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp @@ -68,17 +68,6 @@ CFLAndersAAResult::CFLAndersAAResult(CFLAndersAAResult &&RHS) : AAResultBase(std::move(RHS)), TLI(RHS.TLI) {} CFLAndersAAResult::~CFLAndersAAResult() {} -static const Function *parentFunctionOfValue(const Value *Val) { - if (auto *Inst = dyn_cast<Instruction>(Val)) { - auto *Bb = Inst->getParent(); - return Bb->getParent(); - } - - if (auto *Arg = dyn_cast<Argument>(Val)) - return Arg->getParent(); - return nullptr; -} - namespace { enum class MatchState : uint8_t { @@ -789,10 +778,10 @@ void CFLAndersAAResult::scan(const Function &Fn) { // resize and invalidating the reference returned by operator[] auto FunInfo = buildInfoFrom(Fn); Cache[&Fn] = std::move(FunInfo); - Handles.push_front(FunctionHandle(const_cast<Function *>(&Fn), this)); + Handles.emplace_front(const_cast<Function *>(&Fn), this); } -void CFLAndersAAResult::evict(const Function &Fn) { Cache.erase(&Fn); } +void CFLAndersAAResult::evict(const Function *Fn) { Cache.erase(Fn); } const Optional<CFLAndersAAResult::FunctionInfo> & CFLAndersAAResult::ensureCached(const Function &Fn) { diff --git a/lib/Analysis/CFLSteensAliasAnalysis.cpp b/lib/Analysis/CFLSteensAliasAnalysis.cpp index 6e4263920e58..adbdd82012a3 100644 --- a/lib/Analysis/CFLSteensAliasAnalysis.cpp +++ b/lib/Analysis/CFLSteensAliasAnalysis.cpp @@ -88,19 +88,6 @@ const StratifiedIndex StratifiedLink::SetSentinel = //===----------------------------------------------------------------------===// /// Determines whether it would be pointless to add the given Value to our sets. -static bool canSkipAddingToSets(Value *Val); - -static Optional<Function *> parentFunctionOfValue(Value *Val) { - if (auto *Inst = dyn_cast<Instruction>(Val)) { - auto *Bb = Inst->getParent(); - return Bb->getParent(); - } - - if (auto *Arg = dyn_cast<Argument>(Val)) - return Arg->getParent(); - return None; -} - static bool canSkipAddingToSets(Value *Val) { // Constants can share instances, which may falsely unify multiple // sets, e.g. in @@ -245,7 +232,7 @@ void CFLSteensAAResult::scan(Function *Fn) { auto FunInfo = buildSetsFrom(Fn); Cache[Fn] = std::move(FunInfo); - Handles.push_front(FunctionHandle(Fn, this)); + Handles.emplace_front(Fn, this); } void CFLSteensAAResult::evict(Function *Fn) { Cache.erase(Fn); } @@ -281,9 +268,9 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA, return NoAlias; Function *Fn = nullptr; - auto MaybeFnA = parentFunctionOfValue(ValA); - auto MaybeFnB = parentFunctionOfValue(ValB); - if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) { + Function *MaybeFnA = const_cast<Function *>(parentFunctionOfValue(ValA)); + Function *MaybeFnB = const_cast<Function *>(parentFunctionOfValue(ValB)); + if (!MaybeFnA && !MaybeFnB) { // The only times this is known to happen are when globals + InlineAsm are // involved DEBUG(dbgs() @@ -291,12 +278,12 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA, return MayAlias; } - if (MaybeFnA.hasValue()) { - Fn = *MaybeFnA; - assert((!MaybeFnB.hasValue() || *MaybeFnB == *MaybeFnA) && + if (MaybeFnA) { + Fn = MaybeFnA; + assert((!MaybeFnB || MaybeFnB == MaybeFnA) && "Interprocedural queries not supported"); } else { - Fn = *MaybeFnB; + Fn = MaybeFnB; } assert(Fn != nullptr); diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 77ad6f1e166f..35693666aa03 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -66,6 +66,12 @@ static cl::opt<int> cl::ZeroOrMore, cl::desc("Threshold for hot callsites ")); +static cl::opt<int> ColdCallSiteRelFreq( + "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, + cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " + "entry frequency, for a callsite to be cold in the absence of " + "profile information.")); + namespace { class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { @@ -172,6 +178,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// Return true if size growth is allowed when inlining the callee at CS. bool allowSizeGrowth(CallSite CS); + /// Return true if \p CS is a cold callsite. + bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); + // Custom analysis routines. bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); @@ -631,6 +640,26 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) { return true; } +bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { + // If global profile summary is available, then callsite's coldness is + // determined based on that. + if (PSI->hasProfileSummary()) + return PSI->isColdCallSite(CS, CallerBFI); + if (!CallerBFI) + return false; + + // In the absence of global profile summary, determine if the callsite is cold + // relative to caller's entry. We could potentially cache the computation of + // scaled entry frequency, but the added complexity is not worth it unless + // this scaling shows up high in the profiles. + const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); + auto CallSiteBB = CS.getInstruction()->getParent(); + auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); + auto CallerEntryFreq = + CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); + return CallSiteFreq < CallerEntryFreq * ColdProb; +} + void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // If no size growth is allowed for this inlining, set Threshold to 0. if (!allowSizeGrowth(CS)) { @@ -676,7 +705,7 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { if (PSI->isHotCallSite(CS, CallerBFI)) { DEBUG(dbgs() << "Hot callsite.\n"); Threshold = Params.HotCallSiteThreshold.getValue(); - } else if (PSI->isColdCallSite(CS, CallerBFI)) { + } else if (isColdCallSite(CS, CallerBFI)) { DEBUG(dbgs() << "Cold callsite.\n"); Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); } @@ -1010,7 +1039,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { if (isa<ConstantInt>(V)) return true; - // Assume the most general case where the swith is lowered into + // Assume the most general case where the switch is lowered into // either a jump table, bit test, or a balanced binary tree consisting of // case clusters without merging adjacent clusters with the same // destination. We do not consider the switches that are lowered with a mix diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp index 2a736ec0379c..0e02850df349 100644 --- a/lib/Analysis/IteratedDominanceFrontier.cpp +++ b/lib/Analysis/IteratedDominanceFrontier.cpp @@ -20,14 +20,6 @@ namespace llvm { template <class NodeTy> void IDFCalculator<NodeTy>::calculate( SmallVectorImpl<BasicBlock *> &PHIBlocks) { - // If we haven't computed dominator tree levels, do so now. - if (DomLevels.empty()) { - for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode()); - DFI != DFE; ++DFI) { - DomLevels[*DFI] = DFI.getPathLength() - 1; - } - } - // Use a priority queue keyed on dominator tree level so that inserted nodes // are handled from the bottom of the dominator tree upwards. typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair; @@ -37,7 +29,7 @@ void IDFCalculator<NodeTy>::calculate( for (BasicBlock *BB : *DefBlocks) { if (DomTreeNode *Node = DT.getNode(BB)) - PQ.push(std::make_pair(Node, DomLevels.lookup(Node))); + PQ.push({Node, Node->getLevel()}); } SmallVector<DomTreeNode *, 32> Worklist; @@ -72,7 +64,7 @@ void IDFCalculator<NodeTy>::calculate( if (SuccNode->getIDom() == Node) continue; - unsigned SuccLevel = DomLevels.lookup(SuccNode); + const unsigned SuccLevel = SuccNode->getLevel(); if (SuccLevel > RootLevel) continue; diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp index 7983d62c2f7a..f88d54b21e1e 100644 --- a/lib/Analysis/MemoryBuiltins.cpp +++ b/lib/Analysis/MemoryBuiltins.cpp @@ -400,8 +400,8 @@ static APInt getSizeWithOverflow(const SizeOffsetType &Data) { /// \brief Compute the size of the object pointed by Ptr. Returns true and the /// object size in Size if successful, and false otherwise. -/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas, -/// byval arguments, and global variables. +/// If RoundToAlign is true, then Size is rounded up to the alignment of +/// allocas, byval arguments, and global variables. bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL, const TargetLibraryInfo *TLI, ObjectSizeOpts Opts) { ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), Opts); diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp index e38e530c052d..eb259fd7a384 100644 --- a/lib/Analysis/OptimizationDiagnosticInfo.cpp +++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp @@ -25,7 +25,7 @@ using namespace llvm; OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F) : F(F), BFI(nullptr) { - if (!F->getContext().getDiagnosticHotnessRequested()) + if (!F->getContext().getDiagnosticsHotnessRequested()) return; // First create a dominator tree. @@ -155,6 +155,13 @@ void OptimizationRemarkEmitter::emit( DiagnosticInfoOptimizationBase &OptDiagBase) { auto &OptDiag = cast<DiagnosticInfoIROptimization>(OptDiagBase); computeHotness(OptDiag); + // If a diagnostic has a hotness value, then only emit it if its hotness + // meets the threshold. + if (OptDiag.getHotness() && + *OptDiag.getHotness() < + F->getContext().getDiagnosticsHotnessThreshold()) { + return; + } yaml::Output *Out = F->getContext().getDiagnosticsOutputFile(); if (Out) { @@ -176,7 +183,7 @@ OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass() bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) { BlockFrequencyInfo *BFI; - if (Fn.getContext().getDiagnosticHotnessRequested()) + if (Fn.getContext().getDiagnosticsHotnessRequested()) BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI(); else BFI = nullptr; @@ -198,7 +205,7 @@ OptimizationRemarkEmitterAnalysis::run(Function &F, FunctionAnalysisManager &AM) { BlockFrequencyInfo *BFI; - if (F.getContext().getDiagnosticHotnessRequested()) + if (F.getContext().getDiagnosticsHotnessRequested()) BFI = &AM.getResult<BlockFrequencyAnalysis>(F); else BFI = nullptr; diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp index 63ef8d28d44a..900487323005 100644 --- a/lib/Analysis/RegionInfo.cpp +++ b/lib/Analysis/RegionInfo.cpp @@ -10,28 +10,29 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/RegionInfo.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/RegionInfoImpl.h" -#include "llvm/Analysis/RegionIterator.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #ifndef NDEBUG #include "llvm/Analysis/RegionPrinter.h" #endif +#include "llvm/Analysis/RegionInfoImpl.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "region" namespace llvm { + template class RegionBase<RegionTraits<Function>>; template class RegionNodeBase<RegionTraits<Function>>; template class RegionInfoBase<RegionTraits<Function>>; -} + +} // end namespace llvm STATISTIC(numRegions, "The # of regions"); STATISTIC(numSimpleRegions, "The # of simple regions"); @@ -44,7 +45,6 @@ VerifyRegionInfoX( cl::location(RegionInfoBase<RegionTraits<Function>>::VerifyRegionInfo), cl::desc("Verify region info (time consuming)")); - static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style", cl::location(RegionInfo::printStyle), cl::Hidden, @@ -56,7 +56,6 @@ static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style", clEnumValN(Region::PrintRN, "rn", "print regions in detail with element_iterator"))); - //===----------------------------------------------------------------------===// // Region implementation // @@ -68,20 +67,15 @@ Region::Region(BasicBlock *Entry, BasicBlock *Exit, } -Region::~Region() { } +Region::~Region() = default; //===----------------------------------------------------------------------===// // RegionInfo implementation // -RegionInfo::RegionInfo() : - RegionInfoBase<RegionTraits<Function>>() { - -} +RegionInfo::RegionInfo() = default; -RegionInfo::~RegionInfo() { - -} +RegionInfo::~RegionInfo() = default; bool RegionInfo::invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &) { @@ -126,9 +120,7 @@ RegionInfoPass::RegionInfoPass() : FunctionPass(ID) { initializeRegionInfoPassPass(*PassRegistry::getPassRegistry()); } -RegionInfoPass::~RegionInfoPass() { - -} +RegionInfoPass::~RegionInfoPass() = default; bool RegionInfoPass::runOnFunction(Function &F) { releaseMemory(); @@ -181,10 +173,12 @@ INITIALIZE_PASS_END(RegionInfoPass, "regions", // the link time optimization. namespace llvm { + FunctionPass *createRegionInfoPass() { return new RegionInfoPass(); } -} + +} // end namespace llvm //===----------------------------------------------------------------------===// // RegionInfoAnalysis implementation diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 73a95ec405c7..678ad3af5e85 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -157,6 +157,11 @@ static cl::opt<unsigned> MaxConstantEvolvingDepth( "scalar-evolution-max-constant-evolving-depth", cl::Hidden, cl::desc("Maximum depth of recursive constant evolving"), cl::init(32)); +static cl::opt<unsigned> + MaxExtDepth("scalar-evolution-max-ext-depth", cl::Hidden, + cl::desc("Maximum depth of recursive SExt/ZExt"), + cl::init(8)); + //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// @@ -1285,8 +1290,8 @@ static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step, namespace { struct ExtendOpTraitsBase { - typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)( - const SCEV *, Type *, ScalarEvolution::ExtendCacheTy &Cache); + typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *, + unsigned); }; // Used to make code generic over signed and unsigned overflow. @@ -1315,9 +1320,8 @@ struct ExtendOpTraits<SCEVSignExtendExpr> : public ExtendOpTraitsBase { } }; -const ExtendOpTraitsBase::GetExtendExprTy - ExtendOpTraits<SCEVSignExtendExpr>::GetExtendExpr = - &ScalarEvolution::getSignExtendExprCached; +const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits< + SCEVSignExtendExpr>::GetExtendExpr = &ScalarEvolution::getSignExtendExpr; template <> struct ExtendOpTraits<SCEVZeroExtendExpr> : public ExtendOpTraitsBase { @@ -1332,9 +1336,8 @@ struct ExtendOpTraits<SCEVZeroExtendExpr> : public ExtendOpTraitsBase { } }; -const ExtendOpTraitsBase::GetExtendExprTy - ExtendOpTraits<SCEVZeroExtendExpr>::GetExtendExpr = - &ScalarEvolution::getZeroExtendExprCached; +const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits< + SCEVZeroExtendExpr>::GetExtendExpr = &ScalarEvolution::getZeroExtendExpr; } // The recurrence AR has been shown to have no signed/unsigned wrap or something @@ -1346,8 +1349,7 @@ const ExtendOpTraitsBase::GetExtendExprTy // "sext/zext(PostIncAR)" template <typename ExtendOpTy> static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, - ScalarEvolution *SE, - ScalarEvolution::ExtendCacheTy &Cache) { + ScalarEvolution *SE, unsigned Depth) { auto WrapType = ExtendOpTraits<ExtendOpTy>::WrapType; auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr; @@ -1394,9 +1396,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, unsigned BitWidth = SE->getTypeSizeInBits(AR->getType()); Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2); const SCEV *OperandExtendedStart = - SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Cache), - (SE->*GetExtendExpr)(Step, WideTy, Cache)); - if ((SE->*GetExtendExpr)(Start, WideTy, Cache) == OperandExtendedStart) { + SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Depth), + (SE->*GetExtendExpr)(Step, WideTy, Depth)); + if ((SE->*GetExtendExpr)(Start, WideTy, Depth) == OperandExtendedStart) { if (PreAR && AR->getNoWrapFlags(WrapType)) { // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then @@ -1422,16 +1424,16 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, template <typename ExtendOpTy> static const SCEV *getExtendAddRecStart(const SCEVAddRecExpr *AR, Type *Ty, ScalarEvolution *SE, - ScalarEvolution::ExtendCacheTy &Cache) { + unsigned Depth) { auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr; - const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE, Cache); + const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE, Depth); if (!PreStart) - return (SE->*GetExtendExpr)(AR->getStart(), Ty, Cache); + return (SE->*GetExtendExpr)(AR->getStart(), Ty, Depth); - return SE->getAddExpr( - (SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty, Cache), - (SE->*GetExtendExpr)(PreStart, Ty, Cache)); + return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty, + Depth), + (SE->*GetExtendExpr)(PreStart, Ty, Depth)); } // Try to prove away overflow by looking at "nearby" add recurrences. A @@ -1511,31 +1513,8 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start, return false; } -const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty) { - // Use the local cache to prevent exponential behavior of - // getZeroExtendExprImpl. - ExtendCacheTy Cache; - return getZeroExtendExprCached(Op, Ty, Cache); -} - -/// Query \p Cache before calling getZeroExtendExprImpl. If there is no -/// related entry in the \p Cache, call getZeroExtendExprImpl and save -/// the result in the \p Cache. -const SCEV *ScalarEvolution::getZeroExtendExprCached(const SCEV *Op, Type *Ty, - ExtendCacheTy &Cache) { - auto It = Cache.find({Op, Ty}); - if (It != Cache.end()) - return It->second; - const SCEV *ZExt = getZeroExtendExprImpl(Op, Ty, Cache); - auto InsertResult = Cache.insert({{Op, Ty}, ZExt}); - assert(InsertResult.second && "Expect the key was not in the cache"); - (void)InsertResult; - return ZExt; -} - -/// The real implementation of getZeroExtendExpr. -const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, - ExtendCacheTy &Cache) { +const SCEV * +ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && @@ -1545,11 +1524,11 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) return getConstant( - cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty))); + cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty))); // zext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op)) - return getZeroExtendExprCached(SZ->getOperand(), Ty, Cache); + return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1); // Before doing any expensive analysis, check to see if we've already // computed a SCEV for this Op and Ty. @@ -1559,6 +1538,12 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; + if (Depth > MaxExtDepth) { + SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator), + Op, Ty); + UniqueSCEVs.InsertNode(S, IP); + return S; + } // zext(trunc(x)) --> zext(x) or x or trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) { @@ -1593,8 +1578,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, // we don't need to do any further analysis. if (AR->hasNoUnsignedWrap()) return getAddRecExpr( - getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache), - getZeroExtendExprCached(Step, Ty, Cache), L, AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1), + getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are @@ -1618,22 +1603,29 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, if (MaxBECount == RecastedMaxBECount) { Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no unsigned overflow. - const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step); - const SCEV *ZAdd = - getZeroExtendExprCached(getAddExpr(Start, ZMul), WideTy, Cache); - const SCEV *WideStart = getZeroExtendExprCached(Start, WideTy, Cache); + const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step, + SCEV::FlagAnyWrap, Depth + 1); + const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul, + SCEV::FlagAnyWrap, + Depth + 1), + WideTy, Depth + 1); + const SCEV *WideStart = getZeroExtendExpr(Start, WideTy, Depth + 1); const SCEV *WideMaxBECount = - getZeroExtendExprCached(CastedMaxBECount, WideTy, Cache); - const SCEV *OperandExtendedAdd = getAddExpr( - WideStart, getMulExpr(WideMaxBECount, getZeroExtendExprCached( - Step, WideTy, Cache))); + getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1); + const SCEV *OperandExtendedAdd = + getAddExpr(WideStart, + getMulExpr(WideMaxBECount, + getZeroExtendExpr(Step, WideTy, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1); if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NUW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW); // Return the expression with the addrec on the outside. return getAddRecExpr( - getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache), - getZeroExtendExprCached(Step, Ty, Cache), L, + getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, + Depth + 1), + getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); } // Similar to above, only this time treat the step value as signed. @@ -1641,15 +1633,19 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, - getSignExtendExpr(Step, WideTy))); + getSignExtendExpr(Step, WideTy, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1); if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NW, which is propagated to this AddRec. // Negative step causes unsigned wrap, but it still can't self-wrap. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW); // Return the expression with the addrec on the outside. return getAddRecExpr( - getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache), - getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, + Depth + 1), + getSignExtendExpr(Step, Ty, Depth + 1), L, + AR->getNoWrapFlags()); } } } @@ -1680,8 +1676,9 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW); // Return the expression with the addrec on the outside. return getAddRecExpr( - getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache), - getZeroExtendExprCached(Step, Ty, Cache), L, + getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, + Depth + 1), + getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); } } else if (isKnownNegative(Step)) { @@ -1697,8 +1694,10 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW); // Return the expression with the addrec on the outside. return getAddRecExpr( - getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache), - getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, + Depth + 1), + getSignExtendExpr(Step, Ty, Depth + 1), L, + AR->getNoWrapFlags()); } } } @@ -1706,8 +1705,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) { const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW); return getAddRecExpr( - getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache), - getZeroExtendExprCached(Step, Ty, Cache), L, AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1), + getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); } } @@ -1718,8 +1717,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, // commute the zero extension with the addition operation. SmallVector<const SCEV *, 4> Ops; for (const auto *Op : SA->operands()) - Ops.push_back(getZeroExtendExprCached(Op, Ty, Cache)); - return getAddExpr(Ops, SCEV::FlagNUW); + Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1)); + return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1); } } @@ -1732,31 +1731,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, return S; } -const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty) { - // Use the local cache to prevent exponential behavior of - // getSignExtendExprImpl. - ExtendCacheTy Cache; - return getSignExtendExprCached(Op, Ty, Cache); -} - -/// Query \p Cache before calling getSignExtendExprImpl. If there is no -/// related entry in the \p Cache, call getSignExtendExprImpl and save -/// the result in the \p Cache. -const SCEV *ScalarEvolution::getSignExtendExprCached(const SCEV *Op, Type *Ty, - ExtendCacheTy &Cache) { - auto It = Cache.find({Op, Ty}); - if (It != Cache.end()) - return It->second; - const SCEV *SExt = getSignExtendExprImpl(Op, Ty, Cache); - auto InsertResult = Cache.insert({{Op, Ty}, SExt}); - assert(InsertResult.second && "Expect the key was not in the cache"); - (void)InsertResult; - return SExt; -} - -/// The real implementation of getSignExtendExpr. -const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, - ExtendCacheTy &Cache) { +const SCEV * +ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && @@ -1766,15 +1742,15 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) return getConstant( - cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty))); + cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty))); // sext(sext(x)) --> sext(x) if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op)) - return getSignExtendExprCached(SS->getOperand(), Ty, Cache); + return getSignExtendExpr(SS->getOperand(), Ty, Depth + 1); // sext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op)) - return getZeroExtendExpr(SZ->getOperand(), Ty); + return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1); // Before doing any expensive analysis, check to see if we've already // computed a SCEV for this Op and Ty. @@ -1784,6 +1760,13 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; + // Limit recursion depth. + if (Depth > MaxExtDepth) { + SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator), + Op, Ty); + UniqueSCEVs.InsertNode(S, IP); + return S; + } // sext(trunc(x)) --> sext(x) or x or trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) { @@ -1809,8 +1792,9 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, const APInt &C2 = SC2->getAPInt(); if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) && C2.isPowerOf2()) - return getAddExpr(getSignExtendExprCached(SC1, Ty, Cache), - getSignExtendExprCached(SMul, Ty, Cache)); + return getAddExpr(getSignExtendExpr(SC1, Ty, Depth + 1), + getSignExtendExpr(SMul, Ty, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1); } } } @@ -1821,8 +1805,8 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, // commute the sign extension with the addition operation. SmallVector<const SCEV *, 4> Ops; for (const auto *Op : SA->operands()) - Ops.push_back(getSignExtendExprCached(Op, Ty, Cache)); - return getAddExpr(Ops, SCEV::FlagNSW); + Ops.push_back(getSignExtendExpr(Op, Ty, Depth + 1)); + return getAddExpr(Ops, SCEV::FlagNSW, Depth + 1); } } // If the input value is a chrec scev, and we can prove that the value @@ -1845,8 +1829,8 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, // we don't need to do any further analysis. if (AR->hasNoSignedWrap()) return getAddRecExpr( - getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache), - getSignExtendExprCached(Step, Ty, Cache), L, SCEV::FlagNSW); + getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1), + getSignExtendExpr(Step, Ty, Depth + 1), L, SCEV::FlagNSW); // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are @@ -1870,22 +1854,29 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, if (MaxBECount == RecastedMaxBECount) { Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no signed overflow. - const SCEV *SMul = getMulExpr(CastedMaxBECount, Step); - const SCEV *SAdd = - getSignExtendExprCached(getAddExpr(Start, SMul), WideTy, Cache); - const SCEV *WideStart = getSignExtendExprCached(Start, WideTy, Cache); + const SCEV *SMul = getMulExpr(CastedMaxBECount, Step, + SCEV::FlagAnyWrap, Depth + 1); + const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul, + SCEV::FlagAnyWrap, + Depth + 1), + WideTy, Depth + 1); + const SCEV *WideStart = getSignExtendExpr(Start, WideTy, Depth + 1); const SCEV *WideMaxBECount = - getZeroExtendExpr(CastedMaxBECount, WideTy); - const SCEV *OperandExtendedAdd = getAddExpr( - WideStart, getMulExpr(WideMaxBECount, getSignExtendExprCached( - Step, WideTy, Cache))); + getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1); + const SCEV *OperandExtendedAdd = + getAddExpr(WideStart, + getMulExpr(WideMaxBECount, + getSignExtendExpr(Step, WideTy, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1); if (SAdd == OperandExtendedAdd) { // Cache knowledge of AR NSW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); // Return the expression with the addrec on the outside. return getAddRecExpr( - getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache), - getSignExtendExprCached(Step, Ty, Cache), L, + getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, + Depth + 1), + getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); } // Similar to above, only this time treat the step value as unsigned. @@ -1893,7 +1884,9 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, - getZeroExtendExpr(Step, WideTy))); + getZeroExtendExpr(Step, WideTy, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1); if (SAdd == OperandExtendedAdd) { // If AR wraps around then // @@ -1907,8 +1900,10 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, // Return the expression with the addrec on the outside. return getAddRecExpr( - getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache), - getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, + Depth + 1), + getZeroExtendExpr(Step, Ty, Depth + 1), L, + AR->getNoWrapFlags()); } } } @@ -1939,9 +1934,8 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); return getAddRecExpr( - getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache), - getSignExtendExprCached(Step, Ty, Cache), L, - AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1), + getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); } } @@ -1955,25 +1949,26 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, const APInt &C2 = SC2->getAPInt(); if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) && C2.isPowerOf2()) { - Start = getSignExtendExprCached(Start, Ty, Cache); + Start = getSignExtendExpr(Start, Ty, Depth + 1); const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L, AR->getNoWrapFlags()); - return getAddExpr(Start, getSignExtendExprCached(NewAR, Ty, Cache)); + return getAddExpr(Start, getSignExtendExpr(NewAR, Ty, Depth + 1), + SCEV::FlagAnyWrap, Depth + 1); } } if (proveNoWrapByVaryingStart<SCEVSignExtendExpr>(Start, Step, L)) { const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); return getAddRecExpr( - getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache), - getSignExtendExprCached(Step, Ty, Cache), L, AR->getNoWrapFlags()); + getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1), + getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags()); } } // If the input value is provably positive and we could not simplify // away the sext build a zext instead. if (isKnownNonNegative(Op)) - return getZeroExtendExpr(Op, Ty); + return getZeroExtendExpr(Op, Ty, Depth + 1); // The cast wasn't folded; create an explicit cast node. // Recompute the insert position, as it may have been invalidated. diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 92328f6e5efd..f938a9a52065 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -89,8 +89,9 @@ TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI, return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize); } -int TargetTransformInfo::getUserCost(const User *U) const { - int Cost = TTIImpl->getUserCost(U); +int TargetTransformInfo::getUserCost(const User *U, + ArrayRef<const Value *> Operands) const { + int Cost = TTIImpl->getUserCost(U, Operands); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -116,8 +117,8 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const { } void TargetTransformInfo::getUnrollingPreferences( - Loop *L, UnrollingPreferences &UP) const { - return TTIImpl->getUnrollingPreferences(L, UP); + Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { + return TTIImpl->getUnrollingPreferences(L, SE, UP); } bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const { diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp index cd9972ab56a6..86c528de267a 100644 --- a/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -23,10 +23,10 @@ // // The scalar TBAA metadata format is very simple. TBAA MDNodes have up to // three fields, e.g.: -// !0 = metadata !{ metadata !"an example type tree" } -// !1 = metadata !{ metadata !"int", metadata !0 } -// !2 = metadata !{ metadata !"float", metadata !0 } -// !3 = metadata !{ metadata !"const float", metadata !2, i64 1 } +// !0 = !{ !"an example type tree" } +// !1 = !{ !"int", !0 } +// !2 = !{ !"float", !0 } +// !3 = !{ !"const float", !2, i64 1 } // // The first field is an identity field. It can be any value, usually // an MDString, which uniquely identifies the type. The most important @@ -74,13 +74,13 @@ // instruction. The base type is !4 (struct B), the access type is !2 (scalar // type short) and the offset is 4. // -// !0 = metadata !{metadata !"Simple C/C++ TBAA"} -// !1 = metadata !{metadata !"omnipotent char", metadata !0} // Scalar type node -// !2 = metadata !{metadata !"short", metadata !1} // Scalar type node -// !3 = metadata !{metadata !"A", metadata !2, i64 0} // Struct type node -// !4 = metadata !{metadata !"B", metadata !2, i64 0, metadata !3, i64 4} +// !0 = !{!"Simple C/C++ TBAA"} +// !1 = !{!"omnipotent char", !0} // Scalar type node +// !2 = !{!"short", !1} // Scalar type node +// !3 = !{!"A", !2, i64 0} // Struct type node +// !4 = !{!"B", !2, i64 0, !3, i64 4} // // Struct type node -// !5 = metadata !{metadata !4, metadata !2, i64 4} // Path tag node +// !5 = !{!4, !2, i64 4} // Path tag node // // The struct type nodes and the scalar type nodes form a type DAG. // Root (!0) diff --git a/lib/BinaryFormat/Magic.cpp b/lib/BinaryFormat/Magic.cpp index f24f22c88a8a..b19a07a9066b 100644 --- a/lib/BinaryFormat/Magic.cpp +++ b/lib/BinaryFormat/Magic.cpp @@ -191,8 +191,8 @@ file_magic llvm::identify_magic(StringRef Magic) { } break; - case 0x64: // x86-64 Windows. - if (Magic[1] == char(0x86)) + case 0x64: // x86-64 or ARM64 Windows. + if (Magic[1] == char(0x86) || Magic[1] == char(0xaa)) return file_magic::coff_object; break; diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 0629c2d326ae..1ebef3173135 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -5360,8 +5360,9 @@ const std::error_category &llvm::BitcodeErrorCategory() { return *ErrorCategory; } -static Expected<StringRef> readStrtab(BitstreamCursor &Stream) { - if (Stream.EnterSubBlock(bitc::STRTAB_BLOCK_ID)) +static Expected<StringRef> readBlobInRecord(BitstreamCursor &Stream, + unsigned Block, unsigned RecordID) { + if (Stream.EnterSubBlock(Block)) return error("Invalid record"); StringRef Strtab; @@ -5382,7 +5383,7 @@ static Expected<StringRef> readStrtab(BitstreamCursor &Stream) { case BitstreamEntry::Record: StringRef Blob; SmallVector<uint64_t, 1> Record; - if (Stream.readRecord(Entry.ID, Record, &Blob) == bitc::STRTAB_BLOB) + if (Stream.readRecord(Entry.ID, Record, &Blob) == RecordID) Strtab = Blob; break; } @@ -5450,7 +5451,8 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) { } if (Entry.ID == bitc::STRTAB_BLOCK_ID) { - Expected<StringRef> Strtab = readStrtab(Stream); + Expected<StringRef> Strtab = + readBlobInRecord(Stream, bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB); if (!Strtab) return Strtab.takeError(); // This string table is used by every preceding bitcode module that does @@ -5462,6 +5464,28 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) { break; I->Strtab = *Strtab; } + // Similarly, the string table is used by every preceding symbol table; + // normally there will be just one unless the bitcode file was created + // by binary concatenation. + if (!F.Symtab.empty() && F.StrtabForSymtab.empty()) + F.StrtabForSymtab = *Strtab; + continue; + } + + if (Entry.ID == bitc::SYMTAB_BLOCK_ID) { + Expected<StringRef> SymtabOrErr = + readBlobInRecord(Stream, bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB); + if (!SymtabOrErr) + return SymtabOrErr.takeError(); + + // We can expect the bitcode file to have multiple symbol tables if it + // was created by binary concatenation. In that case we silently + // ignore any subsequent symbol tables, which is fine because this is a + // low level function. The client is expected to notice that the number + // of modules in the symbol table does not match the number of modules + // in the input file and regenerate the symbol table. + if (F.Symtab.empty()) + F.Symtab = *SymtabOrErr; continue; } diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index feeba31908ae..b2b1ea6de374 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -29,10 +29,12 @@ #include "llvm/IR/UseListOrder.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/MC/StringTableBuilder.h" +#include "llvm/Object/IRSymtab.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Program.h" #include "llvm/Support/SHA1.h" +#include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include <cctype> #include <map> @@ -3820,6 +3822,38 @@ void BitcodeWriter::writeBlob(unsigned Block, unsigned Record, StringRef Blob) { Stream->ExitBlock(); } +void BitcodeWriter::writeSymtab() { + assert(!WroteStrtab && !WroteSymtab); + + // If any module has module-level inline asm, we will require a registered asm + // parser for the target so that we can create an accurate symbol table for + // the module. + for (Module *M : Mods) { + if (M->getModuleInlineAsm().empty()) + continue; + + std::string Err; + const Triple TT(M->getTargetTriple()); + const Target *T = TargetRegistry::lookupTarget(TT.str(), Err); + if (!T || !T->hasMCAsmParser()) + return; + } + + WroteSymtab = true; + SmallVector<char, 0> Symtab; + // The irsymtab::build function may be unable to create a symbol table if the + // module is malformed (e.g. it contains an invalid alias). Writing a symbol + // table is not required for correctness, but we still want to be able to + // write malformed modules to bitcode files, so swallow the error. + if (Error E = irsymtab::build(Mods, Symtab, StrtabBuilder, Alloc)) { + consumeError(std::move(E)); + return; + } + + writeBlob(bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB, + {Symtab.data(), Symtab.size()}); +} + void BitcodeWriter::writeStrtab() { assert(!WroteStrtab); @@ -3843,6 +3877,15 @@ void BitcodeWriter::writeModule(const Module *M, bool ShouldPreserveUseListOrder, const ModuleSummaryIndex *Index, bool GenerateHash, ModuleHash *ModHash) { + assert(!WroteStrtab); + + // The Mods vector is used by irsymtab::build, which requires non-const + // Modules in case it needs to materialize metadata. But the bitcode writer + // requires that the module is materialized, so we can cast to non-const here, + // after checking that it is in fact materialized. + assert(M->isMaterialized()); + Mods.push_back(const_cast<Module *>(M)); + ModuleBitcodeWriter ModuleWriter(M, Buffer, StrtabBuilder, *Stream, ShouldPreserveUseListOrder, Index, GenerateHash, ModHash); @@ -3875,6 +3918,7 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out, BitcodeWriter Writer(Buffer); Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash, ModHash); + Writer.writeSymtab(); Writer.writeStrtab(); if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) diff --git a/lib/Bitcode/Writer/LLVMBuild.txt b/lib/Bitcode/Writer/LLVMBuild.txt index a07c280fa9e3..ef6dc9f901e2 100644 --- a/lib/Bitcode/Writer/LLVMBuild.txt +++ b/lib/Bitcode/Writer/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = BitWriter parent = Bitcode -required_libraries = Analysis Core MC Support +required_libraries = Analysis Core MC Object Support diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index c48fcaa7b0d1..ff427c9a0d75 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -631,7 +631,9 @@ void AsmPrinter::EmitFunctionHeader() { const Function *F = MF->getFunction(); if (isVerbose()) - OutStreamer->GetCommentOS() << "-- Begin function " << F->getName() << '\n'; + OutStreamer->GetCommentOS() + << "-- Begin function " + << GlobalValue::dropLLVMManglingEscape(F->getName()) << '\n'; // Print out constants referenced by the function EmitConstantPool(); diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index e94616fd5900..a81d56e9618b 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -365,7 +365,7 @@ static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs, void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL, const MachineFunction *MF) { // Skip this instruction if it has the same location as the previous one. - if (DL == CurFn->LastLoc) + if (!DL || DL == PrevInstLoc) return; const DIScope *Scope = DL.get()->getScope(); @@ -385,11 +385,11 @@ void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL, if (!CurFn->HaveLineInfo) CurFn->HaveLineInfo = true; unsigned FileId = 0; - if (CurFn->LastLoc.get() && CurFn->LastLoc->getFile() == DL->getFile()) + if (PrevInstLoc.get() && PrevInstLoc->getFile() == DL->getFile()) FileId = CurFn->LastFileId; else FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile()); - CurFn->LastLoc = DL; + PrevInstLoc = DL; unsigned FuncId = CurFn->FuncId; if (const DILocation *SiteLoc = DL->getInlinedAt()) { @@ -2150,9 +2150,23 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) { if (!Asm || !CurFn || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup)) return; + + // If the first instruction of a new MBB has no location, find the first + // instruction with a location and use that. DebugLoc DL = MI->getDebugLoc(); - if (DL == PrevInstLoc || !DL) + if (!DL && MI->getParent() != PrevInstBB) { + for (const auto &NextMI : *MI->getParent()) { + DL = NextMI.getDebugLoc(); + if (DL) + break; + } + } + PrevInstBB = MI->getParent(); + + // If we still don't have a debug location, don't record a location. + if (!DL) return; + maybeRecordLocation(DL, Asm->MF); } diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 2cd495aec6dc..fd8f60425c24 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -118,7 +118,6 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { SmallVector<LocalVariable, 1> Locals; - DebugLoc LastLoc; const MCSymbol *Begin = nullptr; const MCSymbol *End = nullptr; unsigned FuncId = 0; diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index dc39d1e6cb52..d4a90eeabe15 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -245,17 +245,6 @@ void DwarfCompileUnit::addRange(RangeSpan Range) { CURanges.back().setEnd(Range.getEnd()); } -DIE::value_iterator -DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Label, const MCSymbol *Sec) { - if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) - return addLabel(Die, Attribute, - DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset - : dwarf::DW_FORM_data4, - Label); - return addSectionDelta(Die, Attribute, Label, Sec); -} - void DwarfCompileUnit::initStmtList() { // Define start line table label for each Compile Unit. MCSymbol *LineTableStartSym = @@ -380,15 +369,6 @@ void DwarfCompileUnit::constructScopeDIE( FinalChildren.push_back(std::move(ScopeDIE)); } -DIE::value_iterator -DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Hi, const MCSymbol *Lo) { - return Die.addValue(DIEValueAllocator, Attribute, - DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset - : dwarf::DW_FORM_data4, - new (DIEValueAllocator) DIEDelta(Hi, Lo)); -} - void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range) { const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 3c2fb8d99db7..e38672792867 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -127,10 +127,6 @@ public: void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Label); - /// addSectionDelta - Add a label delta attribute data and value. - DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Hi, const MCSymbol *Lo); - DwarfCompileUnit &getCU() override { return *this; } unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override; @@ -151,12 +147,6 @@ public: void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End); - /// addSectionLabel - Add a Dwarf section label attribute data and value. - /// - DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute, - const MCSymbol *Label, - const MCSymbol *Sec); - /// \brief Find DIE for the given subprogram and attach appropriate /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global /// variables in this scope then create and insert DIEs for these diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 708f5f7536ff..4f4ebfc56297 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1587,6 +1587,26 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) { sizeof(Ty->getOffset())); } +DIE::value_iterator +DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Hi, const MCSymbol *Lo) { + return Die.addValue(DIEValueAllocator, Attribute, + DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset + : dwarf::DW_FORM_data4, + new (DIEValueAllocator) DIEDelta(Hi, Lo)); +} + +DIE::value_iterator +DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Label, const MCSymbol *Sec) { + if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) + return addLabel(Die, Attribute, + DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset + : dwarf::DW_FORM_data4, + Label); + return addSectionDelta(Die, Attribute, Label, Sec); +} + bool DwarfTypeUnit::isDwoUnit() const { // Since there are no skeleton type units, all type units are dwo type units // when split DWARF is being used. diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h index 7acad2cbd89f..4cc01b3298d4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -291,6 +291,15 @@ public: void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy); + /// addSectionDelta - Add a label delta attribute data and value. + DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Hi, const MCSymbol *Lo); + + /// Add a Dwarf section label attribute data and value. + DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute, + const MCSymbol *Label, + const MCSymbol *Sec); + protected: ~DwarfUnit(); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index cb31c21293f4..b50e76f2e3ba 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1662,6 +1662,7 @@ class MemCmpExpansion { PHINode *PhiRes; bool IsUsedForZeroCmp; const DataLayout &DL; + IRBuilder<> Builder; unsigned calculateNumBlocks(unsigned Size); void createLoadCmpBlocks(); @@ -1671,13 +1672,14 @@ class MemCmpExpansion { void emitLoadCompareBlock(unsigned Index, unsigned LoadSize, unsigned GEPIndex); Value *getCompareLoadPairs(unsigned Index, unsigned Size, - unsigned &NumBytesProcessed, IRBuilder<> &Builder); + unsigned &NumBytesProcessed); void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, unsigned &NumBytesProcessed); void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex); void emitMemCmpResultBlock(); Value *getMemCmpExpansionZeroCase(unsigned Size); Value *getMemCmpEqZeroOneBlock(unsigned Size); + Value *getMemCmpOneBlock(unsigned Size); unsigned getLoadSize(unsigned Size); unsigned getNumLoads(unsigned Size); @@ -1702,7 +1704,7 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, unsigned LoadsPerBlock, const DataLayout &TheDataLayout) : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock), - DL(TheDataLayout) { + DL(TheDataLayout), Builder(CI) { // A memcmp with zero-comparison with only one block of load and compare does // not need to set up any extra blocks. This case could be handled in the DAG, @@ -1710,7 +1712,7 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, // we choose to handle this case too to avoid fragmented lowering. IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); NumBlocks = calculateNumBlocks(Size); - if (!IsUsedForZeroCmp || NumBlocks != 1) { + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) { BasicBlock *StartBlock = CI->getParent(); EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); setupEndBlockPHINodes(); @@ -1731,7 +1733,6 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); } - IRBuilder<> Builder(CI->getContext()); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); } @@ -1754,8 +1755,6 @@ void MemCmpExpansion::createResultBlock() { // final phi node for selecting the memcmp result. void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex) { - IRBuilder<> Builder(CI->getContext()); - Value *Source1 = CI->getArgOperand(0); Value *Source2 = CI->getArgOperand(1); @@ -1811,8 +1810,7 @@ unsigned MemCmpExpansion::getLoadSize(unsigned Size) { /// This is used in the case where the memcmp() call is compared equal or not /// equal to zero. Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, - unsigned &NumBytesProcessed, - IRBuilder<> &Builder) { + unsigned &NumBytesProcessed) { std::vector<Value *> XorList, OrList; Value *Diff; @@ -1910,8 +1908,7 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { - IRBuilder<> Builder(CI->getContext()); - Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed, Builder); + Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed); BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) ? EndBlock @@ -1946,8 +1943,6 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, return; } - IRBuilder<> Builder(CI->getContext()); - Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); assert(LoadSize <= MaxLoadSize && "Unexpected load type"); @@ -1975,9 +1970,7 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); if (DL.isLittleEndian()) { - Function *F = LoadCmpBlocks[Index]->getParent(); - - Function *Bswap = Intrinsic::getDeclaration(F->getParent(), + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::bswap, LoadSizeType); LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); @@ -1995,16 +1988,13 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); } - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) ? EndBlock : LoadCmpBlocks[Index + 1]; // Early exit branch if difference found to ResultBlock. Otherwise, continue // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); Builder.Insert(CmpBr); // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 @@ -2020,8 +2010,6 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, // memcmp result. It compares the two loaded source values and returns -1 if // src1 < src2 and 1 if src1 > src2. void MemCmpExpansion::emitMemCmpResultBlock() { - IRBuilder<> Builder(CI->getContext()); - // Special case: if memcmp result is used in a zero equality, result does not // need to be calculated and can simply return 1. if (IsUsedForZeroCmp) { @@ -2070,7 +2058,6 @@ unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) { } void MemCmpExpansion::setupResultBlockPHINodes() { - IRBuilder<> Builder(CI->getContext()); Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); Builder.SetInsertPoint(ResBlock.BB); ResBlock.PhiSrc1 = @@ -2080,8 +2067,6 @@ void MemCmpExpansion::setupResultBlockPHINodes() { } void MemCmpExpansion::setupEndBlockPHINodes() { - IRBuilder<> Builder(CI->getContext()); - Builder.SetInsertPoint(&EndBlock->front()); PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); } @@ -2102,11 +2087,45 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) { /// in the general case. Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { unsigned NumBytesProcessed = 0; - IRBuilder<> Builder(CI->getContext()); - Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder); + Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed); return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); } +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + // TODO: Instead of comparing ULT, just subtract and return the difference? + Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Type *I32 = Builder.getInt32Ty(); + Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1), + ConstantInt::get(I32, 1)); + return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0)); +} + // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { @@ -2114,6 +2133,10 @@ Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : getMemCmpExpansionZeroCase(Size); + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (NumBlocks == 1 && NumLoadsPerBlock == 1) + return getMemCmpOneBlock(Size); + // This loop calls emitLoadCompareBlock for comparing Size bytes of the two // memcmp sources. It starts with loading using the maximum load size set by // the target. It processes any remaining bytes using a load size which is the @@ -2218,7 +2241,6 @@ Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, const TargetLowering *TLI, const DataLayout *DL) { NumMemCmpCalls++; - IRBuilder<> Builder(CI->getContext()); // TTI call to check if target would like to expand memcmp. Also, get the // MaxLoadSize. @@ -4378,14 +4400,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // If the real base value actually came from an inttoptr, then the matcher // will look through it and provide only the integer value. In that case, // use it here. - if (!ResultPtr && AddrMode.BaseReg) { - ResultPtr = - Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr"); - AddrMode.BaseReg = nullptr; - } else if (!ResultPtr && AddrMode.Scale == 1) { - ResultPtr = - Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr"); - AddrMode.Scale = 0; + if (!DL->isNonIntegralPointerType(Addr->getType())) { + if (!ResultPtr && AddrMode.BaseReg) { + ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), + "sunkaddr"); + AddrMode.BaseReg = nullptr; + } else if (!ResultPtr && AddrMode.Scale == 1) { + ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), + "sunkaddr"); + AddrMode.Scale = 0; + } } if (!ResultPtr && @@ -4466,6 +4490,19 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } } else { + // We'd require a ptrtoint/inttoptr down the line, which we can't do for + // non-integral pointers, so in that case bail out now. + Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr; + Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr; + PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy); + PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy); + if (DL->isNonIntegralPointerType(Addr->getType()) || + (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) || + (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) || + (AddrMode.BaseGV && + DL->isNonIntegralPointerType(AddrMode.BaseGV->getType()))) + return false; + DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); @@ -6367,7 +6404,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { } // Update PHI nodes in both successors. The original BB needs to be - // replaced in one succesor's PHI nodes, because the branch comes now from + // replaced in one successor's PHI nodes, because the branch comes now from // the newly generated BB (NewBB). In the other successor we need to add one // incoming edge to the PHI nodes, because both branch instructions target // now the same successor. Depending on the original branch condition diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 239bad2f5355..521037f9d206 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator --*- C++ -*-==// +//===- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator ---*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -11,34 +11,69 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/IRTranslator.h" - +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <string> +#include <utility> +#include <vector> #define DEBUG_TYPE "irtranslator" using namespace llvm; char IRTranslator::ID = 0; + INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) @@ -62,7 +97,7 @@ static void reportTranslationError(MachineFunction &MF, ORE.emit(R); } -IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) { +IRTranslator::IRTranslator() : MachineFunctionPass(ID) { initializeIRTranslatorPass(*PassRegistry::getPassRegistry()); } @@ -71,7 +106,6 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } - unsigned IRTranslator::getOrCreateVReg(const Value &Val) { unsigned &ValReg = ValToVReg[&Val]; @@ -686,6 +720,26 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, .addUse(getOrCreateVReg(*CI.getArgOperand(0))) .addUse(getOrCreateVReg(*CI.getArgOperand(1))); return true; + case Intrinsic::exp: + MIRBuilder.buildInstr(TargetOpcode::G_FEXP) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::exp2: + MIRBuilder.buildInstr(TargetOpcode::G_FEXP2) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::log: + MIRBuilder.buildInstr(TargetOpcode::G_FLOG) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; + case Intrinsic::log2: + MIRBuilder.buildInstr(TargetOpcode::G_FLOG2) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + return true; case Intrinsic::fma: MIRBuilder.buildInstr(TargetOpcode::G_FMA) .addDef(getOrCreateVReg(CI)) @@ -834,7 +888,6 @@ bool IRTranslator::translateInvoke(const User &U, if (!isa<LandingPadInst>(EHPadBB->front())) return false; - // Emit the actual call, bracketed by EH_LABELs so that the MF knows about // the region covered by the try. MCSymbol *BeginSymbol = Context.createTempSymbol(); @@ -1195,7 +1248,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MRI = &MF->getRegInfo(); DL = &F.getParent()->getDataLayout(); TPC = &getAnalysis<TargetPassConfig>(); - ORE = make_unique<OptimizationRemarkEmitter>(&F); + ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F); assert(PendingPHIs.empty() && "stale PHIs"); diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 5466efd7e90f..860fc9a4f8b6 100644 --- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -1,4 +1,4 @@ -//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp -----------*- C++ -*-==// +//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp --------------------===// // // The LLVM Compiler Infrastructure // @@ -11,19 +11,22 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> #define DEBUG_TYPE "instructionselector" using namespace llvm; -InstructionSelector::InstructionSelector() {} +InstructionSelector::InstructionSelector() = default; bool InstructionSelector::constrainOperandRegToRegClass( MachineInstr &I, unsigned OpIdx, const TargetRegisterClass &RC, @@ -33,8 +36,8 @@ bool InstructionSelector::constrainOperandRegToRegClass( MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); - return llvm::constrainRegToClass(MRI, TII, RBI, I, - I.getOperand(OpIdx).getReg(), RC); + return + constrainRegToClass(MRI, TII, RBI, I, I.getOperand(OpIdx).getReg(), RC); } bool InstructionSelector::constrainSelectedInstRegOperands( @@ -84,7 +87,6 @@ bool InstructionSelector::constrainSelectedInstRegOperands( bool InstructionSelector::isOperandImmEqual( const MachineOperand &MO, int64_t Value, const MachineRegisterInfo &MRI) const { - if (MO.isReg() && MO.getReg()) if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI)) return *VRegVal == Value; diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 1d0d3dffa4c5..84b0a0ac4157 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -158,7 +158,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { // FIXME: Don't know how to handle secondary types yet. - if (TypeIdx != 0) + if (TypeIdx != 0 && MI.getOpcode() != TargetOpcode::G_EXTRACT) return UnableToLegalize; MIRBuilder.setInstr(MI); @@ -166,6 +166,20 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, switch (MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_IMPLICIT_DEF: { + int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / + NarrowTy.getSizeInBits(); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy); + MIRBuilder.buildUndef(Dst); + DstRegs.push_back(Dst); + } + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_ADD: { // Expand in terms of carry-setting/consuming G_ADDE instructions. int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / @@ -193,6 +207,58 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_EXTRACT: { + if (TypeIdx != 1) + return UnableToLegalize; + + int64_t NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() / NarrowSize; + + SmallVector<unsigned, 2> SrcRegs, DstRegs; + SmallVector<uint64_t, 2> Indexes; + extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + + unsigned OpReg = MI.getOperand(0).getReg(); + int64_t OpStart = MI.getOperand(2).getImm(); + int64_t OpSize = MRI.getType(OpReg).getSizeInBits(); + for (int i = 0; i < NumParts; ++i) { + unsigned SrcStart = i * NarrowSize; + + if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) { + // No part of the extract uses this subregister, ignore it. + continue; + } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) { + // The entire subregister is extracted, forward the value. + DstRegs.push_back(SrcRegs[i]); + continue; + } + + // OpSegStart is where this destination segment would start in OpReg if it + // extended infinitely in both directions. + int64_t ExtractOffset, SegSize; + if (OpStart < SrcStart) { + ExtractOffset = 0; + SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart); + } else { + ExtractOffset = OpStart - SrcStart; + SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize); + } + + unsigned SegReg = SrcRegs[i]; + if (ExtractOffset != 0 || SegSize != NarrowSize) { + // A genuine extract is needed. + SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); + MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset); + } + + DstRegs.push_back(SegReg); + } + + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_INSERT: { if (TypeIdx != 0) return UnableToLegalize; diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index 595802f2228b..76917aa9660d 100644 --- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -1,4 +1,4 @@ -//===---- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer -------==// +//===- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer ---------------===// // // The LLVM Compiler Infrastructure // @@ -18,16 +18,25 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" - #include "llvm/ADT/SmallBitVector.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Type.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOpcodes.h" +#include <algorithm> +#include <cassert> +#include <tuple> +#include <utility> + using namespace llvm; -LegalizerInfo::LegalizerInfo() : TablesInitialized(false) { +LegalizerInfo::LegalizerInfo() { + DefaultActions[TargetOpcode::G_IMPLICIT_DEF] = NarrowScalar; + // FIXME: these two can be legalized to the fundamental load/store Jakob // proposed. Once loads & stores are supported. DefaultActions[TargetOpcode::G_ANYEXT] = Legal; @@ -42,6 +51,7 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) { DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar; DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar; + DefaultActions[TargetOpcode::G_EXTRACT] = NarrowScalar; DefaultActions[TargetOpcode::G_FNEG] = Lower; } @@ -75,8 +85,7 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const { // FIXME: the long-term plan calls for expansion in terms of load/store (if // they're not legal). - if (Aspect.Opcode == TargetOpcode::G_EXTRACT || - Aspect.Opcode == TargetOpcode::G_MERGE_VALUES || + if (Aspect.Opcode == TargetOpcode::G_MERGE_VALUES || Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES) return std::make_pair(Legal, Aspect.Type); @@ -172,21 +181,21 @@ Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect, case Custom: return Aspect.Type; case NarrowScalar: { - return findLegalType(Aspect, - [](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); + return findLegalizableSize( + Aspect, [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); } case WidenScalar: { - return findLegalType(Aspect, [](LLT Ty) -> LLT { + return findLegalizableSize(Aspect, [&](LLT Ty) -> LLT { return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize(); }); } case FewerElements: { - return findLegalType(Aspect, - [](LLT Ty) -> LLT { return Ty.halfElements(); }); + return findLegalizableSize( + Aspect, [&](LLT Ty) -> LLT { return Ty.halfElements(); }); } case MoreElements: { - return findLegalType(Aspect, - [](LLT Ty) -> LLT { return Ty.doubleElements(); }); + return findLegalizableSize( + Aspect, [&](LLT Ty) -> LLT { return Ty.doubleElements(); }); } } } diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 3c70013ea296..47c6214c0552 100644 --- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -264,10 +264,13 @@ MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) { } MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) { + assert(MRI->getType(Tgt).isPointer() && "invalid branch destination"); return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt); } MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) { + assert(MRI->getType(Res) == LLT() || MRI->getType(Op) == LLT() || + MRI->getType(Res) == MRI->getType(Op)); return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op); } @@ -364,27 +367,36 @@ MachineInstrBuilder MachineIRBuilder::buildZExt(unsigned Res, unsigned Op) { MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res, unsigned Op) { + assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()); + assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar()); + unsigned Opcode = TargetOpcode::COPY; if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) Opcode = TargetOpcode::G_SEXT; else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) Opcode = TargetOpcode::G_TRUNC; + else + assert(MRI->getType(Res) == MRI->getType(Op)); return buildInstr(Opcode).addDef(Res).addUse(Op); } MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res, unsigned Op) { + assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()); + assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar()); + unsigned Opcode = TargetOpcode::COPY; if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) Opcode = TargetOpcode::G_ZEXT; else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) Opcode = TargetOpcode::G_TRUNC; + else + assert(MRI->getType(Res) == MRI->getType(Op)); return buildInstr(Opcode).addDef(Res).addUse(Op); } - MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) { LLT SrcTy = MRI->getType(Src); LLT DstTy = MRI->getType(Dst); @@ -466,7 +478,7 @@ void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops, } MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) { - return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res); + return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res); } MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res, @@ -482,6 +494,9 @@ MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res, "input operands do not cover output register"); #endif + if (Ops.size() == 1) + return buildCast(Res, Ops[0]); + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES); MIB.addDef(Res); for (unsigned i = 0; i < Ops.size(); ++i) @@ -511,8 +526,11 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res, MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src, unsigned Op, unsigned Index) { + assert(Index + MRI->getType(Op).getSizeInBits() <= + MRI->getType(Res).getSizeInBits() && + "insertion past the end of a register"); + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) { - assert(Index == 0 && "insertion past the end of a register"); return buildCast(Res, Op); } diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp index 2eb3cdee694d..677941dbbf6d 100644 --- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -1,4 +1,4 @@ -//===- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect -*- C++ -*-==// +//==- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect --*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -12,18 +12,39 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Attributes.h" +#include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOpcodes.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <limits> +#include <memory> +#include <utility> #define DEBUG_TYPE "regbankselect" @@ -37,6 +58,7 @@ static cl::opt<RegBankSelect::Mode> RegBankSelectMode( "Use the Greedy mode (best local mapping)"))); char RegBankSelect::ID = 0; + INITIALIZE_PASS_BEGIN(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false); @@ -48,8 +70,7 @@ INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, false) RegBankSelect::RegBankSelect(Mode RunningMode) - : MachineFunctionPass(ID), RBI(nullptr), MRI(nullptr), TRI(nullptr), - MBFI(nullptr), MBPI(nullptr), OptMode(RunningMode) { + : MachineFunctionPass(ID), OptMode(RunningMode) { initializeRegBankSelectPass(*PassRegistry::getPassRegistry()); if (RegBankSelectMode.getNumOccurrences() != 0) { OptMode = RegBankSelectMode; @@ -72,7 +93,7 @@ void RegBankSelect::init(MachineFunction &MF) { MBPI = nullptr; } MIRBuilder.setMF(MF); - MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); + MORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); } void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { @@ -133,9 +154,11 @@ bool RegBankSelect::repairReg( TargetRegisterInfo::isPhysicalRegister(Dst)) && "We are about to create several defs for Dst"); - // Build the instruction used to repair, then clone it at the right places. - MachineInstr *MI = MIRBuilder.buildCopy(Dst, Src); - MI->removeFromParent(); + // Build the instruction used to repair, then clone it at the right + // places. Avoiding buildCopy bypasses the check that Src and Dst have the + // same types because the type is a placeholder when this function is called. + MachineInstr *MI = + MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY).addDef(Dst).addUse(Src); DEBUG(dbgs() << "Copy: " << PrintReg(Src) << " to: " << PrintReg(Dst) << '\n'); // TODO: @@ -202,11 +225,11 @@ uint64_t RegBankSelect::getRepairCost( RBI->copyCost(*DesiredRegBrank, *CurRegBank, RegisterBankInfo::getSizeInBits(MO.getReg(), *MRI, *TRI)); // TODO: use a dedicated constant for ImpossibleCost. - if (Cost != UINT_MAX) + if (Cost != std::numeric_limits<unsigned>::max()) return Cost; // Return the legalization cost of that repairing. } - return UINT_MAX; + return std::numeric_limits<unsigned>::max(); } const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( @@ -352,7 +375,7 @@ void RegBankSelect::tryAvoidingSplit( // the repairing cost because of the PHIs already proceeded // as already stated. // Though the code will be correct. - assert(0 && "Repairing cost may not be accurate"); + assert(false && "Repairing cost may not be accurate"); } else { // We need to do non-local repairing. Basically, patch all // the uses (i.e., phis) that we already proceeded. @@ -450,7 +473,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping( uint64_t RepairCost = getRepairCost(MO, ValMapping); // This is an impossible to repair cost. - if (RepairCost == UINT_MAX) + if (RepairCost == std::numeric_limits<unsigned>::max()) continue; // Bias used for splitting: 5%. @@ -535,9 +558,11 @@ bool RegBankSelect::applyMapping( llvm_unreachable("Other kind should not happen"); } } + // Second, rewrite the instruction. DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n'); RBI->applyMapping(OpdMapper); + return true; } @@ -638,11 +663,8 @@ RegBankSelect::RepairingPlacement::RepairingPlacement( MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P, RepairingPlacement::RepairingKind Kind) // Default is, we are going to insert code to repair OpIdx. - : Kind(Kind), - OpIdx(OpIdx), - CanMaterialize(Kind != RepairingKind::Impossible), - HasSplit(false), - P(P) { + : Kind(Kind), OpIdx(OpIdx), + CanMaterialize(Kind != RepairingKind::Impossible), P(P) { const MachineOperand &MO = MI.getOperand(OpIdx); assert(MO.isReg() && "Trying to repair a non-reg operand"); @@ -847,7 +869,7 @@ bool RegBankSelect::EdgeInsertPoint::canMaterialize() const { } RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq) - : LocalCost(0), NonLocalCost(0), LocalFreq(LocalFreq.getFrequency()) {} + : LocalFreq(LocalFreq.getFrequency()) {} bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) { // Check if this overflows. @@ -920,7 +942,6 @@ bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const { OtherLocalAdjust = Cost.LocalCost - LocalCost; else ThisLocalAdjust = LocalCost - Cost.LocalCost; - } else { ThisLocalAdjust = LocalCost; OtherLocalAdjust = Cost.LocalCost; diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp index 398066bf8903..8c43c9f3f884 100644 --- a/lib/CodeGen/LiveRangeCalc.cpp +++ b/lib/CodeGen/LiveRangeCalc.cpp @@ -20,11 +20,14 @@ using namespace llvm; #define DEBUG_TYPE "regalloc" +// Reserve an address that indicates a value that is known to be "undef". +static VNInfo UndefVNI(0xbad, SlotIndex()); + void LiveRangeCalc::resetLiveOutMap() { unsigned NumBlocks = MF->getNumBlockIDs(); Seen.clear(); Seen.resize(NumBlocks); - EntryInfoMap.clear(); + EntryInfos.clear(); Map.resize(NumBlocks); } @@ -283,8 +286,11 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, // Determine if the exit from the block is reached by some def. unsigned N = WorkList[i]; MachineBasicBlock &B = *MF->getBlockNumbered(N); - if (Seen[N] && Map[&B].first != nullptr) - return MarkDefined(B); + if (Seen[N]) { + const LiveOutPair &LOB = Map[&B]; + if (LOB.first != nullptr && LOB.first != &UndefVNI) + return MarkDefined(B); + } SlotIndex Begin, End; std::tie(Begin, End) = Indexes->getMBBRange(&B); // Treat End as not belonging to B. @@ -365,10 +371,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, #endif FoundUndef |= MBB->pred_empty(); - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); PI != PE; ++PI) { - MachineBasicBlock *Pred = *PI; - + for (MachineBasicBlock *Pred : MBB->predecessors()) { // Is this a known live-out block? if (Seen.test(Pred->getNumber())) { if (VNInfo *VNI = Map[Pred].first) { @@ -387,7 +390,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, auto EP = LR.extendInBlock(Undefs, Start, End); VNInfo *VNI = EP.first; FoundUndef |= EP.second; - setLiveOutValue(Pred, VNI); + setLiveOutValue(Pred, EP.second ? &UndefVNI : VNI); if (VNI) { if (TheVNI && TheVNI != VNI) UniqueVNI = false; @@ -406,7 +409,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, } LiveIn.clear(); - FoundUndef |= (TheVNI == nullptr); + FoundUndef |= (TheVNI == nullptr || TheVNI == &UndefVNI); if (Undefs.size() > 0 && FoundUndef) UniqueVNI = false; @@ -417,7 +420,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, // If a unique reaching def was found, blit in the live ranges immediately. if (UniqueVNI) { - assert(TheVNI != nullptr); + assert(TheVNI != nullptr && TheVNI != &UndefVNI); LiveRangeUpdater Updater(&LR); for (unsigned BN : WorkList) { SlotIndex Start, End; @@ -433,22 +436,26 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, } // Prepare the defined/undefined bit vectors. - auto EF = EntryInfoMap.find(&LR); - if (EF == EntryInfoMap.end()) { + EntryInfoMap::iterator Entry; + bool DidInsert; + std::tie(Entry, DidInsert) = EntryInfos.insert( + std::make_pair(&LR, std::make_pair(BitVector(), BitVector()))); + if (DidInsert) { + // Initialize newly inserted entries. unsigned N = MF->getNumBlockIDs(); - EF = EntryInfoMap.insert({&LR, {BitVector(), BitVector()}}).first; - EF->second.first.resize(N); - EF->second.second.resize(N); + Entry->second.first.resize(N); + Entry->second.second.resize(N); } - BitVector &DefOnEntry = EF->second.first; - BitVector &UndefOnEntry = EF->second.second; + BitVector &DefOnEntry = Entry->second.first; + BitVector &UndefOnEntry = Entry->second.second; // Multiple values were found, so transfer the work list to the LiveIn array // where UpdateSSA will use it as a work list. LiveIn.reserve(WorkList.size()); for (unsigned BN : WorkList) { MachineBasicBlock *MBB = MF->getBlockNumbered(BN); - if (Undefs.size() > 0 && !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry)) + if (Undefs.size() > 0 && + !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry)) continue; addLiveInBlock(LR, DomTree->getNode(MBB)); if (MBB == &UseMBB) @@ -466,9 +473,9 @@ void LiveRangeCalc::updateSSA() { assert(DomTree && "Missing dominator tree"); // Interate until convergence. - unsigned Changes; + bool Changed; do { - Changes = 0; + Changed = false; // Propagate live-out values down the dominator tree, inserting phi-defs // when necessary. for (LiveInBlock &I : LiveIn) { @@ -491,15 +498,20 @@ void LiveRangeCalc::updateSSA() { IDomValue = Map[IDom->getBlock()]; // Cache the DomTree node that defined the value. - if (IDomValue.first && !IDomValue.second) + if (IDomValue.first && IDomValue.first != &UndefVNI && + !IDomValue.second) { Map[IDom->getBlock()].second = IDomValue.second = DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def)); + } - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); PI != PE; ++PI) { - LiveOutPair &Value = Map[*PI]; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + LiveOutPair &Value = Map[Pred]; if (!Value.first || Value.first == IDomValue.first) continue; + if (Value.first == &UndefVNI) { + needPHI = true; + break; + } // Cache the DomTree node that defined the value. if (!Value.second) @@ -523,7 +535,7 @@ void LiveRangeCalc::updateSSA() { // Create a phi-def if required. if (needPHI) { - ++Changes; + Changed = true; assert(Alloc && "Need VNInfo allocator to create PHI-defs"); SlotIndex Start, End; std::tie(Start, End) = Indexes->getMBBRange(MBB); @@ -542,7 +554,7 @@ void LiveRangeCalc::updateSSA() { LR.addSegment(LiveInterval::Segment(Start, End, VNI)); LOP = LiveOutPair(VNI, Node); } - } else if (IDomValue.first) { + } else if (IDomValue.first && IDomValue.first != &UndefVNI) { // No phi-def here. Remember incoming value. I.Value = IDomValue.first; @@ -554,9 +566,9 @@ void LiveRangeCalc::updateSSA() { // MBB is live-out and doesn't define its own value. if (LOP.first == IDomValue.first) continue; - ++Changes; + Changed = true; LOP = IDomValue; } } - } while (Changes); + } while (Changed); } diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h index 1a7598f8044a..d41b782d9bdf 100644 --- a/lib/CodeGen/LiveRangeCalc.h +++ b/lib/CodeGen/LiveRangeCalc.h @@ -24,6 +24,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/CodeGen/LiveInterval.h" @@ -65,7 +66,8 @@ class LiveRangeCalc { /// registers do not overlap), but the defined/undefined information must /// be kept separate for each individual range. /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }. - std::map<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap; + typedef DenseMap<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap; + EntryInfoMap EntryInfos; /// Map each basic block where a live range is live out to the live-out value /// and its defining block. diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index f58d1f8b83ae..c58d192284dd 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -579,12 +579,12 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB, // // is equivalent to // liveins: %edi, %esi - bool ExplicitSuccesors = false; + bool ExplicitSuccessors = false; while (true) { if (Token.is(MIToken::kw_successors)) { if (parseBasicBlockSuccessors(MBB)) return true; - ExplicitSuccesors = true; + ExplicitSuccessors = true; } else if (Token.is(MIToken::kw_liveins)) { if (parseBasicBlockLiveins(MBB)) return true; @@ -636,7 +636,7 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB, } // Construct successor list by searching for basic block machine operands. - if (!ExplicitSuccesors) { + if (!ExplicitSuccessors) { SmallVector<MachineBasicBlock*,4> Successors; bool IsFallthrough; guessSuccessors(MBB, Successors, IsFallthrough); diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp index 6b6b5f2814a9..73c3428a6e53 100644 --- a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp +++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -52,6 +52,14 @@ void MachineOptimizationRemarkEmitter::emit( computeHotness(OptDiag); LLVMContext &Ctx = MF.getFunction()->getContext(); + + // If a diagnostic has a hotness value, then only emit it if its hotness + // meets the threshold. + if (OptDiag.getHotness() && + *OptDiag.getHotness() < Ctx.getDiagnosticsHotnessThreshold()) { + return; + } + yaml::Output *Out = Ctx.getDiagnosticsOutputFile(); if (Out) { auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon); @@ -73,7 +81,7 @@ bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction( MachineFunction &MF) { MachineBlockFrequencyInfo *MBFI; - if (MF.getFunction()->getContext().getDiagnosticHotnessRequested()) + if (MF.getFunction()->getContext().getDiagnosticsHotnessRequested()) MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI(); else MBFI = nullptr; diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp index 45ea0e4c39ab..5e279b065bbd 100644 --- a/lib/CodeGen/MacroFusion.cpp +++ b/lib/CodeGen/MacroFusion.cpp @@ -1,4 +1,4 @@ -//===- MacroFusion.cpp - Macro Fusion ----------------------===// +//===- MacroFusion.cpp - Macro Fusion -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -13,8 +13,15 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MacroFusion.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #define DEBUG_TYPE "misched" @@ -26,8 +33,6 @@ using namespace llvm; static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, cl::desc("Enable scheduling for macro fusion."), cl::init(true)); -namespace { - static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU, SUnit &SecondSU) { // Create a single weak edge between the adjacent instrs. The only effect is @@ -66,6 +71,7 @@ static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU, ++NumFused; } +namespace { /// \brief Post-process the DAG to create cluster edges between instrs that may /// be fused by the processor into a single operation. @@ -81,6 +87,8 @@ public: void apply(ScheduleDAGInstrs *DAGInstrs) override; }; +} // end anonymous namespace + void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); @@ -128,23 +136,18 @@ bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU) { return false; } -} // end anonymous namespace - - -namespace llvm { - std::unique_ptr<ScheduleDAGMutation> -createMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent) { +llvm::createMacroFusionDAGMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) { if(EnableMacroFusion) return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true); return nullptr; } std::unique_ptr<ScheduleDAGMutation> -createBranchMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent) { +llvm::createBranchMacroFusionDAGMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) { if(EnableMacroFusion) return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false); return nullptr; } - -} // end namespace llvm diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp index da8fac6d3834..b13f6b68c420 100644 --- a/lib/CodeGen/PeepholeOptimizer.cpp +++ b/lib/CodeGen/PeepholeOptimizer.cpp @@ -76,6 +76,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -119,6 +120,14 @@ static cl::opt<unsigned> RewritePHILimit( "rewrite-phi-limit", cl::Hidden, cl::init(10), cl::desc("Limit the length of PHI chains to lookup")); +// Limit the length of recurrence chain when evaluating the benefit of +// commuting operands. +static cl::opt<unsigned> MaxRecurrenceChain( + "recurrence-chain-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum length of recurrence chain when evaluating the benefit " + "of commuting operands")); + + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -131,12 +140,14 @@ STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed"); namespace { class ValueTrackerResult; + class RecurrenceInstr; class PeepholeOptimizer : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; MachineDominatorTree *DT; // Machine dominator tree + MachineLoopInfo *MLI; public: static char ID; // Pass identification @@ -150,6 +161,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); if (Aggressive) { AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); @@ -160,6 +173,9 @@ namespace { typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult> RewriteMapTy; + /// \brief Sequence of instructions that formulate recurrence cycle. + typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle; + private: bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, @@ -170,6 +186,7 @@ namespace { bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs); + bool optimizeRecurrence(MachineInstr &PHI); bool findNextSource(unsigned Reg, unsigned SubReg, RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, @@ -178,6 +195,13 @@ namespace { bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs); + /// \brief Finds recurrence cycles, but only ones that formulated around + /// a def operand and a use operand that are tied. If there is a use + /// operand commutable with the tied use operand, find recurrence cycle + /// along that operand as well. + bool findTargetRecurrence(unsigned Reg, + const SmallSet<unsigned, 2> &TargetReg, + RecurrenceCycle &RC); /// \brief If copy instruction \p MI is a virtual register copy, track it in /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was @@ -222,6 +246,28 @@ namespace { } }; + /// \brief Helper class to hold instructions that are inside recurrence + /// cycles. The recurrence cycle is formulated around 1) a def operand and its + /// tied use operand, or 2) a def operand and a use operand that is commutable + /// with another use operand which is tied to the def operand. In the latter + /// case, index of the tied use operand and the commutable use operand are + /// maintained with CommutePair. + class RecurrenceInstr { + public: + typedef std::pair<unsigned, unsigned> IndexPair; + + RecurrenceInstr(MachineInstr *MI) : MI(MI) {} + RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2) + : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {} + + MachineInstr *getMI() const { return MI; } + Optional<IndexPair> getCommutePair() const { return CommutePair; } + + private: + MachineInstr *MI; + Optional<IndexPair> CommutePair; + }; + /// \brief Helper class to hold a reply for ValueTracker queries. Contains the /// returned sources for a given search and the instructions where the sources /// were tracked from. @@ -412,6 +458,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID; INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) @@ -1487,6 +1534,113 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy( return false; } +/// \bried Returns true if \p MO is a virtual register operand. +static bool isVirtualRegisterOperand(MachineOperand &MO) { + if (!MO.isReg()) + return false; + return TargetRegisterInfo::isVirtualRegister(MO.getReg()); +} + +bool PeepholeOptimizer::findTargetRecurrence( + unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs, + RecurrenceCycle &RC) { + // Recurrence found if Reg is in TargetRegs. + if (TargetRegs.count(Reg)) + return true; + + // TODO: Curerntly, we only allow the last instruction of the recurrence + // cycle (the instruction that feeds the PHI instruction) to have more than + // one uses to guarantee that commuting operands does not tie registers + // with overlapping live range. Once we have actual live range info of + // each register, this constraint can be relaxed. + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + // Give up if the reccurrence chain length is longer than the limit. + if (RC.size() >= MaxRecurrenceChain) + return false; + + MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg)); + unsigned Idx = MI.findRegisterUseOperandIdx(Reg); + + // Only interested in recurrences whose instructions have only one def, which + // is a virtual register. + if (MI.getDesc().getNumDefs() != 1) + return false; + + MachineOperand &DefOp = MI.getOperand(0); + if (!isVirtualRegisterOperand(DefOp)) + return false; + + // Check if def operand of MI is tied to any use operand. We are only + // interested in the case that all the instructions in the recurrence chain + // have there def operand tied with one of the use operand. + unsigned TiedUseIdx; + if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx)) + return false; + + if (Idx == TiedUseIdx) { + RC.push_back(RecurrenceInstr(&MI)); + return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC); + } else { + // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx. + unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex; + if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) { + RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx)); + return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC); + } + } + + return false; +} + +/// \brief Phi instructions will eventually be lowered to copy instructions. If +/// phi is in a loop header, a recurrence may formulated around the source and +/// destination of the phi. For such case commuting operands of the instructions +/// in the recurrence may enable coalescing of the copy instruction generated +/// from the phi. For example, if there is a recurrence of +/// +/// LoopHeader: +/// %vreg1 = phi(%vreg0, %vreg100) +/// LoopLatch: +/// %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1 +/// +/// , the fact that vreg0 and vreg2 are in the same tied operands set makes +/// the coalescing of copy instruction generated from the phi in +/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and +/// %vreg2 have overlapping live range. This introduces additional move +/// instruction to the final assembly. However, if we commute %vreg2 and +/// %vreg1 of ADD instruction, the redundant move instruction can be +/// avoided. +bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) { + SmallSet<unsigned, 2> TargetRegs; + for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) { + MachineOperand &MO = PHI.getOperand(Idx); + assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction"); + TargetRegs.insert(MO.getReg()); + } + + bool Changed = false; + RecurrenceCycle RC; + if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) { + // Commutes operands of instructions in RC if necessary so that the copy to + // be generated from PHI can be coalesced. + DEBUG(dbgs() << "Optimize recurrence chain from " << PHI); + for (auto &RI : RC) { + DEBUG(dbgs() << "\tInst: " << *(RI.getMI())); + auto CP = RI.getCommutePair(); + if (CP) { + Changed = true; + TII->commuteInstruction(*(RI.getMI()), false, (*CP).first, + (*CP).second); + DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI())); + } + } + } + + return Changed; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -1501,6 +1655,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); DT = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr; + MLI = &getAnalysis<MachineLoopInfo>(); bool Changed = false; @@ -1529,6 +1684,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallSet<unsigned, 4> CopySrcRegs; DenseMap<unsigned, MachineInstr *> CopySrcMIs; + bool IsLoopHeader = MLI->isLoopHeader(&MBB); + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); MII != MIE; ) { MachineInstr *MI = &*MII; @@ -1540,9 +1697,16 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (MI->isDebugValue()) continue; - if (MI->isPosition() || MI->isPHI()) + if (MI->isPosition()) continue; + if (IsLoopHeader && MI->isPHI()) { + if (optimizeRecurrence(*MI)) { + Changed = true; + continue; + } + } + if (!MI->isCopy()) { for (const auto &Op : MI->operands()) { // Visit all operands: definitions can be implicit or explicit. @@ -1667,7 +1831,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { MRI->markUsesInDebugValueAsUndef(FoldedReg); FoldAsLoadDefCandidates.erase(FoldedReg); ++NumLoadFold; - + // MI is replaced with FoldMI so we can continue trying to fold Changed = true; MI = FoldMI; @@ -1675,7 +1839,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { } } } - + // If we run into an instruction we can't fold across, discard // the load candidates. Note: We might be able to fold *into* this // instruction, so this needs to be after the folding logic. diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 50d241bff23d..9562652556ac 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -2622,7 +2622,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, } // If we couldn't allocate a register from spilling, there is probably some - // invalid inline assembly. The base class wil report it. + // invalid inline assembly. The base class will report it. if (Stage >= RS_Done || !VirtReg.isSpillable()) return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters, Depth); diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 7b3a5d5c5ff7..ff9bca092dbe 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -979,6 +979,11 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); for (LiveInterval::SubRange &SR : IntB.subranges()) SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + + // If the newly created Instruction has an address of an instruction that was + // deleted before (object recycled by the allocator) it needs to be removed from + // the deleted list. + ErasedInstrs.erase(NewCopyMI); } else { DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#" << MBB.getNumber() << '\t' << CopyMI); @@ -989,6 +994,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, // While updating the live-ranges, we only look at slot indices and // never go back to the instruction. LIS->RemoveMachineInstrFromMaps(CopyMI); + // Mark instructions as deleted. + ErasedInstrs.insert(&CopyMI); CopyMI.eraseFromParent(); // Update the liveness. @@ -3095,7 +3102,7 @@ copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) { continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.erase(CurrList[i])) { + if (ErasedInstrs.count(CurrList[i])) { CurrList[i] = nullptr; continue; } diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp index d2eff950d861..bd5ecbd28f29 100644 --- a/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/lib/CodeGen/RenameIndependentSubregs.cpp @@ -243,10 +243,14 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes, unsigned VReg = Intervals[ID]->reg; MO.setReg(VReg); - if (MO.isTied()) { + + if (MO.isTied() && Reg != VReg) { /// Undef use operands are not tracked in the equivalence class but need /// to be update if they are tied. MO.getParent()->substituteRegister(Reg, VReg, 0, TRI); + + // substituteRegister breaks the iterator, so restart. + I = MRI->reg_nodbg_begin(Reg); } } // TODO: We could attempt to recompute new register classes while visiting diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 7dd66d799be4..0f70b0e9ca07 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -1089,7 +1089,7 @@ static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, // Things that are available after the instruction are killed by it. bool IsKill = LiveRegs.available(MRI, Reg); MO.setIsKill(IsKill); - if (IsKill && addToLiveRegs) + if (addToLiveRegs) LiveRegs.addReg(Reg); } } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d02dcb6f4439..d901af727686 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4915,7 +4915,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { return SDValue(); // Loads must share the same base address - BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr()); + BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); int64_t ByteOffsetFromBase = 0; if (!Base) Base = Ptr; @@ -8210,18 +8210,20 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) && TLI.isTypeDesirableForOp(ISD::SHL, VT)) { - if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) { - uint64_t Amt = CAmt->getZExtValue(); - unsigned Size = VT.getScalarSizeInBits(); - - if (Amt < Size) { - SDLoc SL(N); - EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + SDValue Amt = N0.getOperand(1); + KnownBits Known; + DAG.computeKnownBits(Amt, Known); + unsigned Size = VT.getScalarSizeInBits(); + if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { + SDLoc SL(N); + EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); - return DAG.getNode(ISD::SHL, SL, VT, Trunc, - DAG.getConstant(Amt, SL, AmtVT)); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); + if (AmtVT != Amt.getValueType()) { + Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); + AddToWorklist(Amt.getNode()); } + return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); } } @@ -9751,6 +9753,52 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { } } + // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) + // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) + if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && + (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && + TLI.isOperationLegal(ISD::FABS, VT)) { + SDValue Select = N0, X = N1; + if (Select.getOpcode() != ISD::SELECT) + std::swap(Select, X); + + SDValue Cond = Select.getOperand(0); + auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1)); + auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2)); + + if (TrueOpnd && FalseOpnd && + Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && + isa<ConstantFPSDNode>(Cond.getOperand(1)) && + cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETOLT: + case ISD::SETULT: + case ISD::SETOLE: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + std::swap(TrueOpnd, FalseOpnd); + // Fall through + case ISD::SETOGT: + case ISD::SETUGT: + case ISD::SETOGE: + case ISD::SETUGE: + case ISD::SETGT: + case ISD::SETGE: + if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && + TLI.isOperationLegal(ISD::FNEG, VT)) + return DAG.getNode(ISD::FNEG, DL, VT, + DAG.getNode(ISD::FABS, DL, VT, X)); + if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) + return DAG.getNode(ISD::FABS, DL, VT, X); + + break; + } + } + } + // FMUL -> FMA combines: if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { AddToWorklist(Fused.getNode()); @@ -12394,7 +12442,7 @@ void DAGCombiner::getStoreMergeCandidates( StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr()); + BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); EVT MemVT = St->getMemoryVT(); // We must have a base and an offset. @@ -12414,8 +12462,8 @@ void DAGCombiner::getStoreMergeCandidates( BaseIndexOffset LBasePtr; // Match on loadbaseptr if relevant. if (IsLoadSrc) - LBasePtr = - BaseIndexOffset::match(cast<LoadSDNode>(St->getValue())->getBasePtr()); + LBasePtr = BaseIndexOffset::match( + cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG); auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { @@ -12429,7 +12477,7 @@ void DAGCombiner::getStoreMergeCandidates( if (IsLoadSrc) { // The Load's Base Ptr must also match if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) { - auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr()); + auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else @@ -12443,7 +12491,7 @@ void DAGCombiner::getStoreMergeCandidates( if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) return false; - Ptr = BaseIndexOffset::match(Other->getBasePtr()); + Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; // We looking for a root node which is an ancestor to all mergable @@ -12786,7 +12834,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (Ld->getMemoryVT() != MemVT) break; - BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr()); + BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); // If this is not the first ptr that we check. int64_t LdOffset = 0; if (LdBasePtr.getBase().getNode()) { @@ -12829,6 +12877,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { // This variable refers to the size and not index in the array. unsigned LastLegalVectorType = 1; unsigned LastLegalIntegerType = 1; + bool isDereferenceable = true; bool DoIntegerTruncate = false; StartAddress = LoadNodes[0].OffsetFromBase; SDValue FirstChain = FirstLoad->getChain(); @@ -12841,6 +12890,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; + + if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) + isDereferenceable = false; + // Find a legal type for the vector store. EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1); bool IsFastSt, IsFastLd; @@ -12926,11 +12979,16 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); + MachineMemOperand::Flags MMOFlags = isDereferenceable ? + MachineMemOperand::MODereferenceable: + MachineMemOperand::MONone; + SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), FirstLoadAlign); + FirstLoad->getPointerInfo(), FirstLoadAlign, + MMOFlags); NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstStoreAlign); @@ -12940,7 +12998,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), - JointMemOpVT, FirstLoadAlign); + JointMemOpVT, FirstLoadAlign, MMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), JointMemOpVT, @@ -15013,6 +15071,11 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); + unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); + + if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) + return SDValue(); + unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> @@ -15034,11 +15097,10 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, if (EltSizeInBits != ExtSrcSizeInBits) return SDValue(); - // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for - // power-of-2 truncations as they are the most likely. - for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) - if (isTruncate(Scale)) - return DAG.getBitcast(VT, N00); + // We can remove *extend_vector_inreg only if the truncation happens at + // the same scale as the extension. + if (isTruncate(ExtScale)) + return DAG.getBitcast(VT, N00); return SDValue(); } @@ -16540,8 +16602,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; // Check for BaseIndexOffset matching. - BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr()); - BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr()); + BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); + BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); int64_t PtrDiff; if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); @@ -16751,7 +16813,7 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr()); + BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) @@ -16777,7 +16839,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { break; // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr()); + BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); // Check that the base pointer is the same as the original one. if (!BasePtr.equalBaseIndex(Ptr, DAG)) diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 75fec7bd1d48..ac3247948169 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1827,11 +1827,10 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ? ISD::UADDO : ISD::USUBO, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); - TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); - if (hasOVF) { EVT OvfVT = getSetCCResultType(NVT); SDVTList VTList = DAG.getVTList(NVT, OvfVT); + TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); int RevOpc; if (N->getOpcode() == ISD::ADD) { RevOpc = ISD::SUB; @@ -1864,13 +1863,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], ISD::SETULT); - - if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) { - SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT); - Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry); - return; - } - SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); @@ -1885,14 +1877,9 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue Cmp = DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()), LoOps[0], LoOps[1], ISD::SETULT); - - SDValue Borrow; - if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) - Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT); - else - Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT), - DAG.getConstant(0, dl, NVT)); - + SDValue Borrow = DAG.getSelect(dl, NVT, Cmp, + DAG.getConstant(1, dl, NVT), + DAG.getConstant(0, dl, NVT)); Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow); } } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index d2e0dbbf88ec..4e899ae6668e 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -11,6 +11,7 @@ #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -18,28 +19,41 @@ namespace llvm { bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG, int64_t &Off) { - // Obvious equivalent + // Initial Offset difference. Off = Other.Offset - Offset; - if (Other.Base == Base && Other.Index == Index && - Other.IsIndexSignExt == IsIndexSignExt) - return true; - // Match GlobalAddresses - if (Index == Other.Index) - if (GlobalAddressSDNode *A = dyn_cast<GlobalAddressSDNode>(Base)) - if (GlobalAddressSDNode *B = dyn_cast<GlobalAddressSDNode>(Other.Base)) + if ((Other.Index == Index) && (Other.IsIndexSignExt == IsIndexSignExt)) { + // Trivial match. + if (Other.Base == Base) + return true; + + // Match GlobalAddresses + if (auto *A = dyn_cast<GlobalAddressSDNode>(Base)) + if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base)) if (A->getGlobal() == B->getGlobal()) { Off += B->getOffset() - A->getOffset(); return true; } - // TODO: we should be able to add FrameIndex analysis improvements here. + const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + // Match non-equal FrameIndexes - a FrameIndex stemming from an + // alloca will not have it's ObjectOffset set until post-DAG and + // as such we must assume the two framesIndices are incomparable. + if (auto *A = dyn_cast<FrameIndexSDNode>(Base)) + if (auto *B = dyn_cast<FrameIndexSDNode>(Other.Base)) + if (!MFI.getObjectAllocation(A->getIndex()) && + !MFI.getObjectAllocation(B->getIndex())) { + Off += MFI.getObjectOffset(B->getIndex()) - + MFI.getObjectOffset(A->getIndex()); + return true; + } + } return false; } /// Parses tree in Ptr for base, index, offset addresses. -BaseIndexOffset BaseIndexOffset::match(SDValue Ptr) { +BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) { // (((B + I*M) + c)) + c ... SDValue Base = Ptr; SDValue Index = SDValue(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f9f431db55be..acf68fbbdedf 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3375,7 +3375,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { SDValue IdxN = getValue(Idx); if (!IdxN.getValueType().isVector() && VectorWidth) { - MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth); + EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth); IdxN = DAG.getSplatBuildVector(VT, dl, IdxN); } diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index b1918b19e1df..817e58ce59e1 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -1,4 +1,4 @@ -//===-- TargetPassConfig.cpp - Target independent code generation passes --===// +//===- TargetPassConfig.cpp - Target independent code generation passes ---===// // // The LLVM Compiler Infrastructure // @@ -13,29 +13,37 @@ //===---------------------------------------------------------------------===// #include "llvm/CodeGen/TargetPassConfig.h" - +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CFLAndersAliasAnalysis.h" #include "llvm/Analysis/CFLSteensAliasAnalysis.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassRegistry.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" -#include "llvm/CodeGen/RegisterUsageInfo.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Threading.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" +#include <cassert> +#include <string> using namespace llvm; @@ -225,6 +233,7 @@ char TargetPassConfig::EarlyTailDuplicateID = 0; char TargetPassConfig::PostRAMachineLICMID = 0; namespace { + struct InsertedPass { AnalysisID TargetPassID; IdentifyingPassPtr InsertedPassID; @@ -245,9 +254,11 @@ struct InsertedPass { return NP; } }; -} + +} // end anonymous namespace namespace llvm { + class PassConfigImpl { public: // List of passes explicitly substituted by this target. Normally this is @@ -263,7 +274,8 @@ public: /// is inserted after each instance of the first one. SmallVector<InsertedPass, 4> InsertedPasses; }; -} // namespace llvm + +} // end namespace llvm // Out of line virtual method. TargetPassConfig::~TargetPassConfig() { @@ -273,11 +285,7 @@ TargetPassConfig::~TargetPassConfig() { // Out of line constructor provides default values for pass options and // registers all common codegen passes. TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm) - : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false), - AddingMachinePasses(false), TM(&TM), Impl(nullptr), Initialized(false), - DisableVerify(false), EnableTailMerge(true), - RequireCodeGenSCCOrder(false) { - + : ImmutablePass(ID), PM(&pm), TM(&TM) { Impl = new PassConfigImpl(); // Register all target independent codegen passes to activate their PassIDs, @@ -325,7 +333,7 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetPassConfig::TargetPassConfig() - : ImmutablePass(ID), PM(nullptr) { + : ImmutablePass(ID) { report_fatal_error("Trying to construct TargetPassConfig without a target " "machine. Scheduling a CodeGen pass without a target " "triple set?"); diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 552a89f76ca2..83c00e24d14f 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -68,6 +68,13 @@ EnableRescheduling("twoaddr-reschedule", cl::desc("Coalesce copies by rescheduling (default=true)"), cl::init(true), cl::Hidden); +// Limit the number of dataflow edges to traverse when evaluating the benefit +// of commuting operands. +static cl::opt<unsigned> MaxDataFlowEdge( + "dataflow-edge-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum number of dataflow edges to traverse when evaluating " + "the benefit of commuting operands")); + namespace { class TwoAddressInstructionPass : public MachineFunctionPass { MachineFunction *MF; @@ -637,10 +644,10 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, // To more generally minimize register copies, ideally the logic of two addr // instruction pass should be integrated with register allocation pass where // interference graph is available. - if (isRevCopyChain(regC, regA, 3)) + if (isRevCopyChain(regC, regA, MaxDataFlowEdge)) return true; - if (isRevCopyChain(regB, regA, 3)) + if (isRevCopyChain(regB, regA, MaxDataFlowEdge)) return false; // Since there are no intervening uses for both registers, then commute diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp index d058f4864975..e0c7ef58c304 100644 --- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp @@ -29,10 +29,8 @@ static Error visitKnownRecord(CVSymbol &Record, return Error::success(); } -Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) { - if (auto EC = Callbacks.visitSymbolBegin(Record)) - return EC; - +static Error finishVisitation(CVSymbol &Record, + SymbolVisitorCallbacks &Callbacks) { switch (Record.Type) { default: if (auto EC = Callbacks.visitUnknownSymbol(Record)) @@ -55,6 +53,18 @@ Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) { return Error::success(); } +Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) { + if (auto EC = Callbacks.visitSymbolBegin(Record)) + return EC; + return finishVisitation(Record, Callbacks); +} + +Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record, uint32_t Offset) { + if (auto EC = Callbacks.visitSymbolBegin(Record, Offset)) + return EC; + return finishVisitation(Record, Callbacks); +} + Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) { for (auto I : Symbols) { if (auto EC = visitSymbolRecord(I)) @@ -62,3 +72,13 @@ Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) { } return Error::success(); } + +Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols, + uint32_t InitialOffset) { + for (auto I : Symbols) { + if (auto EC = visitSymbolRecord(I, InitialOffset)) + return EC; + InitialOffset += I.length(); + } + return Error::success(); +} diff --git a/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp b/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp index c31b8d1c96d5..ccc20eb74887 100644 --- a/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugChecksumsSubsection.cpp ----------------------*- C++ -*-===// +//===- DebugChecksumsSubsection.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,10 +8,17 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" - -#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" #include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> +#include <cstdint> +#include <cstring> using namespace llvm; using namespace llvm::codeview; @@ -25,7 +32,7 @@ struct FileChecksumEntryHeader { // Checksum bytes follow. }; -Error llvm::VarStreamArrayExtractor<FileChecksumEntry>:: +Error VarStreamArrayExtractor<FileChecksumEntry>:: operator()(BinaryStreamRef Stream, uint32_t &Len, FileChecksumEntry &Item) { BinaryStreamReader Reader(Stream); @@ -48,6 +55,7 @@ Error DebugChecksumsSubsectionRef::initialize(BinaryStreamReader Reader) { return Error::success(); } + Error DebugChecksumsSubsectionRef::initialize(BinaryStreamRef Section) { BinaryStreamReader Reader(Section); return initialize(Reader); diff --git a/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp b/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp index 21e2cc56075b..cef27787cfd1 100644 --- a/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugCrossExSubsection.cpp -------------------------------*- C++ -*-===// +//===- DebugCrossExSubsection.cpp -----------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,8 +8,10 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h" - #include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Error.h" +#include <cstdint> using namespace llvm; using namespace llvm::codeview; diff --git a/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp index 2c4a0b779342..88c0076915b5 100644 --- a/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugCrossImpSubsection.cpp ------------------------------*- C++ -*-===// +//===- DebugCrossImpSubsection.cpp ----------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,14 +8,21 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h" - +#include "llvm/ADT/ArrayRef.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include <algorithm> +#include <cstdint> +#include <utility> +#include <vector> using namespace llvm; using namespace llvm::codeview; -namespace llvm { Error VarStreamArrayExtractor<CrossModuleImportItem>:: operator()(BinaryStreamRef Stream, uint32_t &Len, codeview::CrossModuleImportItem &Item) { @@ -34,7 +41,6 @@ operator()(BinaryStreamRef Stream, uint32_t &Len, return EC; return Error::success(); } -} Error DebugCrossModuleImportsSubsectionRef::initialize( BinaryStreamReader Reader) { diff --git a/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp b/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp index e7719d05dbdc..077c103a615b 100644 --- a/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugInlineeLinesSubsection.cpp ------------------------*- C++-*-===// +//===- DebugInlineeLinesSubsection.cpp ------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,11 +8,15 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h" - -#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" -#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" -#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include <cassert> +#include <cstdint> using namespace llvm; using namespace llvm::codeview; diff --git a/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp b/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp index fbcad61d60a6..57ad40819fbc 100644 --- a/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugLinesSubsection.cpp -------------------------------*- C++-*-===// +//===- DebugLinesSubsection.cpp -------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,18 +8,21 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h" - +#include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" -#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" -#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Error.h" +#include <cassert> +#include <cstdint> using namespace llvm; using namespace llvm::codeview; Error LineColumnExtractor::operator()(BinaryStreamRef Stream, uint32_t &Len, LineColumnEntry &Item) { - using namespace codeview; const LineBlockFragmentHeader *BlockHeader; BinaryStreamReader Reader(Stream); if (auto EC = Reader.readObject(BlockHeader)) diff --git a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp index de02525270c4..d723282eb715 100644 --- a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugStringTableSubsection.cpp - CodeView String Table ---*- C++ -*-===// +//===- DebugStringTableSubsection.cpp - CodeView String Table -------------===// // // The LLVM Compiler Infrastructure // @@ -8,10 +8,14 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" - -#include "llvm/Support/BinaryStream.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Error.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; using namespace llvm::codeview; @@ -23,6 +27,7 @@ Error DebugStringTableSubsectionRef::initialize(BinaryStreamRef Contents) { Stream = Contents; return Error::success(); } + Error DebugStringTableSubsectionRef::initialize(BinaryStreamReader &Reader) { return Reader.readStreamRef(Stream); } diff --git a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp index d69eca018e0c..55f343c11e7f 100644 --- a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp +++ b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp @@ -1,4 +1,4 @@ -//===- DebugSubsectionRecord.cpp -----------------------------*- C++-*-===// +//===- DebugSubsectionRecord.cpp ------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,16 +8,20 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/DebugSubsection.h" - #include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; using namespace llvm::codeview; -DebugSubsectionRecord::DebugSubsectionRecord() - : Container(CodeViewContainer::ObjectFile), - Kind(DebugSubsectionKind::None) {} +DebugSubsectionRecord::DebugSubsectionRecord() = default; DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind, BinaryStreamRef Data, diff --git a/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp b/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp index 5f91b68f3ad8..60fbf9d747b2 100644 --- a/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp @@ -1,4 +1,4 @@ -//===- DebugSymbolRVASubsection.cpp ------------------------------*- C++-*-===// +//===- DebugSymbolRVASubsection.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,6 +8,11 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include <cstdint> using namespace llvm; using namespace llvm::codeview; diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp index ec00af28395e..4cfb55a31b35 100644 --- a/lib/DebugInfo/CodeView/EnumTables.cpp +++ b/lib/DebugInfo/CodeView/EnumTables.cpp @@ -1,4 +1,4 @@ -//===- EnumTables.cpp - Enum to string conversion tables --------*- C++ -*-===// +//===- EnumTables.cpp - Enum to string conversion tables ------------------===// // // The LLVM Compiler Infrastructure // @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/Support/ScopedPrinter.h" +#include <type_traits> using namespace llvm; using namespace codeview; @@ -333,6 +335,7 @@ static const EnumEntry<COFF::SectionCharacteristics> namespace llvm { namespace codeview { + ArrayRef<EnumEntry<SymbolKind>> getSymbolTypeNames() { return makeArrayRef(SymbolTypeNames); } @@ -348,48 +351,63 @@ ArrayRef<EnumEntry<uint16_t>> getRegisterNames() { ArrayRef<EnumEntry<uint32_t>> getPublicSymFlagNames() { return makeArrayRef(PublicSymFlagNames); } + ArrayRef<EnumEntry<uint8_t>> getProcSymFlagNames() { return makeArrayRef(ProcSymFlagNames); } + ArrayRef<EnumEntry<uint16_t>> getLocalFlagNames() { return makeArrayRef(LocalFlags); } + ArrayRef<EnumEntry<uint8_t>> getFrameCookieKindNames() { return makeArrayRef(FrameCookieKinds); } + ArrayRef<EnumEntry<SourceLanguage>> getSourceLanguageNames() { return makeArrayRef(SourceLanguages); } + ArrayRef<EnumEntry<uint32_t>> getCompileSym2FlagNames() { return makeArrayRef(CompileSym2FlagNames); } + ArrayRef<EnumEntry<uint32_t>> getCompileSym3FlagNames() { return makeArrayRef(CompileSym3FlagNames); } + ArrayRef<EnumEntry<uint32_t>> getFileChecksumNames() { return makeArrayRef(FileChecksumNames); } + ArrayRef<EnumEntry<unsigned>> getCPUTypeNames() { return makeArrayRef(CPUTypeNames); } + ArrayRef<EnumEntry<uint32_t>> getFrameProcSymFlagNames() { return makeArrayRef(FrameProcSymFlagNames); } + ArrayRef<EnumEntry<uint16_t>> getExportSymFlagNames() { return makeArrayRef(ExportSymFlagNames); } + ArrayRef<EnumEntry<uint32_t>> getModuleSubstreamKindNames() { return makeArrayRef(ModuleSubstreamKindNames); } + ArrayRef<EnumEntry<uint8_t>> getThunkOrdinalNames() { return makeArrayRef(ThunkOrdinalNames); } + ArrayRef<EnumEntry<uint16_t>> getTrampolineNames() { return makeArrayRef(TrampolineNames); } + ArrayRef<EnumEntry<COFF::SectionCharacteristics>> getImageSectionCharacteristicNames() { return makeArrayRef(ImageSectionCharacteristicNames); } -} -} + +} // end namespace codeview +} // end namespace llvm diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp index ef00bd8570fa..1fa8d219d6ac 100644 --- a/lib/DebugInfo/CodeView/Formatters.cpp +++ b/lib/DebugInfo/CodeView/Formatters.cpp @@ -1,4 +1,4 @@ -//===- Formatters.cpp -------------------------------------------*- C++ -*-===// +//===- Formatters.cpp -----------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,6 +8,10 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/Formatters.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> using namespace llvm; using namespace llvm::codeview; @@ -19,7 +23,7 @@ GuidAdapter::GuidAdapter(StringRef Guid) GuidAdapter::GuidAdapter(ArrayRef<uint8_t> Guid) : FormatAdapter(std::move(Guid)) {} -void GuidAdapter::format(llvm::raw_ostream &Stream, StringRef Style) { +void GuidAdapter::format(raw_ostream &Stream, StringRef Style) { static const char *Lookup = "0123456789ABCDEF"; assert(Item.size() == 16 && "Expected 16-byte GUID"); diff --git a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp index 20f7e72c3af3..5aaf3f1453a8 100644 --- a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp +++ b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp @@ -1,4 +1,4 @@ -//===- LazyRandomTypeCollection.cpp ---------------------------- *- C++--*-===// +//===- LazyRandomTypeCollection.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,12 +8,20 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h" - -#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/TypeName.h" -#include "llvm/DebugInfo/CodeView/TypeServerHandler.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> using namespace llvm; using namespace llvm::codeview; diff --git a/lib/DebugInfo/CodeView/StringsAndChecksums.cpp b/lib/DebugInfo/CodeView/StringsAndChecksums.cpp index 928bf8c94f73..306af1d1ef6b 100644 --- a/lib/DebugInfo/CodeView/StringsAndChecksums.cpp +++ b/lib/DebugInfo/CodeView/StringsAndChecksums.cpp @@ -1,4 +1,4 @@ -//===- StringsAndChecksums.cpp ----------------------------------*- C++ -*-===// +//===- StringsAndChecksums.cpp --------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,14 +8,18 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h" +#include "llvm/Support/Error.h" +#include <cassert> using namespace llvm; using namespace llvm::codeview; -StringsAndChecksumsRef::StringsAndChecksumsRef() {} +StringsAndChecksumsRef::StringsAndChecksumsRef() = default; StringsAndChecksumsRef::StringsAndChecksumsRef( const DebugStringTableSubsectionRef &Strings) diff --git a/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/lib/DebugInfo/CodeView/SymbolSerializer.cpp index 9f2d619d1a1c..9a2e776feb75 100644 --- a/lib/DebugInfo/CodeView/SymbolSerializer.cpp +++ b/lib/DebugInfo/CodeView/SymbolSerializer.cpp @@ -1,4 +1,4 @@ -//===- SymbolSerializer.cpp -------------------------------------*- C++ -*-===// +//===- SymbolSerializer.cpp -----------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,6 +8,13 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include <cassert> +#include <cstdint> +#include <cstring> using namespace llvm; using namespace llvm::codeview; @@ -15,7 +22,7 @@ using namespace llvm::codeview; SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator, CodeViewContainer Container) : Storage(Allocator), RecordBuffer(MaxRecordLength), - Stream(RecordBuffer, llvm::support::little), Writer(Stream), + Stream(RecordBuffer, support::little), Writer(Stream), Mapping(Writer, Container) {} Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) { diff --git a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp index 1226d5be3f3c..72cb9e2e3544 100644 --- a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp +++ b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp @@ -438,6 +438,25 @@ void llvm::codeview::discoverTypeIndices(const CVType &Type, ::discoverTypeIndices(Type.content(), Type.kind(), Refs); } +void llvm::codeview::discoverTypeIndices(const CVType &Type, + SmallVectorImpl<TypeIndex> &Indices) { + + Indices.clear(); + + SmallVector<TiReference, 4> Refs; + discoverTypeIndices(Type, Refs); + if (Refs.empty()) + return; + + BinaryStreamReader Reader(Type.content(), support::little); + for (const auto &Ref : Refs) { + Reader.setOffset(Ref.Offset); + FixedStreamArray<TypeIndex> Run; + cantFail(Reader.readArray(Run, Ref.Count)); + Indices.append(Run.begin(), Run.end()); + } +} + void llvm::codeview::discoverTypeIndices(ArrayRef<uint8_t> RecordData, SmallVectorImpl<TiReference> &Refs) { const RecordPrefix *P = diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp index 93c1198e36ce..003c13b4a20d 100644 --- a/lib/DebugInfo/CodeView/TypeSerializer.cpp +++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp @@ -1,4 +1,4 @@ -//===- TypeSerialzier.cpp ---------------------------------------*- C++ -*-===// +//===- TypeSerialzier.cpp -------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,16 +8,27 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/CodeView/TypeSerializer.h" - +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/RecordSerialization.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/BinaryByteStream.h" #include "llvm/Support/BinaryStreamWriter.h" - -#include <string.h> +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> using namespace llvm; using namespace llvm::codeview; namespace { + struct HashedType { uint64_t Hash; const uint8_t *Data; @@ -30,20 +41,26 @@ struct HashedType { struct HashedTypePtr { HashedTypePtr() = default; HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {} + HashedType *Ptr = nullptr; }; -} // namespace + +} // end anonymous namespace namespace llvm { + template <> struct DenseMapInfo<HashedTypePtr> { static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); } + static inline HashedTypePtr getTombstoneKey() { return HashedTypePtr(reinterpret_cast<HashedType *>(1)); } + static unsigned getHashValue(HashedTypePtr Val) { assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr); return Val.Ptr->Hash; } + static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) { HashedType *LHS = LHSP.Ptr; HashedType *RHS = RHSP.Ptr; @@ -54,7 +71,8 @@ template <> struct DenseMapInfo<HashedTypePtr> { return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0; } }; -} + +} // end namespace llvm /// Private implementation so that we don't leak our DenseMap instantiations to /// users. @@ -159,13 +177,13 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) { TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage, bool Hash) : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2), - Stream(RecordBuffer, llvm::support::little), Writer(Stream), + Stream(RecordBuffer, support::little), Writer(Stream), Mapping(Writer) { // RecordBuffer needs to be able to hold enough data so that if we are 1 // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes, // we won't overflow. if (Hash) - Hasher = make_unique<TypeHasher>(Storage); + Hasher = llvm::make_unique<TypeHasher>(Storage); } TypeSerializer::~TypeSerializer() = default; @@ -331,7 +349,7 @@ Error TypeSerializer::visitMemberEnd(CVMemberRecord &Record) { uint8_t *SegmentBytes = RecordStorage.Allocate<uint8_t>(LengthWithSize); auto SavedSegment = MutableArrayRef<uint8_t>(SegmentBytes, LengthWithSize); - MutableBinaryByteStream CS(SavedSegment, llvm::support::little); + MutableBinaryByteStream CS(SavedSegment, support::little); BinaryStreamWriter CW(CS); if (auto EC = CW.writeBytes(CopyData)) return EC; diff --git a/lib/DebugInfo/DWARF/CMakeLists.txt b/lib/DebugInfo/DWARF/CMakeLists.txt index 6ca6e64bd8e6..11f94509e8fa 100644 --- a/lib/DebugInfo/DWARF/CMakeLists.txt +++ b/lib/DebugInfo/DWARF/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMDebugInfoDWARF DWARFAcceleratorTable.cpp DWARFCompileUnit.cpp DWARFContext.cpp + DWARFDataExtractor.cpp DWARFDebugAbbrev.cpp DWARFDebugArangeSet.cpp DWARFDebugAranges.cpp diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 87009bf1b6a1..9ae7c9a07f76 100644 --- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -62,6 +62,45 @@ uint32_t DWARFAcceleratorTable::getHeaderDataLength() { return Hdr.HeaderDataLength; } +ArrayRef<std::pair<DWARFAcceleratorTable::HeaderData::AtomType, + DWARFAcceleratorTable::HeaderData::Form>> +DWARFAcceleratorTable::getAtomsDesc() { + return HdrData.Atoms; +} + +bool DWARFAcceleratorTable::validateForms() { + for (auto Atom : getAtomsDesc()) { + DWARFFormValue FormValue(Atom.second); + switch (Atom.first) { + case dwarf::DW_ATOM_die_offset: + if ((!FormValue.isFormClass(DWARFFormValue::FC_Constant) && + !FormValue.isFormClass(DWARFFormValue::FC_Flag)) || + FormValue.getForm() == dwarf::DW_FORM_sdata) + return false; + default: + break; + } + } + return true; +} + +uint32_t DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) { + uint32_t DieOffset = dwarf::DW_INVALID_OFFSET; + + for (auto Atom : getAtomsDesc()) { + DWARFFormValue FormValue(Atom.second); + FormValue.extractValue(AccelSection, &HashDataOffset, NULL); + switch (Atom.first) { + case dwarf::DW_ATOM_die_offset: + DieOffset = *FormValue.getAsUnsignedConstant(); + break; + default: + break; + } + } + return DieOffset; +} + LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const { // Dump the header. OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n' @@ -121,8 +160,7 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const { continue; } while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) { - unsigned StringOffset = - getRelocatedValue(AccelSection, 4, &DataOffset, &Relocs); + unsigned StringOffset = AccelSection.getRelocatedValue(4, &DataOffset); if (!StringOffset) break; OS << format(" Name: %08x \"%s\"\n", StringOffset, diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 381479461750..a18d4efec07a 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -59,26 +59,13 @@ using DWARFLineTable = DWARFDebugLine::LineTable; using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind; using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind; -uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size, - uint32_t *Off, const RelocAddrMap *Relocs, - uint64_t *SectionIndex) { - if (!Relocs) - return Data.getUnsigned(Off, Size); - RelocAddrMap::const_iterator AI = Relocs->find(*Off); - if (AI == Relocs->end()) - return Data.getUnsigned(Off, Size); - if (SectionIndex) - *SectionIndex = AI->second.SectionIndex; - return Data.getUnsigned(Off, Size) + AI->second.Value; -} - static void dumpAccelSection(raw_ostream &OS, StringRef Name, const DWARFSection& Section, StringRef StringSection, bool LittleEndian) { - DataExtractor AccelSection(Section.Data, LittleEndian, 0); + DWARFDataExtractor AccelSection(Section, LittleEndian, 0); DataExtractor StrData(StringSection, LittleEndian, 0); OS << "\n." << Name << " contents:\n"; - DWARFAcceleratorTable Accel(AccelSection, StrData, Section.Relocs); + DWARFAcceleratorTable Accel(AccelSection, StrData); if (!Accel.extract()) return; Accel.dump(OS); @@ -88,7 +75,7 @@ static void dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName, const DWARFSection &StringOffsetsSection, StringRef StringSection, bool LittleEndian) { - DataExtractor StrOffsetExt(StringOffsetsSection.Data, LittleEndian, 0); + DWARFDataExtractor StrOffsetExt(StringOffsetsSection, LittleEndian, 0); uint32_t Offset = 0; uint64_t SectionSize = StringOffsetsSection.Data.size(); @@ -144,8 +131,8 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName, while (Offset - ContributionBase < ContributionSize) { OS << format("0x%8.8x: ", Offset); // FIXME: We can only extract strings in DWARF32 format at the moment. - uint64_t StringOffset = getRelocatedValue( - StrOffsetExt, EntrySize, &Offset, &StringOffsetsSection.Relocs); + uint64_t StringOffset = + StrOffsetExt.getRelocatedValue(EntrySize, &Offset); if (Format == DWARF32) { OS << format("%8.8x ", StringOffset); uint32_t StringOffset32 = (uint32_t)StringOffset; @@ -287,11 +274,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) { if (!CUDIE) continue; if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list))) { - DataExtractor lineData(getLineSection().Data, isLittleEndian(), - savedAddressByteSize); + DWARFDataExtractor lineData(getLineSection(), isLittleEndian(), + savedAddressByteSize); DWARFDebugLine::LineTable LineTable; uint32_t Offset = *StmtOffset; - LineTable.parse(lineData, &getLineSection().Relocs, &Offset); + LineTable.parse(lineData, &Offset); LineTable.dump(OS); } } @@ -310,8 +297,8 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) { if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) { OS << "\n.debug_line.dwo contents:\n"; unsigned stmtOffset = 0; - DataExtractor lineData(getLineDWOSection().Data, isLittleEndian(), - savedAddressByteSize); + DWARFDataExtractor lineData(getLineDWOSection(), isLittleEndian(), + savedAddressByteSize); DWARFDebugLine::LineTable LineTable; while (LineTable.Prologue.parse(lineData, &stmtOffset)) { LineTable.dump(OS); @@ -348,11 +335,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) { // sizes, but for simplicity we just use the address byte size of the last // compile unit (there is no easy and fast way to associate address range // list and the compile unit it describes). - DataExtractor rangesData(getRangeSection().Data, isLittleEndian(), - savedAddressByteSize); + DWARFDataExtractor rangesData(getRangeSection(), isLittleEndian(), + savedAddressByteSize); offset = 0; DWARFDebugRangeList rangeList; - while (rangeList.extract(rangesData, &offset, getRangeSection().Relocs)) + while (rangeList.extract(rangesData, &offset)) rangeList.dump(OS); } @@ -499,11 +486,13 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() { if (Loc) return Loc.get(); - DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0); - Loc.reset(new DWARFDebugLoc(getLocSection().Relocs)); + Loc.reset(new DWARFDebugLoc); // assume all compile units have the same address byte size - if (getNumCompileUnits()) - Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize()); + if (getNumCompileUnits()) { + DWARFDataExtractor LocData(getLocSection(), isLittleEndian(), + getCompileUnitAtIndex(0)->getAddressByteSize()); + Loc->parse(LocData); + } return Loc.get(); } @@ -570,7 +559,7 @@ const DWARFDebugMacro *DWARFContext::getDebugMacro() { const DWARFLineTable * DWARFContext::getLineTableForUnit(DWARFUnit *U) { if (!Line) - Line.reset(new DWARFDebugLine(&getLineSection().Relocs)); + Line.reset(new DWARFDebugLine); auto UnitDIE = U->getUnitDIE(); if (!UnitDIE) @@ -586,12 +575,12 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) { return lt; // Make sure the offset is good before we try to parse. - if (stmtOffset >= U->getLineSection().size()) + if (stmtOffset >= U->getLineSection().Data.size()) return nullptr; // We have to parse it first. - DataExtractor lineData(U->getLineSection(), isLittleEndian(), - U->getAddressByteSize()); + DWARFDataExtractor lineData(U->getLineSection(), isLittleEndian(), + U->getAddressByteSize()); return Line->getOrParseLineTable(lineData, stmtOffset); } @@ -870,13 +859,13 @@ static Expected<SymInfo> getSymbolInfo(const object::ObjectFile &Obj, Expected<uint64_t> SymAddrOrErr = Sym->getAddress(); if (!SymAddrOrErr) - return createError("error: failed to compute symbol address: ", + return createError("failed to compute symbol address: ", SymAddrOrErr.takeError()); // Also remember what section this symbol is in for later auto SectOrErr = Sym->getSection(); if (!SectOrErr) - return createError("error: failed to get symbol section: ", + return createError("failed to get symbol section: ", SectOrErr.takeError()); RSec = *SectOrErr; @@ -937,8 +926,14 @@ Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec, return Error::success(); } -DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, - const LoadedObjectInfo *L) +ErrorPolicy DWARFContextInMemory::defaultErrorHandler(Error E) { + errs() << "error: " + toString(std::move(E)) << '\n'; + return ErrorPolicy::Continue; +} + +DWARFContextInMemory::DWARFContextInMemory( + const object::ObjectFile &Obj, const LoadedObjectInfo *L, + function_ref<ErrorPolicy(Error)> HandleError) : FileName(Obj.getFileName()), IsLittleEndian(Obj.isLittleEndian()), AddressSize(Obj.getBytesInAddress()) { for (const SectionRef &Section : Obj.sections()) { @@ -961,9 +956,10 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, Section.getContents(data); if (auto Err = maybeDecompress(Section, name, data)) { - errs() << "error: failed to decompress '" + name + "', " + - toString(std::move(Err)) - << '\n'; + ErrorPolicy EP = HandleError( + createError("failed to decompress '" + name + "', ", std::move(Err))); + if (EP == ErrorPolicy::Halt) + return; continue; } @@ -1055,7 +1051,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, Expected<SymInfo> SymInfoOrErr = getSymbolInfo(Obj, Reloc, L, AddrCache); if (!SymInfoOrErr) { - errs() << toString(SymInfoOrErr.takeError()) << '\n'; + if (HandleError(SymInfoOrErr.takeError()) == ErrorPolicy::Halt) + return; continue; } @@ -1064,7 +1061,11 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, if (V.error()) { SmallString<32> Name; Reloc.getTypeName(Name); - errs() << "error: failed to compute relocation: " << Name << "\n"; + ErrorPolicy EP = HandleError( + createError("failed to compute relocation: " + Name + ", ", + errorCodeToError(object_error::parse_failed))); + if (EP == ErrorPolicy::Halt) + return; continue; } RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val}; diff --git a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp new file mode 100644 index 000000000000..001097e56c71 --- /dev/null +++ b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp @@ -0,0 +1,24 @@ +//===- DWARFDataExtractor.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" + +using namespace llvm; + +uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint32_t *Off, + uint64_t *SecNdx) const { + if (!RelocMap) + return getUnsigned(Off, Size); + RelocAddrMap::const_iterator AI = RelocMap->find(*Off); + if (AI == RelocMap->end()) + return getUnsigned(Off, Size); + if (SecNdx) + *SecNdx = AI->second.SectionIndex; + return getUnsigned(Off, Size) + AI->second.Value; +} diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp index 1551974b822a..976bc4651ae6 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp @@ -21,13 +21,13 @@ using namespace dwarf; bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr) { - DataExtractor DebugInfoData = U.getDebugInfoExtractor(); + DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor(); const uint32_t UEndOffset = U.getNextUnitOffset(); return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0); } bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr, - const DataExtractor &DebugInfoData, + const DWARFDataExtractor &DebugInfoData, uint32_t UEndOffset, uint32_t D) { Offset = *OffsetPtr; Depth = D; diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index ad5647f3e03d..7d180564e9f7 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -94,8 +94,8 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const { // Parse v2-v4 directory and file tables. static void -parseV2DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr, - uint64_t EndPrologueOffset, +parseV2DirFileTables(const DWARFDataExtractor &DebugLineData, + uint32_t *OffsetPtr, uint64_t EndPrologueOffset, std::vector<StringRef> &IncludeDirectories, std::vector<DWARFDebugLine::FileNameEntry> &FileNames) { while (*OffsetPtr < EndPrologueOffset) { @@ -122,7 +122,7 @@ parseV2DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr, // Returns the descriptors, or an empty vector if we did not find a path or // ran off the end of the prologue. static ContentDescriptors -parseV5EntryFormat(DataExtractor DebugLineData, uint32_t *OffsetPtr, +parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr, uint64_t EndPrologueOffset) { ContentDescriptors Descriptors; int FormatCount = DebugLineData.getU8(OffsetPtr); @@ -142,8 +142,8 @@ parseV5EntryFormat(DataExtractor DebugLineData, uint32_t *OffsetPtr, } static bool -parseV5DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr, - uint64_t EndPrologueOffset, +parseV5DirFileTables(const DWARFDataExtractor &DebugLineData, + uint32_t *OffsetPtr, uint64_t EndPrologueOffset, const DWARFFormParams &FormParams, std::vector<StringRef> &IncludeDirectories, std::vector<DWARFDebugLine::FileNameEntry> &FileNames) { @@ -212,7 +212,7 @@ parseV5DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr, return true; } -bool DWARFDebugLine::Prologue::parse(DataExtractor DebugLineData, +bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr) { const uint64_t PrologueOffset = *OffsetPtr; @@ -381,20 +381,19 @@ DWARFDebugLine::getLineTable(uint32_t Offset) const { } const DWARFDebugLine::LineTable * -DWARFDebugLine::getOrParseLineTable(DataExtractor DebugLineData, +DWARFDebugLine::getOrParseLineTable(const DWARFDataExtractor &DebugLineData, uint32_t Offset) { std::pair<LineTableIter, bool> Pos = LineTableMap.insert(LineTableMapTy::value_type(Offset, LineTable())); LineTable *LT = &Pos.first->second; if (Pos.second) { - if (!LT->parse(DebugLineData, RelocMap, &Offset)) + if (!LT->parse(DebugLineData, &Offset)) return nullptr; } return LT; } -bool DWARFDebugLine::LineTable::parse(DataExtractor DebugLineData, - const RelocAddrMap *RMap, +bool DWARFDebugLine::LineTable::parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr) { const uint32_t DebugLineOffset = *OffsetPtr; @@ -443,8 +442,7 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor DebugLineData, // relocatable address. All of the other statement program opcodes // that affect the address register add a delta to it. This instruction // stores a relocatable value into it instead. - State.Row.Address = getRelocatedValue( - DebugLineData, DebugLineData.getAddressSize(), OffsetPtr, RMap); + State.Row.Address = DebugLineData.getRelocatedAddress(OffsetPtr); break; case DW_LNE_define_file: diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp index 2178bef65d1d..c240dd7406d9 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp @@ -40,9 +40,9 @@ void DWARFDebugLoc::dump(raw_ostream &OS) const { } } -void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) { +void DWARFDebugLoc::parse(const DWARFDataExtractor &data) { uint32_t Offset = 0; - while (data.isValidOffset(Offset+AddressSize-1)) { + while (data.isValidOffset(Offset+data.getAddressSize()-1)) { Locations.resize(Locations.size() + 1); LocationList &Loc = Locations.back(); Loc.Offset = Offset; @@ -51,8 +51,8 @@ void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) { while (true) { // A beginning and ending address offsets. Entry E; - E.Begin = getRelocatedValue(data, AddressSize, &Offset, &RelocMap); - E.End = getRelocatedValue(data, AddressSize, &Offset, &RelocMap); + E.Begin = data.getRelocatedAddress(&Offset); + E.End = data.getRelocatedAddress(&Offset); // The end of any given location list is marked by an end of list entry, // which consists of a 0 for the beginning address offset and a 0 for the diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp index 43201293fe60..0b6ae86fd94b 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp @@ -23,8 +23,8 @@ void DWARFDebugRangeList::clear() { Entries.clear(); } -bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr, - const RelocAddrMap &Relocs) { +bool DWARFDebugRangeList::extract(const DWARFDataExtractor &data, + uint32_t *offset_ptr) { clear(); if (!data.isValidOffset(*offset_ptr)) return false; @@ -35,10 +35,9 @@ bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr, while (true) { RangeListEntry entry; uint32_t prev_offset = *offset_ptr; - entry.StartAddress = getRelocatedValue(data, AddressSize, offset_ptr, - &Relocs, &entry.SectionIndex); - entry.EndAddress = - getRelocatedValue(data, AddressSize, offset_ptr, &Relocs); + entry.StartAddress = + data.getRelocatedAddress(offset_ptr, &entry.SectionIndex); + entry.EndAddress = data.getRelocatedAddress(offset_ptr); // Check that both values were extracted correctly. if (*offset_ptr != prev_offset + 2 * AddressSize) { diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp index b4b682dd11b5..ef416f72ad17 100644 --- a/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -308,7 +308,7 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth, unsigned Indent, DIDumpOptions DumpOpts) const { if (!isValid()) return; - DataExtractor debug_info_data = U->getDebugInfoExtractor(); + DWARFDataExtractor debug_info_data = U->getDebugInfoExtractor(); const uint32_t Offset = getOffset(); uint32_t offset = Offset; diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp index 861114bde1f2..83a7792e1244 100644 --- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp +++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp @@ -275,7 +275,7 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const { FC == FC_SectionOffset; } -bool DWARFFormValue::extractValue(const DataExtractor &Data, +bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data, uint32_t *OffsetPtr, const DWARFUnit *CU) { U = CU; bool Indirect = false; @@ -290,10 +290,9 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data, case DW_FORM_ref_addr: { if (!U) return false; - uint16_t AddrSize = (Form == DW_FORM_addr) ? U->getAddressByteSize() - : U->getRefAddrByteSize(); - Value.uval = getRelocatedValue(Data, AddrSize, OffsetPtr, - U->getRelocMap(), &Value.SectionIndex); + uint16_t Size = (Form == DW_FORM_addr) ? U->getAddressByteSize() + : U->getRefAddrByteSize(); + Value.uval = Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex); break; } case DW_FORM_exprloc: @@ -333,11 +332,9 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data, case DW_FORM_ref4: case DW_FORM_ref_sup4: case DW_FORM_strx4: - case DW_FORM_addrx4: { - const RelocAddrMap *RelocMap = U ? U->getRelocMap() : nullptr; - Value.uval = getRelocatedValue(Data, 4, OffsetPtr, RelocMap); + case DW_FORM_addrx4: + Value.uval = Data.getRelocatedValue(4, OffsetPtr); break; - } case DW_FORM_data8: case DW_FORM_ref8: case DW_FORM_ref_sup8: @@ -365,8 +362,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data, case DW_FORM_strp_sup: { if (!U) return false; - Value.uval = getRelocatedValue(Data, U->getDwarfOffsetByteSize(), - OffsetPtr, U->getRelocMap()); + Value.uval = + Data.getRelocatedValue(U->getDwarfOffsetByteSize(), OffsetPtr); break; } case DW_FORM_flag_present: @@ -576,7 +573,6 @@ Optional<const char *> DWARFFormValue::getAsCString() const { uint64_t StrOffset; if (!U->getStringOffsetSectionItem(Offset, StrOffset)) return None; - StrOffset += U->getStringOffsetSectionRelocation(Offset); Offset = StrOffset; } if (const char *Str = U->getStringExtractor().getCStr(&Offset)) { diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index fd9c7c2b1d46..043bdb874f43 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -32,8 +32,7 @@ using namespace dwarf; void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) { parseImpl(C, Section, C.getDebugAbbrev(), &C.getRangeSection(), C.getStringSection(), C.getStringOffsetSection(), - &C.getAddrSection(), C.getLineSection().Data, C.isLittleEndian(), - false); + &C.getAddrSection(), C.getLineSection(), C.isLittleEndian(), false); } void DWARFUnitSectionBase::parseDWO(DWARFContext &C, @@ -41,15 +40,15 @@ void DWARFUnitSectionBase::parseDWO(DWARFContext &C, DWARFUnitIndex *Index) { parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), &C.getRangeDWOSection(), C.getStringDWOSection(), C.getStringOffsetDWOSection(), - &C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(), + &C.getAddrSection(), C.getLineDWOSection(), C.isLittleEndian(), true); } DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section, const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS, const DWARFSection &SOS, - const DWARFSection *AOS, StringRef LS, bool LE, bool IsDWO, - const DWARFUnitSectionBase &UnitSection, + const DWARFSection *AOS, const DWARFSection &LS, bool LE, + bool IsDWO, const DWARFUnitSectionBase &UnitSection, const DWARFUnitIndex::Entry *IndexEntry) : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS), LineSection(LS), StringSection(SS), StringOffsetSection(SOS), @@ -65,33 +64,23 @@ bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index, uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize(); if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize()) return false; - DataExtractor DA(AddrOffsetSection->Data, isLittleEndian, - getAddressByteSize()); - Result = getRelocatedValue(DA, getAddressByteSize(), &Offset, - &AddrOffsetSection->Relocs); + DWARFDataExtractor DA(*AddrOffsetSection, isLittleEndian, + getAddressByteSize()); + Result = DA.getRelocatedAddress(&Offset); return true; } bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index, uint64_t &Result) const { - unsigned ItemSize = getFormat() == DWARF64 ? 8 : 4; + unsigned ItemSize = getDwarfOffsetByteSize(); uint32_t Offset = StringOffsetSectionBase + Index * ItemSize; if (StringOffsetSection.Data.size() < Offset + ItemSize) return false; - DataExtractor DA(StringOffsetSection.Data, isLittleEndian, 0); - Result = ItemSize == 4 ? DA.getU32(&Offset) : DA.getU64(&Offset); + DWARFDataExtractor DA(StringOffsetSection, isLittleEndian, 0); + Result = DA.getRelocatedValue(ItemSize, &Offset); return true; } -uint64_t DWARFUnit::getStringOffsetSectionRelocation(uint32_t Index) const { - unsigned ItemSize = getFormat() == DWARF64 ? 8 : 4; - uint64_t ByteOffset = StringOffsetSectionBase + Index * ItemSize; - RelocAddrMap::const_iterator AI = getStringOffsetsRelocMap().find(ByteOffset); - if (AI != getStringOffsetsRelocMap().end()) - return AI->second.Value; - return 0; -} - bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) { Length = debug_info.getU32(offset_ptr); // FIXME: Support DWARF64. @@ -149,14 +138,13 @@ bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) { } bool DWARFUnit::extractRangeList(uint32_t RangeListOffset, - DWARFDebugRangeList &RangeList) const { + DWARFDebugRangeList &RangeList) const { // Require that compile unit is extracted. assert(!DieArray.empty()); - DataExtractor RangesData(RangeSection->Data, isLittleEndian, - getAddressByteSize()); + DWARFDataExtractor RangesData(*RangeSection, isLittleEndian, + getAddressByteSize()); uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset; - return RangeList.extract(RangesData, &ActualRangeListOffset, - RangeSection->Relocs); + return RangeList.extract(RangesData, &ActualRangeListOffset); } void DWARFUnit::clear() { @@ -190,7 +178,7 @@ void DWARFUnit::extractDIEsToVector( uint32_t DIEOffset = Offset + getHeaderSize(); uint32_t NextCUOffset = getNextUnitOffset(); DWARFDebugInfoEntry DIE; - DataExtractor DebugInfoData = getDebugInfoExtractor(); + DWARFDataExtractor DebugInfoData = getDebugInfoExtractor(); uint32_t Depth = 0; bool IsCUDie = true; diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 41907e570563..0a10e6b78911 100644 --- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -280,11 +280,10 @@ bool DWARFVerifier::handleDebugLine() { bool DWARFVerifier::handleAppleNames() { NumAppleNamesErrors = 0; - DataExtractor AppleNamesSection(DCtx.getAppleNamesSection().Data, - DCtx.isLittleEndian(), 0); + DWARFDataExtractor AppleNamesSection(DCtx.getAppleNamesSection(), + DCtx.isLittleEndian(), 0); DataExtractor StrData(DCtx.getStringSection(), DCtx.isLittleEndian(), 0); - DWARFAcceleratorTable AppleNames(AppleNamesSection, StrData, - DCtx.getAppleNamesSection().Relocs); + DWARFAcceleratorTable AppleNames(AppleNamesSection, StrData); if (!AppleNames.extract()) { return true; @@ -292,20 +291,80 @@ bool DWARFVerifier::handleAppleNames() { OS << "Verifying .apple_names...\n"; - // Verify that all buckets have a valid hash index or are empty + // Verify that all buckets have a valid hash index or are empty. uint32_t NumBuckets = AppleNames.getNumBuckets(); uint32_t NumHashes = AppleNames.getNumHashes(); uint32_t BucketsOffset = AppleNames.getSizeHdr() + AppleNames.getHeaderDataLength(); + uint32_t HashesBase = BucketsOffset + NumBuckets * 4; + uint32_t OffsetsBase = HashesBase + NumHashes * 4; for (uint32_t BucketIdx = 0; BucketIdx < NumBuckets; ++BucketIdx) { uint32_t HashIdx = AppleNamesSection.getU32(&BucketsOffset); if (HashIdx >= NumHashes && HashIdx != UINT32_MAX) { - OS << format("error: Bucket[%d] has invalid hash index: [%d]\n", - BucketIdx, HashIdx); + OS << format("error: Bucket[%d] has invalid hash index: %u\n", BucketIdx, + HashIdx); ++NumAppleNamesErrors; } } + + uint32_t NumAtoms = AppleNames.getAtomsDesc().size(); + if (NumAtoms == 0) { + OS << "error: no atoms; failed to read HashData\n"; + ++NumAppleNamesErrors; + return false; + } + + if (!AppleNames.validateForms()) { + OS << "error: unsupported form; failed to read HashData\n"; + ++NumAppleNamesErrors; + return false; + } + + for (uint32_t HashIdx = 0; HashIdx < NumHashes; ++HashIdx) { + uint32_t HashOffset = HashesBase + 4 * HashIdx; + uint32_t DataOffset = OffsetsBase + 4 * HashIdx; + uint32_t Hash = AppleNamesSection.getU32(&HashOffset); + uint32_t HashDataOffset = AppleNamesSection.getU32(&DataOffset); + if (!AppleNamesSection.isValidOffsetForDataOfSize(HashDataOffset, + sizeof(uint64_t))) { + OS << format("error: Hash[%d] has invalid HashData offset: 0x%08x\n", + HashIdx, HashDataOffset); + ++NumAppleNamesErrors; + } + + uint32_t StrpOffset; + uint32_t StringOffset; + uint32_t StringCount = 0; + uint32_t DieOffset = dwarf::DW_INVALID_OFFSET; + + while ((StrpOffset = AppleNamesSection.getU32(&HashDataOffset)) != 0) { + const uint32_t NumHashDataObjects = + AppleNamesSection.getU32(&HashDataOffset); + for (uint32_t HashDataIdx = 0; HashDataIdx < NumHashDataObjects; + ++HashDataIdx) { + DieOffset = AppleNames.readAtoms(HashDataOffset); + if (!DCtx.getDIEForOffset(DieOffset)) { + const uint32_t BucketIdx = + NumBuckets ? (Hash % NumBuckets) : UINT32_MAX; + StringOffset = StrpOffset; + const char *Name = StrData.getCStr(&StringOffset); + if (!Name) + Name = "<NULL>"; + + OS << format( + "error: .apple_names Bucket[%d] Hash[%d] = 0x%08x " + "Str[%u] = 0x%08x " + "DIE[%d] = 0x%08x is not a valid DIE offset for \"%s\".\n", + BucketIdx, HashIdx, Hash, StringCount, StrpOffset, HashDataIdx, + DieOffset, Name); + + ++NumAppleNamesErrors; + } + } + ++StringCount; + } + } return NumAppleNamesErrors == 0; } diff --git a/lib/DebugInfo/PDB/Native/DbiModuleList.cpp b/lib/DebugInfo/PDB/Native/DbiModuleList.cpp index 434f775097e0..eea70b229c67 100644 --- a/lib/DebugInfo/PDB/Native/DbiModuleList.cpp +++ b/lib/DebugInfo/PDB/Native/DbiModuleList.cpp @@ -1,4 +1,4 @@ -//===- DbiModuleList.cpp - PDB module information list ----------*- C++ -*-===// +//===- DbiModuleList.cpp - PDB module information list --------------------===// // // The LLVM Compiler Infrastructure // @@ -6,10 +6,17 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h" +#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" +#include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/Error.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> using namespace llvm; using namespace llvm::pdb; diff --git a/lib/DebugInfo/PDB/Native/Hash.cpp b/lib/DebugInfo/PDB/Native/Hash.cpp index 2ad3f55dc5c3..61188ece2dcb 100644 --- a/lib/DebugInfo/PDB/Native/Hash.cpp +++ b/lib/DebugInfo/PDB/Native/Hash.cpp @@ -8,10 +8,10 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/Hash.h" - #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Endian.h" #include "llvm/Support/JamCRC.h" +#include <cstdint> using namespace llvm; using namespace llvm::support; diff --git a/lib/DebugInfo/PDB/Native/HashTable.cpp b/lib/DebugInfo/PDB/Native/HashTable.cpp index ebf8c9c04db1..439217f91d04 100644 --- a/lib/DebugInfo/PDB/Native/HashTable.cpp +++ b/lib/DebugInfo/PDB/Native/HashTable.cpp @@ -1,4 +1,4 @@ -//===- HashTable.cpp - PDB Hash Table ---------------------------*- C++ -*-===// +//===- HashTable.cpp - PDB Hash Table -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,12 +8,16 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/HashTable.h" - #include "llvm/ADT/Optional.h" -#include "llvm/ADT/SparseBitVector.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" - -#include <assert.h> +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <utility> using namespace llvm; using namespace llvm::pdb; @@ -106,9 +110,11 @@ void HashTable::clear() { } uint32_t HashTable::capacity() const { return Buckets.size(); } + uint32_t HashTable::size() const { return Present.count(); } HashTableIterator HashTable::begin() const { return HashTableIterator(*this); } + HashTableIterator HashTable::end() const { return HashTableIterator(*this, 0, true); } diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp index 83c56574a16e..2e1f61c7a25d 100644 --- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp +++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp @@ -9,11 +9,11 @@ #include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h" -#include "llvm/DebugInfo/PDB/Native/PDBFile.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" -#include "llvm/DebugInfo/PDB/Native/RawTypes.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/BinaryStreamRef.h" #include "llvm/Support/Error.h" @@ -97,7 +97,7 @@ ModuleDebugStreamRef::symbols(bool *HadError) const { return make_range(SymbolArray.begin(HadError), SymbolArray.end()); } -llvm::iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator> +iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator> ModuleDebugStreamRef::subsections() const { return make_range(Subsections.begin(), Subsections.end()); } diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStreamBuilder.cpp deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/lib/DebugInfo/PDB/Native/ModuleDebugStreamBuilder.cpp +++ /dev/null diff --git a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp index 4f90cd9cd8ac..354b8c0e07ff 100644 --- a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp +++ b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp @@ -1,4 +1,4 @@ -//===- NamedStreamMap.cpp - PDB Named Stream Map ----------------*- C++ -*-===// +//===- NamedStreamMap.cpp - PDB Named Stream Map --------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,17 +8,20 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h" - -#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" #include "llvm/DebugInfo/PDB/Native/HashTable.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" #include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/BinaryStreamRef.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" #include <algorithm> +#include <cassert> #include <cstdint> +#include <tuple> using namespace llvm; using namespace llvm::pdb; diff --git a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp index c23120041164..a65782e2d4fc 100644 --- a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp +++ b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp @@ -32,9 +32,7 @@ std::unique_ptr<PDBSymbol> NativeEnumModules::getChildAtIndex(uint32_t Index) const { if (Index >= Modules.getModuleCount()) return nullptr; - return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland( - Session, std::unique_ptr<IPDBRawSymbol>(new NativeCompilandSymbol( - Session, 0, Modules.getModuleDescriptor(Index))))); + return Session.createCompilandSymbol(Modules.getModuleDescriptor(Index)); } std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() { diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp index ed6db63edbab..b4f5c96ce66b 100644 --- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp +++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp @@ -1,4 +1,4 @@ -//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -*- C++ -*-===// +//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -------===// // // The LLVM Compiler Infrastructure // @@ -8,16 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h" -#include "llvm/DebugInfo/PDB/Native/NativeSession.h" -#include "llvm/DebugInfo/PDB/PDBExtras.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h" -#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h" -#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h" -#include "llvm/Support/ConvertUTF.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace llvm::pdb; @@ -49,7 +40,7 @@ NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const { return nullptr; } -void NativeRawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const { +void NativeRawSymbol::getDataBytes(SmallVector<uint8_t, 32> &bytes) const { bytes.clear(); } @@ -109,7 +100,7 @@ uint32_t NativeRawSymbol::getClassParentId() const { } std::string NativeRawSymbol::getCompilerName() const { - return 0; + return {}; } uint32_t NativeRawSymbol::getCount() const { @@ -136,7 +127,7 @@ uint32_t NativeRawSymbol::getLexicalParentId() const { } std::string NativeRawSymbol::getLibraryName() const { - return ""; + return {}; } uint32_t NativeRawSymbol::getLiveRangeStartAddressOffset() const { @@ -164,7 +155,7 @@ uint32_t NativeRawSymbol::getMemorySpaceKind() const { } std::string NativeRawSymbol::getName() const { - return 0; + return {}; } uint32_t NativeRawSymbol::getNumberOfAcceleratorPointerTags() const { @@ -188,7 +179,7 @@ uint32_t NativeRawSymbol::getNumberOfRows() const { } std::string NativeRawSymbol::getObjectFileName() const { - return ""; + return {}; } uint32_t NativeRawSymbol::getOemId() const { @@ -240,7 +231,7 @@ uint32_t NativeRawSymbol::getSlot() const { } std::string NativeRawSymbol::getSourceFileName() const { - return 0; + return {}; } uint32_t NativeRawSymbol::getStride() const { @@ -251,7 +242,7 @@ uint32_t NativeRawSymbol::getSubTypeId() const { return 0; } -std::string NativeRawSymbol::getSymbolsFileName() const { return ""; } +std::string NativeRawSymbol::getSymbolsFileName() const { return {}; } uint32_t NativeRawSymbol::getSymIndexId() const { return SymbolId; } @@ -292,7 +283,7 @@ uint32_t NativeRawSymbol::getUavSlot() const { } std::string NativeRawSymbol::getUndecoratedName() const { - return 0; + return {}; } uint32_t NativeRawSymbol::getUnmodifiedTypeId() const { @@ -701,5 +692,5 @@ bool NativeRawSymbol::wasInlined() const { } std::string NativeRawSymbol::getUnused() const { - return ""; + return {}; } diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp index 3ab381e76e62..93d43d9ef341 100644 --- a/lib/DebugInfo/PDB/Native/NativeSession.cpp +++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp @@ -13,6 +13,7 @@ #include "llvm/DebugInfo/PDB/GenericError.h" #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h" #include "llvm/DebugInfo/PDB/IPDBSourceFile.h" +#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h" #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h" #include "llvm/DebugInfo/PDB/Native/PDBFile.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" @@ -23,8 +24,10 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MemoryBuffer.h" + #include <algorithm> #include <memory> +#include <utility> using namespace llvm; using namespace llvm::msf; @@ -66,12 +69,23 @@ Error NativeSession::createFromExe(StringRef Path, return make_error<RawError>(raw_error_code::feature_unsupported); } +std::unique_ptr<PDBSymbolCompiland> +NativeSession::createCompilandSymbol(DbiModuleDescriptor MI) { + const auto Id = static_cast<uint32_t>(SymbolCache.size()); + SymbolCache.push_back( + llvm::make_unique<NativeCompilandSymbol>(*this, Id, MI)); + return llvm::make_unique<PDBSymbolCompiland>( + *this, std::unique_ptr<IPDBRawSymbol>(SymbolCache[Id]->clone())); +} + uint64_t NativeSession::getLoadAddress() const { return 0; } void NativeSession::setLoadAddress(uint64_t Address) {} std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() { - auto RawSymbol = llvm::make_unique<NativeExeSymbol>(*this, 0); + const auto Id = static_cast<uint32_t>(SymbolCache.size()); + SymbolCache.push_back(llvm::make_unique<NativeExeSymbol>(*this, Id)); + auto RawSymbol = SymbolCache[Id]->clone(); auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol))); std::unique_ptr<PDBSymbolExe> ExeSymbol( static_cast<PDBSymbolExe *>(PdbSymbol.release())); @@ -80,7 +94,10 @@ std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() { std::unique_ptr<PDBSymbol> NativeSession::getSymbolById(uint32_t SymbolId) const { - return nullptr; + // If the caller has a SymbolId, it'd better be in our SymbolCache. + return SymbolId < SymbolCache.size() + ? PDBSymbol::create(*this, SymbolCache[SymbolId]->clone()) + : nullptr; } std::unique_ptr<PDBSymbol> diff --git a/lib/DebugInfo/PDB/PDB.cpp b/lib/DebugInfo/PDB/PDB.cpp index 7e3acc1165f3..501d4f5985b7 100644 --- a/lib/DebugInfo/PDB/PDB.cpp +++ b/lib/DebugInfo/PDB/PDB.cpp @@ -1,4 +1,4 @@ -//===- PDB.cpp - base header file for creating a PDB reader -----*- C++ -*-===// +//===- PDB.cpp - base header file for creating a PDB reader ---------------===// // // The LLVM Compiler Infrastructure // @@ -8,18 +8,14 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/PDB.h" - #include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" #include "llvm/DebugInfo/PDB/GenericError.h" -#include "llvm/DebugInfo/PDB/IPDBSession.h" -#include "llvm/DebugInfo/PDB/PDB.h" #if LLVM_ENABLE_DIA_SDK #include "llvm/DebugInfo/PDB/DIA/DIASession.h" #endif #include "llvm/DebugInfo/PDB/Native/NativeSession.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Error.h" using namespace llvm; using namespace llvm::pdb; @@ -33,7 +29,7 @@ Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path, #if LLVM_ENABLE_DIA_SDK return DIASession::createFromPdb(Path, Session); #else - return llvm::make_error<GenericError>("DIA is not installed on the system"); + return make_error<GenericError>("DIA is not installed on the system"); #endif } @@ -46,6 +42,6 @@ Error llvm::pdb::loadDataForEXE(PDB_ReaderType Type, StringRef Path, #if LLVM_ENABLE_DIA_SDK return DIASession::createFromExe(Path, Session); #else - return llvm::make_error<GenericError>("DIA is not installed on the system"); + return make_error<GenericError>("DIA is not installed on the system"); #endif } diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp index dc22a30facab..faf1142ddf17 100644 --- a/lib/DebugInfo/PDB/PDBExtras.cpp +++ b/lib/DebugInfo/PDB/PDBExtras.cpp @@ -1,4 +1,4 @@ -//===- PDBExtras.cpp - helper functions and classes for PDBs -----*- C++-*-===// +//===- PDBExtras.cpp - helper functions and classes for PDBs --------------===// // // The LLVM Compiler Infrastructure // @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/PDBExtras.h" - #include "llvm/ADT/ArrayRef.h" #include "llvm/DebugInfo/CodeView/Formatters.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace llvm::pdb; diff --git a/lib/DebugInfo/PDB/UDTLayout.cpp b/lib/DebugInfo/PDB/UDTLayout.cpp index da353cb6977c..5f4390bbaf12 100644 --- a/lib/DebugInfo/PDB/UDTLayout.cpp +++ b/lib/DebugInfo/PDB/UDTLayout.cpp @@ -1,4 +1,4 @@ -//===- UDTLayout.cpp --------------------------------------------*- C++ -*-===// +//===- UDTLayout.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,20 +8,25 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/UDTLayout.h" - +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h" #include "llvm/DebugInfo/PDB/IPDBSession.h" #include "llvm/DebugInfo/PDB/PDBSymbol.h" #include "llvm/DebugInfo/PDB/PDBSymbolData.h" -#include "llvm/DebugInfo/PDB/PDBSymbolExe.h" #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h" - -#include <utility> +#include "llvm/DebugInfo/PDB/PDBTypes.h" +#include "llvm/Support/Casting.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> using namespace llvm; using namespace llvm::pdb; @@ -176,7 +181,6 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) { else Bases.push_back(std::move(Base)); } - else if (auto Data = unique_dyn_cast<PDBSymbolData>(Child)) { if (Data->getDataKind() == PDB_DataKind::Member) Members.push_back(std::move(Data)); @@ -296,4 +300,4 @@ void UDTLayoutBase::addChildToLayout(std::unique_ptr<LayoutItemBase> Child) { } ChildStorage.push_back(std::move(Child)); -}
\ No newline at end of file +} diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h index b20690c7caaf..690276232a6f 100644 --- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h +++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h @@ -193,11 +193,11 @@ public: } auto *MPtr = M.release(); ShouldDelete[MPtr] = true; - auto Deleter = - [this](Module *Mod) { - if (ShouldDelete[Mod]) - delete Mod; - }; + auto Deleter = [this](Module *Mod) { + auto I = ShouldDelete.find(Mod); + if (I != ShouldDelete.end() && I->second) + delete Mod; + }; LocalModules.push_back(std::shared_ptr<Module>(MPtr, std::move(Deleter))); LazyEmitLayer.addModule(LocalModules.back(), &MemMgr, &Resolver); } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 3d12eadea4dd..8b6f9bef66df 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -78,11 +78,11 @@ public: void updateSymbolAddress(const SymbolRef &SymRef, uint64_t Addr); // Methods for type inquiry through isa, cast and dyn_cast - static inline bool classof(const Binary *v) { + static bool classof(const Binary *v) { return (isa<ELFObjectFile<ELFT>>(v) && classof(cast<ELFObjectFile<ELFT>>(v))); } - static inline bool classof(const ELFObjectFile<ELFT> *v) { + static bool classof(const ELFObjectFile<ELFT> *v) { return v->isDyldType(); } }; diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp index 27150a89d9b2..d387a6f0ecb9 100644 --- a/lib/IR/Constants.cpp +++ b/lib/IR/Constants.cpp @@ -716,7 +716,7 @@ bool ConstantFP::isExactlyValue(const APFloat &V) const { /// Remove the constant from the constant table. void ConstantFP::destroyConstantImpl() { - llvm_unreachable("You can't ConstantInt->destroyConstantImpl()!"); + llvm_unreachable("You can't ConstantFP->destroyConstantImpl()!"); } //===----------------------------------------------------------------------===// diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp index 37e735251fdf..9bd0e297f4ef 100644 --- a/lib/IR/Dominators.cpp +++ b/lib/IR/Dominators.cpp @@ -63,15 +63,22 @@ bool BasicBlockEdge::isSingleEdge() const { template class llvm::DomTreeNodeBase<BasicBlock>; template class llvm::DominatorTreeBase<BasicBlock>; -template void llvm::Calculate<Function, BasicBlock *>( +template void llvm::DomTreeBuilder::Calculate<Function, BasicBlock *>( DominatorTreeBase< typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type> &DT, Function &F); -template void llvm::Calculate<Function, Inverse<BasicBlock *>>( +template void llvm::DomTreeBuilder::Calculate<Function, Inverse<BasicBlock *>>( DominatorTreeBase<typename std::remove_pointer< GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT, Function &F); +template bool llvm::DomTreeBuilder::Verify<BasicBlock *>( + const DominatorTreeBase< + typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type> + &DT); +template bool llvm::DomTreeBuilder::Verify<Inverse<BasicBlock *>>( + const DominatorTreeBase<typename std::remove_pointer< + GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT); bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &) { @@ -285,6 +292,13 @@ bool DominatorTree::isReachableFromEntry(const Use &U) const { } void DominatorTree::verifyDomTree() const { + // Perform the expensive checks only when VerifyDomInfo is set. + if (VerifyDomInfo && !verify()) { + errs() << "\n~~~~~~~~~~~\n\t\tDomTree verification failed!\n~~~~~~~~~~~\n"; + print(errs()); + abort(); + } + Function &F = *getRoot()->getParent(); DominatorTree OtherDT; diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp index ad0d4470c111..2e13f362344d 100644 --- a/lib/IR/LLVMContext.cpp +++ b/lib/IR/LLVMContext.cpp @@ -125,11 +125,18 @@ void LLVMContext::setDiagnosticHandler(DiagnosticHandlerTy DiagnosticHandler, pImpl->RespectDiagnosticFilters = RespectFilters; } -void LLVMContext::setDiagnosticHotnessRequested(bool Requested) { - pImpl->DiagnosticHotnessRequested = Requested; +void LLVMContext::setDiagnosticsHotnessRequested(bool Requested) { + pImpl->DiagnosticsHotnessRequested = Requested; } -bool LLVMContext::getDiagnosticHotnessRequested() const { - return pImpl->DiagnosticHotnessRequested; +bool LLVMContext::getDiagnosticsHotnessRequested() const { + return pImpl->DiagnosticsHotnessRequested; +} + +void LLVMContext::setDiagnosticsHotnessThreshold(uint64_t Threshold) { + pImpl->DiagnosticsHotnessThreshold = Threshold; +} +uint64_t LLVMContext::getDiagnosticsHotnessThreshold() const { + return pImpl->DiagnosticsHotnessThreshold; } yaml::Output *LLVMContext::getDiagnosticsOutputFile() { diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h index 4147f71ad9d2..395beb57fe37 100644 --- a/lib/IR/LLVMContextImpl.h +++ b/lib/IR/LLVMContextImpl.h @@ -1169,7 +1169,8 @@ public: LLVMContext::DiagnosticHandlerTy DiagnosticHandler = nullptr; void *DiagnosticContext = nullptr; bool RespectDiagnosticFilters = false; - bool DiagnosticHotnessRequested = false; + bool DiagnosticsHotnessRequested = false; + uint64_t DiagnosticsHotnessThreshold = 0; std::unique_ptr<yaml::Output> DiagnosticsOutputFile; LLVMContext::YieldCallbackTy YieldCallback = nullptr; diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 35032fdd33e1..68b8c9fcb939 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -472,6 +472,36 @@ Error LTO::addModule(InputFile &Input, unsigned ModI, return Error::success(); } +// Checks whether the given global value is in a non-prevailing comdat +// (comdat containing values the linker indicated were not prevailing, +// which we then dropped to available_externally), and if so, removes +// it from the comdat. This is called for all global values to ensure the +// comdat is empty rather than leaving an incomplete comdat. It is needed for +// regular LTO modules, in case we are in a mixed-LTO mode (both regular +// and thin LTO modules) compilation. Since the regular LTO module will be +// linked first in the final native link, we want to make sure the linker +// doesn't select any of these incomplete comdats that would be left +// in the regular LTO module without this cleanup. +static void +handleNonPrevailingComdat(GlobalValue &GV, + std::set<const Comdat *> &NonPrevailingComdats) { + Comdat *C = GV.getComdat(); + if (!C) + return; + + if (!NonPrevailingComdats.count(C)) + return; + + // Additionally need to drop externally visible global values from the comdat + // to available_externally, so that there aren't multiply defined linker + // errors. + if (!GV.hasLocalLinkage()) + GV.setLinkage(GlobalValue::AvailableExternallyLinkage); + + if (auto GO = dyn_cast<GlobalObject>(&GV)) + GO->setComdat(nullptr); +} + // Add a regular LTO object to the link. // The resulting module needs to be linked into the combined LTO module with // linkRegularLTO. @@ -523,6 +553,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, }; Skip(); + std::set<const Comdat *> NonPrevailingComdats; for (const InputFile::Symbol &Sym : Syms) { assert(ResI != ResE); SymbolResolution Res = *ResI++; @@ -557,6 +588,8 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // module (in linkRegularLTO), based on whether it is undefined. Mod.Keep.push_back(GV); GV->setLinkage(GlobalValue::AvailableExternallyLinkage); + if (GV->hasComdat()) + NonPrevailingComdats.insert(GV->getComdat()); cast<GlobalObject>(GV)->setComdat(nullptr); } } @@ -574,6 +607,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } + if (!M.getComdatSymbolTable().empty()) + for (GlobalValue &GV : M.global_values()) + handleNonPrevailingComdat(GV, NonPrevailingComdats); assert(MsymI == MsymE); return std::move(Mod); } @@ -1087,7 +1123,7 @@ lto::setupOptimizationRemarks(LLVMContext &Context, Context.setDiagnosticsOutputFile( llvm::make_unique<yaml::Output>(DiagnosticFile->os())); if (LTOPassRemarksWithHotness) - Context.setDiagnosticHotnessRequested(true); + Context.setDiagnosticsHotnessRequested(true); DiagnosticFile->keep(); return std::move(DiagnosticFile); } diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp index 92c5da0e9fef..0318d916aa49 100644 --- a/lib/MC/MCAssembler.cpp +++ b/lib/MC/MCAssembler.cpp @@ -261,9 +261,9 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, Value -= Offset; } - // Let the backend adjust the fixup value if necessary, including whether - // we need a relocation. - Backend.processFixupValue(*this, Fixup, Target, IsResolved); + // Let the backend force a relocation if needed. + if (IsResolved && Backend.shouldForceRelocation(*this, Fixup, Target)) + IsResolved = false; return IsResolved; } diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 45534ba18212..82352cb50c70 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -265,7 +265,8 @@ private: uint32_t NumFuncImports); void writeCodeRelocSection(); void writeDataRelocSection(uint64_t DataSectionHeaderSize); - void writeLinkingMetaDataSection(ArrayRef<StringRef> WeakSymbols, + void writeLinkingMetaDataSection(uint32_t DataSize, uint32_t DataAlignment, + ArrayRef<StringRef> WeakSymbols, bool HasStackPointer, uint32_t StackPointerGlobal); @@ -877,11 +878,8 @@ void WasmObjectWriter::writeDataRelocSection(uint64_t DataSectionHeaderSize) { } void WasmObjectWriter::writeLinkingMetaDataSection( - ArrayRef<StringRef> WeakSymbols, bool HasStackPointer, - uint32_t StackPointerGlobal) { - if (!HasStackPointer && WeakSymbols.empty()) - return; - + uint32_t DataSize, uint32_t DataAlignment, ArrayRef<StringRef> WeakSymbols, + bool HasStackPointer, uint32_t StackPointerGlobal) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_CUSTOM, "linking"); SectionBookkeeping SubSection; @@ -902,6 +900,16 @@ void WasmObjectWriter::writeLinkingMetaDataSection( endSection(SubSection); } + if (DataSize > 0) { + startSection(SubSection, wasm::WASM_DATA_SIZE); + encodeULEB128(DataSize, getStream()); + endSection(SubSection); + + startSection(SubSection, wasm::WASM_DATA_ALIGNMENT); + encodeULEB128(DataAlignment, getStream()); + endSection(SubSection); + } + endSection(Section); } @@ -923,6 +931,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, unsigned NumFuncImports = 0; unsigned NumGlobalImports = 0; SmallVector<char, 0> DataBytes; + uint32_t DataAlignment = 1; uint32_t StackPointerGlobal = 0; bool HasStackPointer = false; @@ -1157,6 +1166,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, report_fatal_error("data sections must contain at most one variable"); DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment())); + DataAlignment = std::max(DataAlignment, DataSection.getAlignment()); DataSection.setSectionOffset(DataBytes.size()); @@ -1272,7 +1282,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, writeNameSection(Functions, Imports, NumFuncImports); writeCodeRelocSection(); writeDataRelocSection(DataSectionHeaderSize); - writeLinkingMetaDataSection(WeakSymbols, HasStackPointer, StackPointerGlobal); + writeLinkingMetaDataSection(DataBytes.size(), DataAlignment, WeakSymbols, HasStackPointer, StackPointerGlobal); // TODO: Translate the .comment section to the output. // TODO: Translate debug sections to the output. diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt index 1d08a9efd8b3..fd5e7707c541 100644 --- a/lib/Object/CMakeLists.txt +++ b/lib/Object/CMakeLists.txt @@ -27,4 +27,5 @@ add_llvm_library(LLVMObject DEPENDS intrinsics_gen + llvm_vcsrevision_h ) diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index 9a760d86e7e2..1e9b0c5b0454 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -883,7 +883,7 @@ base_reloc_iterator COFFObjectFile::base_reloc_end() const { } uint8_t COFFObjectFile::getBytesInAddress() const { - return getArch() == Triple::x86_64 ? 8 : 4; + return getArch() == Triple::x86_64 || getArch() == Triple::aarch64 ? 8 : 4; } StringRef COFFObjectFile::getFileFormatName() const { @@ -1216,6 +1216,29 @@ void COFFObjectFile::getRelocationTypeName( Res = "Unknown"; } break; + case COFF::IMAGE_FILE_MACHINE_ARM64: + switch (Reloc->Type) { + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ABSOLUTE); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32NB); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH26); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEBASE_REL21); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_REL21); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEOFFSET_12A); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEOFFSET_12L); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_LOW12A); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_HIGH12A); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_LOW12L); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_TOKEN); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECTION); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR64); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH19); + LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH14); + default: + Res = "Unknown"; + } + break; case COFF::IMAGE_FILE_MACHINE_I386: switch (Reloc->Type) { LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE); diff --git a/lib/Object/IRSymtab.cpp b/lib/Object/IRSymtab.cpp index 7bca032a7be1..7a6424a76a98 100644 --- a/lib/Object/IRSymtab.cpp +++ b/lib/Object/IRSymtab.cpp @@ -32,6 +32,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Error.h" #include "llvm/Support/StringSaver.h" +#include "llvm/Support/VCSRevision.h" #include "llvm/Support/raw_ostream.h" #include <cassert> #include <string> @@ -43,6 +44,21 @@ using namespace irsymtab; namespace { +const char *getExpectedProducerName() { + static char DefaultName[] = LLVM_VERSION_STRING +#ifdef LLVM_REVISION + " " LLVM_REVISION +#endif + ; + // Allows for testing of the irsymtab writer and upgrade mechanism. This + // environment variable should not be set by users. + if (char *OverrideName = getenv("LLVM_OVERRIDE_PRODUCER")) + return OverrideName; + return DefaultName; +} + +const char *kExpectedProducerName = getExpectedProducerName(); + /// Stores the temporary state that is required to build an IR symbol table. struct Builder { SmallVector<char, 0> &Symtab; @@ -231,6 +247,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) { storage::Header Hdr; assert(!IRMods.empty()); + Hdr.Version = storage::Header::kCurrentVersion; + setStr(Hdr.Producer, kExpectedProducerName); setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple()); setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName()); TT = Triple(IRMods[0]->getTargetTriple()); @@ -300,7 +318,31 @@ Expected<FileContents> irsymtab::readBitcode(const BitcodeFileContents &BFC) { return make_error<StringError>("Bitcode file does not contain any modules", inconvertibleErrorCode()); - // Right now we have no on-disk representation of symbol tables, so we always - // upgrade. - return upgrade(BFC.Mods); + if (BFC.StrtabForSymtab.empty() || + BFC.Symtab.size() < sizeof(storage::Header)) + return upgrade(BFC.Mods); + + // We cannot use the regular reader to read the version and producer, because + // it will expect the header to be in the current format. The only thing we + // can rely on is that the version and producer will be present as the first + // struct elements. + auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data()); + unsigned Version = Hdr->Version; + StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab); + if (Version != storage::Header::kCurrentVersion || + Producer != kExpectedProducerName) + return upgrade(BFC.Mods); + + FileContents FC; + FC.TheReader = {{BFC.Symtab.data(), BFC.Symtab.size()}, + {BFC.StrtabForSymtab.data(), BFC.StrtabForSymtab.size()}}; + + // Finally, make sure that the number of modules in the symbol table matches + // the number of modules in the bitcode file. If they differ, it may mean that + // the bitcode file was created by binary concatenation, so we need to create + // a new symbol table from scratch. + if (FC.TheReader.getNumModules() != BFC.Mods.size()) + return upgrade(std::move(BFC.Mods)); + + return std::move(FC); } diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index d15860674aeb..fff497ba5564 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -193,6 +193,9 @@ static Error readSection(WasmSection &Section, const uint8_t *&Ptr, WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err) : ObjectFile(Binary::ID_Wasm, Buffer) { + LinkingData.DataAlignment = 0; + LinkingData.DataSize = 0; + ErrorAsOutParameter ErrAsOutParam(&Err); Header.Magic = getData().substr(0, 4); if (Header.Magic != StringRef("\0asm", 4)) { @@ -291,6 +294,7 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) { Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr, const uint8_t *End) { + HasLinkingSection = true; while (Ptr < End) { uint8_t Type = readVarint7(Ptr); uint32_t Size = readVaruint32(Ptr); @@ -305,7 +309,7 @@ Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr, auto iter = SymbolMap.find(Symbol); if (iter == SymbolMap.end()) { return make_error<GenericBinaryError>( - "Invalid symbol name in linking section", + "Invalid symbol name in linking section: " + Symbol, object_error::parse_failed); } uint32_t SymIndex = iter->second; @@ -318,6 +322,12 @@ Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr, } break; } + case wasm::WASM_DATA_SIZE: + LinkingData.DataSize = readVaruint32(Ptr); + break; + case wasm::WASM_DATA_ALIGNMENT: + LinkingData.DataAlignment = readVaruint32(Ptr); + break; case wasm::WASM_STACK_POINTER: default: Ptr += Size; @@ -941,7 +951,9 @@ SubtargetFeatures WasmObjectFile::getFeatures() const { return SubtargetFeatures(); } -bool WasmObjectFile::isRelocatableObject() const { return false; } +bool WasmObjectFile::isRelocatableObject() const { + return HasLinkingSection; +} const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const { assert(Ref.d.a < Sections.size()); diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp index ff9b9ca35eb5..1371eacdf8f2 100644 --- a/lib/Object/WindowsResource.cpp +++ b/lib/Object/WindowsResource.cpp @@ -563,7 +563,7 @@ void WindowsResourceCOFFWriter::writeSymbolTable() { Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset); strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize); Symbol->Value = DataOffsets[i]; - Symbol->SectionNumber = 1; + Symbol->SectionNumber = 2; Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL; Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC; Symbol->NumberOfAuxSymbols = 0; diff --git a/lib/ObjectYAML/COFFYAML.cpp b/lib/ObjectYAML/COFFYAML.cpp index c8cbea1490f6..1103159fc98d 100644 --- a/lib/ObjectYAML/COFFYAML.cpp +++ b/lib/ObjectYAML/COFFYAML.cpp @@ -12,17 +12,25 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/COFFYAML.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/YAMLTraits.h" +#include <cstdint> +#include <cstring> #define ECase(X) IO.enumCase(Value, #X, COFF::X); + namespace llvm { namespace COFFYAML { + Section::Section() { memset(&Header, 0, sizeof(COFF::section)); } Symbol::Symbol() { memset(&Header, 0, sizeof(COFF::symbol)); } Object::Object() { memset(&Header, 0, sizeof(COFF::header)); } -} + +} // end namespace COFFYAML namespace yaml { + void ScalarEnumerationTraits<COFFYAML::COMDATType>::enumeration( IO &IO, COFFYAML::COMDATType &Value) { IO.enumCase(Value, "0", 0); @@ -172,20 +180,20 @@ void ScalarEnumerationTraits<COFF::RelocationTypeAMD64>::enumeration( void ScalarEnumerationTraits<COFF::WindowsSubsystem>::enumeration( IO &IO, COFF::WindowsSubsystem &Value) { - ECase(IMAGE_SUBSYSTEM_UNKNOWN); - ECase(IMAGE_SUBSYSTEM_NATIVE); - ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI); - ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI); - ECase(IMAGE_SUBSYSTEM_OS2_CUI); - ECase(IMAGE_SUBSYSTEM_POSIX_CUI); - ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS); - ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI); - ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION); - ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER); - ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER); - ECase(IMAGE_SUBSYSTEM_EFI_ROM); - ECase(IMAGE_SUBSYSTEM_XBOX); - ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION); + ECase(IMAGE_SUBSYSTEM_UNKNOWN); + ECase(IMAGE_SUBSYSTEM_NATIVE); + ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI); + ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI); + ECase(IMAGE_SUBSYSTEM_OS2_CUI); + ECase(IMAGE_SUBSYSTEM_POSIX_CUI); + ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS); + ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI); + ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION); + ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER); + ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER); + ECase(IMAGE_SUBSYSTEM_EFI_ROM); + ECase(IMAGE_SUBSYSTEM_XBOX); + ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION); } #undef ECase @@ -252,12 +260,15 @@ void ScalarBitSetTraits<COFF::DLLCharacteristics>::bitset( #undef BCase namespace { + struct NSectionSelectionType { NSectionSelectionType(IO &) : SelectionType(COFFYAML::COMDATType(0)) {} NSectionSelectionType(IO &, uint8_t C) : SelectionType(COFFYAML::COMDATType(C)) {} + uint8_t denormalize(IO &) { return SelectionType; } + COFFYAML::COMDATType SelectionType; }; @@ -266,7 +277,9 @@ struct NWeakExternalCharacteristics { : Characteristics(COFFYAML::WeakExternalCharacteristics(0)) {} NWeakExternalCharacteristics(IO &, uint32_t C) : Characteristics(COFFYAML::WeakExternalCharacteristics(C)) {} + uint32_t denormalize(IO &) { return Characteristics; } + COFFYAML::WeakExternalCharacteristics Characteristics; }; @@ -275,7 +288,9 @@ struct NSectionCharacteristics { : Characteristics(COFF::SectionCharacteristics(0)) {} NSectionCharacteristics(IO &, uint32_t C) : Characteristics(COFF::SectionCharacteristics(C)) {} + uint32_t denormalize(IO &) { return Characteristics; } + COFF::SectionCharacteristics Characteristics; }; @@ -284,13 +299,16 @@ struct NAuxTokenType { : AuxType(COFFYAML::AuxSymbolType(0)) {} NAuxTokenType(IO &, uint8_t C) : AuxType(COFFYAML::AuxSymbolType(C)) {} + uint32_t denormalize(IO &) { return AuxType; } + COFFYAML::AuxSymbolType AuxType; }; struct NStorageClass { NStorageClass(IO &) : StorageClass(COFF::SymbolStorageClass(0)) {} NStorageClass(IO &, uint8_t S) : StorageClass(COFF::SymbolStorageClass(S)) {} + uint8_t denormalize(IO &) { return StorageClass; } COFF::SymbolStorageClass StorageClass; @@ -299,7 +317,9 @@ struct NStorageClass { struct NMachine { NMachine(IO &) : Machine(COFF::MachineTypes(0)) {} NMachine(IO &, uint16_t M) : Machine(COFF::MachineTypes(M)) {} + uint16_t denormalize(IO &) { return Machine; } + COFF::MachineTypes Machine; }; @@ -307,6 +327,7 @@ struct NHeaderCharacteristics { NHeaderCharacteristics(IO &) : Characteristics(COFF::Characteristics(0)) {} NHeaderCharacteristics(IO &, uint16_t C) : Characteristics(COFF::Characteristics(C)) {} + uint16_t denormalize(IO &) { return Characteristics; } COFF::Characteristics Characteristics; @@ -316,13 +337,16 @@ template <typename RelocType> struct NType { NType(IO &) : Type(RelocType(0)) {} NType(IO &, uint16_t T) : Type(RelocType(T)) {} + uint16_t denormalize(IO &) { return Type; } + RelocType Type; }; struct NWindowsSubsystem { NWindowsSubsystem(IO &) : Subsystem(COFF::WindowsSubsystem(0)) {} NWindowsSubsystem(IO &, uint16_t C) : Subsystem(COFF::WindowsSubsystem(C)) {} + uint16_t denormalize(IO &) { return Subsystem; } COFF::WindowsSubsystem Subsystem; @@ -332,12 +356,13 @@ struct NDLLCharacteristics { NDLLCharacteristics(IO &) : Characteristics(COFF::DLLCharacteristics(0)) {} NDLLCharacteristics(IO &, uint16_t C) : Characteristics(COFF::DLLCharacteristics(C)) {} + uint16_t denormalize(IO &) { return Characteristics; } COFF::DLLCharacteristics Characteristics; }; -} +} // end anonymous namespace void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO, COFFYAML::Relocation &Rel) { @@ -509,5 +534,6 @@ void MappingTraits<COFFYAML::Object>::mapping(IO &IO, COFFYAML::Object &Obj) { IO.mapRequired("symbols", Obj.Symbols); } -} -} +} // end namespace yaml + +} // end namespace llvm diff --git a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp index d194420d5ef4..60b0ea28030a 100644 --- a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp +++ b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp @@ -13,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h" - +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" #include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h" @@ -24,15 +26,29 @@ #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h" #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h" #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" +#include "llvm/DebugInfo/CodeView/DebugSubsection.h" #include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h" #include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h" #include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h" -#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/Line.h" #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h" -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" -#include "llvm/DebugInfo/CodeView/SymbolSerializer.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/ObjectYAML/CodeViewYAMLSymbols.h" -#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/YAMLTraits.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> +#include <string> +#include <tuple> +#include <vector> + using namespace llvm; using namespace llvm::codeview; using namespace llvm::CodeViewYAML; @@ -48,9 +64,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeSite) LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(CrossModuleExport) LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLCrossModuleImport) -LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLFrameData) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false) LLVM_YAML_DECLARE_ENUM_TRAITS(DebugSubsectionKind) @@ -70,21 +84,25 @@ LLVM_YAML_DECLARE_MAPPING_TRAITS(InlineeSite) namespace llvm { namespace CodeViewYAML { namespace detail { + struct YAMLSubsectionBase { explicit YAMLSubsectionBase(DebugSubsectionKind Kind) : Kind(Kind) {} - DebugSubsectionKind Kind; - virtual ~YAMLSubsectionBase() {} + virtual ~YAMLSubsectionBase() = default; virtual void map(IO &IO) = 0; virtual std::shared_ptr<DebugSubsection> toCodeViewSubsection(BumpPtrAllocator &Allocator, const codeview::StringsAndChecksums &SC) const = 0; + + DebugSubsectionKind Kind; }; -} -} -} + +} // end namespace detail +} // end namespace CodeViewYAML +} // end namespace llvm namespace { + struct YAMLChecksumsSubsection : public YAMLSubsectionBase { YAMLChecksumsSubsection() : YAMLSubsectionBase(DebugSubsectionKind::FileChecksums) {} @@ -215,7 +233,8 @@ struct YAMLCoffSymbolRVASubsection : public YAMLSubsectionBase { std::vector<uint32_t> RVAs; }; -} + +} // end anonymous namespace void ScalarBitSetTraits<LineFlags>::bitset(IO &io, LineFlags &Flags) { io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns); @@ -743,8 +762,9 @@ llvm::CodeViewYAML::toCodeViewSubsectionList( } namespace { + struct SubsectionConversionVisitor : public DebugSubsectionVisitor { - SubsectionConversionVisitor() {} + SubsectionConversionVisitor() = default; Error visitUnknown(DebugUnknownSubsectionRef &Unknown) override; Error visitLines(DebugLinesSubsectionRef &Lines, @@ -769,6 +789,8 @@ struct SubsectionConversionVisitor : public DebugSubsectionVisitor { YAMLDebugSubsection Subsection; }; +} // end anonymous namespace + Error SubsectionConversionVisitor::visitUnknown( DebugUnknownSubsectionRef &Unknown) { return make_error<CodeViewError>(cv_error_code::operation_unsupported); @@ -865,7 +887,6 @@ Error SubsectionConversionVisitor::visitCOFFSymbolRVAs( Subsection.Subsection = *Result; return Error::success(); } -} Expected<YAMLDebugSubsection> YAMLDebugSubsection::fromCodeViewSubection(const StringsAndChecksumsRef &SC, diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp index 83f3d55b8e55..dbe4e2a6d6fd 100644 --- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp +++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp @@ -13,13 +13,25 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/CodeViewYAMLSymbols.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/RecordSerialization.h" #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" +#include "llvm/ObjectYAML/YAML.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/YAMLTraits.h" +#include <algorithm> +#include <cstdint> +#include <cstring> +#include <string> +#include <vector> using namespace llvm; using namespace llvm::codeview; @@ -27,7 +39,6 @@ using namespace llvm::CodeViewYAML; using namespace llvm::CodeViewYAML::detail; using namespace llvm::yaml; -LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) // We only need to declare these, the definitions are in CodeViewYAMLTypes.cpp @@ -49,15 +60,16 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(RegisterId) LLVM_YAML_DECLARE_ENUM_TRAITS(TrampolineType) LLVM_YAML_DECLARE_ENUM_TRAITS(ThunkOrdinal) -LLVM_YAML_STRONG_TYPEDEF(llvm::StringRef, TypeName) +LLVM_YAML_STRONG_TYPEDEF(StringRef, TypeName) LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeName, true) StringRef ScalarTraits<TypeName>::input(StringRef S, void *V, TypeName &T) { return ScalarTraits<StringRef>::input(S, V, T.value); } + void ScalarTraits<TypeName>::output(const TypeName &T, void *V, - llvm::raw_ostream &R) { + raw_ostream &R) { ScalarTraits<StringRef>::output(T.value, V, R); } @@ -174,9 +186,10 @@ namespace detail { struct SymbolRecordBase { codeview::SymbolKind Kind; + explicit SymbolRecordBase(codeview::SymbolKind K) : Kind(K) {} + virtual ~SymbolRecordBase() = default; - virtual ~SymbolRecordBase() {} virtual void map(yaml::IO &io) = 0; virtual codeview::CVSymbol toCodeViewSymbol(BumpPtrAllocator &Allocator, @@ -195,6 +208,7 @@ template <typename T> struct SymbolRecordImpl : public SymbolRecordBase { CodeViewContainer Container) const override { return SymbolSerializer::writeOneSymbol(Symbol, Allocator, Container); } + Error fromCodeViewSymbol(codeview::CVSymbol CVS) override { return SymbolDeserializer::deserializeAs<T>(CVS, Symbol); } @@ -218,6 +232,7 @@ struct UnknownSymbolRecord : public SymbolRecordBase { ::memcpy(Buffer + sizeof(RecordPrefix), Data.data(), Data.size()); return CVSymbol(Kind, ArrayRef<uint8_t>(Buffer, TotalLen)); } + Error fromCodeViewSymbol(CVSymbol CVS) override { this->Kind = CVS.kind(); Data = CVS.RecordData.drop_front(sizeof(RecordPrefix)); @@ -497,9 +512,10 @@ template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) { IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("DisplayName", Symbol.Name); } -} -} -} + +} // end namespace detail +} // end namespace CodeViewYAML +} // end namespace llvm CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol( BumpPtrAllocator &Allocator, CodeViewContainer Container) const { @@ -508,11 +524,13 @@ CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol( namespace llvm { namespace yaml { + template <> struct MappingTraits<SymbolRecordBase> { static void mapping(IO &io, SymbolRecordBase &Record) { Record.map(io); } }; -} -} + +} // end namespace yaml +} // end namespace llvm template <typename SymbolType> static inline Expected<CodeViewYAML::SymbolRecord> diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp index 2d1cb4b1b27b..0b2ea61c5fe0 100644 --- a/lib/ObjectYAML/CodeViewYAMLTypes.cpp +++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp @@ -13,14 +13,29 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/CodeViewYAMLTypes.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/COFF.h" #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" -#include "llvm/DebugInfo/CodeView/EnumTables.h" #include "llvm/DebugInfo/CodeView/TypeDeserializer.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" +#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/YAMLTraits.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <vector> using namespace llvm; using namespace llvm::codeview; @@ -29,7 +44,6 @@ using namespace llvm::CodeViewYAML::detail; using namespace llvm::yaml; LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord) -LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind) LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) @@ -63,9 +77,10 @@ namespace detail { struct LeafRecordBase { TypeLeafKind Kind; + explicit LeafRecordBase(TypeLeafKind K) : Kind(K) {} + virtual ~LeafRecordBase() = default; - virtual ~LeafRecordBase() {} virtual void map(yaml::IO &io) = 0; virtual CVType toCodeViewRecord(TypeTableBuilder &TTB) const = 0; virtual Error fromCodeViewRecord(CVType Type) = 0; @@ -101,9 +116,10 @@ template <> struct LeafRecordImpl<FieldListRecord> : public LeafRecordBase { struct MemberRecordBase { TypeLeafKind Kind; + explicit MemberRecordBase(TypeLeafKind K) : Kind(K) {} + virtual ~MemberRecordBase() = default; - virtual ~MemberRecordBase() {} virtual void map(yaml::IO &io) = 0; virtual void writeTo(FieldListRecordBuilder &FLRB) = 0; }; @@ -111,6 +127,7 @@ struct MemberRecordBase { template <typename T> struct MemberRecordImpl : public MemberRecordBase { explicit MemberRecordImpl(TypeLeafKind K) : MemberRecordBase(K), Record(static_cast<TypeRecordKind>(K)) {} + void map(yaml::IO &io) override; void writeTo(FieldListRecordBuilder &FLRB) override { @@ -119,12 +136,13 @@ template <typename T> struct MemberRecordImpl : public MemberRecordBase { mutable T Record; }; -} -} -} + +} // end namespace detail +} // end namespace CodeViewYAML +} // end namespace llvm void ScalarTraits<TypeIndex>::output(const TypeIndex &S, void *, - llvm::raw_ostream &OS) { + raw_ostream &OS) { OS << S.getIndex(); } @@ -136,8 +154,7 @@ StringRef ScalarTraits<TypeIndex>::input(StringRef Scalar, void *Ctx, return Result; } -void ScalarTraits<APSInt>::output(const APSInt &S, void *, - llvm::raw_ostream &OS) { +void ScalarTraits<APSInt>::output(const APSInt &S, void *, raw_ostream &OS) { S.print(OS, S.isSigned()); } @@ -346,6 +363,7 @@ void MappingTraits<MemberPointerInfo>::mapping(IO &IO, MemberPointerInfo &MPI) { namespace llvm { namespace CodeViewYAML { namespace detail { + template <> void LeafRecordImpl<ModifierRecord>::map(IO &IO) { IO.mapRequired("ModifiedType", Record.ModifiedType); IO.mapRequired("Modifiers", Record.Modifiers); @@ -404,11 +422,13 @@ template <> void LeafRecordImpl<ArrayRecord>::map(IO &IO) { void LeafRecordImpl<FieldListRecord>::map(IO &IO) { IO.mapRequired("FieldList", Members); } -} -} -} + +} // end namespace detail +} // end namespace CodeViewYAML +} // end namespace llvm namespace { + class MemberRecordConversionVisitor : public TypeVisitorCallbacks { public: explicit MemberRecordConversionVisitor(std::vector<MemberRecord> &Records) @@ -433,7 +453,8 @@ private: std::vector<MemberRecord> &Records; }; -} + +} // end anonymous namespace Error LeafRecordImpl<FieldListRecord>::fromCodeViewRecord(CVType Type) { MemberRecordConversionVisitor V(Members); @@ -461,13 +482,13 @@ void MappingTraits<OneMethodRecord>::mapping(IO &io, OneMethodRecord &Record) { namespace llvm { namespace CodeViewYAML { namespace detail { + template <> void LeafRecordImpl<ClassRecord>::map(IO &IO) { IO.mapRequired("MemberCount", Record.MemberCount); IO.mapRequired("Options", Record.Options); IO.mapRequired("FieldList", Record.FieldList); IO.mapRequired("Name", Record.Name); IO.mapRequired("UniqueName", Record.UniqueName); - IO.mapRequired("DerivationList", Record.DerivationList); IO.mapRequired("VTableShape", Record.VTableShape); IO.mapRequired("Size", Record.Size); @@ -479,7 +500,6 @@ template <> void LeafRecordImpl<UnionRecord>::map(IO &IO) { IO.mapRequired("FieldList", Record.FieldList); IO.mapRequired("Name", Record.Name); IO.mapRequired("UniqueName", Record.UniqueName); - IO.mapRequired("Size", Record.Size); } @@ -489,7 +509,6 @@ template <> void LeafRecordImpl<EnumRecord>::map(IO &IO) { IO.mapRequired("FieldList", Record.FieldList); IO.mapRequired("Name", Record.Name); IO.mapRequired("UniqueName", Record.UniqueName); - IO.mapRequired("UnderlyingType", Record.UnderlyingType); } @@ -603,9 +622,10 @@ template <> void MemberRecordImpl<VirtualBaseClassRecord>::map(IO &IO) { template <> void MemberRecordImpl<ListContinuationRecord>::map(IO &IO) { IO.mapRequired("ContinuationIndex", Record.ContinuationIndex); } -} -} -} + +} // end namespace detail +} // end namespace CodeViewYAML +} // end namespace llvm template <typename T> static inline Expected<LeafRecord> fromCodeViewRecordImpl(CVType Type) { @@ -628,7 +648,8 @@ Expected<LeafRecord> LeafRecord::fromCodeViewRecord(CVType Type) { #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) switch (Type.kind()) { #include "llvm/DebugInfo/CodeView/CodeViewTypes.def" - default: { llvm_unreachable("Unknown leaf kind!"); } + default: + llvm_unreachable("Unknown leaf kind!"); } return make_error<CodeViewError>(cv_error_code::corrupt_record); } @@ -644,6 +665,7 @@ CVType LeafRecord::toCodeViewRecord(TypeTableBuilder &TTB) const { namespace llvm { namespace yaml { + template <> struct MappingTraits<LeafRecordBase> { static void mapping(IO &io, LeafRecordBase &Record) { Record.map(io); } }; @@ -651,8 +673,9 @@ template <> struct MappingTraits<LeafRecordBase> { template <> struct MappingTraits<MemberRecordBase> { static void mapping(IO &io, MemberRecordBase &Record) { Record.map(io); } }; -} -} + +} // end namespace yaml +} // end namespace llvm template <typename ConcreteType> static void mapLeafRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind, diff --git a/lib/ObjectYAML/DWARFEmitter.cpp b/lib/ObjectYAML/DWARFEmitter.cpp index 91c928771a65..89fc652035ca 100644 --- a/lib/ObjectYAML/DWARFEmitter.cpp +++ b/lib/ObjectYAML/DWARFEmitter.cpp @@ -13,15 +13,25 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/DWARFEmitter.h" +#include "DWARFVisitor.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ObjectYAML/DWARFYAML.h" #include "llvm/Support/Error.h" +#include "llvm/Support/Host.h" #include "llvm/Support/LEB128.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SwapByteOrder.h" +#include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" - -#include "DWARFVisitor.h" - #include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> using namespace llvm; @@ -127,7 +137,7 @@ class DumpVisitor : public DWARFYAML::ConstVisitor { raw_ostream &OS; protected: - virtual void onStartCompileUnit(const DWARFYAML::Unit &CU) { + void onStartCompileUnit(const DWARFYAML::Unit &CU) override { writeInitialLength(CU.Length, OS, DebugInfo.IsLittleEndian); writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian); if(CU.Version >= 5) { @@ -141,41 +151,43 @@ protected: } - virtual void onStartDIE(const DWARFYAML::Unit &CU, - const DWARFYAML::Entry &DIE) { + void onStartDIE(const DWARFYAML::Unit &CU, + const DWARFYAML::Entry &DIE) override { encodeULEB128(DIE.AbbrCode, OS); } - virtual void onValue(const uint8_t U) { + void onValue(const uint8_t U) override { writeInteger(U, OS, DebugInfo.IsLittleEndian); } - virtual void onValue(const uint16_t U) { + void onValue(const uint16_t U) override { writeInteger(U, OS, DebugInfo.IsLittleEndian); } - virtual void onValue(const uint32_t U) { + + void onValue(const uint32_t U) override { writeInteger(U, OS, DebugInfo.IsLittleEndian); } - virtual void onValue(const uint64_t U, const bool LEB = false) { + + void onValue(const uint64_t U, const bool LEB = false) override { if (LEB) encodeULEB128(U, OS); else writeInteger(U, OS, DebugInfo.IsLittleEndian); } - virtual void onValue(const int64_t S, const bool LEB = false) { + void onValue(const int64_t S, const bool LEB = false) override { if (LEB) encodeSLEB128(S, OS); else writeInteger(S, OS, DebugInfo.IsLittleEndian); } - virtual void onValue(const StringRef String) { + void onValue(const StringRef String) override { OS.write(String.data(), String.size()); OS.write('\0'); } - virtual void onValue(const MemoryBufferRef MBR) { + void onValue(const MemoryBufferRef MBR) override { OS.write(MBR.getBufferStart(), MBR.getBufferSize()); } @@ -280,7 +292,7 @@ void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) { } } -typedef void (*EmitFuncType)(raw_ostream &, const DWARFYAML::Data &); +using EmitFuncType = void (*)(raw_ostream &, const DWARFYAML::Data &); static void EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc, diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp index edb9545f14b1..d6c09e1a35d7 100644 --- a/lib/ObjectYAML/DWARFYAML.cpp +++ b/lib/ObjectYAML/DWARFYAML.cpp @@ -171,6 +171,6 @@ void MappingTraits<DWARFYAML::InitialLength>::mapping( IO.mapRequired("TotalLength64", InitialLength.TotalLength64); } -} // namespace llvm::yaml +} // end namespace yaml -} // namespace llvm +} // end namespace llvm diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index dbd5498e003d..39741dab327a 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -12,12 +12,18 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/ELFYAML.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MipsABIFlags.h" +#include "llvm/Support/YAMLTraits.h" +#include <cassert> +#include <cstdint> namespace llvm { -ELFYAML::Section::~Section() {} +ELFYAML::Section::~Section() = default; namespace yaml { @@ -542,6 +548,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration( llvm_unreachable("Unsupported architecture"); } #undef ELF_RELOC + IO.enumFallback<Hex32>(Value); } void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_REG>::enumeration( @@ -643,6 +650,7 @@ void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO, } namespace { + struct NormalizedOther { NormalizedOther(IO &) : Visibility(ELFYAML::ELF_STV(0)), Other(ELFYAML::ELF_STO(0)) {} @@ -654,7 +662,8 @@ struct NormalizedOther { ELFYAML::ELF_STV Visibility; ELFYAML::ELF_STO Other; }; -} + +} // end anonymous namespace void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) { IO.mapOptional("Name", Symbol.Name, StringRef()); @@ -777,6 +786,7 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Section>>::validate( } namespace { + struct NormalizedMips64RelType { NormalizedMips64RelType(IO &) : Type(ELFYAML::ELF_REL(ELF::R_MIPS_NONE)), @@ -797,7 +807,8 @@ struct NormalizedMips64RelType { ELFYAML::ELF_REL Type3; ELFYAML::ELF_RSS SpecSym; }; -} + +} // end anonymous namespace void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO, ELFYAML::Relocation &Rel) { @@ -838,4 +849,5 @@ LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_ASE) LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_FLAGS1) } // end namespace yaml + } // end namespace llvm diff --git a/lib/ObjectYAML/MachOYAML.cpp b/lib/ObjectYAML/MachOYAML.cpp index 461684827872..ab452a7bf6ef 100644 --- a/lib/ObjectYAML/MachOYAML.cpp +++ b/lib/ObjectYAML/MachOYAML.cpp @@ -12,16 +12,19 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/MachOYAML.h" +#include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/MachO.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/Format.h" #include "llvm/Support/Host.h" - -#include <string.h> // For memcpy, memset and strnlen. +#include "llvm/Support/YAMLTraits.h" +#include "llvm/Support/raw_ostream.h" +#include <cinttypes> +#include <cstdint> +#include <cstring> namespace llvm { -MachOYAML::LoadCommand::~LoadCommand() {} +MachOYAML::LoadCommand::~LoadCommand() = default; bool MachOYAML::LinkEditData::isEmpty() const { return 0 == @@ -33,7 +36,7 @@ bool MachOYAML::LinkEditData::isEmpty() const { namespace yaml { void ScalarTraits<char_16>::output(const char_16 &Val, void *, - llvm::raw_ostream &Out) { + raw_ostream &Out) { auto Len = strnlen(&Val[0], 16); Out << StringRef(&Val[0], Len); } @@ -51,8 +54,7 @@ StringRef ScalarTraits<char_16>::input(StringRef Scalar, void *, char_16 &Val) { bool ScalarTraits<char_16>::mustQuote(StringRef S) { return needsQuotes(S); } -void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *, - llvm::raw_ostream &Out) { +void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *, raw_ostream &Out) { for (int Idx = 0; Idx < 16; ++Idx) { Out << format("%02" PRIX32, Val[Idx]); if (Idx == 3 || Idx == 5 || Idx == 7 || Idx == 9) @@ -154,7 +156,7 @@ void MappingTraits<MachOYAML::LinkEditData>::mapping( IO.mapOptional("BindOpcodes", LinkEditData.BindOpcodes); IO.mapOptional("WeakBindOpcodes", LinkEditData.WeakBindOpcodes); IO.mapOptional("LazyBindOpcodes", LinkEditData.LazyBindOpcodes); - if(LinkEditData.ExportTrie.Children.size() > 0 || !IO.outputting()) + if (!LinkEditData.ExportTrie.Children.empty() || !IO.outputting()) IO.mapOptional("ExportTrie", LinkEditData.ExportTrie); IO.mapOptional("NameList", LinkEditData.NameList); IO.mapOptional("StringTable", LinkEditData.StringTable); @@ -308,13 +310,11 @@ void MappingTraits<MachO::dylib_command>::mapping( void MappingTraits<MachO::dylinker_command>::mapping( IO &IO, MachO::dylinker_command &LoadCommand) { - IO.mapRequired("name", LoadCommand.name); } void MappingTraits<MachO::dysymtab_command>::mapping( IO &IO, MachO::dysymtab_command &LoadCommand) { - IO.mapRequired("ilocalsym", LoadCommand.ilocalsym); IO.mapRequired("nlocalsym", LoadCommand.nlocalsym); IO.mapRequired("iextdefsym", LoadCommand.iextdefsym); @@ -337,7 +337,6 @@ void MappingTraits<MachO::dysymtab_command>::mapping( void MappingTraits<MachO::encryption_info_command>::mapping( IO &IO, MachO::encryption_info_command &LoadCommand) { - IO.mapRequired("cryptoff", LoadCommand.cryptoff); IO.mapRequired("cryptsize", LoadCommand.cryptsize); IO.mapRequired("cryptid", LoadCommand.cryptid); @@ -345,7 +344,6 @@ void MappingTraits<MachO::encryption_info_command>::mapping( void MappingTraits<MachO::encryption_info_command_64>::mapping( IO &IO, MachO::encryption_info_command_64 &LoadCommand) { - IO.mapRequired("cryptoff", LoadCommand.cryptoff); IO.mapRequired("cryptsize", LoadCommand.cryptsize); IO.mapRequired("cryptid", LoadCommand.cryptid); @@ -354,14 +352,12 @@ void MappingTraits<MachO::encryption_info_command_64>::mapping( void MappingTraits<MachO::entry_point_command>::mapping( IO &IO, MachO::entry_point_command &LoadCommand) { - IO.mapRequired("entryoff", LoadCommand.entryoff); IO.mapRequired("stacksize", LoadCommand.stacksize); } void MappingTraits<MachO::fvmfile_command>::mapping( IO &IO, MachO::fvmfile_command &LoadCommand) { - IO.mapRequired("name", LoadCommand.name); IO.mapRequired("header_addr", LoadCommand.header_addr); } @@ -374,7 +370,6 @@ void MappingTraits<MachO::fvmlib>::mapping(IO &IO, MachO::fvmlib &FVMLib) { void MappingTraits<MachO::fvmlib_command>::mapping( IO &IO, MachO::fvmlib_command &LoadCommand) { - IO.mapRequired("fvmlib", LoadCommand.fvmlib); } @@ -383,20 +378,17 @@ void MappingTraits<MachO::ident_command>::mapping( void MappingTraits<MachO::linkedit_data_command>::mapping( IO &IO, MachO::linkedit_data_command &LoadCommand) { - IO.mapRequired("dataoff", LoadCommand.dataoff); IO.mapRequired("datasize", LoadCommand.datasize); } void MappingTraits<MachO::linker_option_command>::mapping( IO &IO, MachO::linker_option_command &LoadCommand) { - IO.mapRequired("count", LoadCommand.count); } void MappingTraits<MachO::prebind_cksum_command>::mapping( IO &IO, MachO::prebind_cksum_command &LoadCommand) { - IO.mapRequired("cksum", LoadCommand.cksum); } @@ -405,7 +397,6 @@ void MappingTraits<MachO::load_command>::mapping( void MappingTraits<MachO::prebound_dylib_command>::mapping( IO &IO, MachO::prebound_dylib_command &LoadCommand) { - IO.mapRequired("name", LoadCommand.name); IO.mapRequired("nmodules", LoadCommand.nmodules); IO.mapRequired("linked_modules", LoadCommand.linked_modules); @@ -413,7 +404,6 @@ void MappingTraits<MachO::prebound_dylib_command>::mapping( void MappingTraits<MachO::routines_command>::mapping( IO &IO, MachO::routines_command &LoadCommand) { - IO.mapRequired("init_address", LoadCommand.init_address); IO.mapRequired("init_module", LoadCommand.init_module); IO.mapRequired("reserved1", LoadCommand.reserved1); @@ -426,7 +416,6 @@ void MappingTraits<MachO::routines_command>::mapping( void MappingTraits<MachO::routines_command_64>::mapping( IO &IO, MachO::routines_command_64 &LoadCommand) { - IO.mapRequired("init_address", LoadCommand.init_address); IO.mapRequired("init_module", LoadCommand.init_module); IO.mapRequired("reserved1", LoadCommand.reserved1); @@ -439,7 +428,6 @@ void MappingTraits<MachO::routines_command_64>::mapping( void MappingTraits<MachO::rpath_command>::mapping( IO &IO, MachO::rpath_command &LoadCommand) { - IO.mapRequired("path", LoadCommand.path); } @@ -475,7 +463,6 @@ void MappingTraits<MachO::section_64>::mapping(IO &IO, void MappingTraits<MachO::segment_command>::mapping( IO &IO, MachO::segment_command &LoadCommand) { - IO.mapRequired("segname", LoadCommand.segname); IO.mapRequired("vmaddr", LoadCommand.vmaddr); IO.mapRequired("vmsize", LoadCommand.vmsize); @@ -489,7 +476,6 @@ void MappingTraits<MachO::segment_command>::mapping( void MappingTraits<MachO::segment_command_64>::mapping( IO &IO, MachO::segment_command_64 &LoadCommand) { - IO.mapRequired("segname", LoadCommand.segname); IO.mapRequired("vmaddr", LoadCommand.vmaddr); IO.mapRequired("vmsize", LoadCommand.vmsize); @@ -503,44 +489,37 @@ void MappingTraits<MachO::segment_command_64>::mapping( void MappingTraits<MachO::source_version_command>::mapping( IO &IO, MachO::source_version_command &LoadCommand) { - IO.mapRequired("version", LoadCommand.version); } void MappingTraits<MachO::sub_client_command>::mapping( IO &IO, MachO::sub_client_command &LoadCommand) { - IO.mapRequired("client", LoadCommand.client); } void MappingTraits<MachO::sub_framework_command>::mapping( IO &IO, MachO::sub_framework_command &LoadCommand) { - IO.mapRequired("umbrella", LoadCommand.umbrella); } void MappingTraits<MachO::sub_library_command>::mapping( IO &IO, MachO::sub_library_command &LoadCommand) { - IO.mapRequired("sub_library", LoadCommand.sub_library); } void MappingTraits<MachO::sub_umbrella_command>::mapping( IO &IO, MachO::sub_umbrella_command &LoadCommand) { - IO.mapRequired("sub_umbrella", LoadCommand.sub_umbrella); } void MappingTraits<MachO::symseg_command>::mapping( IO &IO, MachO::symseg_command &LoadCommand) { - IO.mapRequired("offset", LoadCommand.offset); IO.mapRequired("size", LoadCommand.size); } void MappingTraits<MachO::symtab_command>::mapping( IO &IO, MachO::symtab_command &LoadCommand) { - IO.mapRequired("symoff", LoadCommand.symoff); IO.mapRequired("nsyms", LoadCommand.nsyms); IO.mapRequired("stroff", LoadCommand.stroff); @@ -552,27 +531,23 @@ void MappingTraits<MachO::thread_command>::mapping( void MappingTraits<MachO::twolevel_hints_command>::mapping( IO &IO, MachO::twolevel_hints_command &LoadCommand) { - IO.mapRequired("offset", LoadCommand.offset); IO.mapRequired("nhints", LoadCommand.nhints); } void MappingTraits<MachO::uuid_command>::mapping( IO &IO, MachO::uuid_command &LoadCommand) { - IO.mapRequired("uuid", LoadCommand.uuid); } void MappingTraits<MachO::version_min_command>::mapping( IO &IO, MachO::version_min_command &LoadCommand) { - IO.mapRequired("version", LoadCommand.version); IO.mapRequired("sdk", LoadCommand.sdk); } void MappingTraits<MachO::note_command>::mapping( IO &IO, MachO::note_command &LoadCommand) { - IO.mapRequired("data_owner", LoadCommand.data_owner); IO.mapRequired("offset", LoadCommand.offset); IO.mapRequired("size", LoadCommand.size); @@ -580,13 +555,12 @@ void MappingTraits<MachO::note_command>::mapping( void MappingTraits<MachO::build_version_command>::mapping( IO &IO, MachO::build_version_command &LoadCommand) { - IO.mapRequired("platform", LoadCommand.platform); IO.mapRequired("minos", LoadCommand.minos); IO.mapRequired("sdk", LoadCommand.sdk); IO.mapRequired("ntools", LoadCommand.ntools); } -} // namespace llvm::yaml +} // end namespace yaml -} // namespace llvm +} // end namespace llvm diff --git a/lib/ObjectYAML/ObjectYAML.cpp b/lib/ObjectYAML/ObjectYAML.cpp index 4b7154ebb7c1..850c1a5a06c0 100644 --- a/lib/ObjectYAML/ObjectYAML.cpp +++ b/lib/ObjectYAML/ObjectYAML.cpp @@ -12,7 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/ObjectYAML.h" -#include "llvm/ObjectYAML/YAML.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/YAMLParser.h" +#include "llvm/Support/YAMLTraits.h" +#include <string> using namespace llvm; using namespace yaml; @@ -53,8 +56,8 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO, IO.setError("YAML Object File missing document type tag!"); else IO.setError( - llvm::Twine("YAML Object File unsupported document type tag '") + - llvm::Twine(Tag) + llvm::Twine("'!")); + Twine("YAML Object File unsupported document type tag '") + + Twine(Tag) + Twine("'!")); } } } diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp index 65703c6cf683..2040efdc9d11 100644 --- a/lib/ObjectYAML/WasmYAML.cpp +++ b/lib/ObjectYAML/WasmYAML.cpp @@ -12,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/WasmYAML.h" -#include "llvm/Object/Wasm.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/MipsABIFlags.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/YAMLTraits.h" namespace llvm { @@ -22,7 +23,7 @@ namespace WasmYAML { // Declared here rather than in the header to comply with: // http://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers -Section::~Section() {} +Section::~Section() = default; } // end namespace WasmYAML @@ -56,6 +57,8 @@ static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) { static void sectionMapping(IO &IO, WasmYAML::LinkingSection &Section) { commonSectionMapping(IO, Section); IO.mapRequired("Name", Section.Name); + IO.mapRequired("DataSize", Section.DataSize); + IO.mapRequired("DataAlignment", Section.DataAlignment); IO.mapRequired("SymbolInfo", Section.SymbolInfos); } @@ -403,4 +406,5 @@ void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration( } } // end namespace yaml + } // end namespace llvm diff --git a/lib/ObjectYAML/YAML.cpp b/lib/ObjectYAML/YAML.cpp index 75cf1fbccc80..67b5764eadaa 100644 --- a/lib/ObjectYAML/YAML.cpp +++ b/lib/ObjectYAML/YAML.cpp @@ -16,11 +16,12 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Support/raw_ostream.h" #include <cctype> +#include <cstdint> using namespace llvm; void yaml::ScalarTraits<yaml::BinaryRef>::output( - const yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) { + const yaml::BinaryRef &Val, void *, raw_ostream &Out) { Val.writeAsHex(Out); } @@ -34,7 +35,7 @@ StringRef yaml::ScalarTraits<yaml::BinaryRef>::input(StringRef Scalar, void *, if (!isxdigit(Scalar[I])) return "BinaryRef hex string must contain only hex digits."; Val = yaml::BinaryRef(Scalar); - return StringRef(); + return {}; } void yaml::BinaryRef::writeAsBinary(raw_ostream &OS) const { diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 78d5ea955e64..0380bd991d71 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -161,8 +161,8 @@ static cl::opt<bool> cl::desc("Run NewGVN instead of GVN")); static cl::opt<bool> EnableEarlyCSEMemSSA( - "enable-npm-earlycse-memssa", cl::init(false), cl::Hidden, - cl::desc("Enable the EarlyCSE w/ MemorySSA pass for the new PM (default = off)")); + "enable-npm-earlycse-memssa", cl::init(true), cl::Hidden, + cl::desc("Enable the EarlyCSE w/ MemorySSA pass for the new PM (default = on)")); static cl::opt<bool> EnableGVNHoist( "enable-npm-gvn-hoist", cl::init(false), cl::Hidden, @@ -480,6 +480,14 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, MPM.addPass(PGOInstrumentationUse(ProfileUseFile)); } +static InlineParams +getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) { + auto O3 = PassBuilder::O3; + unsigned OptLevel = Level > O3 ? 2 : Level; + unsigned SizeLevel = Level > O3 ? Level - O3 : 0; + return getInlineParams(OptLevel, SizeLevel); +} + ModulePassManager PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, bool DebugLogging) { @@ -527,13 +535,17 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // Add all the requested passes for PGO, if requested. if (PGOOpt) { - assert(PGOOpt->RunProfileGen || PGOOpt->SamplePGO || + assert(PGOOpt->RunProfileGen || !PGOOpt->SampleProfileFile.empty() || !PGOOpt->ProfileUseFile.empty()); - addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen, - PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile); + if (PGOOpt->SampleProfileFile.empty()) + addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen, + PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile); + else + MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile)); // Indirect call promotion that promotes intra-module targes only. - MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO)); + MPM.addPass(PGOIndirectCallPromotion( + false, PGOOpt && !PGOOpt->SampleProfileFile.empty())); } // Require the GlobalsAA analysis for the module so we can query it within @@ -558,8 +570,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // Run the inliner first. The theory is that we are walking bottom-up and so // the callees have already been fully optimized, and we want to inline them // into the callers so that our optimizations can reflect that. - // FIXME; Customize the threshold based on optimization level. - MainCGPipeline.addPass(InlinerPass()); + MainCGPipeline.addPass(InlinerPass(getInlineParamsFromOptLevel(Level))); // Now deduce any function attributes based in the current code. MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); @@ -751,9 +762,6 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level, // Reduce the size of the IR as much as possible. MPM.addPass(GlobalOptPass()); - // Rename anon globals to be able to export them in the summary. - MPM.addPass(NameAnonGlobalPass()); - return MPM; } @@ -772,9 +780,9 @@ PassBuilder::buildThinLTODefaultPipeline(OptimizationLevel Level, // During the ThinLTO backend phase we perform early indirect call promotion // here, before globalopt. Otherwise imported available_externally functions // look unreferenced and are removed. - MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, - PGOOpt && PGOOpt->SamplePGO && - !PGOOpt->ProfileUseFile.empty())); + MPM.addPass(PGOIndirectCallPromotion( + true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty() && + !PGOOpt->ProfileUseFile.empty())); // Add the core simplification pipeline. MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); @@ -814,8 +822,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. - MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, - PGOOpt && PGOOpt->SamplePGO)); + MPM.addPass(PGOIndirectCallPromotion( + true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function @@ -868,7 +876,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // valuable as the inliner doesn't currently care whether it is inlining an // invoke or a call. // Run the inliner now. - MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(InlinerPass())); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( + InlinerPass(getInlineParamsFromOptLevel(Level)))); // Optimize globals again after we ran the inliner. MPM.addPass(GlobalOptPass()); diff --git a/lib/ProfileData/Coverage/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp index 4534e086b39e..8c5f136ea270 100644 --- a/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -54,26 +54,26 @@ Counter CounterExpressionBuilder::get(const CounterExpression &E) { return Counter::getExpression(I); } -void CounterExpressionBuilder::extractTerms( - Counter C, int Sign, SmallVectorImpl<std::pair<unsigned, int>> &Terms) { +void CounterExpressionBuilder::extractTerms(Counter C, int Factor, + SmallVectorImpl<Term> &Terms) { switch (C.getKind()) { case Counter::Zero: break; case Counter::CounterValueReference: - Terms.push_back(std::make_pair(C.getCounterID(), Sign)); + Terms.emplace_back(C.getCounterID(), Factor); break; case Counter::Expression: const auto &E = Expressions[C.getExpressionID()]; - extractTerms(E.LHS, Sign, Terms); - extractTerms(E.RHS, E.Kind == CounterExpression::Subtract ? -Sign : Sign, - Terms); + extractTerms(E.LHS, Factor, Terms); + extractTerms( + E.RHS, E.Kind == CounterExpression::Subtract ? -Factor : Factor, Terms); break; } } Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) { // Gather constant terms. - SmallVector<std::pair<unsigned, int>, 32> Terms; + SmallVector<Term, 32> Terms; extractTerms(ExpressionTree, +1, Terms); // If there are no terms, this is just a zero. The algorithm below assumes at @@ -82,17 +82,15 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) { return Counter::getZero(); // Group the terms by counter ID. - std::sort(Terms.begin(), Terms.end(), - [](const std::pair<unsigned, int> &LHS, - const std::pair<unsigned, int> &RHS) { - return LHS.first < RHS.first; + std::sort(Terms.begin(), Terms.end(), [](const Term &LHS, const Term &RHS) { + return LHS.CounterID < RHS.CounterID; }); // Combine terms by counter ID to eliminate counters that sum to zero. auto Prev = Terms.begin(); for (auto I = Prev + 1, E = Terms.end(); I != E; ++I) { - if (I->first == Prev->first) { - Prev->second += I->second; + if (I->CounterID == Prev->CounterID) { + Prev->Factor += I->Factor; continue; } ++Prev; @@ -103,24 +101,24 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) { Counter C; // Create additions. We do this before subtractions to avoid constructs like // ((0 - X) + Y), as opposed to (Y - X). - for (auto Term : Terms) { - if (Term.second <= 0) + for (auto T : Terms) { + if (T.Factor <= 0) continue; - for (int I = 0; I < Term.second; ++I) + for (int I = 0; I < T.Factor; ++I) if (C.isZero()) - C = Counter::getCounter(Term.first); + C = Counter::getCounter(T.CounterID); else C = get(CounterExpression(CounterExpression::Add, C, - Counter::getCounter(Term.first))); + Counter::getCounter(T.CounterID))); } // Create subtractions. - for (auto Term : Terms) { - if (Term.second >= 0) + for (auto T : Terms) { + if (T.Factor >= 0) continue; - for (int I = 0; I < -Term.second; ++I) + for (int I = 0; I < -T.Factor; ++I) C = get(CounterExpression(CounterExpression::Subtract, C, - Counter::getCounter(Term.first))); + Counter::getCounter(T.CounterID))); } return C; } @@ -247,18 +245,6 @@ Error CoverageMapping::loadFunctionRecord( return Error::success(); } -Expected<std::unique_ptr<CoverageMapping>> -CoverageMapping::load(CoverageMappingReader &CoverageReader, - IndexedInstrProfReader &ProfileReader) { - auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping()); - - for (const auto &Record : CoverageReader) - if (Error E = Coverage->loadFunctionRecord(Record, ProfileReader)) - return std::move(E); - - return std::move(Coverage); -} - Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load( ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders, IndexedInstrProfReader &ProfileReader) { diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp index 005061c4f068..a1d18724fcd5 100644 --- a/lib/ProfileData/InstrProf.cpp +++ b/lib/ProfileData/InstrProf.cpp @@ -504,9 +504,11 @@ void InstrProfRecord::mergeValueProfData(uint32_t ValueKind, SIPE.addError(instrprof_error::value_site_count_mismatch); return; } + if (!ThisNumValueSites) + return; std::vector<InstrProfValueSiteRecord> &ThisSiteRecords = - getValueSitesForKind(ValueKind); - std::vector<InstrProfValueSiteRecord> &OtherSiteRecords = + getOrCreateValueSitesForKind(ValueKind); + MutableArrayRef<InstrProfValueSiteRecord> OtherSiteRecords = Src.getValueSitesForKind(ValueKind); for (uint32_t I = 0; I < ThisNumValueSites; I++) ThisSiteRecords[I].merge(SIPE, OtherSiteRecords[I], Weight); @@ -533,11 +535,8 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight) { } void InstrProfRecord::scaleValueProfData(uint32_t ValueKind, uint64_t Weight) { - uint32_t ThisNumValueSites = getNumValueSites(ValueKind); - std::vector<InstrProfValueSiteRecord> &ThisSiteRecords = - getValueSitesForKind(ValueKind); - for (uint32_t I = 0; I < ThisNumValueSites; I++) - ThisSiteRecords[I].scale(SIPE, Weight); + for (auto &R : getValueSitesForKind(ValueKind)) + R.scale(SIPE, Weight); } void InstrProfRecord::scale(uint64_t Weight) { @@ -583,7 +582,7 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site, VData[I].Value = remapValue(VData[I].Value, ValueKind, ValueMap); } std::vector<InstrProfValueSiteRecord> &ValueSites = - getValueSitesForKind(ValueKind); + getOrCreateValueSitesForKind(ValueKind); if (N == 0) ValueSites.emplace_back(); else @@ -642,8 +641,9 @@ static ValueProfRecordClosure InstrProfRecordClosure = { // Wrapper implementation using the closure mechanism. uint32_t ValueProfData::getSize(const InstrProfRecord &Record) { - InstrProfRecordClosure.Record = &Record; - return getValueProfDataSize(&InstrProfRecordClosure); + auto Closure = InstrProfRecordClosure; + Closure.Record = &Record; + return getValueProfDataSize(&Closure); } // Wrapper implementation using the closure mechanism. diff --git a/lib/Support/AMDGPUCodeObjectMetadata.cpp b/lib/Support/AMDGPUCodeObjectMetadata.cpp index a00e371415a3..863093ab7def 100644 --- a/lib/Support/AMDGPUCodeObjectMetadata.cpp +++ b/lib/Support/AMDGPUCodeObjectMetadata.cpp @@ -20,8 +20,6 @@ using namespace llvm::AMDGPU; using namespace llvm::AMDGPU::CodeObject; -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string) LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Arg::Metadata) LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 234f7439a546..232efe648b03 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -327,6 +327,7 @@ enum ProcessorSubtypes { INTEL_COREI7_SKYLAKE_AVX512, INTEL_ATOM_BONNELL, INTEL_ATOM_SILVERMONT, + INTEL_ATOM_GOLDMONT, INTEL_KNIGHTS_LANDING, AMDPENTIUM_K6, AMDPENTIUM_K62, @@ -707,7 +708,12 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model, *Type = INTEL_ATOM; *Subtype = INTEL_ATOM_SILVERMONT; break; // "silvermont" - + // Goldmont: + case 0x5c: + case 0x5f: + *Type = INTEL_ATOM; + *Subtype = INTEL_ATOM_GOLDMONT; + break; // "goldmont" case 0x57: *Type = INTEL_XEONPHI; // knl *Subtype = INTEL_KNIGHTS_LANDING; @@ -1070,6 +1076,8 @@ StringRef sys::getHostCPUName() { switch (Subtype) { case INTEL_ATOM_BONNELL: return "bonnell"; + case INTEL_ATOM_GOLDMONT: + return "goldmont"; case INTEL_ATOM_SILVERMONT: return "silvermont"; default: diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp index 227e792d83dc..85e782b2c048 100644 --- a/lib/Support/MemoryBuffer.cpp +++ b/lib/Support/MemoryBuffer.cpp @@ -240,11 +240,9 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) { // Read into Buffer until we hit EOF. do { Buffer.reserve(Buffer.size() + ChunkSize); - ReadBytes = read(FD, Buffer.end(), ChunkSize); - if (ReadBytes == -1) { - if (errno == EINTR) continue; + ReadBytes = sys::RetryAfterSignal(-1, read, FD, Buffer.end(), ChunkSize); + if (ReadBytes == -1) return std::error_code(errno, std::generic_category()); - } Buffer.set_size(Buffer.size() + ReadBytes); } while (ReadBytes != 0); @@ -391,13 +389,12 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize, while (BytesLeft) { #ifdef HAVE_PREAD - ssize_t NumRead = ::pread(FD, BufPtr, BytesLeft, MapSize-BytesLeft+Offset); + ssize_t NumRead = sys::RetryAfterSignal(-1, ::pread, FD, BufPtr, BytesLeft, + MapSize - BytesLeft + Offset); #else - ssize_t NumRead = ::read(FD, BufPtr, BytesLeft); + ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, BufPtr, BytesLeft); #endif if (NumRead == -1) { - if (errno == EINTR) - continue; // Error while reading. return std::error_code(errno, std::generic_category()); } diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index b16351906a4c..13bb6f23bc83 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -784,6 +784,42 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) { return 0; } +StringRef llvm::ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) { + StringRef ArchName = + CPU.empty() ? TT.getArchName() : ARM::getArchName(ARM::parseCPUArch(CPU)); + + if (TT.isOSBinFormatMachO()) { + if (TT.getEnvironment() == Triple::EABI || + TT.getOS() == Triple::UnknownOS || + llvm::ARM::parseArchProfile(ArchName) == ARM::PK_M) + return "aapcs"; + if (TT.isWatchABI()) + return "aapcs16"; + return "apcs-gnu"; + } else if (TT.isOSWindows()) + // FIXME: this is invalid for WindowsCE. + return "aapcs"; + + // Select the default based on the platform. + switch (TT.getEnvironment()) { + case Triple::Android: + case Triple::GNUEABI: + case Triple::GNUEABIHF: + case Triple::MuslEABI: + case Triple::MuslEABIHF: + return "aapcs-linux"; + case Triple::EABIHF: + case Triple::EABI: + return "aapcs"; + default: + if (TT.isOSNetBSD()) + return "apcs-gnu"; + if (TT.isOSOpenBSD()) + return "aapcs-linux"; + return "aapcs"; + } +} + StringRef llvm::AArch64::getCanonicalArchName(StringRef Arch) { return ARM::getCanonicalArchName(Arch); } diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index b6774692595b..45097eb918b7 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -737,10 +737,8 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD, #ifdef O_CLOEXEC OpenFlags |= O_CLOEXEC; #endif - while ((ResultFD = open(P.begin(), OpenFlags)) < 0) { - if (errno != EINTR) - return std::error_code(errno, std::generic_category()); - } + if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags)) < 0) + return std::error_code(errno, std::generic_category()); #ifndef O_CLOEXEC int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC); (void)r; @@ -800,10 +798,8 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD, SmallString<128> Storage; StringRef P = Name.toNullTerminatedStringRef(Storage); - while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) { - if (errno != EINTR) - return std::error_code(errno, std::generic_category()); - } + if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags, Mode)) < 0) + return std::error_code(errno, std::generic_category()); #ifndef O_CLOEXEC int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC); (void)r; diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc index 1d0143c6716e..2d4662094682 100644 --- a/lib/Support/Unix/Process.inc +++ b/lib/Support/Unix/Process.inc @@ -207,13 +207,10 @@ std::error_code Process::FixupStandardFileDescriptors() { for (int StandardFD : StandardFDs) { struct stat st; errno = 0; - while (fstat(StandardFD, &st) < 0) { + if (RetryAfterSignal(-1, fstat, StandardFD, &st) < 0) { assert(errno && "expected errno to be set if fstat failed!"); // fstat should return EBADF if the file descriptor is closed. - if (errno == EBADF) - break; - // retry fstat if we got EINTR, otherwise bubble up the failure. - if (errno != EINTR) + if (errno != EBADF) return std::error_code(errno, std::generic_category()); } // if fstat succeeds, move on to the next FD. @@ -222,11 +219,8 @@ std::error_code Process::FixupStandardFileDescriptors() { assert(errno == EBADF && "expected errno to have EBADF at this point!"); if (NullFD < 0) { - while ((NullFD = open("/dev/null", O_RDWR)) < 0) { - if (errno == EINTR) - continue; + if ((NullFD = RetryAfterSignal(-1, open, "/dev/null", O_RDWR)) < 0) return std::error_code(errno, std::generic_category()); - } } if (NullFD == StandardFD) diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp index f27bc97ec3f3..0a948812ff33 100644 --- a/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -22,7 +22,7 @@ /// cbz w8, .LBB1_2 -> b.eq .LBB1_2 /// /// 3) sub w8, w0, w1 -> subs w8, w0, w1 ; w8 has multiple uses. -/// tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2 +/// tbz w8, #31, .LBB6_2 -> b.pl .LBB6_2 /// //===----------------------------------------------------------------------===// @@ -129,11 +129,11 @@ MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) { break; case AArch64::TBZW: case AArch64::TBZX: - CC = AArch64CC::GE; + CC = AArch64CC::PL; break; case AArch64::TBNZW: case AArch64::TBNZX: - CC = AArch64CC::LT; + CC = AArch64CC::MI; break; } return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc)) @@ -271,6 +271,7 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, } break; } + (void)NewCmp; (void)NewBr; assert(NewCmp && NewBr && "Expected new instructions."); DEBUG(dbgs() << " with instruction:\n "); diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 00a0111f2bd2..9eda56c825a9 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -139,6 +140,7 @@ class SSACCmpConv { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; + const MachineBranchProbabilityInfo *MBPI; public: /// The first block containing a conditional branch, dominating everything @@ -186,8 +188,10 @@ private: public: /// runOnMachineFunction - Initialize per-function data structures. - void runOnMachineFunction(MachineFunction &MF) { + void runOnMachineFunction(MachineFunction &MF, + const MachineBranchProbabilityInfo *MBPI) { this->MF = &MF; + this->MBPI = MBPI; TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -564,8 +568,40 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // All CmpBB instructions are moved into Head, and CmpBB is deleted. // Update the CFG first. updateTailPHIs(); - Head->removeSuccessor(CmpBB, true); - CmpBB->removeSuccessor(Tail, true); + + // Save successor probabilties before removing CmpBB and Tail from their + // parents. + BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB); + BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail); + + Head->removeSuccessor(CmpBB); + CmpBB->removeSuccessor(Tail); + + // If Head and CmpBB had successor probabilties, udpate the probabilities to + // reflect the ccmp-conversion. + if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) { + + // Head is allowed two successors. We've removed CmpBB, so the remaining + // successor is Tail. We need to increase the successor probability for + // Tail to account for the CmpBB path we removed. + // + // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB). + assert(*Head->succ_begin() == Tail && "Head successor is not Tail"); + BranchProbability Head2Tail = MBPI->getEdgeProbability(Head, Tail); + Head->setSuccProbability(Head->succ_begin(), + Head2Tail + Head2CmpBB * CmpBB2Tail); + + // We will transfer successors of CmpBB to Head in a moment without + // normalizing the successor probabilities. Set the successor probabilites + // before doing so. + // + // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB). + for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) { + BranchProbability CmpBB2I = MBPI->getEdgeProbability(CmpBB, *I); + CmpBB->setSuccProbability(I, Head2CmpBB * CmpBB2I); + } + } + Head->transferSuccessorsAndUpdatePHIs(CmpBB); DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); TII->removeBranch(*Head); @@ -717,6 +753,7 @@ int SSACCmpConv::expectedCodeSizeDelta() const { namespace { class AArch64ConditionalCompares : public MachineFunctionPass { + const MachineBranchProbabilityInfo *MBPI; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MCSchedModel SchedModel; @@ -753,6 +790,7 @@ char AArch64ConditionalCompares::ID = 0; INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", @@ -763,6 +801,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() { } void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); AU.addRequired<MachineLoopInfo>(); @@ -892,12 +931,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); DomTree = &getAnalysis<MachineDominatorTree>(); Loops = getAnalysisIfAvailable<MachineLoopInfo>(); + MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; MinSize = MF.getFunction()->optForMinSize(); bool Changed = false; - CmpConv.runOnMachineFunction(MF); + CmpConv.runOnMachineFunction(MF, MBPI); // Visit blocks in dominator tree pre-order. The pre-order enables multiple // cmp-conversions from the same head block. diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 2965106fd270..aaf32a499bc3 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7561,8 +7561,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); - + SubVec = Builder.CreateIntToPtr( + SubVec, VectorType::get(SVI->getType()->getVectorElementType(), + VecTy->getVectorNumElements())); SubVecs[SVI].push_back(SubVec); } } diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index ad24612239fa..6cb723d187af 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -735,7 +735,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>; def : ShiftAlias<"rorv", RORVXr, GPR64>; // Multiply-add -let AddedComplexity = 7 in { +let AddedComplexity = 5 in { defm MADD : MulAccum<0, "madd", add>; defm MSUB : MulAccum<1, "msub", sub>; @@ -752,7 +752,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)), (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)), (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; -} // AddedComplexity = 7 +} // AddedComplexity = 5 let AddedComplexity = 5 in { def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>; diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 9bfd570e9a82..07ce0e863c5e 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -947,7 +947,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); if (DstRB.getID() != SrcRB.getID()) { - DEBUG(dbgs() << "G_TRUNC input/output on different banks\n"); + DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); return false; } @@ -964,16 +964,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); return false; } if (DstRC == SrcRC) { // Nothing to be done + } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && + SrcTy == LLT::scalar(64)) { + llvm_unreachable("TableGen can import this case"); + return false; } else if (DstRC == &AArch64::GPR32RegClass && SrcRC == &AArch64::GPR64RegClass) { I.getOperand(1).setSubReg(AArch64::sub_32); } else { + DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); return false; } diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 01196817f311..4b568f3fba2b 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -39,6 +39,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); + for (auto Ty : {p0, s1, s8, s16, s32, s64}) + setAction({G_IMPLICIT_DEF, Ty}, Legal); + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) { // These operations naturally get the right answer when used on // GPR32, even if the actual type is narrower. @@ -99,6 +102,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap). } + for (auto Ty : {s1, s8, s16, s32, s64, p0}) + setAction({G_EXTRACT, Ty}, Legal); + + for (auto Ty : {s32, s64}) + setAction({G_EXTRACT, 1, Ty}, Legal); + for (unsigned MemOp : {G_LOAD, G_STORE}) { for (auto Ty : {s8, s16, s32, s64, p0, v2s32}) setAction({MemOp, Ty}, Legal); diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index 45083df7ab45..f82b9dbc2c9f 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -151,13 +151,24 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, return MCOperand::createExpr(Expr); } +MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO, + MCSymbol *Sym) const { + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx); + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + return MCOperand::createExpr(Expr); +} + MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const { if (Printer.TM.getTargetTriple().isOSDarwin()) return lowerSymbolOperandDarwin(MO, Sym); + if (Printer.TM.getTargetTriple().isOSBinFormatCOFF()) + return lowerSymbolOperandCOFF(MO, Sym); - assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && - "Expect Darwin or ELF target"); + assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && "Invalid target"); return lowerSymbolOperandELF(MO, Sym); } diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h index 1e29b80c2d62..aa30fe1fa707 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.h +++ b/lib/Target/AArch64/AArch64MCInstLower.h @@ -42,6 +42,8 @@ public: MCSymbol *Sym) const; MCOperand lowerSymbolOperandELF(const MachineOperand &MO, MCSymbol *Sym) const; + MCOperand lowerSymbolOperandCOFF(const MachineOperand &MO, + MCSymbol *Sym) const; MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index baf15ac540cf..fab92e139dd0 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -94,7 +94,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { if (TT.isOSDarwin()) return CSR_AArch64_TLS_Darwin_RegMask; - assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS"); + assert(TT.isOSBinFormatELF() && "Invalid target"); return CSR_AArch64_TLS_ELF_RegMask; } diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td index 3654eeca530a..10df50bcf156 100644 --- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -1,4 +1,4 @@ -//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 Scheduling ---*- tablegen -*-=// +//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -79,75 +79,207 @@ def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>; // 60 entry unified scheduler. def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2, - THX2T99P3, THX2T99P4, THX2T99P5]> { - let BufferSize=60; + THX2T99P3, THX2T99P4, THX2T99P5]> { + let BufferSize = 60; } // Define commonly used write types for InstRW specializations. // All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>. // 3 cycles on I1. -def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 3; } +def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 1 cycles on I2. +def THX2T99Write_1Cyc_I2 : SchedWriteRes<[THX2T99I2]> { + let Latency = 1; + let NumMicroOps = 2; +} // 4 cycles on I1. -def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 4; } +def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 23 cycles on I1. +def THX2T99Write_23Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; +} + +// 39 cycles on I1. +def THX2T99Write_39Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; +} // 1 cycle on I0, I1, or I2. -def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { let Latency = 1; } +def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on I0, I1, or I2. +def THX2T99Write_2Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on I0, I1, or I2. +def THX2T99Write_4Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// 5 cycles on I0, I1, or I2. +def THX2T99Write_5Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 2; + let NumMicroOps = 3; +} // 5 cycles on F1. -def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 5; } +def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { + let Latency = 5; + let NumMicroOps = 2; +} // 7 cycles on F1. -def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 7; } +def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { + let Latency = 7; + let NumMicroOps = 2; +} // 4 cycles on F0 or F1. -def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 4; } +def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 4; + let NumMicroOps = 2; +} // 5 cycles on F0 or F1. -def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 5; } +def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 2; +} // 6 cycles on F0 or F1. -def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 6; } +def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 6; + let NumMicroOps = 3; +} // 7 cycles on F0 or F1. -def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 7; } +def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 3; +} // 8 cycles on F0 or F1. -def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 8; } +def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 10 cycles on F0 or F1. +def THX2T99Write_10Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 10; + let NumMicroOps = 3; +} // 16 cycles on F0 or F1. def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 16; + let NumMicroOps = 3; let ResourceCycles = [8]; } // 23 cycles on F0 or F1. def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 23; + let NumMicroOps = 3; let ResourceCycles = [11]; } // 1 cycles on LS0 or LS1. -def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 1; } +def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 0; +} + +// 1 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_1Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 0; + let NumMicroOps = 2; +} + +// 1 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_1Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 0; + let NumMicroOps = 3; +} + +// 2 cycles on LS0 or LS1. +def THX2T99Write_2Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 1; + let NumMicroOps = 2; +} // 4 cycles on LS0 or LS1. -def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 4; } +def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 4; + let NumMicroOps = 4; +} // 5 cycles on LS0 or LS1. -def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 5; } +def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 5; + let NumMicroOps = 3; +} // 6 cycles on LS0 or LS1. -def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 6; } +def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 4 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_4Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 4 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_4Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 4; + let NumMicroOps = 3; +} // 5 cycles on LS0 or LS1 and I0, I1, or I2. def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { let Latency = 5; - let NumMicroOps = 2; + let NumMicroOps = 3; } // 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2. -def THX2T99Write_6Cyc_LS01_I012_I012 : +def THX2T99Write_5Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_6Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 6; + let NumMicroOps = 4; +} + +// 6 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_6Cyc_LS01_I012_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { let Latency = 6; let NumMicroOps = 3; @@ -162,25 +294,25 @@ def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { // 5 cycles on LS0 or LS1 and F0 or F1. def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 5; - let NumMicroOps = 2; + let NumMicroOps = 3; } // 6 cycles on LS0 or LS1 and F0 or F1. def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 6; - let NumMicroOps = 2; + let NumMicroOps = 3; } // 7 cycles on LS0 or LS1 and F0 or F1. def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 7; - let NumMicroOps = 2; + let NumMicroOps = 3; } // 8 cycles on LS0 or LS1 and F0 or F1. def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 8; - let NumMicroOps = 2; + let NumMicroOps = 3; } // Define commonly used read types. @@ -195,10 +327,8 @@ def : ReadAdvance<ReadID, 0>; def : ReadAdvance<ReadExtrHi, 0>; def : ReadAdvance<ReadAdrBase, 0>; def : ReadAdvance<ReadVLD, 0>; - } - //===----------------------------------------------------------------------===// // 3. Instruction Tables. @@ -211,88 +341,217 @@ let SchedModel = ThunderX2T99Model in { // Branch, immed // Branch and link, immed // Compare and branch -def : WriteRes<WriteBr, [THX2T99I2]> { let Latency = 1; } +def : WriteRes<WriteBr, [THX2T99I2]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes<WriteBrReg, [THX2T99I2]> { + let Latency = 1; + let NumMicroOps = 2; +} def : WriteRes<WriteSys, []> { let Latency = 1; } def : WriteRes<WriteBarrier, []> { let Latency = 1; } def : WriteRes<WriteHint, []> { let Latency = 1; } -def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } +def : WriteRes<WriteAtomic, []> { + let Unsupported = 1; + let NumMicroOps = 2; +} -// Branch, register -// Branch and link, register != LR -// Branch and link, register = LR -def : WriteRes<WriteBrReg, [THX2T99I2]> { let Latency = 1; } +//--- +// Branch +//--- +def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>; +def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>; +def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>; +def : InstRW<[THX2T99Write_1Cyc_I2], + (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>; //--- // 3.2 Arithmetic and Logical Instructions // 3.3 Move and Shift Instructions //--- + // ALU, basic // Conditional compare // Conditional select // Address generation -def : WriteRes<WriteI, [THX2T99I012]> { let Latency = 1; } +def : WriteRes<WriteI, [THX2T99I012]> { + let Latency = 1; + let ResourceCycles = [1, 3]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)", + "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)", + "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)", + "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)", + "CSNEG?(W|X)r(i|r|s|x)")>; + def : InstRW<[WriteI], (instrs COPY)>; // ALU, extend and/or shift def : WriteRes<WriteISReg, [THX2T99I012]> { let Latency = 2; - let ResourceCycles = [2]; + let ResourceCycles = [2, 3]; + let NumMicroOps = 2; } +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)", + "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)", + "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)", + "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)", + "CSNEG?(W|X)r(i|r|s|x)")>; + def : WriteRes<WriteIEReg, [THX2T99I012]> { - let Latency = 2; - let ResourceCycles = [2]; + let Latency = 1; + let ResourceCycles = [1, 3]; + let NumMicroOps = 2; } +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)", + "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)", + "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)", + "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)", + "CSNEG?(W|X)r(i|r|s|x)")>; + // Move immed -def : WriteRes<WriteImm, [THX2T99I012]> { let Latency = 1; } +def : WriteRes<WriteImm, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def : InstRW<[THX2T99Write_1Cyc_I012], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[THX2T99Write_1Cyc_I012], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; // Variable shift -def : WriteRes<WriteIS, [THX2T99I012]> { let Latency = 1; } +def : WriteRes<WriteIS, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} //--- // 3.4 Divide and Multiply Instructions //--- // Divide, W-form -// Latency range of 13-23. Take the average. +// Latency range of 13-23/13-39. def : WriteRes<WriteID32, [THX2T99I1]> { - let Latency = 18; - let ResourceCycles = [18]; + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; } // Divide, X-form -// Latency range of 13-39. Take the average. def : WriteRes<WriteID64, [THX2T99I1]> { - let Latency = 26; - let ResourceCycles = [26]; + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; } // Multiply accumulate, W-form -def : WriteRes<WriteIM32, [THX2T99I012]> { let Latency = 5; } +def : WriteRes<WriteIM32, [THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} // Multiply accumulate, X-form -def : WriteRes<WriteIM64, [THX2T99I012]> { let Latency = 5; } +def : WriteRes<WriteIM64, [THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} + +//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX2T99Write_5Cyc_I012], +// (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[THX2T99Write_5Cyc_I012], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; // Bitfield extract, two reg -def : WriteRes<WriteExtr, [THX2T99I012]> { let Latency = 1; } +def : WriteRes<WriteExtr, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Multiply high +def : InstRW<[THX2T99Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[THX2T99Write_1Cyc_I012], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[THX2T99Write_1Cyc_I012], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; -// Bitfield move, basic // Bitfield move, insert -// NOTE: Handled by WriteIS. +def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "^BFM")>; +def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "(S|U)?BFM.*")>; // Count leading def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$", - "^CLZ(W|X)r$")>; + "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[THX2T99Write_1Cyc_I012], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES[DE]")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AESI?MC")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1SU0")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1[CMP]")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256SU0")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256(H|H2|SU1)")>; + +// CRC Instructions +// def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>; +def : InstRW<[THX2T99Write_4Cyc_I1], + (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>; + +def : InstRW<[THX2T99Write_4Cyc_I1], + (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>; // Reverse bits/bytes // NOTE: Handled by WriteI. //--- -// 3.6 Load Instructions +// 3.6 Load Instructions // 3.10 FP Load Instructions //--- @@ -300,13 +559,29 @@ def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$", // Load register, unscaled immed // Load register, immed unprivileged // Load register, unsigned immed -def : WriteRes<WriteLD, [THX2T99LS01]> { let Latency = 4; } +def : WriteRes<WriteLD, [THX2T99LS01]> { + let Latency = 4; + let NumMicroOps = 4; +} // Load register, immed post-index // NOTE: Handled by WriteLD, WriteI. // Load register, immed pre-index // NOTE: Handled by WriteLD, WriteAdr. -def : WriteRes<WriteAdr, [THX2T99I012]> { let Latency = 1; } +def : WriteRes<WriteAdr, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes<WriteLDHi, []> { + let Latency = 5; + let NumMicroOps = 5; +} // Load register offset, basic // Load register, register offset, scale by 4/8 @@ -324,23 +599,229 @@ def THX2T99ReadAdrBase : SchedReadVariant<[ SchedVar<NoSchedPred, [ReadDefault]>]>; def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>; -// Load pair, immed offset, normal -// Load pair, immed offset, signed words, base != SP -// Load pair, immed offset signed words, base = SP -// LDP only breaks into *one* LS micro-op. Thus -// the resources are handling by WriteLD. -def : WriteRes<WriteLDHi, []> { - let Latency = 5; -} - // Load pair, immed pre-index, normal // Load pair, immed pre-index, signed words // Load pair, immed post-index, normal // Load pair, immed post-index, signed words // NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRBui)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDui)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRHui)>; +def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRQui)>; +def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRSui)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDl)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRQl)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRWl)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRXl)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRBi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRHi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRXi)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSWi)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpost)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpost)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRBpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRHpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRXpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPXpre)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRBpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRHpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRXpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRDroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRQroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHXroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRXroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRBroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRDroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRQroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHXroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRXroX)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBBi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURDi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHHi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURQi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSWi)>; + +//--- +// Prefetch +//--- +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMl)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFUMi)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMui)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroW)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroX)>; + //-- -// 3.7 Store Instructions +// 3.7 Store Instructions // 3.11 FP Store Instructions //-- @@ -382,6 +863,195 @@ def : WriteRes<WriteSTP, [THX2T99LS01, THX2T99SD]> { // Store pair, immed pre-index, X-form // NOTE: Handled by WriteAdr, WriteSTP. +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBBi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURDi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHHi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURQi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURSi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURWi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURXi)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRBi)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRHi)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRWi)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRXi)>; + +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPDi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPQi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPXi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPWi)>; + +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPDi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPQi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPXi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPWi)>; + +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRBui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRBui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRDui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRDui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRHui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRHui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRQui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRQui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRXui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRXui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRWui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRWui)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPXpre, STPXpost)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRXpre, STRXpost)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRXroW, STRXroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + //--- // 3.8 FP Data Processing Instructions //--- @@ -389,28 +1059,95 @@ def : WriteRes<WriteSTP, [THX2T99LS01, THX2T99SD]> { // FP absolute value // FP min/max // FP negate -def : WriteRes<WriteF, [THX2T99F01]> { let Latency = 5; } +def : WriteRes<WriteF, [THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 2; +} // FP arithmetic def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>; // FP compare -def : WriteRes<WriteFCmp, [THX2T99F01]> { let Latency = 5; } +def : WriteRes<WriteFCmp, [THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 2; +} -// FP divide, S-form -// FP square root, S-form -def : WriteRes<WriteFDiv, [THX2T99F01]> { +// FP Mul, Div, Sqrt +def : WriteRes<WriteFDiv, [THX2T99F01]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THX2T99XWriteFDiv : SchedWriteRes<[THX2T99F01]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFDivSP : SchedWriteRes<[THX2T99F01]> { let Latency = 16; let ResourceCycles = [8]; + let NumMicroOps = 4; } +def THX2T99XWriteFDivDP : SchedWriteRes<[THX2T99F01]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFSqrtSP : SchedWriteRes<[THX2T99F01]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFSqrtDP : SchedWriteRes<[THX2T99F01]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>; + // FP divide, D-form // FP square root, D-form -def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>; +def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>; // FP multiply // FP multiply accumulate -def : WriteRes<WriteFMul, [THX2T99F01]> { let Latency = 6; } +def : WriteRes<WriteFMul, [THX2T99F01]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX2T99XWriteFMul : SchedWriteRes<[THX2T99F01]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX2T99XWriteFMulAcc : SchedWriteRes<[THX2T99F01]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def : InstRW<[THX2T99XWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[THX2T99XWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; // FP round to integral def : InstRW<[THX2T99Write_7Cyc_F01], @@ -426,15 +1163,25 @@ def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>; // FP convert, from vec to vec reg // FP convert, from gen to vec reg // FP convert, from vec to gen reg -def : WriteRes<WriteFCvt, [THX2T99F01]> { let Latency = 7; } +def : WriteRes<WriteFCvt, [THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 3; +} // FP move, immed // FP move, register -def : WriteRes<WriteFImm, [THX2T99F01]> { let Latency = 4; } +def : WriteRes<WriteFImm, [THX2T99F01]> { + let Latency = 4; + let NumMicroOps = 2; +} // FP transfer, from gen to vec reg // FP transfer, from vec to gen reg -def : WriteRes<WriteFCopy, [THX2T99F01]> { let Latency = 4; } +def : WriteRes<WriteFCopy, [THX2T99F01]> { + let Latency = 4; + let NumMicroOps = 2; +} + def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; //--- @@ -470,19 +1217,135 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; // ASIMD shift by register, basic, Q-form // ASIMD shift by register, complex, D-form // ASIMD shift by register, complex, Q-form -def : WriteRes<WriteV, [THX2T99F01]> { let Latency = 7; } +def : WriteRes<WriteV, [THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [4, 23]; +} // ASIMD arith, reduce, 4H/4S // ASIMD arith, reduce, 8B/8H // ASIMD arith, reduce, 16B -def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; // ASIMD logical (MOV, MVN, ORN, ORR) -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv", + "^ORRv", "^ORNv", "^NOTv")>; +// ASIMD arith, reduce +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; // ASIMD polynomial (8x8) multiply long -def : InstRW<[THX2T99Write_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^(P?MUL|SQR?DMULH)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD multiply accumulate, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv", + "SQSHRNv","SQSHRUNv", "UQRSHRNv", + "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADALP","^UADALP")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLPv","^UADDLPv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLV","^UADDLV")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SUQADDv","^USQADDv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv", + "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUBHNv", "^SUQADD", + "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv", + "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>; //--- // 3.13 ASIMD Floating-point Instructions @@ -493,7 +1356,8 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>; // ASIMD FP arith, normal, D-form // ASIMD FP arith, normal, Q-form -def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; // ASIMD FP arith,pairwise, D-form // ASIMD FP arith, pairwise, Q-form @@ -503,8 +1367,15 @@ def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>; // ASIMD FP compare, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", - "^FCMGTv", "^FCMLEv", - "^FCMLTv")>; + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP round, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; // ASIMD FP convert, long // ASIMD FP convert, narrow @@ -512,14 +1383,26 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", // ASIMD FP convert, other, Q-form // NOTE: Handled by WriteV. +// ASIMD FP convert, long and narrow +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + // ASIMD FP divide, D-form, F32 def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv2f32")>; // ASIMD FP divide, Q-form, F32 def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv4f32")>; // ASIMD FP divide, Q-form, F64 def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>; +def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "FDIVv2f64")>; // ASIMD FP max/min, normal, D-form // ASIMD FP max/min, normal, Q-form @@ -540,20 +1423,24 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", // ASIMD FP multiply, Q-form, FZ // ASIMD FP multiply, Q-form, no FZ def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP multiply accumulate, Dform, FZ // ASIMD FP multiply accumulate, Dform, no FZ // ASIMD FP multiply accumulate, Qform, FZ // ASIMD FP multiply accumulate, Qform, no FZ def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP negate def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>; -// ASIMD FP round, D-form -// ASIMD FP round, Q-form -// NOTE: Handled by WriteV. - //-- // 3.14 ASIMD Miscellaneous Instructions //-- @@ -563,37 +1450,66 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^BIFv", "^BITv", "^BSLv")>; // ASIMD count, D-form // ASIMD count, Q-form -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^CLSv", "^CLZv", "^CNTv")>; // ASIMD duplicate, gen reg // ASIMD duplicate, element def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>; // ASIMD extract def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>; // ASIMD extract narrow +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^XTNv")>; + // ASIMD extract narrow, saturating -// NOTE: Handled by WriteV. +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; // ASIMD insert, element to element def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>; +// ASIMD transfer, element to gen reg +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>; + // ASIMD move, integer immed def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; // ASIMD move, FP immed def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>; +// ASIMD table lookup, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>; + +// ASIMD table lookup, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>; + +// ASIMD transpose +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + // ASIMD reciprocal estimate, D-form // ASIMD reciprocal estimate, Q-form -def : InstRW<[THX2T99Write_5Cyc_F01], +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", - "^FRSQRTEv", "^URSQRTEv")>; + "^FRSQRTEv", "^URSQRTEv")>; // ASIMD reciprocal step, D-form, FZ // ASIMD reciprocal step, D-form, no FZ @@ -602,7 +1518,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; // ASIMD reverse -def : InstRW<[THX2T99Write_5Cyc_F01], +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^REV16v", "^REV32v", "^REV64v")>; // ASIMD table lookup, D-form @@ -610,135 +1526,135 @@ def : InstRW<[THX2T99Write_5Cyc_F01], def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; // ASIMD transfer, element to word or word -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^UMOVv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>; // ASIMD transfer, element to gen reg -def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "(S|U)MOVv.*")>; // ASIMD transfer gen reg to element def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>; // ASIMD transpose def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", - "^UZP1v", "^UZP2v")>; + "^UZP1v", "^UZP2v")>; // ASIMD unzip/zip def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; //-- -// 3.15 ASIMD Load Instructions +// 3.15 ASIMD Load Instructions //-- // ASIMD load, 1 element, multiple, 1 reg, D-form // ASIMD load, 1 element, multiple, 1 reg, Q-form -def : InstRW<[THX2T99Write_4Cyc_LS01], +def : InstRW<[THX2T99Write_4Cyc_LS01], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, multiple, 2 reg, D-form // ASIMD load, 1 element, multiple, 2 reg, Q-form -def : InstRW<[THX2T99Write_4Cyc_LS01], +def : InstRW<[THX2T99Write_4Cyc_LS01], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, multiple, 3 reg, D-form // ASIMD load, 1 element, multiple, 3 reg, Q-form -def : InstRW<[THX2T99Write_5Cyc_LS01], +def : InstRW<[THX2T99Write_5Cyc_LS01], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, multiple, 4 reg, D-form // ASIMD load, 1 element, multiple, 4 reg, Q-form -def : InstRW<[THX2T99Write_6Cyc_LS01], +def : InstRW<[THX2T99Write_6Cyc_LS01], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, one lane, B/H/S // ASIMD load, 1 element, one lane, D def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD1i(8|16|32|64)_POST$")>; // ASIMD load, 1 element, all lanes, D-form, B/H/S // ASIMD load, 1 element, all lanes, D-form, D // ASIMD load, 1 element, all lanes, Q-form -def : InstRW<[THX2T99Write_5Cyc_LS01_F01], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 2 element, multiple, D-form, B/H/S // ASIMD load, 2 element, multiple, Q-form, D -def : InstRW<[THX2T99Write_5Cyc_LS01_F01], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD load, 2 element, one lane, B/H // ASIMD load, 2 element, one lane, S // ASIMD load, 2 element, one lane, D def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD2i(8|16|32|64)_POST$")>; // ASIMD load, 2 element, all lanes, D-form, B/H/S // ASIMD load, 2 element, all lanes, D-form, D // ASIMD load, 2 element, all lanes, Q-form -def : InstRW<[THX2T99Write_5Cyc_LS01_F01], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 3 element, multiple, D-form, B/H/S // ASIMD load, 3 element, multiple, Q-form, B/H/S // ASIMD load, 3 element, multiple, Q-form, D -def : InstRW<[THX2T99Write_8Cyc_LS01_F01], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01], (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD load, 3 element, one lone, B/H // ASIMD load, 3 element, one lane, S // ASIMD load, 3 element, one lane, D def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], (instregex "^LD3i(8|16|32|64)_POST$")>; // ASIMD load, 3 element, all lanes, D-form, B/H/S // ASIMD load, 3 element, all lanes, D-form, D // ASIMD load, 3 element, all lanes, Q-form, B/H/S // ASIMD load, 3 element, all lanes, Q-form, D -def : InstRW<[THX2T99Write_7Cyc_LS01_F01], +def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 4 element, multiple, D-form, B/H/S // ASIMD load, 4 element, multiple, Q-form, B/H/S // ASIMD load, 4 element, multiple, Q-form, D -def : InstRW<[THX2T99Write_8Cyc_LS01_F01], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01], (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD load, 4 element, one lane, B/H // ASIMD load, 4 element, one lane, S // ASIMD load, 4 element, one lane, D def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], (instregex "^LD4i(8|16|32|64)_POST$")>; // ASIMD load, 4 element, all lanes, D-form, B/H/S // ASIMD load, 4 element, all lanes, D-form, D // ASIMD load, 4 element, all lanes, Q-form, B/H/S // ASIMD load, 4 element, all lanes, Q-form, D -def : InstRW<[THX2T99Write_6Cyc_LS01_F01], +def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; //-- @@ -747,106 +1663,83 @@ def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], // ASIMD store, 1 element, multiple, 1 reg, D-form // ASIMD store, 1 element, multiple, 1 reg, Q-form -def : InstRW<[THX2T99Write_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, multiple, 2 reg, D-form // ASIMD store, 1 element, multiple, 2 reg, Q-form -def : InstRW<[THX2T99Write_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, multiple, 3 reg, D-form // ASIMD store, 1 element, multiple, 3 reg, Q-form -def : InstRW<[THX2T99Write_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, multiple, 4 reg, D-form // ASIMD store, 1 element, multiple, 4 reg, Q-form -def : InstRW<[THX2T99Write_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, one lane, B/H/S // ASIMD store, 1 element, one lane, D -def : InstRW<[THX2T99Write_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST1i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST1i(8|16|32|64)_POST$")>; // ASIMD store, 2 element, multiple, D-form, B/H/S // ASIMD store, 2 element, multiple, Q-form, B/H/S // ASIMD store, 2 element, multiple, Q-form, D -def : InstRW<[THX2T99Write_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD store, 2 element, one lane, B/H/S // ASIMD store, 2 element, one lane, D -def : InstRW<[THX2T99Write_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST2i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST2i(8|16|32|64)_POST$")>; // ASIMD store, 3 element, multiple, D-form, B/H/S // ASIMD store, 3 element, multiple, Q-form, B/H/S // ASIMD store, 3 element, multiple, Q-form, D -def : InstRW<[THX2T99Write_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD store, 3 element, one lane, B/H // ASIMD store, 3 element, one lane, S // ASIMD store, 3 element, one lane, D def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST3i(8|16|32|64)_POST$")>; // ASIMD store, 4 element, multiple, D-form, B/H/S // ASIMD store, 4 element, multiple, Q-form, B/H/S // ASIMD store, 4 element, multiple, Q-form, D -def : InstRW<[THX2T99Write_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD store, 4 element, one lane, B/H // ASIMD store, 4 element, one lane, S // ASIMD store, 4 element, one lane, D def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; -def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST4i(8|16|32|64)_POST$")>; -//-- -// 3.17 Cryptography Extensions -//-- - -// Crypto AES ops -def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES")>; - -// Crypto polynomial (64x64) multiply long -def : InstRW<[THX2T99Write_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>; - -// Crypto SHA1 xor ops -// Crypto SHA1 schedule acceleration ops -// Crypto SHA256 schedule acceleration op (1 u-op) -// Crypto SHA256 schedule acceleration op (2 u-ops) -// Crypto SHA256 hash acceleration ops -def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA")>; - -//-- -// 3.18 CRC -//-- - -// CRC checksum ops -def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32")>; - } // SchedModel = ThunderX2T99Model + diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 6660f0babb8a..1252f9403812 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -167,6 +167,8 @@ extern "C" void LLVMInitializeAArch64Target() { static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) return llvm::make_unique<AArch64_MachoTargetObjectFile>(); + if (TT.isOSBinFormatCOFF()) + return llvm::make_unique<AArch64_COFFTargetObjectFile>(); return llvm::make_unique<AArch64_ELFTargetObjectFile>(); } @@ -179,6 +181,8 @@ static std::string computeDataLayout(const Triple &TT, return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128"; if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; + if (TT.isOSBinFormatCOFF()) + return "e-m:w-i64:64-i128:128-n32:64-S128"; if (LittleEndian) return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 2c75a3258c1c..fefa7e26b79f 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -36,6 +36,7 @@ public: ~AArch64TargetMachine() override; const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; + const AArch64Subtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 47e3bce43f6e..9077eb7902fd 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -45,6 +45,9 @@ public: const TargetMachine &TM) const override; }; +/// This implementation is used for AArch64 COFF targets. +class AArch64_COFFTargetObjectFile : public TargetLoweringObjectFileCOFF {}; + } // end namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a4328682b93c..a76f080530bb 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -20,6 +20,23 @@ using namespace llvm; #define DEBUG_TYPE "aarch64tti" +static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", + cl::init(true), cl::Hidden); + +bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // Inline a callee if its target-features are a subset of the callers + // target-features. + return (CallerBits & CalleeBits) == CalleeBits; +} + /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. @@ -631,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { return ST->getMaxInterleaveFactor(); } -void AArch64TTIImpl::getUnrollingPreferences(Loop *L, +// For Falkor, we want to avoid having too many strided loads in a loop since +// that can exhaust the HW prefetcher resources. We adjust the unroller +// MaxCount preference below to attempt to ensure unrolling doesn't create too +// many strided loads. +static void +getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TargetTransformInfo::UnrollingPreferences &UP) { + enum { MaxStridedLoads = 7 }; + auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { + int StridedLoads = 0; + // FIXME? We could make this more precise by looking at the CFG and + // e.g. not counting loads in each side of an if-then-else diamond. + for (const auto BB : L->blocks()) { + for (auto &I : *BB) { + LoadInst *LMemI = dyn_cast<LoadInst>(&I); + if (!LMemI) + continue; + + Value *PtrValue = LMemI->getPointerOperand(); + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + // FIXME? We could take pairing of unrolled load copies into account + // by looking at the AddRec, but we would probably have to limit this + // to loops with no stores or other memory optimization barriers. + ++StridedLoads; + // We've seen enough strided loads that seeing more won't make a + // difference. + if (StridedLoads > MaxStridedLoads / 2) + return StridedLoads; + } + } + return StridedLoads; + }; + + int StridedLoads = countStridedLoads(L, SE); + DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads + << " strided loads\n"); + // Pick the largest power of 2 unroll count that won't result in too many + // strided loads. + if (StridedLoads) { + UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); + DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount + << '\n'); + } +} + +void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Enable partial unrolling and runtime unrolling. - BaseT::getUnrollingPreferences(L, UP); + BaseT::getUnrollingPreferences(L, SE, UP); // For inner loop, it is more likely to be a hot one, and the runtime check // can be promoted out from LICM pass, so the overhead is less, let's try @@ -644,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; + + if (ST->getProcFamily() == AArch64Subtarget::Falkor && + EnableFalkorHWPFUnrollFix) + getFalkorUnrollingPreferences(L, SE, UP); } Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 290a1ca1f24b..31c037354925 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -51,6 +51,9 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; + /// \name Scalar TTI Implementations /// @{ @@ -119,7 +122,8 @@ public: int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 3d075018904c..475f91016840 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -541,14 +541,13 @@ public: return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32); } - void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, bool &IsResolved) override; + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; }; -void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm, - const MCFixup &Fixup, - const MCValue &Target, - bool &IsResolved) { +bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target) { // The ADRP instruction adds some multiple of 0x1000 to the current PC & // ~0xfff. This means that the required offset to reach a symbol can vary by // up to one step depending on where the ADRP is in memory. For example: @@ -562,11 +561,24 @@ void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm, // section isn't 0x1000-aligned, we therefore need to delegate this decision // to the linker -- a relocation! if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) - IsResolved = false; + return true; + return false; } } +namespace { +class COFFAArch64AsmBackend : public AArch64AsmBackend { +public: + COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple) + : AArch64AsmBackend(T, /*IsLittleEndian*/true) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAArch64WinCOFFObjectWriter(OS); + } +}; +} + MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TheTriple, @@ -575,7 +587,11 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, if (TheTriple.isOSBinFormatMachO()) return new DarwinAArch64AsmBackend(T, MRI); - assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); + if (TheTriple.isOSBinFormatCOFF()) + return new COFFAArch64AsmBackend(T, TheTriple); + + assert(TheTriple.isOSBinFormatELF() && "Invalid target"); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); bool IsILP32 = Options.getABIName() == "ilp32"; return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index f7dda92fb551..89c3e5b4c76e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -49,10 +49,11 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, /*HasRelocationAddend*/ true), IsILP32(IsILP32) {} -#define R_CLS(rtype) \ - IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype -#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\ - "supported (LP64 eqv: " #lp64rtype ")" +#define R_CLS(rtype) \ + IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype +#define BAD_ILP32_MOV(lp64rtype) \ + "ILP32 absolute MOV relocation not " \ + "supported (LP64 eqv: " #lp64rtype ")" // assumes IsILP32 is true static bool isNonILP32reloc(const MCFixup &Fixup, @@ -60,44 +61,45 @@ static bool isNonILP32reloc(const MCFixup &Fixup, MCContext &Ctx) { if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw) return false; - switch(RefKind) { - case AArch64MCExpr::VK_ABS_G3: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3)); - return true; - case AArch64MCExpr::VK_ABS_G2: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2)); - return true; - case AArch64MCExpr::VK_ABS_G2_S: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2)); - return true; - case AArch64MCExpr::VK_ABS_G2_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC)); - return true; - case AArch64MCExpr::VK_ABS_G1_S: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1)); - return true; - case AArch64MCExpr::VK_ABS_G1_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC)); - return true; - case AArch64MCExpr::VK_DTPREL_G2: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2)); - return true; - case AArch64MCExpr::VK_DTPREL_G1_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC)); - return true; - case AArch64MCExpr::VK_TPREL_G2: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2)); - return true; - case AArch64MCExpr::VK_TPREL_G1_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC)); - return true; - case AArch64MCExpr::VK_GOTTPREL_G1: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1)); - return true; - case AArch64MCExpr::VK_GOTTPREL_G0_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC)); - return true; - default: return false; + switch (RefKind) { + case AArch64MCExpr::VK_ABS_G3: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3)); + return true; + case AArch64MCExpr::VK_ABS_G2: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2)); + return true; + case AArch64MCExpr::VK_ABS_G2_S: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2)); + return true; + case AArch64MCExpr::VK_ABS_G2_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC)); + return true; + case AArch64MCExpr::VK_ABS_G1_S: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1)); + return true; + case AArch64MCExpr::VK_ABS_G1_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC)); + return true; + case AArch64MCExpr::VK_DTPREL_G2: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2)); + return true; + case AArch64MCExpr::VK_DTPREL_G1_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC)); + return true; + case AArch64MCExpr::VK_TPREL_G2: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2)); + return true; + case AArch64MCExpr::VK_TPREL_G1_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC)); + return true; + case AArch64MCExpr::VK_GOTTPREL_G1: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1)); + return true; + case AArch64MCExpr::VK_GOTTPREL_G0_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC)); + return true; + default: + return false; } return false; } @@ -130,7 +132,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(PREL32); case FK_Data_8: if (IsILP32) { - Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data " + Ctx.reportError(Fixup.getLoc(), + "ILP32 8 byte PC relative data " "relocation not supported (LP64 eqv: PREL64)"); return ELF::R_AARCH64_NONE; } else @@ -178,7 +181,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } } else { if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) - return ELF::R_AARCH64_NONE; + return ELF::R_AARCH64_NONE; switch ((unsigned)Fixup.getKind()) { case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); @@ -189,8 +192,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(ABS32); case FK_Data_8: if (IsILP32) { - Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte absolute data " - "relocation not supported (LP64 eqv: ABS64)"); + Ctx.reportError(Fixup.getLoc(), + "ILP32 8 byte absolute data " + "relocation not supported (LP64 eqv: ABS64)"); return ELF::R_AARCH64_NONE; } else return ELF::R_AARCH64_ABS64; @@ -262,7 +266,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } else { Ctx.reportError(Fixup.getLoc(), "LP64 4 byte unchecked GOT load/store relocation " - "not supported (ILP32 eqv: LD32_GOT_LO12_NC"); + "not supported (ILP32 eqv: LD32_GOT_LO12_NC"); return ELF::R_AARCH64_NONE; } } @@ -270,12 +274,12 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (IsILP32) { Ctx.reportError(Fixup.getLoc(), "ILP32 4 byte checked GOT load/store relocation " - "not supported (unchecked eqv: LD32_GOT_LO12_NC)"); + "not supported (unchecked eqv: LD32_GOT_LO12_NC)"); } else { Ctx.reportError(Fixup.getLoc(), "LP64 4 byte checked GOT load/store relocation " - "not supported (unchecked/ILP32 eqv: " - "LD32_GOT_LO12_NC)"); + "not supported (unchecked/ILP32 eqv: " + "LD32_GOT_LO12_NC)"); } return ELF::R_AARCH64_NONE; } @@ -283,7 +287,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (IsILP32) { return ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC; } else { - Ctx.reportError(Fixup.getLoc(), "LP64 32-bit load/store " + Ctx.reportError(Fixup.getLoc(), + "LP64 32-bit load/store " "relocation not supported (ILP32 eqv: " "TLSIE_LD32_GOTTPREL_LO12_NC)"); return ELF::R_AARCH64_NONE; @@ -295,14 +300,14 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } else { Ctx.reportError(Fixup.getLoc(), "LP64 4 byte TLSDESC load/store relocation " - "not supported (ILP32 eqv: TLSDESC_LD64_LO12)"); + "not supported (ILP32 eqv: TLSDESC_LD64_LO12)"); return ELF::R_AARCH64_NONE; } } Ctx.reportError(Fixup.getLoc(), "invalid fixup for 32-bit load/store instruction " - "fixup_aarch64_ldst_imm12_scale4"); + "fixup_aarch64_ldst_imm12_scale4"); return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale8: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) @@ -312,8 +317,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AARCH64_LD64_GOT_LO12_NC; } else { Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " - "relocation not supported (LP64 eqv: " - "LD64_GOT_LO12_NC)"); + "relocation not supported (LP64 eqv: " + "LD64_GOT_LO12_NC)"); return ELF::R_AARCH64_NONE; } } @@ -330,8 +335,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC; } else { Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " - "relocation not supported (LP64 eqv: " - "TLSIE_LD64_GOTTPREL_LO12_NC)"); + "relocation not supported (LP64 eqv: " + "TLSIE_LD64_GOTTPREL_LO12_NC)"); return ELF::R_AARCH64_NONE; } } @@ -340,8 +345,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AARCH64_TLSDESC_LD64_LO12; } else { Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " - "relocation not supported (LP64 eqv: " - "TLSDESC_LD64_LO12)"); + "relocation not supported (LP64 eqv: " + "TLSDESC_LD64_LO12)"); return ELF::R_AARCH64_NONE; } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 031aa8b81e35..a0de3c39562b 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetStreamer.h" +#include "AArch64WinCOFFStreamer.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" @@ -30,6 +31,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCWinCOFFStreamer.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" @@ -210,6 +212,8 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); if (TT.isOSBinFormatELF()) return new AArch64TargetELFStreamer(S); + if (TT.isOSBinFormatCOFF()) + return new AArch64TargetWinCOFFStreamer(S); return nullptr; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h index 0f5b765c7697..4293dcba955e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h @@ -16,53 +16,47 @@ namespace llvm { namespace AArch64 { enum Fixups { - // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into - // an ADR instruction. + // A 21-bit pc-relative immediate inserted into an ADR instruction. fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind, - // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into - // an ADRP instruction. + // A 21-bit pc-relative immediate inserted into an ADRP instruction. fixup_aarch64_pcrel_adrp_imm21, - // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions. - // No alignment adjustment. All value bits are encoded. + // 12-bit fixup for add/sub instructions. No alignment adjustment. All value + // bits are encoded. fixup_aarch64_add_imm12, - // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and - // store instructions. + // unsigned 12-bit fixups for load and store instructions. fixup_aarch64_ldst_imm12_scale1, fixup_aarch64_ldst_imm12_scale2, fixup_aarch64_ldst_imm12_scale4, fixup_aarch64_ldst_imm12_scale8, fixup_aarch64_ldst_imm12_scale16, - // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative - // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by - // pc-relative loads and generates relocations directly when necessary. + // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as + // fixup_aarch64_pcrel_adrhi, except this is used by pc-relative loads and + // generates relocations directly when necessary. fixup_aarch64_ldr_pcrel_imm19, // FIXME: comment fixup_aarch64_movw, - // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative - // immediate. + // The high 14 bits of a 21-bit pc-relative immediate. fixup_aarch64_pcrel_branch14, - // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative - // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by - // b.cc and generates relocations directly when necessary. + // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as + // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates + // relocations directly when necessary. fixup_aarch64_pcrel_branch19, - // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative - // immediate. + // The high 26 bits of a 28-bit pc-relative immediate. fixup_aarch64_pcrel_branch26, - // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative - // immediate. Distinguished from branch26 only on ELF. + // The high 26 bits of a 28-bit pc-relative immediate. Distinguished from + // branch26 only on ELF. fixup_aarch64_pcrel_call26, - // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF - // R_AARCH64_TLSDESC_CALL relocation. + // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation. fixup_aarch64_tlsdesc_call, // Marker diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 1b28df963b40..fc808ee0cdd6 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -100,3 +100,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { HasIdentDirective = true; } + +AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() { + CommentString = ";"; +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 253cd30f26ee..2d7107a37244 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H +#include "llvm/MC/MCAsmInfoCOFF.h" #include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/MCAsmInfoELF.h" @@ -33,6 +34,10 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF { explicit AArch64MCAsmInfoELF(const Triple &T); }; +struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF { + explicit AArch64MCAsmInfoCOFF(); +}; + } // namespace llvm #endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index f710065d9bc7..a2555496cdb9 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -14,6 +14,7 @@ #include "AArch64MCTargetDesc.h" #include "AArch64ELFStreamer.h" #include "AArch64MCAsmInfo.h" +#include "AArch64WinCOFFStreamer.h" #include "InstPrinter/AArch64InstPrinter.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" @@ -59,8 +60,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI; if (TheTriple.isOSBinFormatMachO()) MAI = new AArch64MCAsmInfoDarwin(); + else if (TheTriple.isOSBinFormatCOFF()) + MAI = new AArch64MCAsmInfoCOFF(); else { - assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF"); + assert(TheTriple.isOSBinFormatELF() && "Invalid target"); MAI = new AArch64MCAsmInfoELF(TheTriple); } @@ -74,8 +77,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, CodeModel::Model &CM) { - assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) && - "Only expect Darwin and ELF targets"); + assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() || + TT.isOSBinFormatCOFF()) && "Invalid target"); if (CM == CodeModel::Default) CM = CodeModel::Small; @@ -122,6 +125,14 @@ static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB, /*LabelSections*/ true); } +static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible) { + return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, + IncrementalLinkerCompatible); +} + static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) { return new MCInstrAnalysis(Info); } @@ -154,6 +165,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { // Register the obj streamers. TargetRegistry::RegisterELFStreamer(*T, createELFStreamer); TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer); + TargetRegistry::RegisterCOFFStreamer(*T, createWinCOFFStreamer); // Register the obj target streamer. TargetRegistry::RegisterObjectTargetStreamer( diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 615d7dab2c51..1404926b8124 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -60,6 +60,8 @@ MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType, uint32_t CPUSubtype); +MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS); + MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp new file mode 100644 index 000000000000..7862a03e771c --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -0,0 +1,65 @@ +//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCValue.h" +#include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> + +using namespace llvm; + +namespace { + +class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { +public: + AArch64WinCOFFObjectWriter() + : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) { + } + + ~AArch64WinCOFFObjectWriter() override = default; + + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsCrossSection, + const MCAsmBackend &MAB) const override; + + bool recordRelocation(const MCFixup &) const override; +}; + +} // end anonymous namespace + +unsigned +AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection, + const MCAsmBackend &MAB) const { + const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); + report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); +} + +bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { + return true; +} + +namespace llvm { + +MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) { + MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter(); + return createWinCOFFObjectWriter(MOTW, OS); +} + +} // end namespace llvm diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp new file mode 100644 index 000000000000..6c8da27e398f --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -0,0 +1,37 @@ +//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64WinCOFFStreamer.h" + +using namespace llvm; + +namespace { + +class AArch64WinCOFFStreamer : public MCWinCOFFStreamer { +public: + friend class AArch64TargetWinCOFFStreamer; + + AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE, + raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, AB, CE, OS) {} +}; +} // end anonymous namespace + +namespace llvm { +MCWinCOFFStreamer +*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible) { + auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); + return S; +} + +} // end llvm namespace diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h new file mode 100644 index 000000000000..1b4fcd6804e2 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h @@ -0,0 +1,43 @@ +//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements WinCOFF streamer information for the AArch64 backend. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H + +#include "AArch64TargetStreamer.h" +#include "llvm/MC/MCWinCOFFStreamer.h" + +namespace { +class AArch64WinCOFFStreamer; + +class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer { +private: + AArch64WinCOFFStreamer &getStreamer(); + +public: + AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S) + : AArch64TargetStreamer(S) {} +}; + +} // end anonymous namespace + +namespace llvm { + +MCWinCOFFStreamer +*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible); +} // end llvm namespace + +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt index 6d8be5e63fbb..56eeba8a1d4b 100644 --- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt @@ -8,6 +8,8 @@ add_llvm_library(LLVMAArch64Desc AArch64MCTargetDesc.cpp AArch64MachObjectWriter.cpp AArch64TargetStreamer.cpp + AArch64WinCOFFObjectWriter.cpp + AArch64WinCOFFStreamer.cpp ) add_dependencies(LLVMAArch64Desc AArch64CommonTableGen) diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 55d18c3f3646..5a799b2d88d0 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -36,7 +36,6 @@ FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes -FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 7494e5decd6f..f1d899c4d003 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -262,8 +262,8 @@ def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" >; -def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc", - "HasSDWAClampVOPC", +def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc", + "HasSDWAOutModsVOPC", "true", "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" >; @@ -452,7 +452,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP + FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP ] >; diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 2071b6f157cd..9a391d06c9ea 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1776,7 +1776,7 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, E = EndMBB->succ_end(); PI != E; ++PI) { // Either we have a back-edge to the entry block, or a back-edge to the - // succesor of the entry block since the block may be split. + // successor of the entry block since the block may be split. if ((*PI) != StartMBB && !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) { Succs.insert( @@ -1831,7 +1831,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( IfBB->addSuccessor(CodeBBStart); DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); - // Ensure that the MergeBB is a succesor of the CodeEndBB. + // Ensure that the MergeBB is a successor of the CodeEndBB. if (!CodeBBEnd->isSuccessor(MergeBB)) CodeBBEnd->addSuccessor(MergeBB); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ab5abf2039a5..be47b900c6f0 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -128,7 +128,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAScalar(false), HasSDWASdst(false), HasSDWAMac(false), - HasSDWAClampVOPC(false), + HasSDWAOutModsVOPC(false), HasDPP(false), FlatAddressSpace(false), FlatInstOffsets(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 2b16289c723e..22cede59086a 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -153,7 +153,7 @@ protected: bool HasSDWAScalar; bool HasSDWASdst; bool HasSDWAMac; - bool HasSDWAClampVOPC; + bool HasSDWAOutModsVOPC; bool HasDPP; bool FlatAddressSpace; bool FlatInstOffsets; @@ -452,8 +452,8 @@ public: return HasSDWAMac; } - bool hasSDWAClampVOPC() const { - return HasSDWAClampVOPC; + bool hasSDWAOutModsVOPC() const { + return HasSDWAOutModsVOPC; } /// \brief Returns the offset in bytes from the start of the input buffer diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 04fe9f689806..425fd35d47de 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -720,7 +720,6 @@ bool GCNPassConfig::addPreISel() { addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions } addPass(createSinkingPass()); - addPass(createSITypeRewriter()); addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 88245b01683a..89a03902dc69 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -63,7 +63,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, return false; } -void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. UP.MaxCount = UINT_MAX; diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 485e20411ab4..9a320bdfcc3d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -68,7 +68,8 @@ public: bool hasBranchDivergence() { return true; } - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index e30844f082cd..917d9cfa6905 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -96,7 +96,6 @@ add_llvm_target(AMDGPUCodeGen SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp - SITypeRewriter.cpp SIWholeQuadMode.cpp GCNIterativeScheduler.cpp GCNMinRegStrategy.cpp diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 04308fb3aaf6..f26e49295e69 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -626,7 +626,9 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, using namespace AMDGPU::SDWA; if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { - if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + // XXX: static_cast<int> is needed to avoid stupid warning: + // compare with unsigned is always true + if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) && Val <= SDWA9EncValues::SRC_VGPR_MAX) { return createRegOperand(getVgprClassId(Width), Val - SDWA9EncValues::SRC_VGPR_MIN); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index d0f4e00994de..d39b345bdf03 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4314,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( return SDValue(); } +// Returns true if argument is a boolean value which is not serialized into +// memory or argument and does not require v_cmdmask_b32 to be deserialized. +static bool isBoolSGPR(SDValue V) { + if (V.getValueType() != MVT::i1) + return false; + switch (V.getOpcode()) { + default: break; + case ISD::SETCC: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case AMDGPUISD::FP_CLASS: + return true; + } + return false; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -4402,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } + if (VT == MVT::i32 && + (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { + // and x, (sext cc from i1) => select cc, x, 0 + if (RHS.getOpcode() != ISD::SIGN_EXTEND) + std::swap(LHS, RHS); + if (isBoolSGPR(RHS.getOperand(0))) + return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), + LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); + } + return SDValue(); } @@ -4941,8 +4968,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { auto Cond = RHS.getOperand(0); - if (Cond.getOpcode() != ISD::SETCC && - Cond.getOpcode() != AMDGPUISD::FP_CLASS) + if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; @@ -5109,6 +5135,35 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + + auto CRHS = dyn_cast<ConstantSDNode>(RHS); + if (!CRHS) { + CRHS = dyn_cast<ConstantSDNode>(LHS); + if (CRHS) { + std::swap(LHS, RHS); + CC = getSetCCSwappedOperands(CC); + } + } + + if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && + isBoolSGPR(LHS.getOperand(0))) { + // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 + // setcc (sext from i1 cc), -1, eq|sle|uge) => cc + // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 + // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || + (CRHS->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || + (CRHS->isNullValue() && + (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) + return LHS.getOperand(0); + } if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && VT != MVT::f16)) @@ -5116,7 +5171,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, // Match isinf pattern // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); if (!CRHS) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index c9b48fea7225..b6784ec14e9f 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -770,7 +770,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); } return; @@ -871,7 +871,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); } return; @@ -2444,8 +2444,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - if ( DstIdx == -1) - DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst); const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; @@ -2488,14 +2486,20 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; return false; } - } else if (!ST.hasSDWAClampVOPC()) { + } else if (!ST.hasSDWAOutModsVOPC()) { // No clamp allowed on GFX9 for VOPC const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); - if (Clamp != nullptr && - (!Clamp->isImm() || Clamp->getImm() != 0)) { + if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; return false; } + + // No omod allowed on GFX9 for VOPC + const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { + ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; + return false; + } } } } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 3b4a8b5d1e81..4a81fb3b463a 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -336,6 +336,10 @@ def NegSubInlineConst16 : ImmLeaf<i16, [{ return Imm < -16 && Imm >= -64; }], NegateImm>; +def ShiftAmt32Imm : PatLeaf <(imm), [{ + return N->getZExtValue() < 32; +}]>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 3b4bdc864253..bcc685015cf5 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -929,6 +929,14 @@ def : UMad24Pat<V_MAD_U32_U24>; defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; +def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; + +def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; + /********** ====================== **********/ /********** Indirect addressing **********/ /********** ====================== **********/ diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 4ac23ef03cb3..e2ac6631d2f3 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -627,10 +627,13 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, return false; } - if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + if (!ST.hasSDWAOutModsVOPC() && + (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) return false; - } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || + !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { return false; } @@ -649,25 +652,24 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode())); assert(SDWAOpcode != -1); - // Copy dst, if it is present in original then should also be present in SDWA - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (!Dst && !TII->isVOPC(MI)) - return false; - const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); // Create SDWA version of instruction MI and initialize its operands MachineInstrBuilder SDWAInst = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); if (Dst) { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); SDWAInst.add(*Dst); - } else { - Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { assert(Dst && AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); SDWAInst.add(*Dst); + } else { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); + SDWAInst.addReg(AMDGPU::VCC, RegState::Define); } // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and @@ -714,20 +716,22 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, } // Copy omod if present, initialize otherwise if needed - MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); - if (OMod) { - assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1); - SDWAInst.add(*OMod); - } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { - SDWAInst.addImm(0); + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { + MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod) { + SDWAInst.add(*OMod); + } else { + SDWAInst.addImm(0); + } } - // Initialize dst_sel and dst_unused if present - if (Dst) { - assert( - AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && - AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); + // Initialize dst_sel if present + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Initialize dst_unused if present + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); } diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp deleted file mode 100644 index aad68537f779..000000000000 --- a/lib/Target/AMDGPU/SITypeRewriter.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass removes performs the following type substitution on all -/// non-compute shaders: -/// -/// v16i8 => i128 -/// - v16i8 is used for constant memory resource descriptors. This type is -/// legal for some compute APIs, and we don't want to declare it as legal -/// in the backend, because we want the legalizer to expand all v16i8 -/// operations. -/// v1* => * -/// - Having v1* types complicates the legalizer and we can easily replace -/// - them with the element type. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { - -class SITypeRewriter : public FunctionPass, - public InstVisitor<SITypeRewriter> { - - static char ID; - Module *Mod; - Type *v16i8; - Type *v4i32; - -public: - SITypeRewriter() : FunctionPass(ID) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - StringRef getPassName() const override { return "SI Type Rewriter"; } - void visitLoadInst(LoadInst &I); - void visitCallInst(CallInst &I); - void visitBitCast(BitCastInst &I); -}; - -} // End anonymous namespace - -char SITypeRewriter::ID = 0; - -bool SITypeRewriter::doInitialization(Module &M) { - Mod = &M; - v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); - return false; -} - -bool SITypeRewriter::runOnFunction(Function &F) { - if (!AMDGPU::isShader(F.getCallingConv())) - return false; - - visit(F); - visit(F); - - return false; -} - -void SITypeRewriter::visitLoadInst(LoadInst &I) { - Value *Ptr = I.getPointerOperand(); - Type *PtrTy = Ptr->getType(); - Type *ElemTy = PtrTy->getPointerElementType(); - IRBuilder<> Builder(&I); - if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, - PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); - LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector<std::pair<unsigned, MDNode *>, 8> MD; - I.getAllMetadataOtherThanDebugLoc(MD); - for (unsigned i = 0, e = MD.size(); i != e; ++i) { - Load->setMetadata(MD[i].first, MD[i].second); - } - Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); - I.replaceAllUsesWith(BitCastLoad); - I.eraseFromParent(); - } -} - -void SITypeRewriter::visitCallInst(CallInst &I) { - IRBuilder<> Builder(&I); - - SmallVector <Value*, 8> Args; - SmallVector <Type*, 8> Types; - bool NeedToReplace = false; - Function *F = I.getCalledFunction(); - if (!F) - return; - - std::string Name = F->getName(); - for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { - Value *Arg = I.getArgOperand(i); - if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, v4i32)); - Types.push_back(v4i32); - NeedToReplace = true; - Name = Name + ".v4i32"; - } else if (Arg->getType()->isVectorTy() && - Arg->getType()->getVectorNumElements() == 1 && - Arg->getType()->getVectorElementType() == - Type::getInt32Ty(I.getContext())){ - Type *ElementTy = Arg->getType()->getVectorElementType(); - std::string TypeName = "i32"; - InsertElementInst *Def = cast<InsertElementInst>(Arg); - Args.push_back(Def->getOperand(1)); - Types.push_back(ElementTy); - std::string VecTypeName = "v1" + TypeName; - Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); - NeedToReplace = true; - } else { - Args.push_back(Arg); - Types.push_back(Arg->getType()); - } - } - - if (!NeedToReplace) { - return; - } - Function *NewF = Mod->getFunction(Name); - if (!NewF) { - NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); - NewF->setAttributes(F->getAttributes()); - } - I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); - I.eraseFromParent(); -} - -void SITypeRewriter::visitBitCast(BitCastInst &I) { - IRBuilder<> Builder(&I); - if (I.getDestTy() != v4i32) { - return; - } - - if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) { - if (Op->getSrcTy() == v4i32) { - I.replaceAllUsesWith(Op->getOperand(0)); - I.eraseFromParent(); - } - } -} - -FunctionPass *llvm::createSITypeRewriter() { - return new SITypeRewriter(); -} diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 6f67183df6a1..c40b4450a5b5 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true", "Has return address stack">; +// Some processors have no branch predictor, which changes the expected cost of +// taking a branch which affects the choice of whether to use predicated +// instructions. +def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor", + "HasBranchPredictor", "false", + "Has no branch predictor">; + /// DSP extension. def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Supports DSP instructions in ARM and/or Thumb2">; @@ -262,6 +269,10 @@ def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", "Generate calls via indirect call " "instructions">; +def FeatureExecuteOnly + : SubtargetFeature<"execute-only", "GenExecuteOnly", "true", + "Enable the generation of execute only code.">; + def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", "Reserve R9, making it unavailable as " "GPR">; @@ -540,7 +551,7 @@ def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; // // Dummy CPU, used to target architectures -def : ProcNoItin<"generic", []>; +def : ProcessorModel<"generic", CortexA8Model, []>; def : ProcNoItin<"arm8", [ARMv4]>; def : ProcNoItin<"arm810", [ARMv4]>; @@ -756,13 +767,19 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>; -def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>; +def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m, + ProcM3, + FeatureHasNoBranchPredictor]>; + +def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m, + ProcM3, + FeatureHasNoBranchPredictor]>; -def : ProcNoItin<"cortex-m4", [ARMv7em, +def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, FeatureVFP4, FeatureVFPOnlySP, - FeatureD16]>; + FeatureD16, + FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, FeatureFPARMv8, @@ -771,11 +788,12 @@ def : ProcNoItin<"cortex-m7", [ARMv7em, def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, FeatureNoMovt]>; -def : ProcNoItin<"cortex-m33", [ARMv8mMainline, +def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, FeatureDSP, FeatureFPARMv8, FeatureD16, - FeatureVFPOnlySP]>; + FeatureVFPOnlySP, + FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index e0810c358f2d..1ec6b24b2ed6 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB, } bool ARMBaseInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &, +isProfitableToIfCvt(MachineBasicBlock &TBB, unsigned TCycles, unsigned TExtra, - MachineBasicBlock &, + MachineBasicBlock &FBB, unsigned FCycles, unsigned FExtra, BranchProbability Probability) const { if (!TCycles) @@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &, // Here we scale up each component of UnpredCost to avoid precision issue when // scaling TCycles/FCycles by Probability. const unsigned ScalingUpFactor = 1024; - unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); - unsigned FUnpredCost = + + unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor; + unsigned UnpredCost; + if (!Subtarget.hasBranchPredictor()) { + // When we don't have a branch predictor it's always cheaper to not take a + // branch than take it, so we have to take that into account. + unsigned NotTakenBranchCost = 1; + unsigned TakenBranchCost = Subtarget.getMispredictionPenalty(); + unsigned TUnpredCycles, FUnpredCycles; + if (!FCycles) { + // Triangle: TBB is the fallthrough + TUnpredCycles = TCycles + NotTakenBranchCost; + FUnpredCycles = TakenBranchCost; + } else { + // Diamond: TBB is the block that is branched to, FBB is the fallthrough + TUnpredCycles = TCycles + TakenBranchCost; + FUnpredCycles = FCycles + NotTakenBranchCost; + } + // The total cost is the cost of each path scaled by their probabilites + unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor); + unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor); + UnpredCost = TUnpredCost + FUnpredCost; + // When predicating assume that the first IT can be folded away but later + // ones cost one cycle each + if (Subtarget.isThumb2() && TCycles + FCycles > 4) { + PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor; + } + } else { + unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); + unsigned FUnpredCost = Probability.getCompl().scale(FCycles * ScalingUpFactor); - unsigned UnpredCost = TUnpredCost + FUnpredCost; - UnpredCost += 1 * ScalingUpFactor; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + UnpredCost = TUnpredCost + FUnpredCost; + UnpredCost += 1 * ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + } - return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost; + return PredCost <= UnpredCost; } bool diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 2bcc707e9fc3..e42514acd76f 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -7580,6 +7580,9 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), dl, MVT::i32); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + if (isBigEndian) + std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); @@ -7607,10 +7610,14 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); - Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32, - SDValue(CmpSwap, 0))); - Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32, - SDValue(CmpSwap, 0))); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + + Results.push_back( + DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); + Results.push_back( + DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 2)); } diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 423f97ccacd6..891a8f482f0a 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -1416,12 +1416,12 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def tTBB_JT : tPseudoInst<(outs), - (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, - Sched<[WriteBr]>; + (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, + IIC_Br, []>, Sched<[WriteBr]>; def tTBH_JT : tPseudoInst<(outs), - (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, - Sched<[WriteBr]>; + (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, + IIC_Br, []>, Sched<[WriteBr]>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 4cb0eca5ee5f..374176d1d737 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -46,6 +46,10 @@ private: MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) const; + bool selectSelect(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) const; + const ARMBaseInstrInfo &TII; const ARMBaseRegisterInfo &TRI; const ARMBaseTargetMachine &TM; @@ -346,6 +350,50 @@ bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB, return true; } +bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, + const ARMBaseInstrInfo &TII, + MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) const { + auto &MBB = *MIB->getParent(); + auto InsertBefore = std::next(MIB->getIterator()); + auto &DebugLoc = MIB->getDebugLoc(); + + // Compare the condition to 0. + auto CondReg = MIB->getOperand(1).getReg(); + assert(MRI.getType(CondReg).getSizeInBits() == 1 && + RBI.getRegBank(CondReg, MRI, TRI)->getID() == ARM::GPRRegBankID && + "Unsupported types for select operation"); + auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPri)) + .addUse(CondReg) + .addImm(0) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI)) + return false; + + // Move a value into the result register based on the result of the + // comparison. + auto ResReg = MIB->getOperand(0).getReg(); + auto TrueReg = MIB->getOperand(2).getReg(); + auto FalseReg = MIB->getOperand(3).getReg(); + assert(MRI.getType(ResReg) == MRI.getType(TrueReg) && + MRI.getType(TrueReg) == MRI.getType(FalseReg) && + MRI.getType(FalseReg).getSizeInBits() == 32 && + RBI.getRegBank(TrueReg, MRI, TRI)->getID() == ARM::GPRRegBankID && + RBI.getRegBank(FalseReg, MRI, TRI)->getID() == ARM::GPRRegBankID && + "Unsupported types for select operation"); + auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCr)) + .addDef(ResReg) + .addUse(TrueReg) + .addUse(FalseReg) + .add(predOps(ARMCC::EQ, ARM::CPSR)); + if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI)) + return false; + + MIB->eraseFromParent(); + return true; +} + bool ARMInstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -448,6 +496,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } case G_ICMP: return selectICmp(MIB, TII, MRI, TRI, RBI); + case G_SELECT: + return selectSelect(MIB, TII, MRI, TRI, RBI); case G_GEP: I.setDesc(TII.get(ARM::ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 5873c7fb3872..f3e62d09cc30 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -55,10 +55,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { for (unsigned Op : {G_SDIV, G_UDIV}) { for (auto Ty : {s8, s16}) - // FIXME: We need WidenScalar here, but in the case of targets with - // software division we'll also need Libcall afterwards. Treat as Custom - // until we have better support for chaining legalization actions. - setAction({Op, Ty}, Custom); + setAction({Op, Ty}, WidenScalar); if (ST.hasDivideInARMMode()) setAction({Op, s32}, Legal); else @@ -84,6 +81,10 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_GEP, p0}, Legal); setAction({G_GEP, 1, s32}, Legal); + setAction({G_SELECT, s32}, Legal); + setAction({G_SELECT, p0}, Legal); + setAction({G_SELECT, 1, s1}, Legal); + setAction({G_CONSTANT, s32}, Legal); setAction({G_ICMP, s1}, Legal); @@ -118,40 +119,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, switch (MI.getOpcode()) { default: return false; - case G_SDIV: - case G_UDIV: { - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - if (Ty != LLT::scalar(16) && Ty != LLT::scalar(8)) - return false; - - // We need to widen to 32 bits and then maybe, if the target requires, - // transform into a libcall. - LegalizerHelper Helper(MIRBuilder.getMF()); - - MachineInstr *NewMI = nullptr; - Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) { - // Store the new, 32-bit div instruction. - if (MI->getOpcode() == G_SDIV || MI->getOpcode() == G_UDIV) - NewMI = MI; - }); - - auto Result = Helper.widenScalar(MI, 0, LLT::scalar(32)); - Helper.MIRBuilder.stopRecordingInsertions(); - if (Result == LegalizerHelper::UnableToLegalize) { - return false; - } - assert(NewMI && "Couldn't find widened instruction"); - assert((NewMI->getOpcode() == G_SDIV || NewMI->getOpcode() == G_UDIV) && - "Unexpected widened instruction"); - assert(MRI.getType(NewMI->getOperand(0).getReg()).getSizeInBits() == 32 && - "Unexpected type for the widened instruction"); - - Result = Helper.legalizeInstrStep(*NewMI); - if (Result == LegalizerHelper::UnableToLegalize) { - return false; - } - return true; - } case G_SREM: case G_UREM: { unsigned OriginalResult = MI.getOperand(0).getReg(); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 2350d0c6ef69..11fb81a4f9fe 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -255,6 +255,18 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); break; + case G_SELECT: { + LLT Ty2 = MRI.getType(MI.getOperand(1).getReg()); + (void)Ty2; + assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT"); + assert(Ty2.getSizeInBits() == 1 && "Unsupported size for G_SELECT"); + OperandsMapping = + getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], + &ARM::ValueMappings[ARM::GPR3OpsIdx], + &ARM::ValueMappings[ARM::GPR3OpsIdx], + &ARM::ValueMappings[ARM::GPR3OpsIdx]}); + break; + } case G_ICMP: { LLT Ty2 = MRI.getType(MI.getOperand(2).getReg()); (void)Ty2; diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 02cbfb1fa9f1..b10583bc7983 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -245,6 +245,10 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { // the general GPR register class above (MOV, e.g.) def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>; +// Thumb registers R0-R7 and the PC. Some instructions like TBB or THH allow +// the PC to be used as a destination operand as well. +def tGPRwithpc : RegisterClass<"ARM", [i32], 32, (add tGPR, PC)>; + // The high registers in thumb mode, R8-R15. def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>; diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 1c7902520f2d..53e012f13ee2 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -424,3 +424,4 @@ include "ARMScheduleA9.td" include "ARMScheduleSwift.td" include "ARMScheduleR52.td" include "ARMScheduleA57.td" +include "ARMScheduleM3.td" diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td new file mode 100644 index 000000000000..93f8299f9bd0 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleM3.td @@ -0,0 +1,21 @@ +//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the ARM Cortex-M3 processor. +// +//===----------------------------------------------------------------------===// + +def CortexM3Model : SchedMachineModel { + let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue + let MicroOpBufferSize = 0; // In-order + let LoadLatency = 2; // Latency when not pipelined, not pc-relative + let MispredictPenalty = 2; // Best case branch taken cost + + let CompleteModel = 0; +} diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index d9d0c27c6304..2c42a1336166 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -11,6 +11,13 @@ // //===----------------------------------------------------------------------===// +#include "ARM.h" + +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "ARMCallLowering.h" +#include "ARMLegalizerInfo.h" +#include "ARMRegisterBankInfo.h" +#endif #include "ARMSubtarget.h" #include "ARMFrameLowering.h" #include "ARMInstrInfo.h" @@ -23,6 +30,13 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#endif #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -78,11 +92,6 @@ ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -/// EnableExecuteOnly - Enables the generation of execute-only code on supported -/// targets -static cl::opt<bool> -EnableExecuteOnly("arm-execute-only"); - ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU, StringRef FS) { ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS); @@ -92,13 +101,41 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU, return new ARMFrameLowering(STI); } +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { + +struct ARMGISelActualAccessor : public GISelAccessor { + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; + +} // end anonymous namespace +#endif + ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle) : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps), - GenExecuteOnly(EnableExecuteOnly), CPUString(CPU), IsLittle(IsLittle), - TargetTriple(TT), Options(TM.Options), TM(TM), - FrameLowering(initializeFrameLowering(CPU, FS)), + CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), + TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. InstrInfo(isThumb1Only() @@ -106,7 +143,29 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, : !isThumb() ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this) : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)), - TLInfo(TM, *this) {} + TLInfo(TM, *this) { + assert((isThumb() || hasARMOps()) && + "Target must either be thumb or support ARM operations!"); + +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor(); + GISel->CallLoweringInfo.reset(new ARMCallLowering(*getTargetLowering())); + GISel->Legalizer.reset(new ARMLegalizerInfo(*this)); + + auto *RBI = new ARMRegisterBankInfo(*getRegisterInfo()); + + // FIXME: At this point, we can't rely on Subtarget having RBI. + // It's awkward to mix passing RBI and the Subtarget; should we pass + // TII/TRI as well? + GISel->InstSelector.reset(createARMInstructionSelector( + *static_cast<const ARMBaseTargetMachine *>(&TM), *this, *RBI)); + + GISel->RegBankInfo.reset(RBI); +#endif + setGISelAccessor(*GISel); +} const CallLowering *ARMSubtarget::getCallLowering() const { assert(GISel && "Access to GlobalISel APIs not set"); diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index d890d0fa777e..e15b17512c96 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -246,6 +246,11 @@ protected: /// avoid issue "normal" call instructions to callees which do not return. bool HasRetAddrStack = false; + /// HasBranchPredictor - True if the subtarget has a branch predictor. Having + /// a branch predictor or not changes the expected cost of taking a branch + /// which affects the choice of whether to use predicated instructions. + bool HasBranchPredictor = true; + /// HasMPExtension - True if the subtarget supports Multiprocessing /// extension (ARMv7 only). bool HasMPExtension = false; @@ -554,6 +559,7 @@ public: bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRetAddrStack() const { return HasRetAddrStack; } + bool hasBranchPredictor() const { return HasBranchPredictor; } bool hasMPExtension() const { return HasMPExtension; } bool hasDSP() const { return HasDSP; } bool useNaClTrap() const { return UseNaClTrap; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index eb71e557ec91..c323a1d368de 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -11,11 +11,6 @@ //===----------------------------------------------------------------------===// #include "ARM.h" -#include "ARMCallLowering.h" -#include "ARMLegalizerInfo.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL -#include "ARMRegisterBankInfo.h" -#endif #include "ARMSubtarget.h" #include "ARMMacroFusion.h" #include "ARMTargetMachine.h" @@ -29,7 +24,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/ExecutionDepsFix.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" @@ -110,60 +104,20 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { static ARMBaseTargetMachine::ARMABI computeTargetABI(const Triple &TT, StringRef CPU, const TargetOptions &Options) { - if (Options.MCOptions.getABIName() == "aapcs16") + StringRef ABIName = Options.MCOptions.getABIName(); + + if (ABIName.empty()) + ABIName = ARM::computeDefaultTargetABI(TT, CPU); + + if (ABIName == "aapcs16") return ARMBaseTargetMachine::ARM_ABI_AAPCS16; - else if (Options.MCOptions.getABIName().startswith("aapcs")) + else if (ABIName.startswith("aapcs")) return ARMBaseTargetMachine::ARM_ABI_AAPCS; - else if (Options.MCOptions.getABIName().startswith("apcs")) + else if (ABIName.startswith("apcs")) return ARMBaseTargetMachine::ARM_ABI_APCS; - assert(Options.MCOptions.getABIName().empty() && - "Unknown target-abi option!"); - - ARMBaseTargetMachine::ARMABI TargetABI = - ARMBaseTargetMachine::ARM_ABI_UNKNOWN; - - unsigned ArchKind = ARM::parseCPUArch(CPU); - StringRef ArchName = ARM::getArchName(ArchKind); - // FIXME: This is duplicated code from the front end and should be unified. - if (TT.isOSBinFormatMachO()) { - if (TT.getEnvironment() == Triple::EABI || - (TT.getOS() == Triple::UnknownOS && TT.isOSBinFormatMachO()) || - ARM::parseArchProfile(ArchName) == ARM::PK_M) { - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; - } else if (TT.isWatchABI()) { - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16; - } else { - TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; - } - } else if (TT.isOSWindows()) { - // FIXME: this is invalid for WindowsCE - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; - } else { - // Select the default based on the platform. - switch (TT.getEnvironment()) { - case Triple::Android: - case Triple::GNUEABI: - case Triple::GNUEABIHF: - case Triple::MuslEABI: - case Triple::MuslEABIHF: - case Triple::EABIHF: - case Triple::EABI: - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; - break; - case Triple::GNU: - TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; - break; - default: - if (TT.isOSNetBSD()) - TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; - else - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; - break; - } - } - - return TargetABI; + llvm_unreachable("Unhandled/unknown ABI Name!"); + return ARMBaseTargetMachine::ARM_ABI_UNKNOWN; } static std::string computeDataLayout(const Triple &TT, StringRef CPU, @@ -248,61 +202,39 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM, OL), TargetABI(computeTargetABI(TT, CPU, Options)), - TLOF(createTLOF(getTargetTriple())), - Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) { + TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) { // Default to triple-appropriate float ABI - if (Options.FloatABIType == FloatABI::Default) - this->Options.FloatABIType = - Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft; + if (Options.FloatABIType == FloatABI::Default) { + if (TargetTriple.getEnvironment() == Triple::GNUEABIHF || + TargetTriple.getEnvironment() == Triple::MuslEABIHF || + TargetTriple.getEnvironment() == Triple::EABIHF || + TargetTriple.isOSWindows() || + TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16) + this->Options.FloatABIType = FloatABI::Hard; + else + this->Options.FloatABIType = FloatABI::Soft; + } // Default to triple-appropriate EABI if (Options.EABIVersion == EABI::Default || Options.EABIVersion == EABI::Unknown) { // musl is compatible with glibc with regard to EABI version - if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI()) + if ((TargetTriple.getEnvironment() == Triple::GNUEABI || + TargetTriple.getEnvironment() == Triple::GNUEABIHF || + TargetTriple.getEnvironment() == Triple::MuslEABI || + TargetTriple.getEnvironment() == Triple::MuslEABIHF) && + !(TargetTriple.isOSWindows() || TargetTriple.isOSDarwin())) this->Options.EABIVersion = EABI::GNU; else this->Options.EABIVersion = EABI::EABI5; } initAsmInfo(); - if (!Subtarget.isThumb() && !Subtarget.hasARMOps()) - report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " - "support ARM mode execution!"); } ARMBaseTargetMachine::~ARMBaseTargetMachine() = default; -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct ARMGISelActualAccessor : public GISelAccessor { - std::unique_ptr<CallLowering> CallLoweringInfo; - std::unique_ptr<InstructionSelector> InstSelector; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - - const CallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; - -} // end anonymous namespace -#endif - const ARMSubtarget * ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -334,24 +266,6 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle); - -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor(); - GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering())); - GISel->Legalizer.reset(new ARMLegalizerInfo(*I)); - - auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo()); - - // FIXME: At this point, we can't rely on Subtarget having RBI. - // It's awkward to mix passing RBI and the Subtarget; should we pass - // TII/TRI as well? - GISel->InstSelector.reset(createARMInstructionSelector(*this, *I, *RBI)); - - GISel->RegBankInfo.reset(RBI); -#endif - I->setGISelAccessor(*GISel); } return I.get(); } diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 2fcee73228fe..f41da3e8e223 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -36,7 +36,6 @@ public: protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; - ARMSubtarget Subtarget; bool isLittle; mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap; @@ -47,8 +46,8 @@ public: CodeGenOpt::Level OL, bool isLittle); ~ARMBaseTargetMachine() override; - const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } const ARMSubtarget *getSubtargetImpl(const Function &F) const override; + const ARMSubtarget *getSubtargetImpl() const = delete; bool isLittleEndian() const { return isLittle; } /// \brief Get the TargetIRAnalysis for this target. diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp index a5b27abeb27f..88bab64ffaf2 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -32,7 +32,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM); bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS; - genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly(); + // genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly(); TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(isAAPCS_ABI); @@ -43,16 +43,6 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, AttributesSection = getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0); - - // Make code section unreadable when in execute-only mode - if (genExecuteOnly) { - unsigned Type = ELF::SHT_PROGBITS; - unsigned Flags = ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE; - // Since we cannot modify flags for an existing section, we create a new - // section with the right flags, and use 0 as the unique ID for - // execute-only text - TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U); - } } const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference( @@ -74,21 +64,27 @@ getDebugThreadLocalSymbol(const MCSymbol *Sym) const { getContext()); } -MCSection * -ARMElfTargetObjectFile::getExplicitSectionGlobal(const GlobalObject *GO, - SectionKind SK, const TargetMachine &TM) const { +static bool isExecuteOnlyFunction(const GlobalObject *GO, SectionKind SK, + const TargetMachine &TM) { + if (const Function *F = dyn_cast<Function>(GO)) + if (TM.getSubtarget<ARMSubtarget>(*F).genExecuteOnly() && SK.isText()) + return true; + return false; +} + +MCSection *ARMElfTargetObjectFile::getExplicitSectionGlobal( + const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const { // Set execute-only access for the explicit section - if (genExecuteOnly && SK.isText()) + if (isExecuteOnlyFunction(GO, SK, TM)) SK = SectionKind::getExecuteOnly(); return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM); } -MCSection * -ARMElfTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO, - SectionKind SK, const TargetMachine &TM) const { +MCSection *ARMElfTargetObjectFile::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const { // Place the global in the execute-only text section - if (genExecuteOnly && SK.isText()) + if (isExecuteOnlyFunction(GO, SK, TM)) SK = SectionKind::getExecuteOnly(); return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, SK, TM); diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h index dbb8128269dc..bd7aa1cfe02b 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.h +++ b/lib/Target/ARM/ARMTargetObjectFile.h @@ -16,8 +16,6 @@ namespace llvm { class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { - mutable bool genExecuteOnly = false; - protected: const MCSection *AttributesSection = nullptr; diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 585726208a8d..5ab236b7fd4c 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -486,7 +486,7 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - Size = 0; + Size = 4; return MCDisassembler::Fail; } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 81760f03940a..22de728fe06e 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -738,13 +738,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, } } -void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, - const MCFixup &Fixup, - const MCValue &Target, bool &IsResolved) { +bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target) { const MCSymbolRefExpr *A = Target.getSymA(); const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; const unsigned FixupKind = Fixup.getKind() ; - if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { + if ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { assert(Sym && "How did we resolve this?"); // If the symbol is external the linker will handle it. @@ -753,7 +753,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // If the symbol is out of range, produce a relocation and hope the // linker can handle it. GNU AS produces an error in this case. if (Sym->isExternal()) - IsResolved = false; + return true; } // Create relocations for unconditional branches to function symbols with // different execution mode in ELF binaries. @@ -761,12 +761,12 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType(); if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) { if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch)) - IsResolved = false; + return true; if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br || FixupKind == ARM::fixup_arm_thumb_bl || FixupKind == ARM::fixup_t2_condbranch || FixupKind == ARM::fixup_t2_uncondbranch)) - IsResolved = false; + return true; } } // We must always generate a relocation for BL/BLX instructions if we have @@ -776,7 +776,8 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, FixupKind == ARM::fixup_arm_blx || FixupKind == ARM::fixup_arm_uncondbl || FixupKind == ARM::fixup_arm_condbl)) - IsResolved = false; + return true; + return false; } /// getFixupKindNumBytes - The number of bytes the fixup may change. diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 6a0ba2ed41c1..84b54bbb9a49 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -38,10 +38,8 @@ public: const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; - /// processFixupValue - Target hook to process the literal value of a fixup - /// if necessary. - void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, bool &IsResolved) override; + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, uint64_t Value, bool IsPCRel, diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h index 9f6c5d7bf920..831589ba0581 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h +++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h @@ -15,55 +15,47 @@ namespace llvm { namespace ARM { enum Fixups { - // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol - // addresses + // 12-bit PC relative relocation for symbol addresses fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind, - // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with - // the 16-bit halfwords reordered. + // Equivalent to fixup_arm_ldst_pcrel_12, with the 16-bit halfwords reordered. fixup_t2_ldst_pcrel_12, - // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol - // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded. + // 10-bit PC relative relocation for symbol addresses used in + // LDRD/LDRH/LDRB/etc. instructions. All bits are encoded. fixup_arm_pcrel_10_unscaled, - // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses - // used in VFP instructions where the lower 2 bits are not encoded - // (so it's encoded as an 8-bit immediate). + // 10-bit PC relative relocation for symbol addresses used in VFP instructions + // where the lower 2 bits are not encoded (so it's encoded as an 8-bit + // immediate). fixup_arm_pcrel_10, - // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for - // the short-swapped encoding of Thumb2 instructions. + // Equivalent to fixup_arm_pcrel_10, accounting for the short-swapped encoding + // of Thumb2 instructions. fixup_t2_pcrel_10, - // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses - // used in VFP instructions where bit 0 not encoded (so it's encoded as an - // 8-bit immediate). + // 9-bit PC relative relocation for symbol addresses used in VFP instructions + // where bit 0 not encoded (so it's encoded as an 8-bit immediate). fixup_arm_pcrel_9, - // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for - // the short-swapped encoding of Thumb2 instructions. + // Equivalent to fixup_arm_pcrel_9, accounting for the short-swapped encoding + // of Thumb2 instructions. fixup_t2_pcrel_9, - // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol - // addresses where the lower 2 bits are not encoded (so it's encoded as an - // 8-bit immediate). + // 10-bit PC relative relocation for symbol addresses where the lower 2 bits + // are not encoded (so it's encoded as an 8-bit immediate). fixup_thumb_adr_pcrel_10, - // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR - // instruction. + // 12-bit PC relative relocation for the ADR instruction. fixup_arm_adr_pcrel_12, - // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR - // instruction. + // 12-bit PC relative relocation for the ADR instruction. fixup_t2_adr_pcrel_12, - // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch - // instructions. + // 24-bit PC relative relocation for conditional branch instructions. fixup_arm_condbranch, - // fixup_arm_uncondbranch - 24-bit PC relative relocation for - // branch instructions. (unconditional) + // 24-bit PC relative relocation for branch instructions. (unconditional) fixup_arm_uncondbranch, - // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct - // uconditional branch instructions. + // 20-bit PC relative relocation for Thumb2 direct uconditional branch + // instructions. fixup_t2_condbranch, - // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct - // branch unconditional branch instructions. + // 20-bit PC relative relocation for Thumb2 direct branch unconditional branch + // instructions. fixup_t2_uncondbranch, - // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions. + // 12-bit fixup for Thumb B instructions. fixup_arm_thumb_br, // The following fixups handle the ARM BL instructions. These can be @@ -75,42 +67,41 @@ enum Fixups { // MachO does not draw a distinction between the two cases, so it will treat // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups. - // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions. + // Fixup for unconditional ARM BL instructions. fixup_arm_uncondbl, - // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial - // conditionalisation. + // Fixup for ARM BL instructions with nontrivial conditionalisation. fixup_arm_condbl, - // fixup_arm_blx - Fixup for ARM BLX instructions. + // Fixup for ARM BLX instructions. fixup_arm_blx, - // fixup_arm_thumb_bl - Fixup for Thumb BL instructions. + // Fixup for Thumb BL instructions. fixup_arm_thumb_bl, - // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions. + // Fixup for Thumb BLX instructions. fixup_arm_thumb_blx, - // fixup_arm_thumb_cb - Fixup for Thumb branch instructions. + // Fixup for Thumb branch instructions. fixup_arm_thumb_cb, - // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs. + // Fixup for Thumb load/store from constant pool instrs. fixup_arm_thumb_cp, - // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions. + // Fixup for Thumb conditional branching instructions. fixup_arm_thumb_bcc, // The next two are for the movt/movw pair // the 16bit imm field are split into imm{15-12} and imm{11-0} fixup_arm_movt_hi16, // :upper16: fixup_arm_movw_lo16, // :lower16: - fixup_t2_movt_hi16, // :upper16: - fixup_t2_movw_lo16, // :lower16: + fixup_t2_movt_hi16, // :upper16: + fixup_t2_movw_lo16, // :lower16: - // fixup_arm_mod_imm - Fixup for mod_imm + // Fixup for mod_imm fixup_arm_mod_imm, - // fixup_t2_so_imm - Fixup for Thumb2 8-bit rotated operand + // Fixup for Thumb2 8-bit rotated operand fixup_t2_so_imm, // Marker @@ -118,6 +109,6 @@ enum Fixups { NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; } -} +} // namespace llvm #endif diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 5c3b45ac2328..d18298385adf 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -230,13 +230,25 @@ void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value, namespace llvm { // Prepare value for the target space for it -void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t &Value, +void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, + const MCValue &Target, + uint64_t &Value, MCContext *Ctx) const { // The size of the fixup in bits. uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize; unsigned Kind = Fixup.getKind(); + // Parsed LLVM-generated temporary labels are already + // adjusted for instruction size, but normal labels aren't. + // + // To handle both cases, we simply un-adjust the temporary label + // case so it acts like all other labels. + if (const MCSymbolRefExpr *A = Target.getSymA()) { + if (A->getSymbol().isTemporary()) + Value += 2; + } + switch (Kind) { default: llvm_unreachable("unhandled fixup"); @@ -333,9 +345,10 @@ MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { MCELFObjectTargetWriter::getOSABI(OSType)); } -void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel, MCContext &Ctx) const { +void AVRAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const { + adjustFixupValue(Fixup, Target, Value, &Asm.getContext()); if (Value == 0) return; // Doesn't change encoding. @@ -349,7 +362,7 @@ void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. @@ -436,30 +449,16 @@ bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } -void AVRAsmBackend::processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFixup &Fixup, - const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) { +bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target) { switch ((unsigned) Fixup.getKind()) { + default: return false; // Fixups which should always be recorded as relocations. case AVR::fixup_7_pcrel: case AVR::fixup_13_pcrel: case AVR::fixup_call: - IsResolved = false; - break; - default: - // Parsed LLVM-generated temporary labels are already - // adjusted for instruction size, but normal labels aren't. - // - // To handle both cases, we simply un-adjust the temporary label - // case so it acts like all other labels. - if (Target.getSymA()->getSymbol().isTemporary()) - Value += 2; - - adjustFixupValue(Fixup, Value, &Asm.getContext()); - break; + return true; } } diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h index f2be2494684a..4a75e3b0d22d 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h +++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h @@ -35,13 +35,14 @@ public: AVRAsmBackend(Triple::OSType OSType) : MCAsmBackend(), OSType(OSType) {} - void adjustFixupValue(const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) const; + void adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, + uint64_t &Value, MCContext *Ctx = nullptr) const; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsPCRel) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; @@ -63,10 +64,8 @@ public: bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; private: Triple::OSType OSType; diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp index c6ddd6bdad5e..f48429ee57b0 100644 --- a/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -16,6 +16,7 @@ #include "BPFRegisterInfo.h" #include "BPFSubtarget.h" #include "BPFTargetMachine.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -57,6 +58,11 @@ private: bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset); + // Node preprocessing cases + void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I); + void PreprocessCopyToReg(SDNode *Node); + void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I); + // Find constants from a constant structure typedef std::vector<unsigned char> val_vec_type; bool fillGenericConstant(const DataLayout &DL, const Constant *CV, @@ -69,9 +75,12 @@ private: val_vec_type &Vals, int Offset); bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset, uint64_t Size, unsigned char *ByteSeq); + bool checkLoadDef(unsigned DefReg, unsigned match_load_op); // Mapping from ConstantStruct global value to corresponding byte-list values std::map<const void *, val_vec_type> cs_vals_; + // Mapping from vreg to load memory opcode + std::map<unsigned, unsigned> load_to_vreg_; }; } // namespace @@ -203,89 +212,110 @@ void BPFDAGToDAGISel::Select(SDNode *Node) { SelectCode(Node); } +void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node, + SelectionDAG::allnodes_iterator I) { + union { + uint8_t c[8]; + uint16_t s; + uint32_t i; + uint64_t d; + } new_val; // hold up the constant values replacing loads. + bool to_replace = false; + SDLoc DL(Node); + const LoadSDNode *LD = cast<LoadSDNode>(Node); + uint64_t size = LD->getMemOperand()->getSize(); + + if (!size || size > 8 || (size & (size - 1))) + return; + + SDNode *LDAddrNode = LD->getOperand(1).getNode(); + // Match LDAddr against either global_addr or (global_addr + offset) + unsigned opcode = LDAddrNode->getOpcode(); + if (opcode == ISD::ADD) { + SDValue OP1 = LDAddrNode->getOperand(0); + SDValue OP2 = LDAddrNode->getOperand(1); + + // We want to find the pattern global_addr + offset + SDNode *OP1N = OP1.getNode(); + if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0) + return; + + DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n'); + + const GlobalAddressSDNode *GADN = + dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode()); + const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode()); + if (GADN && CDN) + to_replace = + getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c); + } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END && + LDAddrNode->getNumOperands() > 0) { + DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n'); + + SDValue OP1 = LDAddrNode->getOperand(0); + if (const GlobalAddressSDNode *GADN = + dyn_cast<GlobalAddressSDNode>(OP1.getNode())) + to_replace = getConstantFieldValue(GADN, 0, size, new_val.c); + } + + if (!to_replace) + return; + + // replacing the old with a new value + uint64_t val; + if (size == 1) + val = new_val.c[0]; + else if (size == 2) + val = new_val.s; + else if (size == 4) + val = new_val.i; + else { + val = new_val.d; + } + + DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val + << '\n'); + SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64); + + // After replacement, the current node is dead, we need to + // go backward one step to make iterator still work + I--; + SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)}; + SDValue To[] = {NVal, NVal}; + CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2); + I++; + // It is safe to delete node now + CurDAG->DeleteNode(Node); +} + void BPFDAGToDAGISel::PreprocessISelDAG() { - // Iterate through all nodes, only interested in loads from ConstantStruct - // ConstantArray should have converted by IR->DAG processing + // Iterate through all nodes, interested in the following cases: + // + // . loads from ConstantStruct or ConstantArray of constructs + // which can be turns into constant itself, with this we can + // avoid reading from read-only section at runtime. + // + // . reg truncating is often the result of 8/16/32bit->64bit or + // 8/16bit->32bit conversion. If the reg value is loaded with + // masked byte width, the AND operation can be removed since + // BPF LOAD already has zero extension. + // + // This also solved a correctness issue. + // In BPF socket-related program, e.g., __sk_buff->{data, data_end} + // are 32-bit registers, but later on, kernel verifier will rewrite + // it with 64-bit value. Therefore, truncating the value after the + // load will result in incorrect code. for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E;) { SDNode *Node = &*I++; unsigned Opcode = Node->getOpcode(); - if (Opcode != ISD::LOAD) - continue; - - union { - uint8_t c[8]; - uint16_t s; - uint32_t i; - uint64_t d; - } new_val; // hold up the constant values replacing loads. - bool to_replace = false; - SDLoc DL(Node); - const LoadSDNode *LD = cast<LoadSDNode>(Node); - uint64_t size = LD->getMemOperand()->getSize(); - if (!size || size > 8 || (size & (size - 1))) - continue; - - SDNode *LDAddrNode = LD->getOperand(1).getNode(); - // Match LDAddr against either global_addr or (global_addr + offset) - unsigned opcode = LDAddrNode->getOpcode(); - if (opcode == ISD::ADD) { - SDValue OP1 = LDAddrNode->getOperand(0); - SDValue OP2 = LDAddrNode->getOperand(1); - - // We want to find the pattern global_addr + offset - SDNode *OP1N = OP1.getNode(); - if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || - OP1N->getNumOperands() == 0) - continue; - - DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n'); - - const GlobalAddressSDNode *GADN = - dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode()); - const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode()); - if (GADN && CDN) - to_replace = - getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c); - } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END && - LDAddrNode->getNumOperands() > 0) { - DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n'); - - SDValue OP1 = LDAddrNode->getOperand(0); - if (const GlobalAddressSDNode *GADN = - dyn_cast<GlobalAddressSDNode>(OP1.getNode())) - to_replace = getConstantFieldValue(GADN, 0, size, new_val.c); - } - - if (!to_replace) - continue; - - // replacing the old with a new value - uint64_t val; - if (size == 1) - val = new_val.c[0]; - else if (size == 2) - val = new_val.s; - else if (size == 4) - val = new_val.i; - else { - val = new_val.d; - } - - DEBUG(dbgs() << "Replacing load of size " << size << " with constant " - << val << '\n'); - SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64); - - // After replacement, the current node is dead, we need to - // go backward one step to make iterator still work - I--; - SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)}; - SDValue To[] = {NVal, NVal}; - CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2); - I++; - // It is safe to delete node now - CurDAG->DeleteNode(Node); + if (Opcode == ISD::LOAD) + PreprocessLoad(Node, I); + else if (Opcode == ISD::CopyToReg) + PreprocessCopyToReg(Node); + else if (Opcode == ISD::AND) + PreprocessTrunc(Node, I); } } @@ -415,6 +445,134 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL, return true; } +void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) { + const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1)); + if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg())) + return; + + const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2)); + if (!LD) + return; + + // Assign a load value to a virtual register. record its load width + unsigned mem_load_op = 0; + switch (LD->getMemOperand()->getSize()) { + default: + return; + case 4: + mem_load_op = BPF::LDW; + break; + case 2: + mem_load_op = BPF::LDH; + break; + case 1: + mem_load_op = BPF::LDB; + break; + } + + DEBUG(dbgs() << "Find Load Value to VReg " + << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n'); + load_to_vreg_[RegN->getReg()] = mem_load_op; +} + +void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, + SelectionDAG::allnodes_iterator I) { + ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1)); + if (!MaskN) + return; + + unsigned match_load_op = 0; + switch (MaskN->getZExtValue()) { + default: + return; + case 0xFFFFFFFF: + match_load_op = BPF::LDW; + break; + case 0xFFFF: + match_load_op = BPF::LDH; + break; + case 0xFF: + match_load_op = BPF::LDB; + break; + } + + // The Reg operand should be a virtual register, which is defined + // outside the current basic block. DAG combiner has done a pretty + // good job in removing truncating inside a single basic block. + SDValue BaseV = Node->getOperand(0); + if (BaseV.getOpcode() != ISD::CopyFromReg) + return; + + const RegisterSDNode *RegN = + dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1)); + if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg())) + return; + unsigned AndOpReg = RegN->getReg(); + DEBUG(dbgs() << "Examine %vreg" << TargetRegisterInfo::virtReg2Index(AndOpReg) + << '\n'); + + // Examine the PHI insns in the MachineBasicBlock to found out the + // definitions of this virtual register. At this stage (DAG2DAG + // transformation), only PHI machine insns are available in the machine basic + // block. + MachineBasicBlock *MBB = FuncInfo->MBB; + MachineInstr *MII = nullptr; + for (auto &MI : *MBB) { + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { + const MachineOperand &MOP = MI.getOperand(i); + if (!MOP.isReg() || !MOP.isDef()) + continue; + unsigned Reg = MOP.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) { + MII = &MI; + break; + } + } + } + + if (MII == nullptr) { + // No phi definition in this block. + if (!checkLoadDef(AndOpReg, match_load_op)) + return; + } else { + // The PHI node looks like: + // %vreg2<def> = PHI %vreg0, <BB#1>, %vreg1, <BB#3> + // Trace each incoming definition, e.g., (%vreg0, BB#1) and (%vreg1, BB#3) + // The AND operation can be removed if both %vreg0 in BB#1 and %vreg1 in + // BB#3 are defined with with a load matching the MaskN. + DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n'); + unsigned PrevReg = -1; + for (unsigned i = 0; i < MII->getNumOperands(); ++i) { + const MachineOperand &MOP = MII->getOperand(i); + if (MOP.isReg()) { + if (MOP.isDef()) + continue; + PrevReg = MOP.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(PrevReg)) + return; + if (!checkLoadDef(PrevReg, match_load_op)) + return; + } + } + } + + DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump(); + dbgs() << '\n'); + + I--; + CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); + I++; + CurDAG->DeleteNode(Node); +} + +bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) { + auto it = load_to_vreg_.find(DefReg); + if (it == load_to_vreg_.end()) + return false; // The definition of register is not exported yet. + + return it->second == match_load_op; +} + FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) { return new BPFDAGToDAGISel(TM); } diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 2b0ceaa66258..97a53dcbaed7 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -178,8 +178,8 @@ static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long", cl::Hidden, cl::desc("Enable long calls for save-restore stubs."), cl::init(false), cl::ZeroOrMore); -static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true), - cl::Hidden, cl::desc("Use allocframe more conservatively")); +static cl::opt<bool> EliminateFramePointer("hexagon-fp-elim", cl::init(true), + cl::Hidden, cl::desc("Refrain from using FP whenever possible")); static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden, cl::init(true), cl::desc("Optimize spill slots")); @@ -550,7 +550,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB, auto &HST = MF.getSubtarget<HexagonSubtarget>(); auto &HII = *HST.getInstrInfo(); auto &HRI = *HST.getRegisterInfo(); - DebugLoc dl; unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment()); @@ -584,77 +583,56 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB, MI->eraseFromParent(); } - if (!hasFP(MF)) - return; - - // Check for overflow. - // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used? - const unsigned int ALLOCFRAME_MAX = 16384; + DebugLoc dl = MBB.findDebugLoc(InsertPt); - // Create a dummy memory operand to avoid allocframe from being treated as - // a volatile memory reference. - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore, - 4, 4); - - if (NumBytes >= ALLOCFRAME_MAX) { - // Emit allocframe(#0). - BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe)) - .addImm(0) - .addMemOperand(MMO); - - // Subtract offset from frame pointer. - // We use a caller-saved non-parameter register for that. - unsigned CallerSavedReg = HRI.getFirstCallerSavedNonParamReg(); - BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CONST32), - CallerSavedReg).addImm(NumBytes); - BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_sub), SP) + if (hasFP(MF)) { + insertAllocframe(MBB, InsertPt, NumBytes); + if (AlignStack) { + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP) + .addReg(SP) + .addImm(-int64_t(MaxAlign)); + } + // If the stack-checking is enabled, and we spilled the callee-saved + // registers inline (i.e. did not use a spill function), then call + // the stack checker directly. + if (EnableStackOVFSanitizer && !PrologueStubs) + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk)) + .addExternalSymbol("__runtime_stack_check"); + } else if (NumBytes > 0) { + assert(alignTo(NumBytes, 8) == NumBytes); + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP) .addReg(SP) - .addReg(CallerSavedReg); - } else { - BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe)) - .addImm(NumBytes) - .addMemOperand(MMO); + .addImm(-int(NumBytes)); } - - if (AlignStack) { - BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP) - .addReg(SP) - .addImm(-int64_t(MaxAlign)); - } - - // If the stack-checking is enabled, and we spilled the callee-saved - // registers inline (i.e. did not use a spill function), then call - // the stack checker directly. - if (EnableStackOVFSanitizer && !PrologueStubs) - BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk)) - .addExternalSymbol("__runtime_stack_check"); } void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const { MachineFunction &MF = *MBB.getParent(); - if (!hasFP(MF)) - return; - auto &HST = MF.getSubtarget<HexagonSubtarget>(); auto &HII = *HST.getInstrInfo(); auto &HRI = *HST.getRegisterInfo(); unsigned SP = HRI.getStackRegister(); + MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator(); + DebugLoc dl = MBB.findDebugLoc(InsertPt); + + if (!hasFP(MF)) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (unsigned NumBytes = MFI.getStackSize()) { + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP) + .addReg(SP) + .addImm(NumBytes); + } + return; + } + MachineInstr *RetI = getReturn(MBB); unsigned RetOpc = RetI ? RetI->getOpcode() : 0; - MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator(); - DebugLoc DL; - if (InsertPt != MBB.end()) - DL = InsertPt->getDebugLoc(); - else if (!MBB.empty()) - DL = std::prev(MBB.end())->getDebugLoc(); - // Handle EH_RETURN. if (RetOpc == Hexagon::EH_RETURN_JMPR) { - BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe)); - BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::A2_add), SP) + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe)); + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_add), SP) .addReg(SP) .addReg(Hexagon::R28); return; @@ -699,16 +677,52 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const { // otherwise just add deallocframe. The function could be returning via a // tail call. if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) { - BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe)); + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe)); return; } unsigned NewOpc = Hexagon::L4_return; - MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc)); + MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc)); // Transfer the function live-out registers. NewI->copyImplicitOps(MF, *RetI); MBB.erase(RetI); } +void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const { + MachineFunction &MF = *MBB.getParent(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + auto &HII = *HST.getInstrInfo(); + auto &HRI = *HST.getRegisterInfo(); + + // Check for overflow. + // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used? + const unsigned int ALLOCFRAME_MAX = 16384; + + // Create a dummy memory operand to avoid allocframe from being treated as + // a volatile memory reference. + auto *MMO = MF.getMachineMemOperand(MachinePointerInfo::getStack(MF, 0), + MachineMemOperand::MOStore, 4, 4); + + DebugLoc dl = MBB.findDebugLoc(InsertPt); + + if (NumBytes >= ALLOCFRAME_MAX) { + // Emit allocframe(#0). + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe)) + .addImm(0) + .addMemOperand(MMO); + + // Subtract the size from the stack pointer. + unsigned SP = HRI.getStackRegister(); + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP) + .addReg(SP) + .addImm(-int(NumBytes)); + } else { + BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe)) + .addImm(NumBytes) + .addMemOperand(MMO); + } +} + void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF, MachineBasicBlock &SaveB) const { SetVector<unsigned> Worklist; @@ -928,12 +942,11 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB, } bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { + if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + return false; + auto &MFI = MF.getFrameInfo(); auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); - - bool HasFixed = MFI.getNumFixedObjects(); - bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI) - .getLocalFrameObjectCount(); bool HasExtraAlign = HRI.needsStackRealignment(MF); bool HasAlloca = MFI.hasVarSizedObjects(); @@ -947,18 +960,35 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { // By default we want to use SP (since it's always there). FP requires // some setup (i.e. ALLOCFRAME). - // Fixed and preallocated objects need FP if the distance from them to - // the SP is unknown (as is with alloca or aligna). - if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign)) + // Both, alloca and stack alignment modify the stack pointer by an + // undetermined value, so we need to save it at the entry to the function + // (i.e. use allocframe). + if (HasAlloca || HasExtraAlign) return true; if (MFI.getStackSize() > 0) { - if (EnableStackOVFSanitizer || UseAllocframe) + // If FP-elimination is disabled, we have to use FP at this point. + const TargetMachine &TM = MF.getTarget(); + if (TM.Options.DisableFramePointerElim(MF) || !EliminateFramePointer) + return true; + if (EnableStackOVFSanitizer) return true; } - if (MFI.hasCalls() || - MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR()) + const auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>(); + if (MFI.hasCalls() || HMFI.hasClobberLR()) + return true; + + // Frame pointer elimination is a possiblility at this point, but + // to know if FP is necessary we need to know if spill/restore + // functions will be used (they require FP to be valid). + // This means that hasFP shouldn't really be called before CSI is + // calculated, and some measures are taken to make sure of that + // (e.g. default implementations of virtual functions that call it + // are overridden apropriately). + assert(MFI.isCalleeSavedInfoValid() && "Need to know CSI"); + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + if (useSpillFunction(MF, CSI) || useRestoreFunction(MF, CSI)) return true; return false; @@ -1051,9 +1081,10 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, bool HasExtraAlign = HRI.needsStackRealignment(MF); bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None; - unsigned FrameSize = MFI.getStackSize(); - unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister(); auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>(); + unsigned FrameSize = MFI.getStackSize(); + unsigned SP = HRI.getStackRegister(); + unsigned FP = HRI.getFrameRegister(); unsigned AP = HMFI.getStackAlignBasePhysReg(); // It may happen that AP will be absent even HasAlloca && HasExtraAlign // is true. HasExtraAlign may be set because of vector spills, without @@ -1135,7 +1166,7 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, // there will be no SP -= FrameSize), so the frame size should not be // added to the calculated offset. int RealOffset = Offset; - if (!UseFP && !UseAP && HasFP) + if (!UseFP && !UseAP) RealOffset = FrameSize+Offset; return RealOffset; } @@ -2402,7 +2433,7 @@ void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, /// be generated via inline code. If this function returns "true", inline /// code will be generated. If this function returns "false", additional /// checks are performed, which may still lead to the inline code. -bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF, +bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF, const CSIVect &CSI) const { if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn()) return true; @@ -2432,7 +2463,7 @@ bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF, return false; } -bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF, +bool HexagonFrameLowering::useSpillFunction(const MachineFunction &MF, const CSIVect &CSI) const { if (shouldInlineCSR(MF, CSI)) return false; @@ -2445,7 +2476,7 @@ bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF, return Threshold < NumCSI; } -bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF, +bool HexagonFrameLowering::useRestoreFunction(const MachineFunction &MF, const CSIVect &CSI) const { if (shouldInlineCSR(MF, CSI)) return false; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 529a61d4a5b5..f4d4e1b61a26 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -48,6 +48,15 @@ public: return true; } + bool hasReservedCallFrame(const MachineFunction &MF) const override { + // We always reserve call frame as a part of the initial stack allocation. + return true; + } + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override { + // Override this function to avoid calling hasFP before CSI is set + // (the default implementation calls hasFP). + return true; + } MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; @@ -94,6 +103,8 @@ private: unsigned SP, unsigned CF) const; void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const; void insertEpilogueInBlock(MachineBasicBlock &MBB) const; + void insertAllocframe(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const; bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI, const HexagonRegisterInfo &HRI, bool &PrologueStubs) const; bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI, @@ -148,9 +159,9 @@ private: void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI, bool IsDef, bool IsKill) const; - bool shouldInlineCSR(MachineFunction &MF, const CSIVect &CSI) const; - bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const; - bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const; + bool shouldInlineCSR(const MachineFunction &MF, const CSIVect &CSI) const; + bool useSpillFunction(const MachineFunction &MF, const CSIVect &CSI) const; + bool useRestoreFunction(const MachineFunction &MF, const CSIVect &CSI) const; bool mayOverflowFrameOffset(MachineFunction &MF) const; }; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index afed894cfb9a..2daacf795555 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1002,51 +1002,46 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { - SDNode *Node = Op.getNode(); MachineFunction &MF = DAG.getMachineFunction(); - auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>(); - switch (Node->getOpcode()) { - case ISD::INLINEASM: { - unsigned NumOps = Node->getNumOperands(); - if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue) - --NumOps; // Ignore the flag operand. - - for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) { - if (FuncInfo.hasClobberLR()) - break; - unsigned Flags = - cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue(); - unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); - ++i; // Skip the ID value. - - switch (InlineAsm::getKind(Flags)) { - default: llvm_unreachable("Bad flags!"); - case InlineAsm::Kind_RegDef: - case InlineAsm::Kind_RegUse: - case InlineAsm::Kind_Imm: - case InlineAsm::Kind_Clobber: - case InlineAsm::Kind_Mem: { - for (; NumVals; --NumVals, ++i) {} - break; - } - case InlineAsm::Kind_RegDefEarlyClobber: { - for (; NumVals; --NumVals, ++i) { - unsigned Reg = - cast<RegisterSDNode>(Node->getOperand(i))->getReg(); - - // Check it to be lr - const HexagonRegisterInfo *QRI = Subtarget.getRegisterInfo(); - if (Reg == QRI->getRARegister()) { - FuncInfo.setHasClobberLR(true); - break; - } - } - break; - } + auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>(); + const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo(); + unsigned LR = HRI.getRARegister(); + + if (Op.getOpcode() != ISD::INLINEASM || HMFI.hasClobberLR()) + return Op; + + unsigned NumOps = Op.getNumOperands(); + if (Op.getOperand(NumOps-1).getValueType() == MVT::Glue) + --NumOps; // Ignore the flag operand. + + for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) { + unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue(); + unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); + ++i; // Skip the ID value. + + switch (InlineAsm::getKind(Flags)) { + default: + llvm_unreachable("Bad flags!"); + case InlineAsm::Kind_RegUse: + case InlineAsm::Kind_Imm: + case InlineAsm::Kind_Mem: + i += NumVals; + break; + case InlineAsm::Kind_Clobber: + case InlineAsm::Kind_RegDef: + case InlineAsm::Kind_RegDefEarlyClobber: { + for (; NumVals; --NumVals, ++i) { + unsigned Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg(); + if (Reg != LR) + continue; + HMFI.setHasClobberLR(true); + return Op; } + break; } } - } // Node->getOpcode + } + return Op; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index fec2dc5ce306..1eac2d3dd8e2 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1253,10 +1253,16 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); + unsigned PReg = Op1.getReg(); + assert(Op1.getSubReg() == 0); + unsigned PState = getRegState(Op1); + if (Op0.getReg() != Op2.getReg()) { + unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill + : PState; auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov)) .add(Op0) - .add(Op1) + .addReg(PReg, S) .add(Op2); if (IsDestLive) T.addReg(Op0.getReg(), RegState::Implicit); @@ -1265,7 +1271,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { if (Op0.getReg() != Op3.getReg()) { auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov)) .add(Op0) - .add(Op1) + .addReg(PReg, PState) .add(Op3); if (IsDestLive) T.addReg(Op0.getReg(), RegState::Implicit); @@ -1282,12 +1288,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); + unsigned PReg = Op1.getReg(); + assert(Op1.getSubReg() == 0); + unsigned PState = getRegState(Op1); if (Op0.getReg() != Op2.getReg()) { + unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill + : PState; unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo); unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi); auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine)) .add(Op0) + .addReg(PReg, S) .add(Op1) .addReg(SrcHi) .addReg(SrcLo); @@ -1300,7 +1312,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi); auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine)) .add(Op0) - .add(Op1) + .addReg(PReg, PState) .addReg(SrcHi) .addReg(SrcLo); if (IsDestLive) diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp index de6b203015d8..e93f075f4ccd 100644 --- a/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -69,9 +69,7 @@ namespace { public: static char ID; - HexagonNewValueJump() : MachineFunctionPass(ID) { - initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry()); - } + HexagonNewValueJump() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineBranchProbabilityInfo>(); @@ -445,8 +443,6 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { unsigned predReg = 0; // predicate reg of the jump. unsigned cmpReg1 = 0; int cmpOp2 = 0; - bool MO1IsKill = false; - bool MO2IsKill = false; MachineBasicBlock::iterator jmpPos; MachineBasicBlock::iterator cmpPos; MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr; @@ -548,14 +544,10 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { // We need cmpReg1 and cmpOp2(imm or reg) while building // new value jump instruction. cmpReg1 = MI.getOperand(1).getReg(); - if (MI.getOperand(1).isKill()) - MO1IsKill = true; - if (isSecondOpReg) { + if (isSecondOpReg) cmpOp2 = MI.getOperand(2).getReg(); - if (MI.getOperand(2).isKill()) - MO2IsKill = true; - } else + else cmpOp2 = MI.getOperand(2).getImm(); continue; } @@ -605,11 +597,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { if ((COp == Hexagon::C2_cmpeq || COp == Hexagon::C4_cmpneq) && (feederReg == (unsigned) cmpOp2)) { unsigned tmp = cmpReg1; - bool tmpIsKill = MO1IsKill; cmpReg1 = cmpOp2; - MO1IsKill = MO2IsKill; cmpOp2 = tmp; - MO2IsKill = tmpIsKill; } // Now we have swapped the operands, all we need to check is, @@ -623,31 +612,33 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { // make sure we are respecting the kill values of // the operands of the feeder. - bool updatedIsKill = false; - for (unsigned i = 0; i < MI.getNumOperands(); i++) { - MachineOperand &MO = MI.getOperand(i); - if (MO.isReg() && MO.isUse()) { - unsigned feederReg = MO.getReg(); - for (MachineBasicBlock::iterator localII = feederPos, - end = cmpInstr->getIterator(); localII != end; localII++) { - MachineInstr &localMI = *localII; - for (unsigned j = 0; j < localMI.getNumOperands(); j++) { - MachineOperand &localMO = localMI.getOperand(j); - if (localMO.isReg() && localMO.isUse() && - localMO.isKill() && feederReg == localMO.getReg()) { - // We found that there is kill of a use register - // Set up a kill flag on the register - localMO.setIsKill(false); - MO.setIsKill(); - updatedIsKill = true; - break; - } + auto TransferKills = [jmpPos,cmpPos] (MachineInstr &MI) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned UseR = MO.getReg(); + for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) { + if (I == cmpPos) + continue; + for (MachineOperand &Op : I->operands()) { + if (!Op.isReg() || !Op.isUse() || !Op.isKill()) + continue; + if (Op.getReg() != UseR) + continue; + // We found that there is kill of a use register + // Set up a kill flag on the register + Op.setIsKill(false); + MO.setIsKill(true); + return; } - if (updatedIsKill) break; } } - if (updatedIsKill) break; - } + }; + + TransferKills(*feederPos); + TransferKills(*cmpPos); + bool MO1IsKill = cmpPos->killsRegister(cmpReg1, QRI); + bool MO2IsKill = isSecondOpReg && cmpPos->killsRegister(cmpOp2, QRI); MBB->splice(jmpPos, MI.getParent(), MI); MBB->splice(jmpPos, MI.getParent(), cmpInstr); diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 27b40f134b1f..a331c978f59d 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -535,9 +535,9 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) { !MI->getOperand(1).isGlobal()) continue; - DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n"); - DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG) - << "\n"); + DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode()) << "]: " + << *MI << "\n\t[InstrNode]: " + << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n'); NodeList UNodeList; getAllRealUses(SA, UNodeList); @@ -605,7 +605,9 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) { const TargetOperandInfo TOI(*HII); DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI); - G.build(); + // Need to keep dead phis because we can propagate uses of registers into + // nodes dominated by those would-be phis. + G.build(BuildOptions::KeepDeadPhis); DFG = &G; Liveness L(MRI, *DFG); diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 031a1bdefafb..76d9b31b005f 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -113,6 +113,7 @@ namespace llvm { void initializeHexagonLoopIdiomRecognizePass(PassRegistry&); void initializeHexagonGenMuxPass(PassRegistry&); void initializeHexagonOptAddrModePass(PassRegistry&); + void initializeHexagonNewValueJumpPass(PassRegistry&); Pass *createHexagonLoopIdiomPass(); FunctionPass *createHexagonBitSimplify(); @@ -158,6 +159,7 @@ extern "C" void LLVMInitializeHexagonTarget() { initializeHexagonLoopIdiomRecognizePass(PR); initializeHexagonGenMuxPass(PR); initializeHexagonOptAddrModePass(PR); + initializeHexagonNewValueJumpPass(PR); } HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 4dacb1501392..34df2ebcc520 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -49,6 +49,10 @@ static cl::opt<bool> TraceGVPlacement("trace-gv-placement", cl::Hidden, cl::init(false), cl::desc("Trace global value placement")); +static cl::opt<bool> + EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false), + cl::desc("Emit hexagon jump tables in function section")); + // TraceGVPlacement controls messages for all builds. For builds with assertions // (debug or release), messages are also controlled by the usual debug flags // (e.g. -debug and -debug-only=globallayout) @@ -256,6 +260,11 @@ unsigned HexagonTargetObjectFile::getSmallDataSize() const { return SmallDataThreshold; } +bool HexagonTargetObjectFile::shouldPutJumpTableInFunctionSection( + bool UsesLabelDifference, const Function &F) const { + return EmitJtInText; +} + /// Descends any type down to "elementary" components, /// discovering the smallest addressable one. /// If zero is returned, declaration will not be modified. diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h index 58dff2b95e19..373d850b53be 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.h +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h @@ -33,6 +33,9 @@ namespace llvm { unsigned getSmallDataSize() const; + bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference, + const Function &F) const override; + private: MCSectionELF *SmallDataSection; MCSectionELF *SmallBSSSection; diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index d578bfab3658..aac810e29fe9 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -21,6 +21,10 @@ using namespace llvm; #define DEBUG_TYPE "hexagontti" +static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables", + cl::init(true), cl::Hidden, + cl::desc("Control lookup table emission on Hexagon target")); + TargetTransformInfo::PopcntSupportKind HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { // Return Fast Hardware support as every input < 64 bits will be promoted @@ -29,7 +33,7 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { } // The Hexagon target can unroll loops with run-time trip counts. -void HexagonTTIImpl::getUnrollingPreferences(Loop *L, +void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { UP.Runtime = UP.Partial = true; } @@ -46,8 +50,9 @@ unsigned HexagonTTIImpl::getCacheLineSize() const { return getST()->getL1CacheLineSize(); } -int HexagonTTIImpl::getUserCost(const User *U) { - auto isCastFoldedIntoLoad = [] (const CastInst *CI) -> bool { +int HexagonTTIImpl::getUserCost(const User *U, + ArrayRef<const Value *> Operands) { + auto isCastFoldedIntoLoad = [](const CastInst *CI) -> bool { if (!CI->isIntegerCast()) return false; const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0)); @@ -67,5 +72,9 @@ int HexagonTTIImpl::getUserCost(const User *U) { if (const CastInst *CI = dyn_cast<const CastInst>(U)) if (isCastFoldedIntoLoad(CI)) return TargetTransformInfo::TCC_Free; - return BaseT::getUserCost(U); + return BaseT::getUserCost(U, Operands); +} + +bool HexagonTTIImpl::shouldBuildLookupTables() const { + return EmitLookupTables; } diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 8414bfc4e197..ab5a6e07d873 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -46,7 +46,8 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; // The Hexagon target can unroll loops with run-time trip counts. - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); // L1 cache prefetch. unsigned getPrefetchDistance() const; @@ -61,7 +62,10 @@ public: /// @} - int getUserCost(const User *U); + int getUserCost(const User *U, ArrayRef<const Value *> Operands); + + // Hexagon specific decision to generate a lookup table. + bool shouldBuildLookupTables() const; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 093ce80bc2e3..34d0b55aa22a 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -199,11 +199,8 @@ public: return Infos[Kind - FirstTargetFixupKind]; } - /// processFixupValue - Target hook to adjust the literal value of a fixup - /// if necessary. IsResolved signals whether the caller believes a relocation - /// is needed; the target can modify the value. The default does nothing. - void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, bool &IsResolved) override { + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override { MCFixupKind Kind = Fixup.getKind(); switch((unsigned)Kind) { @@ -299,8 +296,7 @@ public: case fixup_Hexagon_LD_PLT_B22_PCREL_X: case fixup_Hexagon_LD_PLT_B32_PCREL_X: // These relocations should always have a relocation recorded - IsResolved = false; - return; + return true; case fixup_Hexagon_B22_PCREL: //IsResolved = false; @@ -317,7 +313,7 @@ public: case fixup_Hexagon_B7_PCREL: case fixup_Hexagon_B7_PCREL_X: if (DisableFixup) - IsResolved = false; + return true; break; case FK_Data_1: @@ -326,8 +322,9 @@ public: case FK_PCRel_4: case fixup_Hexagon_32: // Leave these relocations alone as they are used for EH. - return; + return false; } + return false; } /// getFixupKindNumBytes - The number of bytes the fixup may change. diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 9d5c179a0fd9..69b1ba1528d0 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2789,6 +2789,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, bool Is32BitSym, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { + // FIXME: These expansions do not respect -mxgot. MipsTargetStreamer &TOut = getTargetStreamer(); bool UseSrcReg = SrcReg != Mips::NoRegister; warnIfNoMacro(IDLoc); @@ -2808,8 +2809,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, // symbol in the final relocation is external and not modified with a // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16. if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg && - Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() && - !Res.getSymA()->getSymbol().isTemporary()) { + Res.getConstant() == 0 && + !(Res.getSymA()->getSymbol().isInSection() || + Res.getSymA()->getSymbol().isTemporary() || + (Res.getSymA()->getSymbol().isELF() && + cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() == + ELF::STB_LOCAL))) { const MCExpr *CallExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(), @@ -2865,6 +2870,85 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, return false; } + if (inPicMode() && ABI.ArePtrs64bit()) { + MCValue Res; + if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) { + Error(IDLoc, "expected relocatable expression"); + return true; + } + if (Res.getSymB() != nullptr) { + Error(IDLoc, "expected relocatable expression with only one symbol"); + return true; + } + + // The case where the result register is $25 is somewhat special. If the + // symbol in the final relocation is external and not modified with a + // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP. + if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg && + Res.getConstant() == 0 && + !(Res.getSymA()->getSymbol().isInSection() || + Res.getSymA()->getSymbol().isTemporary() || + (Res.getSymA()->getSymbol().isELF() && + cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() == + ELF::STB_LOCAL))) { + const MCExpr *CallExpr = + MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); + TOut.emitRRX(Mips::LD, DstReg, ABI.GetGlobalPtr(), + MCOperand::createExpr(CallExpr), IDLoc, STI); + return false; + } + + // The remaining cases are: + // Small offset: ld $tmp, %got_disp(symbol)($gp) + // >daddiu $tmp, $tmp, offset + // >daddu $rd, $tmp, $rs + // The daddiu's marked with a '>' may be omitted if they are redundant. If + // this happens then the last instruction must use $rd as the result + // register. + const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, + Res.getSymA(), + getContext()); + const MCExpr *LoExpr = nullptr; + if (Res.getConstant() != 0) { + // Symbols fully resolve with just the %got_disp(symbol) but we + // must still account for any offset to the symbol for + // expressions like symbol+8. + LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); + + // FIXME: Offsets greater than 16 bits are not yet implemented. + // FIXME: The correct range is a 32-bit sign-extended number. + if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) { + Error(IDLoc, "macro instruction uses large offset, which is not " + "currently supported"); + return true; + } + } + + unsigned TmpReg = DstReg; + if (UseSrcReg && + getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, + SrcReg)) { + // If $rs is the same as $rd, we need to use AT. + // If it is not available we exit. + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + TmpReg = ATReg; + } + + TOut.emitRRX(Mips::LD, TmpReg, ABI.GetGlobalPtr(), + MCOperand::createExpr(GotExpr), IDLoc, STI); + + if (LoExpr) + TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), + IDLoc, STI); + + if (UseSrcReg) + TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI); + + return false; + } + const MipsMCExpr *HiExpr = MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext()); const MipsMCExpr *LoExpr = diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td index 6b7f39e9dd79..38b09d105ddd 100644 --- a/lib/Target/Mips/MicroMips64r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td @@ -548,3 +548,15 @@ def : MipsInstAlias<"dnegu $rt, $rs", def : MipsInstAlias<"dnegu $rt", (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>, ISA_MICROMIPS64R6; +def : MipsInstAlias<"dsll $rd, $rt, $rs", + (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt, + GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6; +def : MipsInstAlias<"dsrl $rd, $rt, $rs", + (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt, + GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6; +def : MipsInstAlias<"dsrl $rd, $rt", + (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd, + GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6; +def : MipsInstAlias<"dsll $rd, $rt", + (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd, + GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6; diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 99025fe1341d..3dba7ce30cad 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -748,9 +748,6 @@ let AdditionalPredicates = [NotInMicroMips] in { defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>, GPR_64; } -def : MipsInstAlias<"dsll $rd, $rt, $rs", - (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>, - ISA_MIPS3; let AdditionalPredicates = [NotInMicroMips] in { def : MipsInstAlias<"dneg $rt, $rs", (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>, @@ -793,9 +790,18 @@ def : MipsInstAlias<"dsra $rd, $rt, $rs", (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS3; let AdditionalPredicates = [NotInMicroMips] in { + def : MipsInstAlias<"dsll $rd, $rt, $rs", + (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>, + ISA_MIPS3; def : MipsInstAlias<"dsrl $rd, $rt, $rs", (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS3; + def : MipsInstAlias<"dsrl $rd, $rt", + (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>, + ISA_MIPS3; + def : MipsInstAlias<"dsll $rd, $rt", + (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>, + ISA_MIPS3; // Two operand (implicit 0 selector) versions: def : MipsInstAlias<"dmtc0 $rt, $rd", diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index 5d82571ff94f..4a34e3101cb8 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -564,7 +564,7 @@ Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch, // For given opcode returns opcode of corresponding instruction with short // delay slot. -// For the pseudo TAILCALL*_MM instrunctions return the short delay slot +// For the pseudo TAILCALL*_MM instructions return the short delay slot // form. Unfortunately, TAILCALL<->b16 is denied as b16 has a limited range // that is too short to make use of for tail calls. static int getEquivalentCallShort(int Opcode) { diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 02102d6b22f4..a6ec9fb2e598 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -364,18 +364,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); - if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) { - setOperationAction(ISD::ADDC, MVT::i32, Expand); - setOperationAction(ISD::ADDE, MVT::i32, Expand); - } - - setOperationAction(ISD::ADDC, MVT::i64, Expand); - setOperationAction(ISD::ADDE, MVT::i64, Expand); - setOperationAction(ISD::SUBC, MVT::i32, Expand); - setOperationAction(ISD::SUBE, MVT::i32, Expand); - setOperationAction(ISD::SUBC, MVT::i64, Expand); - setOperationAction(ISD::SUBE, MVT::i64, Expand); - // Operations not directly supported by Mips. setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); @@ -481,7 +469,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::SHL); @@ -936,130 +923,14 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, } } -static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG, - const MipsSubtarget &Subtarget) { - // ROOTNode must have a multiplication as an operand for the match to be - // successful. - if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL && - ROOTNode->getOperand(1).getOpcode() != ISD::MUL) - return SDValue(); - - // We don't handle vector types here. - if (ROOTNode->getValueType(0).isVector()) - return SDValue(); - - // For MIPS64, madd / msub instructions are inefficent to use with 64 bit - // arithmetic. E.g. - // (add (mul a b) c) => - // let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in - // MIPS64: (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32) - // or - // MIPS64R2: (dins (mflo res) (mfhi res) 32 32) - // - // The overhead of setting up the Hi/Lo registers and reassembling the - // result makes this a dubious optimzation for MIPS64. The core of the - // problem is that Hi/Lo contain the upper and lower 32 bits of the - // operand and result. - // - // It requires a chain of 4 add/mul for MIPS64R2 to get better code - // density than doing it naively, 5 for MIPS64. Additionally, using - // madd/msub on MIPS64 requires the operands actually be 32 bit sign - // extended operands, not true 64 bit values. - // - // FIXME: For the moment, disable this completely for MIPS64. - if (Subtarget.hasMips64()) - return SDValue(); - - SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL - ? ROOTNode->getOperand(0) - : ROOTNode->getOperand(1); - - SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL - ? ROOTNode->getOperand(1) - : ROOTNode->getOperand(0); - - // Transform this to a MADD only if the user of this node is the add. - // If there are other users of the mul, this function returns here. - if (!Mult.hasOneUse()) - return SDValue(); - - // maddu and madd are unusual instructions in that on MIPS64 bits 63..31 - // must be in canonical form, i.e. sign extended. For MIPS32, the operands - // of the multiply must have 32 or more sign bits, otherwise we cannot - // perform this optimization. We have to check this here as we're performing - // this optimization pre-legalization. - SDValue MultLHS = Mult->getOperand(0); - SDValue MultRHS = Mult->getOperand(1); - unsigned LHSSB = CurDAG.ComputeNumSignBits(MultLHS); - unsigned RHSSB = CurDAG.ComputeNumSignBits(MultRHS); - - if (LHSSB < 32 || RHSSB < 32) - return SDValue(); - - APInt HighMask = - APInt::getHighBitsSet(Mult->getValueType(0).getScalarSizeInBits(), 32); - bool IsUnsigned = CurDAG.MaskedValueIsZero(Mult->getOperand(0), HighMask) && - CurDAG.MaskedValueIsZero(Mult->getOperand(1), HighMask) && - CurDAG.MaskedValueIsZero(AddOperand, HighMask); - - // Initialize accumulator. - SDLoc DL(ROOTNode); - SDValue TopHalf; - SDValue BottomHalf; - BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand, - CurDAG.getIntPtrConstant(0, DL)); - - TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand, - CurDAG.getIntPtrConstant(1, DL)); - SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, - BottomHalf, - TopHalf); - - // Create MipsMAdd(u) / MipsMSub(u) node. - bool IsAdd = ROOTNode->getOpcode() == ISD::ADD; - unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd) - : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub); - SDValue MAddOps[3] = { - CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)), - CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn}; - EVT VTs[2] = {MVT::i32, MVT::i32}; - SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps); - - SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd); - SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd); - SDValue Combined = - CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi); - return Combined; -} - -static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget &Subtarget) { - // (sub v0 (mul v1, v2)) => (msub v1, v2, v0) - if (DCI.isBeforeLegalizeOps()) { - if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && - !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64) - return performMADD_MSUBCombine(N, DAG, Subtarget); - - return SDValue(); - } - - return SDValue(); -} - static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const MipsSubtarget &Subtarget) { - // (add v0 (mul v1, v2)) => (madd v1, v2, v0) - if (DCI.isBeforeLegalizeOps()) { - if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && - !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64) - return performMADD_MSUBCombine(N, DAG, Subtarget); + // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) + if (DCI.isBeforeLegalizeOps()) return SDValue(); - } - // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) SDValue Add = N->getOperand(1); if (Add.getOpcode() != ISD::ADD) @@ -1187,8 +1058,6 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return performAssertZextCombine(N, DAG, DCI, Subtarget); case ISD::SHL: return performSHLCombine(N, DAG, DCI, Subtarget); - case ISD::SUB: - return performSUBCombine(N, DAG, DCI, Subtarget); } return SDValue(); diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 4be26dd25dc0..49ae6dd4cd39 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -245,64 +245,46 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { } } -void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const { - SDValue InFlag = Node->getOperand(2); - unsigned Opc = InFlag.getOpcode(); +void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag, + SDValue CmpLHS, const SDLoc &DL, + SDNode *Node) const { + unsigned Opc = InFlag.getOpcode(); (void)Opc; + + assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || + (Opc == ISD::SUBC || Opc == ISD::SUBE)) && + "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn"); + + unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu; + if (Subtarget->isGP64bit()) { + SLTuOp = Mips::SLTu64; + ADDuOp = Mips::DADDu; + } + + SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) }; SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1); EVT VT = LHS.getValueType(); - // In the base case, we can rely on the carry bit from the addsc - // instruction. - if (Opc == ISD::ADDC) { - SDValue Ops[3] = {LHS, RHS, InFlag}; - CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops); - return; + SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops); + + if (Subtarget->isGP64bit()) { + // On 64-bit targets, sltu produces an i64 but our backend currently says + // that SLTu64 produces an i32. We need to fix this in the long run but for + // now, just make the DAG type-correct by asserting the upper bits are zero. + Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT, + CurDAG->getTargetConstant(0, DL, VT), + SDValue(Carry, 0), + CurDAG->getTargetConstant(Mips::sub_32, DL, + VT)); } - assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!"); - - // The more complex case is when there is a chain of ISD::ADDE nodes like: - // (adde (adde (adde (addc a b) c) d) e). - // - // The addwc instruction does not write to the carry bit, instead it writes - // to bit 20 of the dsp control register. To match this series of nodes, each - // intermediate adde node must be expanded to write the carry bit before the - // addition. - - // Start by reading the overflow field for addsc and moving the value to the - // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP - // corresponds to reading/writing the entire control register to/from a GPR. - - SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32); - - SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32); - - SDNode *DSPCtrlField = - CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag); - - SDNode *Carry = CurDAG->getMachineNode( - Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne); + // Generate a second addition only if we know that RHS is not a + // constant-zero node. + SDNode *AddCarry = Carry; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS); + if (!C || C->getZExtValue()) + AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS); - SDValue Ops[4] = {SDValue(DSPCtrlField, 0), - CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne, - SDValue(Carry, 0)}; - SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops); - - // My reading of the the MIPS DSP 3.01 specification isn't as clear as I - // would like about whether bit 20 always gets overwritten by addwc. - // Hence take an extremely conservative view and presume it's sticky. We - // therefore need to clear it. - - SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32); - - SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)}; - SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps); - - SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue, - SDValue(DSPCtrlFinal, 0), CstOne); - - SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)}; - CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands); + CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0)); } /// Match frameindex @@ -783,8 +765,19 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { switch(Opcode) { default: break; + case ISD::SUBE: { + SDValue InFlag = Node->getOperand(2); + unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu; + selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node); + return true; + } + case ISD::ADDE: { - selectAddE(Node, DL); + if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC. + break; + SDValue InFlag = Node->getOperand(2); + unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu; + selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node); return true; } diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h index 6f38289c5a45..f89a350cab04 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -41,7 +41,8 @@ private: const SDLoc &dl, EVT Ty, bool HasLo, bool HasHi); - void selectAddE(SDNode *Node, const SDLoc &DL) const; + void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS, + const SDLoc &DL, SDNode *Node) const; bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const; bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset, diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index b57bceb3c837..06a97b9d123e 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -179,6 +179,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); + setTargetDAGCombine(ISD::ADDE); + setTargetDAGCombine(ISD::SUBE); setTargetDAGCombine(ISD::MUL); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -419,6 +421,163 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op, return MipsTargetLowering::LowerOperation(Op, DAG); } +// selectMADD - +// Transforms a subgraph in CurDAG if the following pattern is found: +// (addc multLo, Lo0), (adde multHi, Hi0), +// where, +// multHi/Lo: product of multiplication +// Lo0: initial value of Lo register +// Hi0: initial value of Hi register +// Return true if pattern matching was successful. +static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) { + // ADDENode's second operand must be a flag output of an ADDC node in order + // for the matching to be successful. + SDNode *ADDCNode = ADDENode->getOperand(2).getNode(); + + if (ADDCNode->getOpcode() != ISD::ADDC) + return false; + + SDValue MultHi = ADDENode->getOperand(0); + SDValue MultLo = ADDCNode->getOperand(0); + SDNode *MultNode = MultHi.getNode(); + unsigned MultOpc = MultHi.getOpcode(); + + // MultHi and MultLo must be generated by the same node, + if (MultLo.getNode() != MultNode) + return false; + + // and it must be a multiplication. + if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI) + return false; + + // MultLo amd MultHi must be the first and second output of MultNode + // respectively. + if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0) + return false; + + // Transform this to a MADD only if ADDENode and ADDCNode are the only users + // of the values of MultNode, in which case MultNode will be removed in later + // phases. + // If there exist users other than ADDENode or ADDCNode, this function returns + // here, which will result in MultNode being mapped to a single MULT + // instruction node rather than a pair of MULT and MADD instructions being + // produced. + if (!MultHi.hasOneUse() || !MultLo.hasOneUse()) + return false; + + SDLoc DL(ADDENode); + + // Initialize accumulator. + SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped, + ADDCNode->getOperand(1), + ADDENode->getOperand(1)); + + // create MipsMAdd(u) node + MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd; + + SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped, + MultNode->getOperand(0),// Factor 0 + MultNode->getOperand(1),// Factor 1 + ACCIn); + + // replace uses of adde and addc here + if (!SDValue(ADDCNode, 0).use_empty()) { + SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut); + } + if (!SDValue(ADDENode, 0).use_empty()) { + SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut); + } + + return true; +} + +// selectMSUB - +// Transforms a subgraph in CurDAG if the following pattern is found: +// (addc Lo0, multLo), (sube Hi0, multHi), +// where, +// multHi/Lo: product of multiplication +// Lo0: initial value of Lo register +// Hi0: initial value of Hi register +// Return true if pattern matching was successful. +static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) { + // SUBENode's second operand must be a flag output of an SUBC node in order + // for the matching to be successful. + SDNode *SUBCNode = SUBENode->getOperand(2).getNode(); + + if (SUBCNode->getOpcode() != ISD::SUBC) + return false; + + SDValue MultHi = SUBENode->getOperand(1); + SDValue MultLo = SUBCNode->getOperand(1); + SDNode *MultNode = MultHi.getNode(); + unsigned MultOpc = MultHi.getOpcode(); + + // MultHi and MultLo must be generated by the same node, + if (MultLo.getNode() != MultNode) + return false; + + // and it must be a multiplication. + if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI) + return false; + + // MultLo amd MultHi must be the first and second output of MultNode + // respectively. + if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0) + return false; + + // Transform this to a MSUB only if SUBENode and SUBCNode are the only users + // of the values of MultNode, in which case MultNode will be removed in later + // phases. + // If there exist users other than SUBENode or SUBCNode, this function returns + // here, which will result in MultNode being mapped to a single MULT + // instruction node rather than a pair of MULT and MSUB instructions being + // produced. + if (!MultHi.hasOneUse() || !MultLo.hasOneUse()) + return false; + + SDLoc DL(SUBENode); + + // Initialize accumulator. + SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped, + SUBCNode->getOperand(0), + SUBENode->getOperand(0)); + + // create MipsSub(u) node + MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub; + + SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue, + MultNode->getOperand(0),// Factor 0 + MultNode->getOperand(1),// Factor 1 + ACCIn); + + // replace uses of sube and subc here + if (!SDValue(SUBCNode, 0).use_empty()) { + SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut); + } + if (!SDValue(SUBENode, 0).use_empty()) { + SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut); + } + + return true; +} + +static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget &Subtarget) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && + N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG)) + return SDValue(N, 0); + + return SDValue(); +} + // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT // // Performs the following transformations: @@ -661,6 +820,19 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget &Subtarget) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 && + selectMSUB(N, &DAG)) + return SDValue(N, 0); + + return SDValue(); +} + static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT, EVT ShiftTy, SelectionDAG &DAG) { // Clear the upper (64 - VT.sizeInBits) bits. @@ -938,12 +1110,16 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue Val; switch (N->getOpcode()) { + case ISD::ADDE: + return performADDECombine(N, DAG, DCI, Subtarget); case ISD::AND: Val = performANDCombine(N, DAG, DCI, Subtarget); break; case ISD::OR: Val = performORCombine(N, DAG, DCI, Subtarget); break; + case ISD::SUBE: + return performSUBECombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMULCombine(N, DAG, DCI, this); case ISD::SHL: diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index dd7707084948..a64d95512a4a 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -141,9 +141,9 @@ int NVPTXTTIImpl::getArithmeticInstrCost( } } -void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, +void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { - BaseT::getUnrollingPreferences(L, UP); + BaseT::getUnrollingPreferences(L, SE, UP); // Enable partial unrolling and runtime unrolling, but reduce the // threshold. This partially unrolls small loops which are often diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 03075b550429..f987892ba675 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -61,7 +61,8 @@ public: TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef<const Value *> Args = ArrayRef<const Value *>()); - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); }; } // end namespace llvm diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 6d7eb786a683..7393f3d7a08a 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -131,10 +131,11 @@ public: } } - void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, bool &IsResolved) override { + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override { switch ((PPC::Fixups)Fixup.getKind()) { - default: break; + default: + return false; case PPC::fixup_ppc_br24: case PPC::fixup_ppc_br24abs: // If the target symbol has a local entry point we must not attempt @@ -147,10 +148,10 @@ public: // and thus the shift to pack it. unsigned Other = S->getOther() << 2; if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) - IsResolved = false; + return true; } } - break; + return false; } } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index ae43e59d3cb1..dce443997ea5 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -17,35 +17,31 @@ namespace llvm { namespace PPC { enum Fixups { - // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b' - // and 'bl'. + // 24-bit PC relative relocation for direct branches like 'b' and 'bl'. fixup_ppc_br24 = FirstTargetFixupKind, - - /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional - /// branches. + + /// 14-bit PC relative relocation for conditional branches. fixup_ppc_brcond14, - - /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches - /// like 'ba' and 'bla'. + + /// 24-bit absolute relocation for direct branches like 'ba' and 'bla'. fixup_ppc_br24abs, - /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional - /// branches. + /// 14-bit absolute relocation for conditional branches. fixup_ppc_brcond14abs, - /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo) - /// or ha16(_foo) for instrs like 'li' or 'addis'. + /// A 16-bit fixup corresponding to lo16(_foo) or ha16(_foo) for instrs like + /// 'li' or 'addis'. fixup_ppc_half16, - - /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with - /// implied 2 zero bits for instrs like 'std'. + + /// A 14-bit fixup corresponding to lo16(_foo) with implied 2 zero bits for + /// instrs like 'std'. fixup_ppc_half16ds, - /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call - /// to __tls_get_addr for the TLS general and local dynamic models, - /// or inserts the thread-pointer register number. + /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the + /// TLS general and local dynamic models, or inserts the thread-pointer + /// register number. fixup_ppc_nofixup, - + // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index 6d591ca964a6..d5506277ca88 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -219,11 +219,11 @@ bool PPCMachObjectWriter::recordScatteredRelocation( const MCSymbol *SB = &B->getSymbol(); if (!SB->getFragment()) - report_fatal_error("symbol '" + B->getSymbol().getName() + + report_fatal_error("symbol '" + SB->getName() + "' can not be undefined in a subtraction expression"); // FIXME: is Type correct? see include/llvm/BinaryFormat/MachO.h - Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + Value2 = Writer->getSymbolAddress(*SB, Layout); FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); } // FIXME: does FixedValue get used?? diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index 07c9c1f9f84c..ad92ac8ce120 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPC_H #define LLVM_LIB_TARGET_POWERPC_PPC_H +#include "llvm/Support/CodeGen.h" #include "MCTargetDesc/PPCMCTargetDesc.h" // GCC #defines PPC on Linux but we use it as our namespace name @@ -28,7 +29,7 @@ namespace llvm { class AsmPrinter; class MCInst; - FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM); + FunctionPass *createPPCCTRLoops(); #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); #endif @@ -41,7 +42,7 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCQPXLoadSplatPass(); - FunctionPass *createPPCISelDag(PPCTargetMachine &TM); + FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); FunctionPass *createPPCExpandISELPass(); @@ -51,6 +52,7 @@ namespace llvm { void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); + void initializePPCTLSDynamicCallPass(PassRegistry &); extern char &PPCVSXFMAMutateID; namespace PPCII { diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 24bc027f8106..094d3e6a61b5 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -24,12 +24,14 @@ //===----------------------------------------------------------------------===// #include "PPC.h" +#include "PPCSubtarget.h" #include "PPCTargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -81,10 +83,7 @@ namespace { public: static char ID; - PPCCTRLoops() : FunctionPass(ID), TM(nullptr) { - initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); - } - PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + PPCCTRLoops() : FunctionPass(ID) { initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); } @@ -99,16 +98,18 @@ namespace { } private: - bool mightUseCTR(const Triple &TT, BasicBlock *BB); + bool mightUseCTR(BasicBlock *BB); bool convertToCTRLoop(Loop *L); private: - PPCTargetMachine *TM; + const PPCTargetMachine *TM; + const PPCSubtarget *STI; + const PPCTargetLowering *TLI; + const DataLayout *DL; + const TargetLibraryInfo *LibInfo; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; DominatorTree *DT; - const TargetLibraryInfo *LibInfo; bool PreserveLCSSA; }; @@ -149,9 +150,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) -FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) { - return new PPCCTRLoops(TM); -} +FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); } #ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", @@ -169,6 +168,14 @@ bool PPCCTRLoops::runOnFunction(Function &F) { if (skipFunction(F)) return false; + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + TM = &TPC->getTM<PPCTargetMachine>(); + STI = TM->getSubtargetImpl(F); + TLI = STI->getTargetLowering(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -198,8 +205,7 @@ static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { // Determining the address of a TLS variable results in a function call in // certain TLS models. -static bool memAddrUsesCTR(const PPCTargetMachine *TM, - const Value *MemAddr) { +static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) { const auto *GV = dyn_cast<GlobalValue>(MemAddr); if (!GV) { // Recurse to check for constants that refer to TLS global variables. @@ -213,35 +219,35 @@ static bool memAddrUsesCTR(const PPCTargetMachine *TM, if (!GV->isThreadLocal()) return false; - if (!TM) - return true; - TLSModel::Model Model = TM->getTLSModel(GV); + TLSModel::Model Model = TM.getTLSModel(GV); return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; } -bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { +// Loop through the inline asm constraints and look for something that clobbers +// ctr. +static bool asmClobbersCTR(InlineAsm *IA) { + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + return false; +} + +bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) { if (CallInst *CI = dyn_cast<CallInst>(J)) { + // Inline ASM is okay, unless it clobbers the ctr register. if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { - // Inline ASM is okay, unless it clobbers the ctr register. - InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); - for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { - InlineAsm::ConstraintInfo &C = CIV[i]; - if (C.Type != InlineAsm::isInput) - for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) - if (StringRef(C.Codes[j]).equals_lower("{ctr}")) - return true; - } - + if (asmClobbersCTR(IA)) + return true; continue; } - if (!TM) - return true; - const TargetLowering *TLI = - TM->getSubtargetImpl(*BB->getParent())->getTargetLowering(); - if (Function *F = CI->getCalledFunction()) { // Most intrinsics don't become function calls, but some might. // sin, cos, exp and log are always calls. @@ -380,9 +386,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { } if (Opcode) { - auto &DL = CI->getModule()->getDataLayout(); - MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(), - true); + MVT VTy = TLI->getSimpleValueType( + *DL, CI->getArgOperand(0)->getType(), true); if (VTy == MVT::Other) return true; @@ -406,17 +411,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { CastInst *CI = cast<CastInst>(J); if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || - isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) || - isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType())) + isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType())) return true; - } else if (isLargeIntegerTy(TT.isArch32Bit(), + } else if (isLargeIntegerTy(!TM->isPPC64(), J->getType()->getScalarType()) && (J->getOpcode() == Instruction::UDiv || J->getOpcode() == Instruction::SDiv || J->getOpcode() == Instruction::URem || J->getOpcode() == Instruction::SRem)) { return true; - } else if (TT.isArch32Bit() && + } else if (!TM->isPPC64() && isLargeIntegerTy(false, J->getType()->getScalarType()) && (J->getOpcode() == Instruction::Shl || J->getOpcode() == Instruction::AShr || @@ -428,16 +433,11 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { // On PowerPC, indirect jumps use the counter register. return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { - if (!TM) - return true; - const TargetLowering *TLI = - TM->getSubtargetImpl(*BB->getParent())->getTargetLowering(); - if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) return true; } - if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) { + if (STI->useSoftFloat()) { switch(J->getOpcode()) { case Instruction::FAdd: case Instruction::FSub: @@ -456,7 +456,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { } for (Value *Operand : J->operands()) - if (memAddrUsesCTR(TM, Operand)) + if (memAddrUsesCTR(*TM, Operand)) return true; } @@ -466,11 +466,6 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { bool PPCCTRLoops::convertToCTRLoop(Loop *L) { bool MadeChange = false; - const Triple TT = - Triple(L->getHeader()->getParent()->getParent()->getTargetTriple()); - if (!TT.isArch32Bit() && !TT.isArch64Bit()) - return MadeChange; // Unknown arch. type. - // Process nested loops first. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { MadeChange |= convertToCTRLoop(*I); @@ -495,7 +490,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // want to use the counter register if the loop contains calls. for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); I != IE; ++I) - if (mightUseCTR(TT, *I)) + if (mightUseCTR(*I)) return MadeChange; SmallVector<BasicBlock*, 4> ExitingBlocks; @@ -517,7 +512,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { } else if (!SE->isLoopInvariant(EC, L)) continue; - if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32)) + if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) continue; // We now have a loop-invariant count of loop iterations (which is not the @@ -571,7 +566,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // preheader, then we can use it (except if the preheader contains a use of // the CTR register because some such uses might be reordered by the // selection DAG after the mtctr instruction). - if (!Preheader || mightUseCTR(TT, Preheader)) + if (!Preheader || mightUseCTR(Preheader)) Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (!Preheader) return MadeChange; @@ -582,10 +577,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // selected branch. MadeChange = true; - SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt"); + SCEVExpander SCEVE(*SE, *DL, "loopcnt"); LLVMContext &C = SE->getContext(); - Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) : - Type::getInt32Ty(C); + Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); if (!ExitCount->getType()->isPointerTy() && ExitCount->getType() != CountType) ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index afd2e87078a9..535b9deaefac 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -114,8 +114,8 @@ namespace { unsigned GlobalBaseReg; public: - explicit PPCDAGToDAGISel(PPCTargetMachine &tm) - : SelectionDAGISel(tm), TM(tm) {} + explicit PPCDAGToDAGISel(PPCTargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), TM(tm) {} bool runOnMachineFunction(MachineFunction &MF) override { // Make sure we re-emit a set of the global base reg if necessary @@ -5116,6 +5116,7 @@ void PPCDAGToDAGISel::PeepholePPC64() { /// createPPCISelDag - This pass converts a legalized DAG into a /// PowerPC-specific DAG, ready for instruction scheduling. /// -FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) { - return new PPCDAGToDAGISel(TM); +FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new PPCDAGToDAGISel(TM, OptLevel); } diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 31c50785c2ee..5f8085f4626e 100644 --- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -52,6 +52,7 @@ namespace { protected: bool processBlock(MachineBasicBlock &MBB) { bool Changed = false; + bool NeedFence = true; bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64(); for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); @@ -62,6 +63,16 @@ protected: MI.getOpcode() != PPC::ADDItlsldLADDR && MI.getOpcode() != PPC::ADDItlsgdLADDR32 && MI.getOpcode() != PPC::ADDItlsldLADDR32) { + + // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP + // as scheduling fences, we skip creating fences if we already + // have existing ADJCALLSTACKDOWN/UP to avoid nesting, + // which causes verification error with -verify-machineinstrs. + if (MI.getOpcode() == PPC::ADJCALLSTACKDOWN) + NeedFence = false; + else if (MI.getOpcode() == PPC::ADJCALLSTACKUP) + NeedFence = true; + ++I; continue; } @@ -96,11 +107,15 @@ protected: break; } - // Don't really need to save data to the stack - the clobbered + // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr + // as schduling fence to avoid it is scheduled before + // mflr in the prologue and the address in LR is clobbered (PR25839). + // We don't really need to save data to the stack - the clobbered // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr) // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR). - BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) - .addImm(0); + if (NeedFence) + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) + .addImm(0); // Expand into two ops built prior to the existing instruction. MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) @@ -116,7 +131,8 @@ protected: .addReg(GPR3)); Call->addOperand(MI.getOperand(3)); - BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); + if (NeedFence) + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) .addReg(GPR3); diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index a88a6541e8d0..fe092cc3b858 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -93,6 +93,7 @@ extern "C" void LLVMInitializePowerPCTarget() { PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); + initializePPCTLSDynamicCallPass(PR); } /// Return the datalayout string of a subtarget. @@ -336,7 +337,7 @@ bool PPCPassConfig::addPreISel() { addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) - addPass(createPPCCTRLoops(getPPCTargetMachine())); + addPass(createPPCCTRLoops()); return false; } @@ -352,7 +353,7 @@ bool PPCPassConfig::addILPOpts() { bool PPCPassConfig::addInstSelector() { // Install an instruction selector. - addPass(createPPCISelDag(getPPCTargetMachine())); + addPass(createPPCISelDag(getPPCTargetMachine(), getOptLevel())); #ifndef NDEBUG if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 5eb6ba785d1b..2dc3828334ac 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -41,6 +41,7 @@ public: ~PPCTargetMachine() override; const PPCSubtarget *getSubtargetImpl(const Function &F) const override; + const PPCSubtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 3dbd5f5b9a92..6110706b01b9 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -189,7 +189,7 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, return PPCTTIImpl::getIntImmCost(Imm, Ty); } -void PPCTTIImpl::getUnrollingPreferences(Loop *L, +void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { // The A2 is in-order with a deep pipeline, and concatenation unrolling @@ -201,7 +201,7 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, UP.AllowExpensiveTripCount = true; } - BaseT::getUnrollingPreferences(L, UP); + BaseT::getUnrollingPreferences(L, SE, UP); } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 758c335def08..99ca6394d1be 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -52,7 +52,8 @@ public: Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); /// @} diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index c72b47b09085..d4454c271f5a 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -203,13 +203,14 @@ namespace { return InfosBE[Kind - FirstTargetFixupKind]; } - void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, bool &IsResolved) override { + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override { switch ((Sparc::Fixups)Fixup.getKind()) { - default: break; + default: + return false; case Sparc::fixup_sparc_wplt30: if (Target.getSymA()->getSymbol().isTemporary()) - return; + return false; case Sparc::fixup_sparc_tls_gd_hi22: case Sparc::fixup_sparc_tls_gd_lo10: case Sparc::fixup_sparc_tls_gd_add: @@ -227,7 +228,8 @@ namespace { case Sparc::fixup_sparc_tls_ie_ldx: case Sparc::fixup_sparc_tls_ie_add: case Sparc::fixup_sparc_tls_le_hix22: - case Sparc::fixup_sparc_tls_le_lox10: IsResolved = false; break; + case Sparc::fixup_sparc_tls_le_lox10: + return true; } } diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index ad05779a9f64..ee23692ad1db 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -61,6 +61,7 @@ enum RegisterKind { VR64Reg, VR128Reg, AR32Reg, + CR64Reg, }; enum MemoryKind { @@ -343,6 +344,7 @@ public: bool isVF128() const { return false; } bool isVR128() const { return isReg(VR128Reg); } bool isAR32() const { return isReg(AR32Reg); } + bool isCR64() const { return isReg(CR64Reg); } bool isAnyReg() const { return (isReg() || isImm(0, 15)); } bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); } bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); } @@ -379,7 +381,8 @@ private: RegGR, RegFP, RegV, - RegAR + RegAR, + RegCR }; struct Register { RegisterGroup Group; @@ -487,6 +490,9 @@ public: OperandMatchResultTy parseAR32(OperandVector &Operands) { return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg); } + OperandMatchResultTy parseCR64(OperandVector &Operands) { + return parseRegister(Operands, RegCR, SystemZMC::CR64Regs, CR64Reg); + } OperandMatchResultTy parseAnyReg(OperandVector &Operands) { return parseAnyRegister(Operands); } @@ -648,6 +654,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg) { Reg.Group = RegV; else if (Prefix == 'a' && Reg.Num < 16) Reg.Group = RegAR; + else if (Prefix == 'c' && Reg.Num < 16) + Reg.Group = RegCR; else return Error(Reg.StartLoc, "invalid register"); @@ -741,6 +749,10 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) { Kind = AR32Reg; RegNo = SystemZMC::AR32Regs[Reg.Num]; } + else if (Reg.Group == RegCR) { + Kind = CR64Reg; + RegNo = SystemZMC::CR64Regs[Reg.Num]; + } else { return MatchOperand_ParseFail; } @@ -1056,6 +1068,8 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, RegNo = SystemZMC::VR128Regs[Reg.Num]; else if (Reg.Group == RegAR) RegNo = SystemZMC::AR32Regs[Reg.Num]; + else if (Reg.Group == RegCR) + RegNo = SystemZMC::CR64Regs[Reg.Num]; StartLoc = Reg.StartLoc; EndLoc = Reg.EndLoc; return false; diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp index 27fd70bc6092..8903b57ffd0b 100644 --- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp +++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp @@ -162,6 +162,12 @@ static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16); } +static DecodeStatus DecodeCR64BitRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, SystemZMC::CR64Regs, 16); +} + template<unsigned N> static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) { if (!isUInt<N>(Imm)) diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index dfea7e33fa15..727ab921daf9 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -116,6 +116,13 @@ const unsigned SystemZMC::AR32Regs[16] = { SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15 }; +const unsigned SystemZMC::CR64Regs[16] = { + SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3, + SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7, + SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11, + SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15 +}; + unsigned SystemZMC::getFirstReg(unsigned Reg) { static unsigned Map[SystemZ::NUM_TARGET_REGS]; static bool Initialized = false; diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index d9926c7e4986..dbca3485290a 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -55,6 +55,7 @@ extern const unsigned VR32Regs[32]; extern const unsigned VR64Regs[32]; extern const unsigned VR128Regs[32]; extern const unsigned AR32Regs[16]; +extern const unsigned CR64Regs[16]; // Return the 0-based number of the first architectural register that // contains the given LLVM register. E.g. R1D -> 1. diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt index 74cf653b9d95..9b714157550d 100644 --- a/lib/Target/SystemZ/README.txt +++ b/lib/Target/SystemZ/README.txt @@ -67,6 +67,11 @@ We don't use ICM, STCM, or CLM. -- +We don't use ADD (LOGICAL) HIGH, SUBTRACT (LOGICAL) HIGH, +or COMPARE (LOGICAL) HIGH yet. + +-- + DAGCombiner doesn't yet fold truncations of extended loads. Functions like: unsigned long f (unsigned long x, unsigned short *y) diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td index c5f324418da5..41300a1b6295 100644 --- a/lib/Target/SystemZ/SystemZ.td +++ b/lib/Target/SystemZ/SystemZ.td @@ -56,6 +56,7 @@ include "SystemZInstrVector.td" include "SystemZInstrFP.td" include "SystemZInstrHFP.td" include "SystemZInstrDFP.td" +include "SystemZInstrSystem.td" def SystemZInstrInfo : InstrInfo {} diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td index ffb0b8d1c861..c5faa0d62881 100644 --- a/lib/Target/SystemZ/SystemZFeatures.td +++ b/lib/Target/SystemZ/SystemZFeatures.td @@ -68,11 +68,21 @@ def FeaturePopulationCount : SystemZFeature< "Assume that the population-count facility is installed" >; +def FeatureMessageSecurityAssist3 : SystemZFeature< + "message-security-assist-extension3", "MessageSecurityAssist3", + "Assume that the message-security-assist extension facility 3 is installed" +>; + def FeatureMessageSecurityAssist4 : SystemZFeature< "message-security-assist-extension4", "MessageSecurityAssist4", "Assume that the message-security-assist extension facility 4 is installed" >; +def FeatureResetReferenceBitsMultiple : SystemZFeature< + "reset-reference-bits-multiple", "ResetReferenceBitsMultiple", + "Assume that the reset-reference-bits-multiple facility is installed" +>; + def Arch9NewFeatures : SystemZFeatureList<[ FeatureDistinctOps, FeatureFastSerialization, @@ -81,7 +91,9 @@ def Arch9NewFeatures : SystemZFeatureList<[ FeatureInterlockedAccess1, FeatureLoadStoreOnCond, FeaturePopulationCount, - FeatureMessageSecurityAssist4 + FeatureMessageSecurityAssist3, + FeatureMessageSecurityAssist4, + FeatureResetReferenceBitsMultiple ]>; //===----------------------------------------------------------------------===// @@ -120,13 +132,19 @@ def FeatureDFPZonedConversion : SystemZFeature< "Assume that the DFP zoned-conversion facility is installed" >; +def FeatureEnhancedDAT2 : SystemZFeature< + "enhanced-dat-2", "EnhancedDAT2", + "Assume that the enhanced-DAT facility 2 is installed" +>; + def Arch10NewFeatures : SystemZFeatureList<[ FeatureExecutionHint, FeatureLoadAndTrap, FeatureMiscellaneousExtensions, FeatureProcessorAssist, FeatureTransactionalExecution, - FeatureDFPZonedConversion + FeatureDFPZonedConversion, + FeatureEnhancedDAT2 ]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 5f6115ed86a4..7620e06ccbc9 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2468,6 +2468,14 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpType = "reg"; } +class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls> + : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src), + mnemonic#"\t$R1", []> { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; + let R2 = 0; +} + class UnaryMemRRFc<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src), @@ -2702,6 +2710,26 @@ class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode, let AddedComplexity = 7; } +class SideEffectBinaryRRE<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2), + mnemonic#"\t$R1, $R2", []>; + +class SideEffectBinaryRRFa<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2), + mnemonic#"\t$R1, $R2", []> { + let R3 = 0; + let M4 = 0; +} + +class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2), + mnemonic#"\t$R1, $R2", []> { + let M3 = 0; +} + class SideEffectBinaryIE<string mnemonic, bits<16> opcode, Immediate imm1, Immediate imm2> : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2), @@ -2729,6 +2757,10 @@ class SideEffectBinarySSf<string mnemonic, bits<8> opcode> : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2), mnemonic##"\t$BD1, $BDL2", []>; +class SideEffectBinarySSE<string mnemonic, bits<16> opcode> + : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2), + mnemonic#"\t$BD1, $BD2", []>; + class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), @@ -3612,6 +3644,22 @@ class SideEffectTernarySSc<string mnemonic, bits<8> opcode> shift12only:$BD2, imm32zx4:$I3), mnemonic##"\t$BDL1, $BD2, $I3", []>; +class SideEffectTernaryRRFa<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3), + mnemonic#"\t$R1, $R2, $R3", []> { + let M4 = 0; +} + +class SideEffectTernaryRRFb<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3), + mnemonic#"\t$R1, $R3, $R2", []> { + let M4 = 0; +} + class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2, @@ -3630,6 +3678,13 @@ class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode, : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []>; +multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode, + RegisterOperand cls1, + RegisterOperand cls2> { + def "" : SideEffectTernaryRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>; + def Opt : SideEffectBinaryRRFc<mnemonic, opcode, cls1, cls2>; +} + class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2, Immediate imm> @@ -3720,6 +3775,18 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode, } } +class SideEffectTernaryRS<string mnemonic, bits<8> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRSa<opcode, (outs), + (ins cls1:$R1, cls2:$R3, bdaddr12only:$BD2), + mnemonic#"\t$R1, $R3, $BD2", []>; + +class SideEffectTernaryRSY<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRSYa<opcode, (outs), + (ins cls1:$R1, cls2:$R3, bdaddr20only:$BD2), + mnemonic#"\t$R1, $R3, $BD2", []>; + class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRSa<opcode, (outs cls1:$R1, cls2:$R3), @@ -3997,6 +4064,35 @@ multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> { VR128:$V4, imm32zx4:$M5, 0)>; } +class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4), + mnemonic#"\t$R1, $R2, $R3, $M4", []>; + +multiclass SideEffectQuaternaryRRFaOptOpt<string mnemonic, bits<16> opcode, + RegisterOperand cls1, + RegisterOperand cls2, + RegisterOperand cls3> { + def "" : SideEffectQuaternaryRRFa<mnemonic, opcode, cls1, cls2, cls3>; + def Opt : SideEffectTernaryRRFa<mnemonic, opcode, cls1, cls2, cls3>; + def OptOpt : SideEffectBinaryRRFa<mnemonic, opcode, cls1, cls2>; +} + +class SideEffectQuaternaryRRFb<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4), + mnemonic#"\t$R1, $R3, $R2, $M4", []>; + +multiclass SideEffectQuaternaryRRFbOpt<string mnemonic, bits<16> opcode, + RegisterOperand cls1, + RegisterOperand cls2, + RegisterOperand cls3> { + def "" : SideEffectQuaternaryRRFb<mnemonic, opcode, cls1, cls2, cls3>; + def Opt : SideEffectTernaryRRFb<mnemonic, opcode, cls1, cls2, cls3>; +} + class SideEffectQuaternarySSe<string mnemonic, bits<8> opcode, RegisterOperand cls> : InstSSe<opcode, (outs), @@ -4012,6 +4108,16 @@ class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator, let mayStore = 1; } +class CmpSwapRRE<string mnemonic, bits<16> opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2), + mnemonic#"\t$R1, $R2", []> { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; + let mayLoad = 1; + let mayStore = 1; +} + class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator, RegisterOperand cls, AddressingMode mode = bdaddr12only> : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2), diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 9f5e6288348e..98f66c29ae64 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -883,6 +883,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { } def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>; + // Addition to a high register. + def AHHHR : BinaryRRFa<"ahhhr", 0xB9C8, null_frag, GRH32, GRH32, GRH32>, + Requires<[FeatureHighWord]>; + def AHHLR : BinaryRRFa<"ahhlr", 0xB9D8, null_frag, GRH32, GRH32, GR32>, + Requires<[FeatureHighWord]>; + // Addition of signed 16-bit immediates. defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>; defm AHI : BinaryRIAndK<"ahi", 0xA7A, 0xECD8, add, GR32, imm32sx16>; @@ -917,6 +923,12 @@ let Defs = [CC] in { } def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>; + // Addition to a high register. + def ALHHHR : BinaryRRFa<"alhhhr", 0xB9CA, null_frag, GRH32, GRH32, GRH32>, + Requires<[FeatureHighWord]>; + def ALHHLR : BinaryRRFa<"alhhlr", 0xB9DA, null_frag, GRH32, GRH32, GR32>, + Requires<[FeatureHighWord]>; + // Addition of signed 16-bit immediates. def ALHSIK : BinaryRIE<"alhsik", 0xECDA, addc, GR32, imm32sx16>, Requires<[FeatureDistinctOps]>; @@ -927,6 +939,10 @@ let Defs = [CC] in { def ALFI : BinaryRIL<"alfi", 0xC2B, addc, GR32, uimm32>; def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>; + // Addition of signed 32-bit immediates. + def ALSIH : BinaryRIL<"alsih", 0xCCA, null_frag, GRH32, simm32>, + Requires<[FeatureHighWord]>; + // Addition of memory. defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>; def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>; @@ -949,6 +965,10 @@ let Defs = [CC], Uses = [CC] in { def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>; } +// Addition that does not modify the condition code. +def ALSIHN : BinaryRIL<"alsihn", 0xCCB, null_frag, GRH32, simm32>, + Requires<[FeatureHighWord]>; + //===----------------------------------------------------------------------===// // Subtraction //===----------------------------------------------------------------------===// @@ -961,6 +981,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>; defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>; + // Subtraction from a high register. + def SHHHR : BinaryRRFa<"shhhr", 0xB9C9, null_frag, GRH32, GRH32, GRH32>, + Requires<[FeatureHighWord]>; + def SHHLR : BinaryRRFa<"shhlr", 0xB9D9, null_frag, GRH32, GRH32, GR32>, + Requires<[FeatureHighWord]>; + // Subtraction of memory. defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>; defm S : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>; @@ -976,6 +1002,12 @@ let Defs = [CC] in { def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>; defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>; + // Subtraction from a high register. + def SLHHHR : BinaryRRFa<"slhhhr", 0xB9CB, null_frag, GRH32, GRH32, GRH32>, + Requires<[FeatureHighWord]>; + def SLHHLR : BinaryRRFa<"slhhlr", 0xB9DB, null_frag, GRH32, GRH32, GR32>, + Requires<[FeatureHighWord]>; + // Subtraction of unsigned 32-bit immediates. These don't match // subc because we prefer addc for constants. def SLFI : BinaryRIL<"slfi", 0xC25, null_frag, GR32, uimm32>; @@ -1298,6 +1330,12 @@ let Defs = [CC], CCValues = 0xE in { def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>; def CGR : CompareRRE<"cgr", 0xB920, z_scmp, GR64, GR64>; + // Comparison with a high register. + def CHHR : CompareRRE<"chhr", 0xB9CD, null_frag, GRH32, GRH32>, + Requires<[FeatureHighWord]>; + def CHLR : CompareRRE<"chlr", 0xB9DD, null_frag, GRH32, GR32>, + Requires<[FeatureHighWord]>; + // Comparison with a signed 16-bit immediate. CHIMux expands to CHI or CIH, // depending on the choice of register. def CHIMux : CompareRIPseudo<z_scmp, GRX32, imm32sx16>, @@ -1344,6 +1382,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in { def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>; def CLGR : CompareRRE<"clgr", 0xB921, z_ucmp, GR64, GR64>; + // Comparison with a high register. + def CLHHR : CompareRRE<"clhhr", 0xB9CF, null_frag, GRH32, GRH32>, + Requires<[FeatureHighWord]>; + def CLHLR : CompareRRE<"clhlr", 0xB9DF, null_frag, GRH32, GR32>, + Requires<[FeatureHighWord]>; + // Comparison with an unsigned 32-bit immediate. CLFIMux expands to CLFI // or CLIH, depending on the choice of register. def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>, @@ -1888,54 +1932,12 @@ let mayLoad = 1, Defs = [CC] in let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>; -// Supervisor call. -let hasSideEffects = 1, isCall = 1, Defs = [CC] in - def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>; - -// Monitor call. -let hasSideEffects = 1, isCall = 1 in - def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>; - -// Store clock. -let hasSideEffects = 1, Defs = [CC] in { - def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>; - def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>; - def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>; -} - -// Store facility list. -let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in - def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>; - -// Extract CPU attribute. -let hasSideEffects = 1 in - def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>; - -// Extract CPU time. -let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in - def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>; - -// Extract PSW. -let hasSideEffects = 1, Uses = [CC] in - def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>; - // Execute. let hasSideEffects = 1 in { def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>; } -// Program return. -let hasSideEffects = 1, Defs = [CC] in - def PR : SideEffectInherentE<"pr", 0x0101>; - -// Move with key. -let mayLoad = 1, mayStore = 1, Defs = [CC] in - def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>; - -// Store real address. -def STRAG : StoreSSE<"strag", 0xE502>; - //===----------------------------------------------------------------------===// // .insn directive instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td new file mode 100644 index 000000000000..a9803c2d83e9 --- /dev/null +++ b/lib/Target/SystemZ/SystemZInstrSystem.td @@ -0,0 +1,517 @@ +//==- SystemZInstrSystem.td - SystemZ system instructions -*- tblgen-*-----==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The instructions in this file implement SystemZ system-level instructions. +// Most of these instructions are privileged or semi-privileged. They are +// not used for code generation, but are provided for use with the assembler +// and disassembler only. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Program-Status Word Instructions. +//===----------------------------------------------------------------------===// + +// Extract PSW. +let hasSideEffects = 1, Uses = [CC] in + def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>; + +// Load PSW (extended). +let hasSideEffects = 1, Defs = [CC], mayLoad = 1 in { + def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>; + def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>; +} + +// Insert PSW key. +let Uses = [R2L], Defs = [R2L] in + def IPK : SideEffectInherentS<"ipk", 0xB20B, null_frag>; + +// Set PSW key from address. +let hasSideEffects = 1 in + def SPKA : SideEffectAddressS<"spka", 0xB20A, null_frag>; + +// Set system mask. +let hasSideEffects = 1, mayLoad = 1 in + def SSM : SideEffectUnaryS<"ssm", 0x8000, null_frag, 1>; + +// Store then AND/OR system mask. +let hasSideEffects = 1 in { + def STNSM : StoreSI<"stnsm", 0xAC, null_frag, imm32zx8>; + def STOSM : StoreSI<"stosm", 0xAD, null_frag, imm32zx8>; +} + +// Insert address space control. +let hasSideEffects = 1 in + def IAC : InherentRRE<"iac", 0xB224, GR32, null_frag>; + +// Set address space control (fast). +let hasSideEffects = 1 in { + def SAC : SideEffectAddressS<"sac", 0xB219, null_frag>; + def SACF : SideEffectAddressS<"sacf", 0xB279, null_frag>; +} + +//===----------------------------------------------------------------------===// +// Control Register Instructions. +//===----------------------------------------------------------------------===// + +// Load control. +def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>; +def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>; + +// Store control. +def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>; +def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>; + +// Extract primary ASN (and instance). +let hasSideEffects = 1 in { + def EPAR : InherentRRE<"epar", 0xB226, GR32, null_frag>; + def EPAIR : InherentRRE<"epair", 0xB99A, GR64, null_frag>; +} + +// Extract secondary ASN (and instance). +let hasSideEffects = 1 in { + def ESAR : InherentRRE<"esar", 0xB227, GR32, null_frag>; + def ESAIR : InherentRRE<"esair", 0xB99B, GR64, null_frag>; +} + +// Set secondary ASN (and instance). +let hasSideEffects = 1 in { + def SSAR : SideEffectUnaryRRE<"ssar", 0xB225, GR32, null_frag>; + def SSAIR : SideEffectUnaryRRE<"ssair", 0xB99F, GR64, null_frag>; +} + +// Extract and set extended authority. +let hasSideEffects = 1 in + def ESEA : UnaryTiedRRE<"esea", 0xB99D, GR32>; + +//===----------------------------------------------------------------------===// +// Prefix-Register Instructions. +//===----------------------------------------------------------------------===// + +// Set prefix. +let hasSideEffects = 1 in + def SPX : SideEffectUnaryS<"spx", 0xB210, null_frag, 4>; + +// Store prefix. +let hasSideEffects = 1 in + def STPX : StoreInherentS<"stpx", 0xB211, null_frag, 4>; + +//===----------------------------------------------------------------------===// +// Storage-Key and Real Memory Instructions. +//===----------------------------------------------------------------------===// + +// Insert storage key extended. +let hasSideEffects = 1 in + def ISKE : BinaryRRE<"iske", 0xB229, null_frag, GR32, GR64>; + +// Insert virtual storage key. +let hasSideEffects = 1 in + def IVSK : BinaryRRE<"ivsk", 0xB223, null_frag, GR32, GR64>; + +// Set storage key extended. +let hasSideEffects = 1, Defs = [CC] in + defm SSKE : SideEffectTernaryRRFcOpt<"sske", 0xB22B, GR32, GR64>; + +// Reset reference bit extended. +let hasSideEffects = 1, Defs = [CC] in + def RRBE : SideEffectBinaryRRE<"rrbe", 0xB22A, GR32, GR64>; + +// Reset reference bits multiple. +let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in + def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>; + +// Perform frame management function. +let hasSideEffects = 1 in + def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>; + +// Test block. +let hasSideEffects = 1, mayStore = 1, Uses = [R0D], Defs = [R0D, CC] in + def TB : SideEffectBinaryRRE<"tb", 0xB22C, GR64, GR64>; + +// Page in / out. +let mayLoad = 1, mayStore = 1, Defs = [CC] in { + def PGIN : SideEffectBinaryRRE<"pgin", 0xB22E, GR64, GR64>; + def PGOUT : SideEffectBinaryRRE<"pgout", 0xB22F, GR64, GR64>; +} + +//===----------------------------------------------------------------------===// +// Dynamic-Address-Translation Instructions. +//===----------------------------------------------------------------------===// + +// Invalidate page table entry. +let hasSideEffects = 1 in + defm IPTE : SideEffectQuaternaryRRFaOptOpt<"ipte", 0xB221, GR64, GR32, GR32>; + +// Invalidate DAT table entry. +let hasSideEffects = 1 in + defm IDTE : SideEffectQuaternaryRRFbOpt<"idte", 0xB98E, GR64, GR64, GR64>; + +// Compare and replace DAT table entry. +let Predicates = [FeatureEnhancedDAT2], hasSideEffects = 1, Defs = [CC] in + defm CRDTE : SideEffectQuaternaryRRFbOpt<"crdte", 0xB98F, GR128, GR128, GR64>; + +// Purge TLB. +let hasSideEffects = 1 in + def PTLB : SideEffectInherentS<"ptlb", 0xB20D, null_frag>; + +// Compare and swap and purge. +let hasSideEffects = 1, Defs = [CC] in { + def CSP : CmpSwapRRE<"csp", 0xB250, GR128, GR64>; + def CSPG : CmpSwapRRE<"cspg", 0xB98A, GR128, GR64>; +} + +// Load page-table-entry address. +let hasSideEffects = 1, Defs = [CC] in + def LPTEA : TernaryRRFb<"lptea", 0xB9AA, GR64, GR64, GR64>; + +// Load real address. +let hasSideEffects = 1, Defs = [CC] in { + defm LRA : LoadAddressRXPair<"lra", 0xB1, 0xE313, null_frag>; + def LRAG : LoadAddressRXY<"lrag", 0xE303, null_frag, laaddr20pair>; +} + +// Store real address. +def STRAG : StoreSSE<"strag", 0xE502>; + +// Load using real address. +let mayLoad = 1 in { + def LURA : UnaryRRE<"lura", 0xB24B, null_frag, GR32, GR64>; + def LURAG : UnaryRRE<"lurag", 0xB905, null_frag, GR64, GR64>; +} + +// Store using real address. +let mayStore = 1 in { + def STURA : SideEffectBinaryRRE<"stura", 0xB246, GR32, GR64>; + def STURG : SideEffectBinaryRRE<"sturg", 0xB925, GR64, GR64>; +} + +// Test protection. +let hasSideEffects = 1, Defs = [CC] in + def TPROT : SideEffectBinarySSE<"tprot", 0xE501>; + +//===----------------------------------------------------------------------===// +// Memory-move Instructions. +//===----------------------------------------------------------------------===// + +// Move with key. +let mayLoad = 1, mayStore = 1, Defs = [CC] in + def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>; + +// Move to primary / secondary. +let mayLoad = 1, mayStore = 1, Defs = [CC] in { + def MVCP : MemoryBinarySSd<"mvcp", 0xDA, GR64>; + def MVCS : MemoryBinarySSd<"mvcs", 0xDB, GR64>; +} + +// Move with source / destination key. +let mayLoad = 1, mayStore = 1, Uses = [R0L, R1L] in { + def MVCSK : SideEffectBinarySSE<"mvcsk", 0xE50E>; + def MVCDK : SideEffectBinarySSE<"mvcdk", 0xE50F>; +} + +// Move with optional specifications. +let mayLoad = 1, mayStore = 1, Uses = [R0L] in + def MVCOS : SideEffectTernarySSF<"mvcos", 0xC80, GR64>; + +// Move page. +let mayLoad = 1, mayStore = 1, Uses = [R0L], Defs = [CC] in + def MVPG : SideEffectBinaryRRE<"mvpg", 0xB254, GR64, GR64>; + +//===----------------------------------------------------------------------===// +// Address-Space Instructions. +//===----------------------------------------------------------------------===// + +// Load address space parameters. +let hasSideEffects = 1, Defs = [CC] in + def LASP : SideEffectBinarySSE<"lasp", 0xE500>; + +// Purge ALB. +let hasSideEffects = 1 in + def PALB : SideEffectInherentRRE<"palb", 0xB248>; + +// Program call. +let hasSideEffects = 1 in + def PC : SideEffectAddressS<"pc", 0xB218, null_frag>; + +// Program return. +let hasSideEffects = 1, Defs = [CC] in + def PR : SideEffectInherentE<"pr", 0x0101>; + +// Program transfer (with instance). +let hasSideEffects = 1 in { + def PT : SideEffectBinaryRRE<"pt", 0xB228, GR32, GR64>; + def PTI : SideEffectBinaryRRE<"pti", 0xB99E, GR64, GR64>; +} + +// Resume program. +let hasSideEffects = 1, Defs = [CC] in + def RP : SideEffectAddressS<"rp", 0xB277, null_frag>; + +// Branch in subspace group. +let hasSideEffects = 1 in + def BSG : UnaryRRE<"bsg", 0xB258, null_frag, GR64, GR64>; + +// Branch and set authority. +let hasSideEffects = 1 in + def BSA : UnaryRRE<"bsa", 0xB25A, null_frag, GR64, GR64>; + +// Test access. +let Defs = [CC] in + def TAR : SideEffectBinaryRRE<"tar", 0xB24C, AR32, GR32>; + +//===----------------------------------------------------------------------===// +// Linkage-Stack Instructions. +//===----------------------------------------------------------------------===// + +// Branch and stack. +let hasSideEffects = 1 in + def BAKR : SideEffectBinaryRRE<"bakr", 0xB240, GR64, GR64>; + +// Extract stacked registers. +let hasSideEffects = 1 in { + def EREG : SideEffectBinaryRRE<"ereg", 0xB249, GR32, GR32>; + def EREGG : SideEffectBinaryRRE<"eregg", 0xB90E, GR64, GR64>; +} + +// Extract stacked state. +let hasSideEffects = 1, Defs = [CC] in + def ESTA : UnaryRRE<"esta", 0xB24A, null_frag, GR128, GR32>; + +// Modify stacked state. +let hasSideEffects = 1 in + def MSTA : SideEffectUnaryRRE<"msta", 0xB247, GR128, null_frag>; + +//===----------------------------------------------------------------------===// +// Time-Related Instructions. +//===----------------------------------------------------------------------===// + +// Perform timing facility function. +let hasSideEffects = 1, mayLoad = 1, Uses = [R0L, R1D], Defs = [CC] in + def PTFF : SideEffectInherentE<"ptff", 0x0104>; + +// Set clock. +let hasSideEffects = 1, Defs = [CC] in + def SCK : SideEffectUnaryS<"sck", 0xB204, null_frag, 8>; + +// Set clock programmable field. +let hasSideEffects = 1, Uses = [R0L] in + def SCKPF : SideEffectInherentE<"sckpf", 0x0107>; + +// Set clock comparator. +let hasSideEffects = 1 in + def SCKC : SideEffectUnaryS<"sckc", 0xB206, null_frag, 8>; + +// Set CPU timer. +let hasSideEffects = 1 in + def SPT : SideEffectUnaryS<"spt", 0xB208, null_frag, 8>; + +// Store clock (fast / extended). +let hasSideEffects = 1, Defs = [CC] in { + def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>; + def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>; + def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>; +} + +// Store clock comparator. +let hasSideEffects = 1 in + def STCKC : StoreInherentS<"stckc", 0xB207, null_frag, 8>; + +// Store CPU timer. +let hasSideEffects = 1 in + def STPT : StoreInherentS<"stpt", 0xB209, null_frag, 8>; + +//===----------------------------------------------------------------------===// +// CPU-Related Instructions. +//===----------------------------------------------------------------------===// + +// Store CPU address. +let hasSideEffects = 1 in + def STAP : StoreInherentS<"stap", 0xB212, null_frag, 2>; + +// Store CPU ID. +let hasSideEffects = 1 in + def STIDP : StoreInherentS<"stidp", 0xB202, null_frag, 8>; + +// Store system information. +let hasSideEffects = 1, Uses = [R0L, R1L], Defs = [R0L, CC] in + def STSI : StoreInherentS<"stsi", 0xB27D, null_frag, 0>; + +// Store facility list. +let hasSideEffects = 1 in + def STFL : StoreInherentS<"stfl", 0xB2B1, null_frag, 4>; + +// Store facility list extended. +let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in + def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>; + +// Extract CPU attribute. +let hasSideEffects = 1 in + def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>; + +// Extract CPU time. +let hasSideEffects = 1, mayLoad = 1, Defs = [R0D, R1D] in + def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>; + +// Perform topology function. +let hasSideEffects = 1 in + def PTF : UnaryTiedRRE<"ptf", 0xB9A2, GR64>; + +// Perform cryptographic key management operation. +let Predicates = [FeatureMessageSecurityAssist3], + hasSideEffects = 1, Uses = [R0L, R1D] in + def PCKMO : SideEffectInherentRRE<"pckmo", 0xB928>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +//===----------------------------------------------------------------------===// + +// Supervisor call. +let hasSideEffects = 1, isCall = 1, Defs = [CC] in + def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>; + +// Monitor call. +let hasSideEffects = 1, isCall = 1 in + def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>; + +// Diagnose. +let hasSideEffects = 1, isCall = 1 in + def DIAG : SideEffectTernaryRS<"diag", 0x83, GR32, GR32>; + +// Trace. +let hasSideEffects = 1, mayLoad = 1 in { + def TRACE : SideEffectTernaryRS<"trace", 0x99, GR32, GR32>; + def TRACG : SideEffectTernaryRSY<"tracg", 0xEB0F, GR64, GR64>; +} + +// Trap. +let hasSideEffects = 1 in { + def TRAP2 : SideEffectInherentE<"trap2", 0x01FF>; + def TRAP4 : SideEffectAddressS<"trap4", 0xB2FF, null_frag>; +} + +// Signal processor. +let hasSideEffects = 1, Defs = [CC] in + def SIGP : SideEffectTernaryRS<"sigp", 0xAE, GR64, GR64>; + +// Signal adapter. +let hasSideEffects = 1, Uses = [R0D, R1D, R2D, R3D], Defs = [CC] in + def SIGA : SideEffectAddressS<"siga", 0xB274, null_frag>; + +// Start interpretive execution. +let hasSideEffects = 1, Defs = [CC] in + def SIE : SideEffectUnaryS<"sie", 0xB214, null_frag, 0>; + +//===----------------------------------------------------------------------===// +// CPU-Measurement Facility Instructions (SA23-2260). +//===----------------------------------------------------------------------===// + +// Load program parameter +let hasSideEffects = 1 in + def LPP : SideEffectUnaryS<"lpp", 0xB280, null_frag, 8>; + +// Extract coprocessor-group address. +let hasSideEffects = 1, Defs = [CC] in + def ECPGA : UnaryRRE<"ecpga", 0xB2ED, null_frag, GR32, GR64>; + +// Extract CPU counter. +let hasSideEffects = 1, Defs = [CC] in + def ECCTR : UnaryRRE<"ecctr", 0xB2E4, null_frag, GR64, GR64>; + +// Extract peripheral counter. +let hasSideEffects = 1, Defs = [CC] in + def EPCTR : UnaryRRE<"epctr", 0xB2E5, null_frag, GR64, GR64>; + +// Load CPU-counter-set controls. +let hasSideEffects = 1, Defs = [CC] in + def LCCTL : SideEffectUnaryS<"lcctl", 0xB284, null_frag, 8>; + +// Load peripheral-counter-set controls. +let hasSideEffects = 1, Defs = [CC] in + def LPCTL : SideEffectUnaryS<"lpctl", 0xB285, null_frag, 8>; + +// Load sampling controls. +let hasSideEffects = 1, Defs = [CC] in + def LSCTL : SideEffectUnaryS<"lsctl", 0xB287, null_frag, 0>; + +// Query sampling information. +let hasSideEffects = 1 in + def QSI : StoreInherentS<"qsi", 0xB286, null_frag, 0>; + +// Query counter information. +let hasSideEffects = 1 in + def QCTRI : StoreInherentS<"qctri", 0xB28E, null_frag, 0>; + +// Set CPU counter. +let hasSideEffects = 1, Defs = [CC] in + def SCCTR : SideEffectBinaryRRE<"scctr", 0xB2E0, GR64, GR64>; + +// Set peripheral counter. +let hasSideEffects = 1, Defs = [CC] in + def SPCTR : SideEffectBinaryRRE<"spctr", 0xB2E1, GR64, GR64>; + +//===----------------------------------------------------------------------===// +// I/O Instructions (Principles of Operation, Chapter 14). +//===----------------------------------------------------------------------===// + +// Clear subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def CSCH : SideEffectInherentS<"csch", 0xB230, null_frag>; + +// Halt subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def HSCH : SideEffectInherentS<"hsch", 0xB231, null_frag>; + +// Modify subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def MSCH : SideEffectUnaryS<"msch", 0xB232, null_frag, 0>; + +// Resume subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def RSCH : SideEffectInherentS<"rsch", 0xB238, null_frag>; + +// Start subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def SSCH : SideEffectUnaryS<"ssch", 0xB233, null_frag, 0>; + +// Store subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def STSCH : StoreInherentS<"stsch", 0xB234, null_frag, 0>; + +// Test subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def TSCH : StoreInherentS<"tsch", 0xB235, null_frag, 0>; + +// Cancel subchannel. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def XSCH : SideEffectInherentS<"xsch", 0xB276, null_frag>; + +// Reset channel path. +let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in + def RCHP : SideEffectInherentS<"rchp", 0xB23B, null_frag>; + +// Set channel monitor. +let hasSideEffects = 1, mayLoad = 1, Uses = [R1L, R2D] in + def SCHM : SideEffectInherentS<"schm", 0xB23C, null_frag>; + +// Store channel path status. +let hasSideEffects = 1 in + def STCPS : StoreInherentS<"stcps", 0xB23A, null_frag, 0>; + +// Store channel report word. +let hasSideEffects = 1, Defs = [CC] in + def STCRW : StoreInherentS<"stcrw", 0xB239, null_frag, 0>; + +// Test pending interruption. +let hasSideEffects = 1, Defs = [CC] in + def TPI : StoreInherentS<"tpi", 0xB236, null_frag, 0>; + +// Set address limit. +let hasSideEffects = 1, Uses = [R1L] in + def SAL : SideEffectInherentS<"sal", 0xB237, null_frag>; + diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index 47d2f75cc11a..36809ea81dc1 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -304,3 +304,13 @@ foreach I = 0-15 in { defm AR32 : SystemZRegClass<"AR32", [i32], 32, (add (sequence "A%u", 0, 15)), 0>; +// Control registers. +class CREG64<bits<16> num, string n> : SystemZReg<n> { + let HWEncoding = num; +} +foreach I = 0-15 in { + def C#I : CREG64<I, "c"#I>, DwarfRegNum<[!add(I, 32)]>; +} +defm CR64 : SystemZRegClass<"CR64", [i64], 64, + (add (sequence "C%u", 0, 15)), 0>; + diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td index 5f5f2f690e58..adc9f2976f87 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -353,6 +353,9 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>; def : InstRW<[FXa], (instregex "ALGR(K)?$")>; def : InstRW<[FXa], (instregex "ALR(K)?$")>; def : InstRW<[FXa], (instregex "AR(K)?$")>; +def : InstRW<[FXa], (instregex "A(L)?HHHR$")>; +def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXa], (instregex "ALSIH(N)?$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>; // Logical addition with carry @@ -376,6 +379,8 @@ def : InstRW<[FXa], (instregex "SLGF(I|R)$")>; def : InstRW<[FXa], (instregex "SLGR(K)?$")>; def : InstRW<[FXa], (instregex "SLR(K)?$")>; def : InstRW<[FXa], (instregex "SR(K)?$")>; +def : InstRW<[FXa], (instregex "S(L)?HHHR$")>; +def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>; // Subtraction with borrow def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>; @@ -506,6 +511,8 @@ def : InstRW<[FXb], (instregex "CLIH$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>; def : InstRW<[FXb], (instregex "CLR$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>; +def : InstRW<[FXb], (instregex "C(L)?HHR$")>; +def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>; // Compare halfword def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>; @@ -701,38 +708,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; -// Move with key -def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>; - -// Monitor call -def : InstRW<[FXb], (instregex "MC$")>; - -// Extract CPU attribute -def : InstRW<[FXb, Lat30], (instregex "ECAG$")>; - -// Extract CPU Time -def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>; - -// Extract PSW -def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; - // Execute def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>; -// Program return -def : InstRW<[FXb, Lat30], (instregex "PR$")>; - -// Inline assembly -def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone], - (instregex "STCK(F)?$")>; -def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone], - (instregex "STCKE$")>; -def : InstRW<[FXa, LSU, Lat5], (instregex "STFLE$")>; -def : InstRW<[FXb, Lat30], (instregex "SVC$")>; - -// Store real address -def : InstRW<[FXb, LSU, Lat5], (instregex "STRAG$")>; - //===----------------------------------------------------------------------===// // .insn directive instructions //===----------------------------------------------------------------------===// @@ -1364,5 +1342,162 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>; def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>; def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>; + +// -------------------------------- System ---------------------------------- // + +//===----------------------------------------------------------------------===// +// System: Program-Status Word Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>; +def : InstRW<[FXa, Lat3], (instregex "IPK$")>; +def : InstRW<[LSU], (instregex "SPKA$")>; +def : InstRW<[LSU], (instregex "SSM$")>; +def : InstRW<[FXb], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXa, Lat3], (instregex "IAC$")>; +def : InstRW<[LSU], (instregex "SAC(F)?$")>; + +//===----------------------------------------------------------------------===// +// System: Control Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>; +def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; +def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>; +def : InstRW<[FXb, Lat30], (instregex "ESEA$")>; + +//===----------------------------------------------------------------------===// +// System: Prefix-Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>; + +//===----------------------------------------------------------------------===// +// System: Storage-Key and Real Memory Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "ISKE$")>; +def : InstRW<[FXb, Lat30], (instregex "IVSK$")>; +def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>; +def : InstRW<[FXb, Lat30], (instregex "PFMF$")>; +def : InstRW<[FXb, Lat30], (instregex "TB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>; + +//===----------------------------------------------------------------------===// +// System: Dynamic-Address-Translation Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "PTLB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>; + +//===----------------------------------------------------------------------===// +// System: Memory-move Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "MVPG$")>; + +//===----------------------------------------------------------------------===// +// System: Address-Space Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>; +def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>; +def : InstRW<[FXb, Lat30], (instregex "PR$")>; +def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>; +def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>; +def : InstRW<[FXb, Lat20], (instregex "TAR$")>; + +//===----------------------------------------------------------------------===// +// System: Linkage-Stack Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>; +def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>; + +//===----------------------------------------------------------------------===// +// System: Time-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "PTFF$")>; +def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>; +def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>; +def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>; +def : InstRW<[LSU, GroupAlone], (instregex "SPT$")>; +def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone], + (instregex "STCK(F)?$")>; +def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone], + (instregex "STCKE$")>; +def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>; +def : InstRW<[LSU, LSU, FXb, Lat3], (instregex "STPT$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "ECTG$")>; +def : InstRW<[FXb, Lat30], (instregex "PTF$")>; +def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>; + +//===----------------------------------------------------------------------===// +// System: Miscellaneous Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "SVC$")>; +def : InstRW<[FXb], (instregex "MC$")>; +def : InstRW<[FXb, Lat30], (instregex "DIAG$")>; +def : InstRW<[FXb], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>; +def : InstRW<[FXb, Lat30], (instregex "SIGP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Measurement Facility Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "LPP$")>; +def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>; +def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>; +def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>; + +//===----------------------------------------------------------------------===// +// System: I/O Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>; +def : InstRW<[FXb, Lat30], (instregex "RCHP$")>; +def : InstRW<[FXb, Lat30], (instregex "SCHM$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>; +def : InstRW<[FXb, Lat30], (instregex "SAL$")>; + } diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index 126eac2e2072..128049a09086 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -310,6 +310,9 @@ def : InstRW<[FXU], (instregex "ALGF(I|R)$")>; def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; +def : InstRW<[FXU], (instregex "A(L)?HHHR$")>; +def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXU], (instregex "ALSIH(N)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; // Logical addition with carry @@ -333,6 +336,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>; def : InstRW<[FXU], (instregex "SLGR(K)?$")>; def : InstRW<[FXU], (instregex "SLR(K)?$")>; def : InstRW<[FXU], (instregex "SR(K)?$")>; +def : InstRW<[FXU], (instregex "S(L)?HHHR$")>; +def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>; // Subtraction with borrow def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>; @@ -468,6 +473,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>; def : InstRW<[FXU], (instregex "CLR$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>; +def : InstRW<[FXU], (instregex "C(L)?HHR$")>; +def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>; // Compare halfword def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>; @@ -634,37 +641,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; -// Move with key -def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>; - -// Monitor call -def : InstRW<[FXU], (instregex "MC$")>; - -// Extract CPU attribute -def : InstRW<[FXU, Lat30], (instregex "ECAG$")>; - -// Extract CPU Time -def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>; - -// Extract PSW -def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; - // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; -// Program return -def : InstRW<[FXU, Lat30], (instregex "PR$")>; - -// Inline assembly -def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>; -def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>; -def : InstRW<[LSU, FXU, Lat5], (instregex "STCKE$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>; -def : InstRW<[FXU, Lat30], (instregex "SVC$")>; - -// Store real address -def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>; - //===----------------------------------------------------------------------===// // .insn directive instructions //===----------------------------------------------------------------------===// @@ -1058,5 +1037,160 @@ def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>; def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>; def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; + +// -------------------------------- System ---------------------------------- // + +//===----------------------------------------------------------------------===// +// System: Program-Status Word Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>; +def : InstRW<[FXU, Lat3], (instregex "IPK$")>; +def : InstRW<[LSU], (instregex "SPKA$")>; +def : InstRW<[LSU], (instregex "SSM$")>; +def : InstRW<[FXU], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXU, Lat3], (instregex "IAC$")>; +def : InstRW<[LSU], (instregex "SAC(F)?$")>; + +//===----------------------------------------------------------------------===// +// System: Control Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>; +def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; +def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>; +def : InstRW<[FXU, Lat30], (instregex "ESEA$")>; + +//===----------------------------------------------------------------------===// +// System: Prefix-Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>; + +//===----------------------------------------------------------------------===// +// System: Storage-Key and Real Memory Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "ISKE$")>; +def : InstRW<[FXU, Lat30], (instregex "IVSK$")>; +def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>; +def : InstRW<[FXU, Lat30], (instregex "PFMF$")>; +def : InstRW<[FXU, Lat30], (instregex "TB$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>; + +//===----------------------------------------------------------------------===// +// System: Dynamic-Address-Translation Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "PTLB$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>; + +//===----------------------------------------------------------------------===// +// System: Memory-move Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; +def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>; +def : InstRW<[LSU, Lat30], (instregex "MVPG$")>; + +//===----------------------------------------------------------------------===// +// System: Address-Space Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>; +def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>; +def : InstRW<[FXU, Lat30], (instregex "PR$")>; +def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>; +def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>; +def : InstRW<[FXU, Lat20], (instregex "TAR$")>; + +//===----------------------------------------------------------------------===// +// System: Linkage-Stack Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>; + +//===----------------------------------------------------------------------===// +// System: Time-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "PTFF$")>; +def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>; +def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>; +def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>; +def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>; +def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>; +def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STCKE$")>; +def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>; +def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>; +def : InstRW<[FXU, Lat30], (instregex "PTF$")>; +def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>; + +//===----------------------------------------------------------------------===// +// System: Miscellaneous Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "SVC$")>; +def : InstRW<[FXU], (instregex "MC$")>; +def : InstRW<[FXU, Lat30], (instregex "DIAG$")>; +def : InstRW<[FXU], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>; +def : InstRW<[FXU, Lat30], (instregex "SIGP$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Measurement Facility Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU], (instregex "LPP$")>; +def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>; +def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>; +def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>; + +//===----------------------------------------------------------------------===// +// System: I/O Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>; +def : InstRW<[FXU, Lat30], (instregex "RCHP$")>; +def : InstRW<[FXU, Lat30], (instregex "SCHM$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>; +def : InstRW<[FXU, Lat30], (instregex "SAL$")>; + } diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index d38ca64d2e9b..76b378454631 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -320,6 +320,9 @@ def : InstRW<[FXU], (instregex "ALGF(I|R)$")>; def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; +def : InstRW<[FXU], (instregex "A(L)?HHHR$")>; +def : InstRW<[FXU, Lat2], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXU], (instregex "ALSIH(N)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; // Logical addition with carry @@ -343,6 +346,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>; def : InstRW<[FXU], (instregex "SLGR(K)?$")>; def : InstRW<[FXU], (instregex "SLR(K)?$")>; def : InstRW<[FXU], (instregex "SR(K)?$")>; +def : InstRW<[FXU], (instregex "S(L)?HHHR$")>; +def : InstRW<[FXU, Lat2], (instregex "S(L)?HHLR$")>; // Subtraction with borrow def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>; @@ -478,6 +483,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>; def : InstRW<[FXU], (instregex "CLR$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>; +def : InstRW<[FXU], (instregex "C(L)?HHR$")>; +def : InstRW<[FXU, Lat2], (instregex "C(L)?HLR$")>; // Compare halfword def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>; @@ -672,37 +679,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; -// Move with key -def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>; - -// Monitor call -def : InstRW<[FXU], (instregex "MC$")>; - -// Extract CPU attribute -def : InstRW<[FXU, Lat30], (instregex "ECAG$")>; - -// Extract CPU Time -def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>; - -// Extract PSW -def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; - // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; -// Program return -def : InstRW<[FXU, Lat30], (instregex "PR$")>; - -// Inline assembly -def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>; -def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone], - (instregex "STCKE$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>; -def : InstRW<[FXU, Lat30], (instregex "SVC$")>; - -// Store real address -def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>; - //===----------------------------------------------------------------------===// // .insn directive instructions //===----------------------------------------------------------------------===// @@ -1102,5 +1081,161 @@ def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>; def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>; def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; + +// -------------------------------- System ---------------------------------- // + +//===----------------------------------------------------------------------===// +// System: Program-Status Word Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>; +def : InstRW<[FXU, Lat3], (instregex "IPK$")>; +def : InstRW<[LSU], (instregex "SPKA$")>; +def : InstRW<[LSU], (instregex "SSM$")>; +def : InstRW<[FXU], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXU, Lat3], (instregex "IAC$")>; +def : InstRW<[LSU], (instregex "SAC(F)?$")>; + +//===----------------------------------------------------------------------===// +// System: Control Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>; +def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; +def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>; +def : InstRW<[FXU, Lat30], (instregex "ESEA$")>; + +//===----------------------------------------------------------------------===// +// System: Prefix-Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>; + +//===----------------------------------------------------------------------===// +// System: Storage-Key and Real Memory Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "ISKE$")>; +def : InstRW<[FXU, Lat30], (instregex "IVSK$")>; +def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>; +def : InstRW<[FXU, Lat30], (instregex "PFMF$")>; +def : InstRW<[FXU, Lat30], (instregex "TB$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>; + +//===----------------------------------------------------------------------===// +// System: Dynamic-Address-Translation Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "CRDTE(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "PTLB$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>; + +//===----------------------------------------------------------------------===// +// System: Memory-move Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; +def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>; +def : InstRW<[LSU, Lat30], (instregex "MVPG$")>; + +//===----------------------------------------------------------------------===// +// System: Address-Space Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>; +def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>; +def : InstRW<[FXU, Lat30], (instregex "PR$")>; +def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>; +def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>; +def : InstRW<[FXU, Lat20], (instregex "TAR$")>; + +//===----------------------------------------------------------------------===// +// System: Linkage-Stack Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>; + +//===----------------------------------------------------------------------===// +// System: Time-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "PTFF$")>; +def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>; +def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>; +def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>; +def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>; +def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>; +def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone], + (instregex "STCKE$")>; +def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>; +def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>; +def : InstRW<[FXU, Lat30], (instregex "PTF$")>; +def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>; + +//===----------------------------------------------------------------------===// +// System: Miscellaneous Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "SVC$")>; +def : InstRW<[FXU], (instregex "MC$")>; +def : InstRW<[FXU, Lat30], (instregex "DIAG$")>; +def : InstRW<[FXU], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>; +def : InstRW<[FXU, Lat30], (instregex "SIGP$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Measurement Facility Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU], (instregex "LPP$")>; +def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>; +def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>; +def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>; + +//===----------------------------------------------------------------------===// +// System: I/O Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>; +def : InstRW<[FXU, Lat30], (instregex "RCHP$")>; +def : InstRW<[FXU, Lat30], (instregex "SCHM$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>; +def : InstRW<[FXU, Lat30], (instregex "SAL$")>; + } diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index 0ab0c2f25915..eb4a0962f7eb 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, const TargetMachine &TM) : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), - HasPopulationCount(false), HasMessageSecurityAssist4(false), + HasPopulationCount(false), HasMessageSecurityAssist3(false), + HasMessageSecurityAssist4(false), HasResetReferenceBitsMultiple(false), HasFastSerialization(false), HasInterlockedAccess1(false), HasMiscellaneousExtensions(false), HasExecutionHint(false), HasLoadAndTrap(false), HasTransactionalExecution(false), HasProcessorAssist(false), - HasDFPZonedConversion(false), + HasDFPZonedConversion(false), HasEnhancedDAT2(false), HasVector(false), HasLoadStoreOnCond2(false), HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false), HasDFPPackedConversion(false), diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index be480f03c572..b05a1bb6cafd 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -39,7 +39,9 @@ protected: bool HasHighWord; bool HasFPExtension; bool HasPopulationCount; + bool HasMessageSecurityAssist3; bool HasMessageSecurityAssist4; + bool HasResetReferenceBitsMultiple; bool HasFastSerialization; bool HasInterlockedAccess1; bool HasMiscellaneousExtensions; @@ -48,6 +50,7 @@ protected: bool HasTransactionalExecution; bool HasProcessorAssist; bool HasDFPZonedConversion; + bool HasEnhancedDAT2; bool HasVector; bool HasLoadStoreOnCond2; bool HasLoadAndZeroRightmostByte; @@ -109,9 +112,18 @@ public: bool hasPopulationCount() const { return HasPopulationCount; } // Return true if the target has the message-security-assist + // extension facility 3. + bool hasMessageSecurityAssist3() const { return HasMessageSecurityAssist3; } + + // Return true if the target has the message-security-assist // extension facility 4. bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; } + // Return true if the target has the reset-reference-bits-multiple facility. + bool hasResetReferenceBitsMultiple() const { + return HasResetReferenceBitsMultiple; + } + // Return true if the target has the fast-serialization facility. bool hasFastSerialization() const { return HasFastSerialization; } @@ -138,6 +150,9 @@ public: // Return true if the target has the DFP zoned-conversion facility. bool hasDFPZonedConversion() const { return HasDFPZonedConversion; } + // Return true if the target has the enhanced-DAT facility 2. + bool hasEnhancedDAT2() const { return HasEnhancedDAT2; } + // Return true if the target has the load-and-zero-rightmost-byte facility. bool hasLoadAndZeroRightmostByte() const { return HasLoadAndZeroRightmostByte; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 422c16b8eb62..ce5c57e0f519 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -238,7 +238,7 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -void SystemZTTIImpl::getUnrollingPreferences(Loop *L, +void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Find out if L contains a call, what the machine instruction count // estimate is, and how many stores there are. diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index bdba7601eb78..6923fc6fc910 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -45,7 +45,8 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); /// @} diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 39cb1ca336f2..129794171464 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -57,17 +57,19 @@ def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops), } } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 -// Placemarkers to indicate the start or end of a block or loop scope. These -// use/clobber VALUE_STACK to prevent them from being moved into the middle of -// an expression tree. +// Placemarkers to indicate the start or end of a block, loop, or try scope. +// These use/clobber VALUE_STACK to prevent them from being moved into the +// middle of an expression tree. let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in { def BLOCK : I<(outs), (ins Signature:$sig), [], "block \t$sig", 0x02>; def LOOP : I<(outs), (ins Signature:$sig), [], "loop \t$sig", 0x03>; +def TRY : I<(outs), (ins Signature:$sig), [], "try \t$sig", 0x06>; -// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode -// in wasm. +// END_BLOCK, END_LOOP, END_TRY, and END_FUNCTION are represented with the same +// opcode in wasm. def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>; def END_LOOP : I<(outs), (ins), [], "end_loop", 0x0b>; +def END_TRY : I<(outs), (ins), [], "end_try", 0x0b>; let isTerminator = 1, isBarrier = 1 in def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>; } // Uses = [VALUE_STACK], Defs = [VALUE_STACK] @@ -112,6 +114,20 @@ let isReturn = 1 in { def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>; +def THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$obj), + [(int_wasm_throw imm:$tag, I32:$obj)], "throw \t$tag, $obj", + 0x08>; +def THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$obj), + [(int_wasm_throw imm:$tag, I64:$obj)], "throw \t$tag, $obj", + 0x08>; +def RETHROW : I<(outs), (ins i32imm:$rel_depth), [], "rethrow \t$rel_depth", + 0x09>; + } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 } // Defs = [ARGUMENTS] + +// rethrow takes a relative depth as an argument, for which currently only 0 is +// possible for C++. Once other languages need depths other than 0, depths will +// be computed in CFGStackify. +def : Pat<(int_wasm_rethrow), (RETHROW 0)>; diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 947c0329bb6e..f0b6a3e35dba 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -897,7 +897,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) { } } - // Look for orphan landingpads, can occur in blocks with no predecesors + // Look for orphan landingpads, can occur in blocks with no predecessors for (BasicBlock &BB : F) { Instruction *I = BB.getFirstNonPHI(); if (auto *LPI = dyn_cast<LandingPadInst>(I)) diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index d30cc724c203..825f23dc52d9 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -49,8 +49,11 @@ static const char OpPrecedence[] = { 4, // IC_MINUS 5, // IC_MULTIPLY 5, // IC_DIVIDE - 6, // IC_RPAREN - 7, // IC_LPAREN + 5, // IC_MOD + 6, // IC_NOT + 7, // IC_NEG + 8, // IC_RPAREN + 9, // IC_LPAREN 0, // IC_IMM 0 // IC_REGISTER }; @@ -92,6 +95,9 @@ private: IC_MINUS, IC_MULTIPLY, IC_DIVIDE, + IC_MOD, + IC_NOT, + IC_NEG, IC_RPAREN, IC_LPAREN, IC_IMM, @@ -111,6 +117,10 @@ private: SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; SmallVector<ICToken, 4> PostfixStack; + bool isUnaryOperator(const InfixCalculatorTok Op) { + return Op == IC_NEG || Op == IC_NOT; + } + public: int64_t popOperand() { assert (!PostfixStack.empty() && "Poped an empty stack!"); @@ -192,6 +202,22 @@ private: ICToken Op = PostfixStack[i]; if (Op.first == IC_IMM || Op.first == IC_REGISTER) { OperandStack.push_back(Op); + } else if (isUnaryOperator(Op.first)) { + assert (OperandStack.size() > 0 && "Too few operands."); + ICToken Operand = OperandStack.pop_back_val(); + assert (Operand.first == IC_IMM && + "Unary operation with a register!"); + switch (Op.first) { + default: + report_fatal_error("Unexpected operator!"); + break; + case IC_NEG: + OperandStack.push_back(std::make_pair(IC_IMM, -Operand.second)); + break; + case IC_NOT: + OperandStack.push_back(std::make_pair(IC_IMM, ~Operand.second)); + break; + } } else { assert (OperandStack.size() > 1 && "Too few operands."); int64_t Val; @@ -222,6 +248,12 @@ private: Val = Op1.second / Op2.second; OperandStack.push_back(std::make_pair(IC_IMM, Val)); break; + case IC_MOD: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Modulo operation with an immediate and a register!"); + Val = Op1.second % Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; case IC_OR: assert (Op1.first == IC_IMM && Op2.first == IC_IMM && "Or operation with an immediate and a register!"); @@ -271,6 +303,7 @@ private: IES_NOT, IES_MULTIPLY, IES_DIVIDE, + IES_MOD, IES_LBRAC, IES_RBRAC, IES_LPAREN, @@ -421,10 +454,16 @@ private: default: State = IES_ERROR; break; + case IES_OR: + case IES_XOR: + case IES_AND: + case IES_LSHIFT: + case IES_RSHIFT: case IES_PLUS: case IES_NOT: case IES_MULTIPLY: case IES_DIVIDE: + case IES_MOD: case IES_LPAREN: case IES_RPAREN: case IES_LBRAC: @@ -432,11 +471,12 @@ private: case IES_INTEGER: case IES_REGISTER: State = IES_MINUS; - // Only push the minus operator if it is not a unary operator. - if (!(CurrState == IES_PLUS || CurrState == IES_MINUS || - CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE || - CurrState == IES_LPAREN || CurrState == IES_LBRAC)) + // push minus operator if it is not a negate operator + if (CurrState == IES_REGISTER || CurrState == IES_RPAREN || + CurrState == IES_INTEGER || CurrState == IES_RBRAC) IC.pushOperator(IC_MINUS); + else + IC.pushOperator(IC_NEG); if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { // If we already have a BaseReg, then assume this is the IndexReg with // a scale of 1. @@ -458,9 +498,21 @@ private: default: State = IES_ERROR; break; + case IES_OR: + case IES_XOR: + case IES_AND: + case IES_LSHIFT: + case IES_RSHIFT: case IES_PLUS: + case IES_MINUS: case IES_NOT: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_MOD: + case IES_LPAREN: + case IES_LBRAC: State = IES_NOT; + IC.pushOperator(IC_NOT); break; } PrevState = CurrState; @@ -525,6 +577,7 @@ private: case IES_LSHIFT: case IES_RSHIFT: case IES_DIVIDE: + case IES_MOD: case IES_MULTIPLY: case IES_LPAREN: State = IES_INTEGER; @@ -539,26 +592,6 @@ private: } // Get the scale and replace the 'Register * Scale' with '0'. IC.popOperator(); - } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || - PrevState == IES_OR || PrevState == IES_AND || - PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || - PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC || - PrevState == IES_NOT || PrevState == IES_XOR) && - CurrState == IES_MINUS) { - // Unary minus. No need to pop the minus operand because it was never - // pushed. - IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. - } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || - PrevState == IES_OR || PrevState == IES_AND || - PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || - PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC || - PrevState == IES_NOT || PrevState == IES_XOR) && - CurrState == IES_NOT) { - // Unary not. No need to pop the not operand because it was never - // pushed. - IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm. } else { IC.pushOperand(IC_IMM, TmpInt); } @@ -594,6 +627,19 @@ private: break; } } + void onMod() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + State = IES_MOD; + IC.pushOperator(IC_MOD); + break; + } + } void onLBrac() { PrevState = State; switch (State) { @@ -647,18 +693,8 @@ private: case IES_RSHIFT: case IES_MULTIPLY: case IES_DIVIDE: + case IES_MOD: case IES_LPAREN: - // FIXME: We don't handle this type of unary minus or not, yet. - if ((PrevState == IES_PLUS || PrevState == IES_MINUS || - PrevState == IES_OR || PrevState == IES_AND || - PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || - PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC || - PrevState == IES_NOT || PrevState == IES_XOR) && - (CurrState == IES_MINUS || CurrState == IES_NOT)) { - State = IES_ERROR; - break; - } State = IES_LPAREN; IC.pushOperator(IC_LPAREN); break; @@ -1302,6 +1338,8 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine SM.onXor(); else if (Name.equals_lower("and")) SM.onAnd(); + else if (Name.equals_lower("mod")) + SM.onMod(); else return false; return true; diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index caf98bffb80d..8f2017e990c5 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -396,7 +396,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (!SB->getFragment()) { Asm.getContext().reportError( Fixup.getLoc(), - "symbol '" + B->getSymbol().getName() + + "symbol '" + SB->getName() + "' can not be undefined in a subtraction expression"); return false; } @@ -408,7 +408,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, // pedantic compatibility with 'as'. Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF; - Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + Value2 = Writer->getSymbolAddress(*SB, Layout); FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); } @@ -468,8 +468,8 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { - assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP && - !is64Bit() && + const MCSymbolRefExpr *SymA = Target.getSymA(); + assert(SymA->getKind() == MCSymbolRefExpr::VK_TLVP && !is64Bit() && "Should only be called with a 32-bit TLVP relocation!"); unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); @@ -480,15 +480,14 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, // subtraction from the picbase. For 32-bit pic the addend is the difference // between the picbase and the next address. For 32-bit static the addend is // zero. - if (Target.getSymB()) { + if (auto *SymB = Target.getSymB()) { // If this is a subtraction then we're pcrel. uint32_t FixupAddress = Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); IsPCRel = 1; - FixedValue = - FixupAddress - - Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) + - Target.getConstant(); + FixedValue = FixupAddress - + Writer->getSymbolAddress(SymB->getSymbol(), Layout) + + Target.getConstant(); FixedValue += 1ULL << Log2Size; } else { FixedValue = 0; @@ -499,8 +498,7 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, MRE.r_word0 = Value; MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28); - Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(), - MRE); + Writer->addRelocation(&SymA->getSymbol(), Fragment->getParent(), MRE); } void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 5892f1de33ee..807f7a6ddb19 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -44,7 +44,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, const MCAsmBackend &MAB) const { unsigned FixupKind = Fixup.getKind(); if (IsCrossSection) { - if (FixupKind != FK_Data_4) { + if (FixupKind != FK_Data_4 && FixupKind != llvm::X86::reloc_signed_4byte) { Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression"); return COFF::IMAGE_REL_AMD64_ADDR32; } diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index fe105298f5c1..7437ebacfac3 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -300,6 +300,8 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", "Intel Atom processors">; def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", "Intel Silvermont processors">; +def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM", + "Intel Goldmont processors">; class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; @@ -430,6 +432,34 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. +class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [ + ProcIntelGLM, + FeatureX87, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeaturePRFCHW, + FeatureCallRegIndirect, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureLAHFSAHF, + FeatureMPX, + FeatureSHA, + FeatureRDSEED, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT +]>; +def : GoldmontProc<"goldmont">; + // "Arrandale" along with corei3 and corei5 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ FeatureX87, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f777e5628988..b89914f8893e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5065,6 +5065,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +// Return true if the instruction zeroes the unused upper part of the +// destination and accepts mask. +static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { + switch (Opcode) { + default: + return false; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return true; + } +} + /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5097,6 +5111,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) + // If this node widens - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark this node as legal to enable replacing them with + // the v8i1 version of the previous instruction during instruction selection. + // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg, + // while zeroing all the upper remaining 60 bits of the register. if the + // result of such instruction is inserted into an allZeroVector, then we can + // safely remove insert_vector (in instruction selection) as the cmp instr + // already zeroed the rest of the register. + if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 && + (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) || + (SubVec.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode()))))) + return Op; + // extend to natively supported kshift MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; MVT WideOpVT = OpVT; @@ -7919,6 +7949,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } +// Return true if all the operands of the given CONCAT_VECTORS node are zeros +// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) +static bool isExpandWithZeros(const SDValue &Op) { + assert(Op.getOpcode() == ISD::CONCAT_VECTORS && + "Expand with zeros only possible in CONCAT_VECTORS nodes!"); + + for (unsigned i = 1; i < Op.getNumOperands(); i++) + if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) + return false; + + return true; +} + +// Returns true if the given node is a type promotion (by concatenating i1 +// zeros) of the result of a node that already zeros all upper bits of +// k-register. +static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { + unsigned Opc = Op.getOpcode(); + + assert(Opc == ISD::CONCAT_VECTORS && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected node to check for type promotion!"); + + // As long as we are concatenating zeros to the upper part of a previous node + // result, climb up the tree until a node with different opcode is + // encountered + while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { + if (Opc == ISD::INSERT_SUBVECTOR) { + if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && + Op.getConstantOperandVal(2) == 0) + Op = Op.getOperand(1); + else + return SDValue(); + } else { // Opc == ISD::CONCAT_VECTORS + if (isExpandWithZeros(Op)) + Op = Op.getOperand(0); + else + return SDValue(); + } + Opc = Op.getOpcode(); + } + + // Check if the first inserted node zeroes the upper bits, or an 'and' result + // of a node that zeros the upper bits (its masked version). + if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || + (Op.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { + return Op; + } + + return SDValue(); +} + static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { @@ -7929,6 +8013,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + // If this node promotes - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark it as legal and catch the pattern in instruction + // selection to avoid emitting extra insturctions (for zeroing upper bits). + if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) { + SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64); + SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted, + ZeroC); + } + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { // Specialize the cases when all, or all but one, of the operands are undef. @@ -27012,6 +27107,9 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); + unsigned InputSizeInBits = MaskVT.getSizeInBits(); + unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; + MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); bool ContainsZeros = false; APInt Zeroable(NumMaskElts, false); @@ -27027,7 +27125,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskVT.getScalarSizeInBits(), Mask, + MaskScalarSizeInBits, Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt) { PermuteImm = (unsigned)ShiftAmt; @@ -27043,10 +27141,6 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && "Expected unary shuffle"); - unsigned InputSizeInBits = MaskVT.getSizeInBits(); - unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size(); - MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); - // Handle PSHUFLW/PSHUFHW repeated patterns. if (MaskScalarSizeInBits == 16) { SmallVector<int, 4> RepeatedMask; @@ -35072,7 +35166,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, /// that is commonly recognized as an idiom (has no register dependency), so /// that's better/smaller than loading a splat 1 constant. static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB && + assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"); // Pseudo-legality check: getOnesVector() expects one of these types, so bail diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 01a70323224c..cc5c09cbf0e5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -185,6 +185,20 @@ def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info, def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info, v2f64x_info>; +class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm, + ValueType _vt> { + RegisterClass KRC = _krc; + RegisterClass KRCWM = _krcwm; + ValueType KVT = _vt; +} + +def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>; +def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>; +def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>; +def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>; +def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>; +def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>; + // This multiclass generates the masking variants from the non-masking // variant. It only provides the assembly pieces for the masking variants. // It assumes custom ISel patterns for masking which can be provided as @@ -1735,17 +1749,217 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -} +multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, + SDNode OpNode, string InstrStr, + list<Predicate> Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask, + _.RC:$src1, _.RC:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, + _.RC:$src1, addr:$src2), + NewInf.KRC)>; +} +} + +multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, + SDNode OpNode, string InstrStr, + list<Predicate> Preds> + : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask, + _.RC:$src1, addr:$src2), + NewInf.KRC)>; +} +} + +// VPCMPEQB - i8 +defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQBZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQBZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQBZ256", [HasBWI, HasVLX]>; + +// VPCMPEQW - i16 +defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm, + "VPCMPEQWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQWZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQWZ256", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQWZ256", [HasBWI, HasVLX]>; + +defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm, + "VPCMPEQWZ", [HasBWI]>; + +// VPCMPEQD - i32 +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpeqm, + "VPCMPEQDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm, + "VPCMPEQDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQDZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm, + "VPCMPEQDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQDZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm, + "VPCMPEQDZ", [HasAVX512]>; +defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm, + "VPCMPEQDZ", [HasAVX512]>; + +// VPCMPEQQ - i64 +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpeqm, + "VPCMPEQQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpeqm, + "VPCMPEQQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm, + "VPCMPEQQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQQZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpeqm, + "VPCMPEQQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm, + "VPCMPEQQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm, + "VPCMPEQQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm, + "VPCMPEQQZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, + "VPCMPEQQZ", [HasAVX512]>; +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm, + "VPCMPEQQZ", [HasAVX512]>; +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm, + "VPCMPEQQZ", [HasAVX512]>; + +// VPCMPGTB - i8 +defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTBZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTBZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTBZ256", [HasBWI, HasVLX]>; + +// VPCMPGTW - i16 +defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpgtm, + "VPCMPGTWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTWZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTWZ256", [HasBWI, HasVLX]>; +defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTWZ256", [HasBWI, HasVLX]>; + +defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpgtm, + "VPCMPGTWZ", [HasBWI]>; + +// VPCMPGTD - i32 +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpgtm, + "VPCMPGTDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpgtm, + "VPCMPGTDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTDZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpgtm, + "VPCMPGTDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTDZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm, + "VPCMPGTDZ", [HasAVX512]>; +defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm, + "VPCMPGTDZ", [HasAVX512]>; + +// VPCMPGTQ - i64 +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpgtm, + "VPCMPGTQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpgtm, + "VPCMPGTQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm, + "VPCMPGTQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTQZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpgtm, + "VPCMPGTQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm, + "VPCMPGTQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm, + "VPCMPGTQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm, + "VPCMPGTQZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm, + "VPCMPGTQZ", [HasAVX512]>; +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm, + "VPCMPGTQZ", [HasAVX512]>; +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm, + "VPCMPGTQZ", [HasAVX512]>; multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> { @@ -1908,6 +2122,237 @@ defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, + SDNode OpNode, string InstrStr, + list<Predicate> Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask, + _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))), + imm:$cc)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, + SDNode OpNode, string InstrStr, + list<Predicate> Preds> + : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + imm:$cc)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +// VPCMPB - i8 +defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm, + "VPCMPBZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm, + "VPCMPBZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm, + "VPCMPBZ256", [HasBWI, HasVLX]>; + +// VPCMPW - i16 +defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpm, + "VPCMPWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpm, + "VPCMPWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpm, + "VPCMPWZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm, + "VPCMPWZ256", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm, + "VPCMPWZ256", [HasBWI, HasVLX]>; + +defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm, + "VPCMPWZ", [HasBWI]>; + +// VPCMPD - i32 +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpm, + "VPCMPDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpm, + "VPCMPDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpm, + "VPCMPDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpm, + "VPCMPDZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpm, + "VPCMPDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpm, + "VPCMPDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpm, + "VPCMPDZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm, + "VPCMPDZ", [HasAVX512]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm, + "VPCMPDZ", [HasAVX512]>; + +// VPCMPQ - i64 +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpm, + "VPCMPQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpm, + "VPCMPQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpm, + "VPCMPQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpm, + "VPCMPQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpm, + "VPCMPQZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpm, + "VPCMPQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpm, + "VPCMPQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpm, + "VPCMPQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpm, + "VPCMPQZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm, + "VPCMPQZ", [HasAVX512]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm, + "VPCMPQZ", [HasAVX512]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm, + "VPCMPQZ", [HasAVX512]>; + +// VPCMPUB - i8 +defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu, + "VPCMPUBZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu, + "VPCMPUBZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu, + "VPCMPUBZ256", [HasBWI, HasVLX]>; + +// VPCMPUW - i16 +defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpmu, + "VPCMPUWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpmu, + "VPCMPUWZ128", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpmu, + "VPCMPUWZ128", [HasBWI, HasVLX]>; + +defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu, + "VPCMPUWZ256", [HasBWI, HasVLX]>; +defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu, + "VPCMPUWZ256", [HasBWI, HasVLX]>; + +defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu, + "VPCMPUWZ", [HasBWI]>; + +// VPCMPUD - i32 +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpmu, + "VPCMPUDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpmu, + "VPCMPUDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpmu, + "VPCMPUDZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpmu, + "VPCMPUDZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpmu, + "VPCMPUDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpmu, + "VPCMPUDZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpmu, + "VPCMPUDZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu, + "VPCMPUDZ", [HasAVX512]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu, + "VPCMPUDZ", [HasAVX512]>; + +// VPCMPUQ - i64 +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpmu, + "VPCMPUQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpmu, + "VPCMPUQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpmu, + "VPCMPUQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpmu, + "VPCMPUQZ128", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpmu, + "VPCMPUQZ128", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpmu, + "VPCMPUQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpmu, + "VPCMPUQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpmu, + "VPCMPUQZ256", [HasAVX512, HasVLX]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpmu, + "VPCMPUQZ256", [HasAVX512, HasVLX]>; + +defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu, + "VPCMPUQZ", [HasAVX512]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu, + "VPCMPUQZ", [HasAVX512]>; +defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu, + "VPCMPUQZ", [HasAVX512]>; + multiclass avx512_vcmp_common<X86VectorVTInfo _> { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, @@ -1998,21 +2443,108 @@ defm VCMPPD : avx512_vcmp<avx512vl_f64_info>, defm VCMPPS : avx512_vcmp<avx512vl_f32_info>, AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VCMPPSZrri - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; -def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VPCMPDZrri - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; -def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VPCMPUDZrri - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; +multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, + string InstrStr, list<Predicate> Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, + string InstrStr, list<Predicate> Preds> + : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> { + +let Predicates = Preds in + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))), + (i64 0)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; +} + + +// VCMPPS - f32 +defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v8i1_info, "VCMPPSZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v16i1_info, "VCMPPSZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v32i1_info, "VCMPPSZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v64i1_info, "VCMPPSZ128", + [HasAVX512, HasVLX]>; + +defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v16i1_info, "VCMPPSZ256", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v32i1_info, "VCMPPSZ256", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v64i1_info, "VCMPPSZ256", + [HasAVX512, HasVLX]>; + +defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ", + [HasAVX512]>; +defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ", + [HasAVX512]>; + +// VCMPPD - f64 +defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info, "VCMPPDZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info, "VCMPPDZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128", + [HasAVX512, HasVLX]>; + +defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info, "VCMPPDZ256", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256", + [HasAVX512, HasVLX]>; +defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256", + [HasAVX512, HasVLX]>; + +defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ", + [HasAVX512]>; +defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ", + [HasAVX512]>; +defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ", + [HasAVX512]>; // ---------------------------------------------------------------- // FPClass @@ -2498,6 +3030,69 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; +multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> { +def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr) + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr) + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (i8 8)), (i8 8))>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, + (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (i8 8)), (i8 8))>; +} + +multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr, + AVX512VLVectorVTInfo _> { +def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri) + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri) + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), + (i8 8)), (i8 8))>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, + (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), + (i8 8)), (i8 8))>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>; +} + // Mask setting all 0s or 1s multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { let Predicates = [HasAVX512] in diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index f98c2a7e802d..e34a90e975b8 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -75,6 +75,8 @@ private: bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI, @@ -270,6 +272,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectUadde(I, MRI, MF)) return true; + if (selectMergeValues(I, MRI, MF)) + return true; if (selectExtract(I, MRI, MF)) return true; if (selectInsert(I, MRI, MF)) @@ -914,6 +918,55 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } +bool X86InstructionSelector::selectMergeValues(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_MERGE_VALUES) + return false; + + // Split to inserts. + unsigned DstReg = I.getOperand(0).getReg(); + unsigned SrcReg0 = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg0); + unsigned SrcSize = SrcTy.getSizeInBits(); + + const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); + + // For the first src use insertSubReg. + unsigned DefReg = MRI.createGenericVirtualRegister(DstTy); + MRI.setRegBank(DefReg, RegBank); + if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF)) + return false; + + for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) { + + unsigned Tmp = MRI.createGenericVirtualRegister(DstTy); + MRI.setRegBank(Tmp, RegBank); + + MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::G_INSERT), Tmp) + .addReg(DefReg) + .addReg(I.getOperand(Idx).getReg()) + .addImm((Idx - 1) * SrcSize); + + DefReg = Tmp; + + if (!select(InsertInst)) + return false; + } + + MachineInstr &CopyInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::COPY), DstReg) + .addReg(DefReg); + + if (!select(CopyInst)) + return false; + + I.eraseFromParent(); + return true; +} InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index a584eabcc1b2..a5fa3340c3f1 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -56,7 +56,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - for (unsigned BinOp : {G_ADD, G_SUB, G_MUL}) + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) for (auto Ty : {s8, s16, s32}) setAction({BinOp, Ty}, Legal); @@ -117,7 +117,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - for (unsigned BinOp : {G_ADD, G_SUB, G_MUL}) + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) for (auto Ty : {s8, s16, s32, s64}) setAction({BinOp, Ty}, Legal); @@ -228,10 +228,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX() { for (auto Ty : {v8s32, v4s64}) setAction({MemOp, Ty}, Legal); - for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) + for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) { setAction({G_INSERT, Ty}, Legal); - for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) + setAction({G_EXTRACT, 1, Ty}, Legal); + } + for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) { setAction({G_INSERT, 1, Ty}, Legal); + setAction({G_EXTRACT, Ty}, Legal); + } } void X86LegalizerInfo::setLegalizerInfoAVX2() { @@ -280,10 +284,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { for (auto Ty : {v16s32, v8s64}) setAction({MemOp, Ty}, Legal); - for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) + for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) { setAction({G_INSERT, Ty}, Legal); - for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) + setAction({G_EXTRACT, 1, Ty}, Legal); + } + for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) { setAction({G_INSERT, 1, Ty}, Legal); + setAction({G_EXTRACT, Ty}, Legal); + } /************ VLX *******************/ if (!Subtarget.hasVLX()) diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index e36a47506ba0..24845beac22d 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -11,10 +11,23 @@ // //===----------------------------------------------------------------------===// +#include "X86.h" + +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "X86CallLowering.h" +#include "X86LegalizerInfo.h" +#include "X86RegisterBankInfo.h" +#endif #include "X86Subtarget.h" #include "MCTargetDesc/X86BaseInfo.h" #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#endif #include "llvm/IR/Attributes.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" @@ -336,6 +349,35 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { + +struct X86GISelActualAccessor : public GISelAccessor { + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + std::unique_ptr<InstructionSelector> InstSelector; + + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; + +} // end anonymous namespace +#endif + X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride) @@ -360,6 +402,19 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, setPICStyle(PICStyles::StubPIC); else if (isTargetELF()) setPICStyle(PICStyles::GOT); +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + X86GISelActualAccessor *GISel = new X86GISelActualAccessor(); + + GISel->CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering())); + GISel->Legalizer.reset(new X86LegalizerInfo(*this, TM)); + + auto *RBI = new X86RegisterBankInfo(*getRegisterInfo()); + GISel->RegBankInfo.reset(RBI); + GISel->InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI)); +#endif + setGISelAccessor(*GISel); } const CallLowering *X86Subtarget::getCallLowering() const { diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 550e95c39ab5..fa0afe29586b 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -58,7 +58,7 @@ protected: }; enum X86ProcFamilyEnum { - Others, IntelAtom, IntelSLM + Others, IntelAtom, IntelSLM, IntelGLM }; /// X86 processor family: Intel Atom, and others diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index a9f42cacf788..8d891c983fab 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -15,9 +15,6 @@ #include "X86.h" #include "X86CallLowering.h" #include "X86LegalizerInfo.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL -#include "X86RegisterBankInfo.h" -#endif #include "X86MacroFusion.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" @@ -31,7 +28,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/ExecutionDepsFix.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" @@ -212,35 +208,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, X86TargetMachine::~X86TargetMachine() = default; -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct X86GISelActualAccessor : public GISelAccessor { - std::unique_ptr<CallLowering> CallLoweringInfo; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - std::unique_ptr<InstructionSelector> InstSelector; - - const CallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; - -} // end anonymous namespace -#endif - const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -280,20 +247,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { resetTargetOptions(F); I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride); -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - X86GISelActualAccessor *GISel = new X86GISelActualAccessor(); - - GISel->CallLoweringInfo.reset(new X86CallLowering(*I->getTargetLowering())); - GISel->Legalizer.reset(new X86LegalizerInfo(*I, *this)); - - auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo()); - GISel->RegBankInfo.reset(RBI); - GISel->InstSelector.reset(createX86InstructionSelector( - *this, *I, *RBI)); -#endif - I->setGISelAccessor(*GISel); } return I.get(); } diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 1bf267d34ec2..aaa6d58bd134 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -40,6 +40,7 @@ public: ~X86TargetMachine() override; const X86Subtarget *getSubtargetImpl(const Function &F) const override; + const X86Subtarget *getSubtargetImpl() const = delete; TargetIRAnalysis getTargetIRAnalysis() override; diff --git a/lib/Transforms/Coroutines/CoroInstr.h b/lib/Transforms/Coroutines/CoroInstr.h index 5c666bdfea1f..9a8cc5a2591c 100644 --- a/lib/Transforms/Coroutines/CoroInstr.h +++ b/lib/Transforms/Coroutines/CoroInstr.h @@ -58,10 +58,10 @@ public: } // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_subfn_addr; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -70,10 +70,10 @@ public: class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_alloc; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -175,10 +175,10 @@ public: } // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_id; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -187,10 +187,10 @@ public: class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_frame; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -203,10 +203,10 @@ public: Value *getFrame() const { return getArgOperand(FrameArg); } // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_free; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -221,10 +221,10 @@ public: Value *getMem() const { return getArgOperand(MemArg); } // Methods for support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_begin; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -233,10 +233,10 @@ public: class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_save; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -254,10 +254,10 @@ public: } // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_promise; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -279,10 +279,10 @@ public: } // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_suspend; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -291,10 +291,10 @@ public: class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_size; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; @@ -310,10 +310,10 @@ public: } // Methods to support type inquiry through isa, cast, and dyn_cast: - static inline bool classof(const IntrinsicInst *I) { + static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_end; } - static inline bool classof(const Value *V) { + static bool classof(const Value *V) { return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } }; diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 087a8aa2c624..5b1b58b89c32 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -56,10 +56,6 @@ RunSLPVectorization("vectorize-slp", cl::Hidden, cl::desc("Run the SLP vectorization passes")); static cl::opt<bool> -RunBBVectorization("vectorize-slp-aggressive", cl::Hidden, - cl::desc("Run the BB vectorization passes")); - -static cl::opt<bool> UseGVNAfterVectorization("use-gvn-after-vectorization", cl::init(false), cl::Hidden, cl::desc("Run GVN instead of Early CSE after vectorization passes")); @@ -138,8 +134,8 @@ static cl::opt<int> PreInlineThreshold( "(default = 75)")); static cl::opt<bool> EnableEarlyCSEMemSSA( - "enable-earlycse-memssa", cl::init(false), cl::Hidden, - cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = off)")); + "enable-earlycse-memssa", cl::init(true), cl::Hidden, + cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = on)")); static cl::opt<bool> EnableGVNHoist( "enable-gvn-hoist", cl::init(false), cl::Hidden, @@ -166,7 +162,6 @@ PassManagerBuilder::PassManagerBuilder() { Inliner = nullptr; DisableUnitAtATime = false; DisableUnrollLoops = false; - BBVectorize = RunBBVectorization; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; RerollLoops = RunLoopRerolling; @@ -263,11 +258,12 @@ void PassManagerBuilder::populateFunctionPassManager( // Do PGO instrumentation generation or use pass as the option specified. void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) { - if (!EnablePGOInstrGen && PGOInstrUse.empty()) + if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty()) return; // Perform the preinline and cleanup passes for O1 and above. // And avoid doing them if optimizing for size. - if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) { + if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner && + PGOSampleUse.empty()) { // Create preinline pass. We construct an InlineParams object and specify // the threshold here to avoid the command line options of the regular // inliner to influence pre-inlining. The only fields of InlineParams we @@ -383,26 +379,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (RerollLoops) MPM.add(createLoopRerollPass()); - if (!RunSLPAfterLoopVectorization) { - if (SLPVectorize) - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - addInstructionCombiningPass(MPM); - addExtensionsToPM(EP_Peephole, MPM); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(NewGVN - ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass(OptLevel)); - } - } + if (!RunSLPAfterLoopVectorization && SLPVectorize) + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs @@ -634,28 +612,10 @@ void PassManagerBuilder::populateModulePassManager( addInstructionCombiningPass(MPM); } - if (RunSLPAfterLoopVectorization) { - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - if (OptLevel > 1 && ExtraVectorizerPasses) { - MPM.add(createEarlyCSEPass()); - } - } - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - addInstructionCombiningPass(MPM); - addExtensionsToPM(EP_Peephole, MPM); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(NewGVN - ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass(OptLevel)); + if (RunSLPAfterLoopVectorization && SLPVectorize) { + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + if (OptLevel > 1 && ExtraVectorizerPasses) { + MPM.add(createEarlyCSEPass()); } } diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 656421ee58df..ac4765f96075 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -1484,7 +1484,8 @@ bool SampleProfileLoader::runOnFunction(Function &F) { PreservedAnalyses SampleProfileLoaderPass::run(Module &M, ModuleAnalysisManager &AM) { - SampleProfileLoader SampleLoader(SampleProfileFile); + SampleProfileLoader SampleLoader( + ProfileFileName.empty() ? SampleProfileFile : ProfileFileName); SampleLoader.doInitialization(M); diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 802f470ffe1f..8d494fe9cde2 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -371,6 +371,7 @@ void splitAndWriteThinLTOBitcode( /*GenerateHash=*/true, &ModHash); W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false, &MergedMIndex); + W.writeSymtab(); W.writeStrtab(); OS << Buffer; @@ -385,6 +386,7 @@ void splitAndWriteThinLTOBitcode( /*GenerateHash=*/false, &ModHash); W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false, &MergedMIndex); + W2.writeSymtab(); W2.writeStrtab(); *ThinLinkOS << Buffer; } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index d3d8cefe9735..db98be2c98f5 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2301,10 +2301,10 @@ static Instruction *foldXorToXor(BinaryOperator &I) { // (~B | A) ^ (~A | B) -> A ^ B // (~A | B) ^ (A | ~B) -> A ^ B // (B | ~A) ^ (A | ~B) -> A ^ B - if ((match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) || - (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && - match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B)))))) { + if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) || + (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) { I.setOperand(0, A); I.setOperand(1, B); return &I; @@ -2314,10 +2314,10 @@ static Instruction *foldXorToXor(BinaryOperator &I) { // (~B & A) ^ (~A & B) -> A ^ B // (~A & B) ^ (A & ~B) -> A ^ B // (B & ~A) ^ (A & ~B) -> A ^ B - if ((match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B)))) || - (match(Op0, m_c_And(m_Not(m_Value(A)), m_Value(B))) && - match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B)))))) { + if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) || + (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) { I.setOperand(0, A); I.setOperand(1, B); return &I; @@ -2456,10 +2456,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } - // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B + // not (cmp A, B) = !cmp A, B ICmpInst::Predicate Pred; - if (match(Op0, m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))) && - match(Op1, m_AllOnes())) { + if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) { cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred)); return replaceInstUsesWith(I, Op0); } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index dbed7ad4eae8..3770021de100 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1985,7 +1985,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *X = nullptr; // bitreverse(bitreverse(x)) -> x - if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X)))) + if (match(IIOperand, m_BitReverse(m_Value(X)))) return replaceInstUsesWith(CI, X); break; } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 6ad32490a328..58b8b2f52629 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -112,10 +112,10 @@ static bool subWithOverflow(Constant *&Result, Constant *In1, /// Given an icmp instruction, return true if any use of this comparison is a /// branch on sign bit comparison. -static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) { +static bool hasBranchUse(ICmpInst &I) { for (auto *U : I.users()) if (isa<BranchInst>(U)) - return isSignBit; + return true; return false; } @@ -1448,12 +1448,13 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { // of a test and branch. So we avoid canonicalizing in such situations // because test and branch instruction has better branch displacement // than compare and branch instruction. - if (!isBranchOnSignBitCheck(Cmp, IsSignBit) && !Cmp.isEquality()) { - if (auto *AI = Intersection.getSingleElement()) - return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI)); - if (auto *AD = Difference.getSingleElement()) - return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD)); - } + if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp))) + return nullptr; + + if (auto *AI = Intersection.getSingleElement()) + return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI)); + if (auto *AD = Difference.getSingleElement()) + return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD)); } return nullptr; @@ -3301,12 +3302,12 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { return nullptr; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + const CmpInst::Predicate Pred = I.getPredicate(); Value *A, *B, *C, *D; if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) { if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0 Value *OtherVal = A == Op1 ? B : A; - return new ICmpInst(I.getPredicate(), OtherVal, - Constant::getNullValue(A->getType())); + return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType())); } if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) { @@ -3316,26 +3317,25 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { Op1->hasOneUse()) { Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue()); Value *Xor = Builder->CreateXor(C, NC); - return new ICmpInst(I.getPredicate(), A, Xor); + return new ICmpInst(Pred, A, Xor); } // A^B == A^D -> B == D if (A == C) - return new ICmpInst(I.getPredicate(), B, D); + return new ICmpInst(Pred, B, D); if (A == D) - return new ICmpInst(I.getPredicate(), B, C); + return new ICmpInst(Pred, B, C); if (B == C) - return new ICmpInst(I.getPredicate(), A, D); + return new ICmpInst(Pred, A, D); if (B == D) - return new ICmpInst(I.getPredicate(), A, C); + return new ICmpInst(Pred, A, C); } } if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) { // A == (A^B) -> B == 0 Value *OtherVal = A == Op0 ? B : A; - return new ICmpInst(I.getPredicate(), OtherVal, - Constant::getNullValue(A->getType())); + return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType())); } // (X&Z) == (Y&Z) -> (X^Y) & Z == 0 @@ -3380,8 +3380,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { APInt Pow2 = Cst1->getValue() + 1; if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) && Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth()) - return new ICmpInst(I.getPredicate(), A, - Builder->CreateTrunc(B, A->getType())); + return new ICmpInst(Pred, A, Builder->CreateTrunc(B, A->getType())); } // (A >> C) == (B >> C) --> (A^B) u< (1 << C) @@ -3393,12 +3392,11 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { unsigned TypeBits = Cst1->getBitWidth(); unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); if (ShAmt < TypeBits && ShAmt != 0) { - ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE - ? ICmpInst::ICMP_UGE - : ICmpInst::ICMP_ULT; + ICmpInst::Predicate NewPred = + Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted"); APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt); - return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal)); + return new ICmpInst(NewPred, Xor, Builder->getInt(CmpVal)); } } @@ -3412,8 +3410,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt); Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal), I.getName() + ".mask"); - return new ICmpInst(I.getPredicate(), And, - Constant::getNullValue(Cst1->getType())); + return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType())); } } @@ -3437,7 +3434,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { CmpV <<= ShAmt; Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV)); - return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV)); + return new ICmpInst(Pred, Mask, Builder->getInt(CmpV)); } } diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 1b0fe84dd4dd..87f11467b95e 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -131,11 +131,10 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) { return true; // A vector of constant integers can be inverted easily. - Constant *CV; - if (V->getType()->isVectorTy() && match(V, PatternMatch::m_Constant(CV))) { + if (V->getType()->isVectorTy() && isa<Constant>(V)) { unsigned NumElts = V->getType()->getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = CV->getAggregateElement(i); + Constant *Elt = cast<Constant>(V)->getAggregateElement(i); if (!Elt) return false; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index ca370c73fca4..26bee204e5a4 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -661,6 +661,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { if (NumElements == 1) { LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), ".unpack"); + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + NewLoad->setAAMetadata(AAMD); return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue( UndefValue::get(T), NewLoad, 0, Name)); } @@ -690,6 +693,10 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { Name + ".elt"); auto EltAlign = MinAlign(Align, SL->getElementOffset(i)); auto *L = IC.Builder->CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack"); + // Propagate AA metadata. It'll still be valid on the narrowed load. + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + L->setAAMetadata(AAMD); V = IC.Builder->CreateInsertValue(V, L, i); } @@ -702,6 +709,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { auto NumElements = AT->getNumElements(); if (NumElements == 1) { LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack"); + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + NewLoad->setAAMetadata(AAMD); return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue( UndefValue::get(T), NewLoad, 0, Name)); } @@ -734,6 +744,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { Name + ".elt"); auto *L = IC.Builder->CreateAlignedLoad(Ptr, MinAlign(Align, Offset), Name + ".unpack"); + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + L->setAAMetadata(AAMD); V = IC.Builder->CreateInsertValue(V, L, i); Offset += EltSize; } @@ -1192,7 +1205,11 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { AddrName); auto *Val = IC.Builder->CreateExtractValue(V, i, EltName); auto EltAlign = MinAlign(Align, SL->getElementOffset(i)); - IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign); + llvm::Instruction *NS = + IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign); + AAMDNodes AAMD; + SI.getAAMetadata(AAMD); + NS->setAAMetadata(AAMD); } return true; @@ -1239,7 +1256,10 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { AddrName); auto *Val = IC.Builder->CreateExtractValue(V, i, EltName); auto EltAlign = MinAlign(Align, Offset); - IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign); + Instruction *NS = IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign); + AAMDNodes AAMD; + SI.getAAMetadata(AAMD); + NS->setAAMetadata(AAMD); Offset += EltSize; } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 365c4ba75154..579639a6194e 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -227,8 +227,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); if (I.hasNoSignedWrap()) { - uint64_t V; - if (match(NewCst, m_ConstantInt(V)) && V != Width - 1) + const APInt *V; + if (match(NewCst, m_APInt(V)) && *V != Width - 1) Shl->setHasNoSignedWrap(); } diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 33951e66497a..80c6595904e1 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1167,6 +1167,23 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *I = canonicalizeSelectToShuffle(SI)) return I; + // Canonicalize a one-use integer compare with a non-canonical predicate by + // inverting the predicate and swapping the select operands. This matches a + // compare canonicalization for conditional branches. + // TODO: Should we do the same for FP compares? + CmpInst::Predicate Pred; + if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) && + !isCanonicalPredicate(Pred)) { + // Swap true/false values and condition. + CmpInst *Cond = cast<CmpInst>(CondVal); + Cond->setPredicate(CmpInst::getInversePredicate(Pred)); + SI.setOperand(1, FalseVal); + SI.setOperand(2, TrueVal); + SI.swapProfMetadata(); + Worklist.Add(Cond); + return &SI; + } + if (SelType->getScalarType()->isIntegerTy(1) && TrueVal->getType() == CondVal->getType()) { if (match(TrueVal, m_One())) { diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 02fac4fb37a4..723414635d6f 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2425,9 +2425,15 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { Builder->SetInsertPoint(L); Value *GEP = Builder->CreateInBoundsGEP(L->getType(), L->getPointerOperand(), Indices); + Instruction *NL = Builder->CreateLoad(GEP); + // Whatever aliasing information we had for the orignal load must also + // hold for the smaller load, so propagate the annotations. + AAMDNodes Nodes; + L->getAAMetadata(Nodes); + NL->setAAMetadata(Nodes); // Returning the load directly will cause the main loop to insert it in // the wrong spot, so use replaceInstUsesWith(). - return replaceInstUsesWith(EV, Builder->CreateLoad(GEP)); + return replaceInstUsesWith(EV, NL); } // We could simplify extracts from other values. Note that nested extracts may // already be simplified implicitly by the above: extract (extract (insert) ) diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index c3810366bf22..a49c9b68c97d 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -38,6 +38,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -340,6 +341,49 @@ void ConstantHoistingPass::collectConstantCandidates( } } + +/// \brief Check the operand for instruction Inst at index Idx. +void ConstantHoistingPass::collectConstantCandidates( + ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) { + Value *Opnd = Inst->getOperand(Idx); + + // Visit constant integers. + if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + return; + } + + // Visit cast instructions that have constant integers. + if (auto CastInst = dyn_cast<Instruction>(Opnd)) { + // Only visit cast instructions, which have been skipped. All other + // instructions should have already been visited. + if (!CastInst->isCast()) + return; + + if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the cast instruction. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + return; + } + } + + // Visit constant expressions that have constant integers. + if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { + // Only visit constant cast expressions. + if (!ConstExpr->isCast()) + return; + + if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the constant expression. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + return; + } + } +} + + /// \brief Scan the instruction for expensive integer constants and record them /// in the constant candidate vector. void ConstantHoistingPass::collectConstantCandidates( @@ -365,44 +409,25 @@ void ConstantHoistingPass::collectConstantCandidates( if (AI && AI->isStaticAlloca()) return; - // Scan all operands. - for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { - Value *Opnd = Inst->getOperand(Idx); - - // Visit constant integers. - if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { - collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); - continue; - } - - // Visit cast instructions that have constant integers. - if (auto CastInst = dyn_cast<Instruction>(Opnd)) { - // Only visit cast instructions, which have been skipped. All other - // instructions should have already been visited. - if (!CastInst->isCast()) - continue; - - if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { - // Pretend the constant is directly used by the instruction and ignore - // the cast instruction. - collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); - continue; + // Constants in GEPs that index into a struct type should not be hoisted. + if (isa<GetElementPtrInst>(Inst)) { + gep_type_iterator GTI = gep_type_begin(Inst); + + // Collect constant for first operand. + collectConstantCandidates(ConstCandMap, Inst, 0); + // Scan rest operands. + for (unsigned Idx = 1, E = Inst->getNumOperands(); Idx != E; ++Idx, ++GTI) { + // Only collect constants that index into a non struct type. + if (!GTI.isStruct()) { + collectConstantCandidates(ConstCandMap, Inst, Idx); } } + return; + } - // Visit constant expressions that have constant integers. - if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { - // Only visit constant cast expressions. - if (!ConstExpr->isCast()) - continue; - - if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { - // Pretend the constant is directly used by the instruction and ignore - // the constant expression. - collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); - continue; - } - } + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + collectConstantCandidates(ConstCandMap, Inst, Idx); } // end of for all operands } diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 2f96c3064b86..a40c22c3fce9 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -917,7 +917,6 @@ LoopConstrainer::calculateSubRanges() const { // I think we can be more aggressive here and make this nuw / nsw if the // addition that feeds into the icmp for the latch's terminating branch is nuw // / nsw. In any case, a wrapping 2's complement addition is safe. - ConstantInt *One = ConstantInt::get(Ty, 1); const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart); const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); @@ -948,8 +947,9 @@ LoopConstrainer::calculateSubRanges() const { // will be an empty range. Returning an empty range is always safe. // - Smallest = SE.getAddExpr(End, SE.getSCEV(One)); - Greatest = SE.getAddExpr(Start, SE.getSCEV(One)); + const SCEV *One = SE.getOne(Ty); + Smallest = SE.getAddExpr(End, One); + Greatest = SE.getAddExpr(Start, One); } auto Clamp = [this, Smallest, Greatest](const SCEV *S) { diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 62aa6ee48069..530a68424d5c 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -131,7 +131,7 @@ static const unsigned NoThreshold = UINT_MAX; /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( - Loop *L, const TargetTransformInfo &TTI, int OptLevel, + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, Optional<bool> UserUpperBound) { @@ -158,7 +158,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.AllowPeeling = true; // Override with any target specific settings - TTI.getUnrollingPreferences(L, UP); + TTI.getUnrollingPreferences(L, SE, UP); // Apply size attributes if (L->getHeader()->getParent()->optForSize()) { @@ -699,7 +699,7 @@ static uint64_t getUnrolledLoopSize( // Calculates unroll count and writes it to UP.Count. static bool computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, + ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. @@ -770,7 +770,7 @@ static bool computeUnrollCount( // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, *SE, TTI, + L, FullUnrollTripCount, DT, SE, TTI, UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); @@ -836,6 +836,8 @@ static bool computeUnrollCount( } else { UP.Count = TripCount; } + if (UP.Count > UP.MaxCount) + UP.Count = UP.MaxCount; if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && UP.Count != TripCount) ORE->emit( @@ -926,7 +928,7 @@ static bool computeUnrollCount( } static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, const TargetTransformInfo &TTI, + ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel, Optional<unsigned> ProvidedCount, @@ -948,8 +950,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, bool NotDuplicatable; bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, TTI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime, ProvidedUpperBound); + L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount, + ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound); // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) return false; @@ -977,8 +979,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) ExitingBlock = L->getExitingBlock(); if (ExitingBlock) { - TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); - TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); + TripCount = SE.getSmallConstantTripCount(L, ExitingBlock); + TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock); } // If the loop contains a convergent operation, the prelude we'd add @@ -1000,8 +1002,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, // count. bool MaxOrZero = false; if (!TripCount) { - MaxTripCount = SE->getSmallConstantMaxTripCount(L); - MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); + MaxTripCount = SE.getSmallConstantMaxTripCount(L); + MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L); // We can unroll by the upper bound amount if it's generally allowed or if // we know that the loop is executed either the upper bound or zero times. // (MaxOrZero unrolling keeps only the first loop test, so the number of @@ -1030,7 +1032,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, - TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE, + TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; @@ -1073,7 +1075,7 @@ public: auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -1157,7 +1159,7 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, if (!AllowPartialUnrolling) AllowPartialParam = RuntimeParam = UpperBoundParam = false; bool Changed = tryToUnrollLoop( - &L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE, + &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam); if (!Changed) diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 7a7624f77542..9cf01c6582b5 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -2423,8 +2423,7 @@ void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB, AllTempInstructions.insert(Op); PHIOfOpsPHIs[BB].push_back(Op); TempToBlock[Op] = BB; - if (ExistingValue) - RealToTemp[ExistingValue] = Op; + RealToTemp[ExistingValue] = Op; } static bool okayForPHIOfOps(const Instruction *I) { diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 6da551bd7efd..cdba0062953f 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -1894,6 +1894,8 @@ void ReassociatePass::EraseInst(Instruction *I) { Op = Op->user_back(); RedoInsts.insert(Op); } + + MadeChange = true; } // Canonicalize expressions of the following form: diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index a52739bb76f7..a73e9aec0617 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1954,7 +1954,7 @@ static void rematerializeLiveValues(CallSite CS, // to identify the newly generated AlternateRootPhi (.base version of phi) // and RootOfChain (the original phi node itself) are the same, so that we // can rematerialize the gep and casts. This is a workaround for the - // deficieny in the findBasePointer algorithm. + // deficiency in the findBasePointer algorithm. if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi)) continue; // Now that the phi nodes are proved to be the same, assert that diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 80fbbeb6829b..4729f4ef5956 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -2402,9 +2402,20 @@ private: if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + // Any !nonnull metadata or !range metadata on the old load is also valid + // on the new load. This is even true in some cases even when the loads + // are different types, for example by mapping !nonnull metadata to + // !range metadata by modeling the null pointer constant converted to the + // integer type. + // FIXME: Add support for range metadata here. Currently the utilities + // for this don't propagate range metadata in trivial cases from one + // integer load to another, don't handle non-addrspace-0 null pointers + // correctly, and don't have any support for mapping ranges as the + // integer type becomes winder or narrower. + if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull)) + copyNonnullMetadata(LI, N, *NewLI); + // Try to preserve nonnull metadata - if (TargetTy->isPointerTy()) - NewLI->copyMetadata(LI, LLVMContext::MD_nonnull); V = NewLI; // If this is an integer load past the end of the slice (which means the @@ -3580,10 +3591,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { int Idx = 0, Size = Offsets.Splits.size(); for (;;) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); - auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + auto AS = LI->getPointerAddressSpace(); + auto *PartPtrTy = PartTy->getPointerTo(AS); LoadInst *PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, BasePtr, - APInt(DL.getPointerSizeInBits(), PartOffset), + APInt(DL.getPointerSizeInBits(AS), PartOffset), PartPtrTy, BasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 5d57ed9718fb..30d8856cfbef 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -59,6 +59,33 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) { // Landing pads must be in the function where they were inserted for cleanup. if (BB.isEHPad()) return false; + // taking the address of a basic block moved to another function is illegal + if (BB.hasAddressTaken()) + return false; + + // don't hoist code that uses another basicblock address, as it's likely to + // lead to unexpected behavior, like cross-function jumps + SmallPtrSet<User const *, 16> Visited; + SmallVector<User const *, 16> ToVisit; + + for (Instruction const &Inst : BB) + ToVisit.push_back(&Inst); + + while (!ToVisit.empty()) { + User const *Curr = ToVisit.pop_back_val(); + if (!Visited.insert(Curr).second) + continue; + if (isa<BlockAddress const>(Curr)) + return false; // even a reference to self is likely to be not compatible + + if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB) + continue; + + for (auto const &U : Curr->operands()) { + if (auto *UU = dyn_cast<User>(U)) + ToVisit.push_back(UU); + } + } // Don't hoist code containing allocas, invokes, or vastarts. for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) { diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 5f85e17927fa..9ad2b707e6b2 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -36,6 +36,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <algorithm> @@ -45,6 +46,10 @@ using namespace llvm; STATISTIC(NumRuntimeUnrolled, "Number of loops unrolled with run-time trip counts"); +static cl::opt<bool> UnrollRuntimeMultiExit( + "unroll-runtime-multi-exit", cl::init(false), cl::Hidden, + cl::desc("Allow runtime unrolling for loops with multiple exits, when " + "epilog is generated")); /// Connect the unrolling prolog code to the original loop. /// The unrolling prolog code contains code to execute the @@ -285,15 +290,13 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, /// The cloned blocks should be inserted between InsertTop and InsertBot. /// If loop structure is cloned InsertTop should be new preheader, InsertBot /// new loop exit. -/// -static void CloneLoopBlocks(Loop *L, Value *NewIter, - const bool CreateRemainderLoop, - const bool UseEpilogRemainder, - BasicBlock *InsertTop, BasicBlock *InsertBot, - BasicBlock *Preheader, - std::vector<BasicBlock *> &NewBlocks, - LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI) { +/// Return the new cloned loop that is created when CreateRemainderLoop is true. +static Loop * +CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, + const bool UseEpilogRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, + std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, + ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -418,7 +421,10 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); NewLoop->setLoopID(NewLoopID); + return NewLoop; } + else + return nullptr; } /// Insert code in the prolog/epilog code when unrolling a loop with a @@ -465,29 +471,52 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, bool PreserveLCSSA) { // for now, only unroll loops that contain a single exit - if (!L->getExitingBlock()) + if (!UnrollRuntimeMultiExit && !L->getExitingBlock()) return false; - // Make sure the loop is in canonical form, and there is a single - // exit block only. + // Make sure the loop is in canonical form. if (!L->isLoopSimplifyForm()) return false; // Guaranteed by LoopSimplifyForm. BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Header = L->getHeader(); BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop - if (!LatchExit) + if (!LatchExit && !UnrollRuntimeMultiExit) return false; + // These are exit blocks other than the target of the latch exiting block. + SmallVector<BasicBlock *, 4> OtherExits; + BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); + unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the - // targets of the Latch be the single exit block out of the loop. This needs + // targets of the Latch be an exit block out of the loop. This needs // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. - BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); - assert((LatchBR->getSuccessor(0) == LatchExit || - LatchBR->getSuccessor(1) == LatchExit) && - "one of the loop latch successors should be " - "the exit block!"); - (void)LatchBR; + assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) && + "one of the loop latch successors should be the exit block!"); + // Support runtime unrolling for multiple exit blocks and multiple exiting + // blocks. + if (!LatchExit) { + assert(UseEpilogRemainder && "Multi exit unrolling is currently supported " + "unrolling with epilog remainder only!"); + LatchExit = LatchBR->getSuccessor(ExitIndex); + // We rely on LCSSA form being preserved when the exit blocks are + // transformed. + if (!PreserveLCSSA) + return false; + // TODO: Support multiple exiting blocks jumping to the `LatchExit`. This + // will need updating the logic in connectEpilog. + if (!LatchExit->getSinglePredecessor()) + return false; + SmallVector<BasicBlock *, 4> Exits; + L->getUniqueExitBlocks(Exits); + for (auto *BB : Exits) + if (BB != LatchExit) + OtherExits.push_back(BB); + } + + assert(LatchExit && "Latch Exit should exist!"); + // Use Scalar Evolution to compute the trip count. This allows more loops to // be unrolled than relying on induction var simplification. if (!SE) @@ -495,7 +524,11 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // Only unroll loops with a computable trip count, and the trip count needs // to be an int value (allowing a pointer type is a TODO item). - const SCEV *BECountSC = SE->getBackedgeTakenCount(L); + // We calculate the backedge count by using getExitCount on the Latch block, + // which is proven to be the only exiting block in this loop. This is same as + // calculating getBackedgeTakenCount on the loop (which computes SCEV for all + // exiting blocks). + const SCEV *BECountSC = SE->getExitCount(L, Latch); if (isa<SCEVCouldNotCompute>(BECountSC) || !BECountSC->getType()->isIntegerTy()) return false; @@ -508,7 +541,6 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; - BasicBlock *Header = L->getHeader(); BasicBlock *PreHeader = L->getLoopPreheader(); BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); const DataLayout &DL = Header->getModule()->getDataLayout(); @@ -650,8 +682,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; - CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, - InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); + Loop *remainderLoop = CloneLoopBlocks( + L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot, + NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); // Insert the cloned blocks into the function. F->getBasicBlockList().splice(InsertBot->getIterator(), @@ -659,6 +692,42 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, NewBlocks[0]->getIterator(), F->end()); + // Now the loop blocks are cloned and the other exiting blocks from the + // remainder are connected to the original Loop's exit blocks. The remaining + // work is to update the phi nodes in the original loop, and take in the + // values from the cloned region. Also update the dominator info for + // OtherExits, since we have new edges into OtherExits. + for (auto *BB : OtherExits) { + for (auto &II : *BB) { + + // Given we preserve LCSSA form, we know that the values used outside the + // loop will be used through these phi nodes at the exit blocks that are + // transformed below. + if (!isa<PHINode>(II)) + break; + PHINode *Phi = cast<PHINode>(&II); + unsigned oldNumOperands = Phi->getNumIncomingValues(); + // Add the incoming values from the remainder code to the end of the phi + // node. + for (unsigned i =0; i < oldNumOperands; i++){ + Value *newVal = VMap[Phi->getIncomingValue(i)]; + if (!newVal) { + assert(isa<Constant>(Phi->getIncomingValue(i)) && + "VMap should exist for all values except constants!"); + newVal = Phi->getIncomingValue(i); + } + Phi->addIncoming(newVal, + cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)])); + } + } + // Update the dominator info because the immediate dominator is no longer the + // header of the original Loop. BB has edges both from L and remainder code. + // Since the preheader determines which loop is run (L or directly jump to + // the remainder code), we set the immediate dominator as the preheader. + if (DT) + DT->changeImmediateDominator(BB, PreHeader); + } + // Loop structure should be the following: // Epilog Prolog // @@ -721,6 +790,19 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); + // Canonicalize to LoopSimplifyForm both original and remainder loops. We + // cannot rely on the LoopUnrollPass to do this because it only does + // canonicalization for parent/subloops and not the sibling loops. + if (OtherExits.size() > 0) { + // Generate dedicated exit blocks for the original loop, to preserve + // LoopSimplifyForm. + formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA); + // Generate dedicated exit blocks for the remainder loop if one exists, to + // preserve LoopSimplifyForm. + if (remainderLoop) + formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA); + } + NumRuntimeUnrolled++; return true; } diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 0ed33945ef40..58b70be95d99 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -528,8 +528,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, return false; } -bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, - DominatorTree *DT) { +bool RecurrenceDescriptor::isFirstOrderRecurrence( + PHINode *Phi, Loop *TheLoop, + DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) { // Ensure the phi node is in the loop header and has two incoming values. if (Phi->getParent() != TheLoop->getHeader() || @@ -551,12 +552,24 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, // Get the previous value. The previous value comes from the latch edge while // the initial value comes form the preheader edge. auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch)); - if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous)) + if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous) || + SinkAfter.count(Previous)) // Cannot rely on dominance due to motion. return false; // Ensure every user of the phi node is dominated by the previous value. // The dominance requirement ensures the loop vectorizer will not need to // vectorize the initial value prior to the first iteration of the loop. + // TODO: Consider extending this sinking to handle other kinds of instructions + // and expressions, beyond sinking a single cast past Previous. + if (Phi->hasOneUse()) { + auto *I = Phi->user_back(); + if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() && + DT->dominates(Previous, I->user_back())) { + SinkAfter[I] = Previous; + return true; + } + } + for (User *U : Phi->users()) if (auto *I = dyn_cast<Instruction>(U)) { if (!DT->dominates(Previous, I)) diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 0a51f9a0e4a2..1c2a60a6b8b2 100644 --- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -27,7 +27,6 @@ void llvm::createMemCpyLoop(Instruction *InsertBefore, BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB); - OrigBB->getTerminator()->setSuccessor(0, LoopBB); IRBuilder<> Builder(OrigBB->getTerminator()); // SrcAddr and DstAddr are expected to be pointer types, @@ -39,6 +38,11 @@ void llvm::createMemCpyLoop(Instruction *InsertBefore, SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS)); DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS)); + Builder.CreateCondBr( + Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB, + LoopBB); + OrigBB->getTerminator()->eraseFromParent(); + IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); @@ -167,6 +171,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, Value *CopyLen, Value *SetValue, unsigned Align, bool IsVolatile) { + Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); BasicBlock *NewBB = @@ -174,7 +179,6 @@ static void createMemSetLoop(Instruction *InsertBefore, BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB); - OrigBB->getTerminator()->setSuccessor(0, LoopBB); IRBuilder<> Builder(OrigBB->getTerminator()); // Cast pointer to the type of value getting stored @@ -182,9 +186,14 @@ static void createMemSetLoop(Instruction *InsertBefore, DstAddr = Builder.CreateBitCast(DstAddr, PointerType::get(SetValue->getType(), dstAS)); + Builder.CreateCondBr( + Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB, + LoopBB); + OrigBB->getTerminator()->eraseFromParent(); + IRBuilder<> LoopBuilder(LoopBB); - PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0); - LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB); + PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); + LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); LoopBuilder.CreateStore( SetValue, @@ -192,7 +201,7 @@ static void createMemSetLoop(Instruction *InsertBefore, IsVolatile); Value *NewIndex = - LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1)); + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); LoopIndex->addIncoming(NewIndex, LoopBB); LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, diff --git a/lib/Transforms/Utils/OrderedInstructions.cpp b/lib/Transforms/Utils/OrderedInstructions.cpp index 2e67e0def5b9..dc780542ce68 100644 --- a/lib/Transforms/Utils/OrderedInstructions.cpp +++ b/lib/Transforms/Utils/OrderedInstructions.cpp @@ -27,7 +27,6 @@ bool OrderedInstructions::dominates(const Instruction *InstA, if (OBB == OBBMap.end()) OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first; return OBB->second->dominates(InstA, InstB); - } else { - return DT->dominates(InstA->getParent(), InstB->getParent()); } + return DT->dominates(InstA->getParent(), InstB->getParent()); } diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp index 1260e35e934d..d4cdaede6b86 100644 --- a/lib/Transforms/Utils/PredicateInfo.cpp +++ b/lib/Transforms/Utils/PredicateInfo.cpp @@ -19,7 +19,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/IR/AssemblyAnnotationWriter.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -34,6 +33,7 @@ #include "llvm/Support/DebugCounter.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/OrderedInstructions.h" #include <algorithm> #define DEBUG_TYPE "predicateinfo" using namespace llvm; @@ -106,14 +106,27 @@ struct ValueDFS { bool EdgeOnly = false; }; +// Perform a strict weak ordering on instructions and arguments. +static bool valueComesBefore(OrderedInstructions &OI, const Value *A, + const Value *B) { + auto *ArgA = dyn_cast_or_null<Argument>(A); + auto *ArgB = dyn_cast_or_null<Argument>(B); + if (ArgA && !ArgB) + return true; + if (ArgB && !ArgA) + return false; + if (ArgA && ArgB) + return ArgA->getArgNo() < ArgB->getArgNo(); + return OI.dominates(cast<Instruction>(A), cast<Instruction>(B)); +} + // This compares ValueDFS structures, creating OrderedBasicBlocks where // necessary to compare uses/defs in the same block. Doing so allows us to walk // the minimum number of instructions necessary to compute our def/use ordering. struct ValueDFS_Compare { - DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap; - ValueDFS_Compare( - DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap) - : OBBMap(OBBMap) {} + OrderedInstructions &OI; + ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {} + bool operator()(const ValueDFS &A, const ValueDFS &B) const { if (&A == &B) return false; @@ -196,23 +209,12 @@ struct ValueDFS_Compare { auto *ArgA = dyn_cast_or_null<Argument>(ADef); auto *ArgB = dyn_cast_or_null<Argument>(BDef); - if (ArgA && !ArgB) - return true; - if (ArgB && !ArgA) - return false; - if (ArgA && ArgB) - return ArgA->getArgNo() < ArgB->getArgNo(); + if (ArgA || ArgB) + return valueComesBefore(OI, ArgA, ArgB); auto *AInst = getDefOrUser(ADef, A.U); auto *BInst = getDefOrUser(BDef, B.U); - - auto *BB = AInst->getParent(); - auto LookupResult = OBBMap.find(BB); - if (LookupResult != OBBMap.end()) - return LookupResult->second->dominates(AInst, BInst); - - auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)}); - return Result.first->second->dominates(AInst, BInst); + return valueComesBefore(OI, AInst, BInst); } }; @@ -547,38 +549,11 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) { // Sort OpsToRename since we are going to iterate it. SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end()); - std::sort(OpsToRename.begin(), OpsToRename.end(), [&](const Value *A, - const Value *B) { - auto *ArgA = dyn_cast_or_null<Argument>(A); - auto *ArgB = dyn_cast_or_null<Argument>(B); - - // If A and B are args, order them based on their arg no. - if (ArgA && !ArgB) - return true; - if (ArgB && !ArgA) - return false; - if (ArgA && ArgB) - return ArgA->getArgNo() < ArgB->getArgNo(); - - // Else, A are B are instructions. - // If they belong to different BBs, order them by the dominance of BBs. - auto *AInst = cast<Instruction>(A); - auto *BInst = cast<Instruction>(B); - if (AInst->getParent() != BInst->getParent()) - return DT.dominates(AInst->getParent(), BInst->getParent()); - - // Else, A and B belong to the same BB. - // Order A and B by their dominance. - auto *BB = AInst->getParent(); - auto LookupResult = OBBMap.find(BB); - if (LookupResult != OBBMap.end()) - return LookupResult->second->dominates(AInst, BInst); - - auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)}); - return Result.first->second->dominates(AInst, BInst); - }); - - ValueDFS_Compare Compare(OBBMap); + auto Comparator = [&](const Value *A, const Value *B) { + return valueComesBefore(OI, A, B); + }; + std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator); + ValueDFS_Compare Compare(OI); // Compute liveness, and rename in O(uses) per Op. for (auto *Op : OpsToRename) { unsigned Counter = 0; @@ -715,7 +690,7 @@ PredicateInfo::getValueInfo(Value *Operand) const { PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT, AssumptionCache &AC) - : F(F), DT(DT), AC(AC) { + : F(F), DT(DT), AC(AC), OI(&DT) { // Push an empty operand info so that we can detect 0 as not finding one ValueInfos.resize(1); buildPredicateInfo(); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 0970c436e665..e724b0a28c32 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -4781,7 +4781,7 @@ public: SwitchLookupTable( Module &M, uint64_t TableSize, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, - Constant *DefaultValue, const DataLayout &DL); + Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName); /// Build instructions with Builder to retrieve the value at /// the position given by Index in the lookup table. @@ -4835,7 +4835,7 @@ private: SwitchLookupTable::SwitchLookupTable( Module &M, uint64_t TableSize, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, - Constant *DefaultValue, const DataLayout &DL) + Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr), LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) { assert(Values.size() && "Can't build lookup table without values!"); @@ -4943,7 +4943,7 @@ SwitchLookupTable::SwitchLookupTable( Array = new GlobalVariable(M, ArrayTy, /*constant=*/true, GlobalVariable::PrivateLinkage, Initializer, - "switch.table"); + "switch.table." + FuncName); Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); Kind = ArrayKind; } @@ -5333,7 +5333,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // If using a bitmask, use any value to fill the lookup table holes. Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI]; - SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL); + StringRef FuncName = SI->getParent()->getParent()->getName(); + SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL, + FuncName); Value *Result = Table.BuildLookup(TableIndex, Builder); diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index faa14046b1e3..ec8b0d426265 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -354,7 +354,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) { typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)( const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned); typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)( - const SCEV *, Type *); + const SCEV *, Type *, unsigned); OperationFunctionTy Operation; ExtensionFunctionTy Extension; @@ -406,11 +406,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) { IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2); const SCEV *A = - (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0u), - WideTy); + (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0), + WideTy, 0); const SCEV *B = - (SE->*Operation)((SE->*Extension)(LHS, WideTy), - (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap, 0u); + (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0), + (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0); if (A != B) return false; diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp deleted file mode 100644 index 78453aaa16ce..000000000000 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ /dev/null @@ -1,3282 +0,0 @@ -//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a basic-block vectorization pass. The algorithm was -// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, -// et al. It works by looking for chains of pairable operations and then -// pairing them. -// -//===----------------------------------------------------------------------===// - -#define BBV_NAME "bb-vectorize" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/ValueHandle.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Vectorize.h" -#include <algorithm> -using namespace llvm; - -#define DEBUG_TYPE BBV_NAME - -static cl::opt<bool> -IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), - cl::Hidden, cl::desc("Ignore target information")); - -static cl::opt<unsigned> -ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, - cl::desc("The required chain depth for vectorization")); - -static cl::opt<bool> -UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), - cl::Hidden, cl::desc("Use the chain depth requirement with" - " target information")); - -static cl::opt<unsigned> -SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, - cl::desc("The maximum search distance for instruction pairs")); - -static cl::opt<bool> -SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, - cl::desc("Replicating one element to a pair breaks the chain")); - -static cl::opt<unsigned> -VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, - cl::desc("The size of the native vector registers")); - -static cl::opt<unsigned> -MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, - cl::desc("The maximum number of pairing iterations")); - -static cl::opt<bool> -Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, - cl::desc("Don't try to form non-2^n-length vectors")); - -static cl::opt<unsigned> -MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, - cl::desc("The maximum number of pairable instructions per group")); - -static cl::opt<unsigned> -MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, - cl::desc("The maximum number of candidate instruction pairs per group")); - -static cl::opt<unsigned> -MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), - cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" - " a full cycle check")); - -static cl::opt<bool> -NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize boolean (i1) values")); - -static cl::opt<bool> -NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize integer values")); - -static cl::opt<bool> -NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize floating-point values")); - -// FIXME: This should default to false once pointer vector support works. -static cl::opt<bool> -NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, - cl::desc("Don't try to vectorize pointer values")); - -static cl::opt<bool> -NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize casting (conversion) operations")); - -static cl::opt<bool> -NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize floating-point math intrinsics")); - -static cl::opt<bool> - NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize BitManipulation intrinsics")); - -static cl::opt<bool> -NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); - -static cl::opt<bool> -NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize select instructions")); - -static cl::opt<bool> -NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize comparison instructions")); - -static cl::opt<bool> -NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize getelementptr instructions")); - -static cl::opt<bool> -NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize loads and stores")); - -static cl::opt<bool> -AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, - cl::desc("Only generate aligned loads and stores")); - -static cl::opt<bool> -NoMemOpBoost("bb-vectorize-no-mem-op-boost", - cl::init(false), cl::Hidden, - cl::desc("Don't boost the chain-depth contribution of loads and stores")); - -static cl::opt<bool> -FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, - cl::desc("Use a fast instruction dependency analysis")); - -#ifndef NDEBUG -static cl::opt<bool> -DebugInstructionExamination("bb-vectorize-debug-instruction-examination", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " instruction-examination process")); -static cl::opt<bool> -DebugCandidateSelection("bb-vectorize-debug-candidate-selection", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " candidate-selection process")); -static cl::opt<bool> -DebugPairSelection("bb-vectorize-debug-pair-selection", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " pair-selection process")); -static cl::opt<bool> -DebugCycleCheck("bb-vectorize-debug-cycle-check", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " cycle-checking process")); - -static cl::opt<bool> -PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, dump the basic block after" - " every pair is fused")); -#endif - -STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); - -namespace { - struct BBVectorize : public BasicBlockPass { - static char ID; // Pass identification, replacement for typeid - - const VectorizeConfig Config; - - BBVectorize(const VectorizeConfig &C = VectorizeConfig()) - : BasicBlockPass(ID), Config(C) { - initializeBBVectorizePass(*PassRegistry::getPassRegistry()); - } - - BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) - : BasicBlockPass(ID), Config(C) { - AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults(); - DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - TTI = IgnoreTargetInfo - ? nullptr - : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - } - - typedef std::pair<Value *, Value *> ValuePair; - typedef std::pair<ValuePair, int> ValuePairWithCost; - typedef std::pair<ValuePair, size_t> ValuePairWithDepth; - typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair - typedef std::pair<VPPair, unsigned> VPPairWithType; - - AliasAnalysis *AA; - DominatorTree *DT; - ScalarEvolution *SE; - const TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - - // FIXME: const correct? - - bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); - - bool getCandidatePairs(BasicBlock &BB, - BasicBlock::iterator &Start, - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, bool NonPow2Len); - - // FIXME: The current implementation does not account for pairs that - // are connected in multiple ways. For example: - // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) - enum PairConnectionType { - PairConnectionDirect, - PairConnectionSwap, - PairConnectionSplat - }; - - void computeConnectedPairs( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes); - - void buildDepMap(BasicBlock &BB, - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &PairableInstUsers); - - void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<Value *, Value *>& ChosenPairs); - - void fuseChosenPairs(BasicBlock &BB, - std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *>& ChosenPairs, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); - - - bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); - - bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len, - int &CostSavings, int &FixedOrder); - - bool trackUsesOfI(DenseSet<Value *> &Users, - AliasSetTracker &WriteSet, Instruction *I, - Instruction *J, bool UpdateUsers = true, - DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); - - void computePairsConnectedTo( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - ValuePair P); - - bool pairsConflict(ValuePair P, ValuePair Q, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<ValuePair, std::vector<ValuePair> > - *PairableInstUserMap = nullptr, - DenseSet<VPPair> *PairableInstUserPairSet = nullptr); - - bool pairWillFormCycle(ValuePair P, - DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, - DenseSet<ValuePair> &CurrentPairs); - - void pruneDAGFor( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, - DenseSet<VPPair> &PairableInstUserPairSet, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<ValuePair, size_t> &DAG, - DenseSet<ValuePair> &PrunedDAG, ValuePair J, - bool UseCycleCheck); - - void buildInitialDAGFor( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<ValuePair, size_t> &DAG, ValuePair J); - - void findBestDAGFor( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, - DenseSet<VPPair> &PairableInstUserPairSet, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, - int &BestEffSize, Value *II, std::vector<Value *>&JJ, - bool UseCycleCheck); - - Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o); - - void fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned MaskOffset, unsigned NumInElem, - unsigned NumInElem1, unsigned IdxOffset, - std::vector<Constant*> &Mask); - - Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, - Instruction *J); - - bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, - unsigned o, Value *&LOp, unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, - unsigned IdxOff = 0); - - Value *getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool IBeforeJ); - - void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, - Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, - bool IBeforeJ); - - void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, - Instruction *J, Instruction *K, - Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2); - - void collectPairLoadMoveSet(BasicBlock &BB, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, - DenseSet<ValuePair> &LoadMoveSetPairs, - Instruction *I); - - void collectLoadMoveSet(BasicBlock &BB, - std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, - DenseSet<ValuePair> &LoadMoveSetPairs); - - bool canMoveUsesOfIAfterJ(BasicBlock &BB, - DenseSet<ValuePair> &LoadMoveSetPairs, - Instruction *I, Instruction *J); - - void moveUsesOfIAfterJ(BasicBlock &BB, - DenseSet<ValuePair> &LoadMoveSetPairs, - Instruction *&InsertionPt, - Instruction *I, Instruction *J); - - bool vectorizeBB(BasicBlock &BB) { - if (skipBasicBlock(BB)) - return false; - if (!DT->isReachableFromEntry(&BB)) { - DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << - " in " << BB.getParent()->getName() << "\n"); - return false; - } - - DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); - - bool changed = false; - // Iterate a sufficient number of times to merge types of size 1 bit, - // then 2 bits, then 4, etc. up to half of the target vector width of the - // target vector register. - unsigned n = 1; - for (unsigned v = 2; - (TTI || v <= Config.VectorBits) && - (!Config.MaxIter || n <= Config.MaxIter); - v *= 2, ++n) { - DEBUG(dbgs() << "BBV: fusing loop #" << n << - " for " << BB.getName() << " in " << - BB.getParent()->getName() << "...\n"); - if (vectorizePairs(BB)) - changed = true; - else - break; - } - - if (changed && !Pow2LenOnly) { - ++n; - for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { - DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << - n << " for " << BB.getName() << " in " << - BB.getParent()->getName() << "...\n"); - if (!vectorizePairs(BB, true)) break; - } - } - - DEBUG(dbgs() << "BBV: done!\n"); - return changed; - } - - bool runOnBasicBlock(BasicBlock &BB) override { - // OptimizeNone check deferred to vectorizeBB(). - - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - TTI = IgnoreTargetInfo - ? nullptr - : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *BB.getParent()); - - return vectorizeBB(BB); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - BasicBlockPass::getAnalysisUsage(AU); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.setPreservesCFG(); - } - - static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { - assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && - "Cannot form vector from incompatible scalar types"); - Type *STy = ElemTy->getScalarType(); - - unsigned numElem; - if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { - numElem = VTy->getNumElements(); - } else { - numElem = 1; - } - - if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { - numElem += VTy->getNumElements(); - } else { - numElem += 1; - } - - return VectorType::get(STy, numElem); - } - - static inline void getInstructionTypes(Instruction *I, - Type *&T1, Type *&T2) { - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - // For stores, it is the value type, not the pointer type that matters - // because the value is what will come from a vector register. - - Value *IVal = SI->getValueOperand(); - T1 = IVal->getType(); - } else { - T1 = I->getType(); - } - - if (CastInst *CI = dyn_cast<CastInst>(I)) - T2 = CI->getSrcTy(); - else - T2 = T1; - - if (SelectInst *SI = dyn_cast<SelectInst>(I)) { - T2 = SI->getCondition()->getType(); - } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { - T2 = SI->getOperand(0)->getType(); - } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { - T2 = CI->getOperand(0)->getType(); - } - } - - // Returns the weight associated with the provided value. A chain of - // candidate pairs has a length given by the sum of the weights of its - // members (one weight per pair; the weight of each member of the pair - // is assumed to be the same). This length is then compared to the - // chain-length threshold to determine if a given chain is significant - // enough to be vectorized. The length is also used in comparing - // candidate chains where longer chains are considered to be better. - // Note: when this function returns 0, the resulting instructions are - // not actually fused. - inline size_t getDepthFactor(Value *V) { - // InsertElement and ExtractElement have a depth factor of zero. This is - // for two reasons: First, they cannot be usefully fused. Second, because - // the pass generates a lot of these, they can confuse the simple metric - // used to compare the dags in the next iteration. Thus, giving them a - // weight of zero allows the pass to essentially ignore them in - // subsequent iterations when looking for vectorization opportunities - // while still tracking dependency chains that flow through those - // instructions. - if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) - return 0; - - // Give a load or store half of the required depth so that load/store - // pairs will vectorize. - if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) - return Config.ReqChainDepth/2; - - return 1; - } - - // Returns the cost of the provided instruction using TTI. - // This does not handle loads and stores. - unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue, - const Instruction *I = nullptr) { - switch (Opcode) { - default: break; - case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the instruction addressing mode. At the moment we don't - // generate vector GEPs. - return 0; - case Instruction::Br: - return TTI->getCFInstrCost(Opcode); - case Instruction::PHI: - return 0; - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - return TTI->getCmpSelInstrCost(Opcode, T1, T2, I); - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::ShuffleVector: - return TTI->getCastInstrCost(Opcode, T1, T2, I); - } - - return 1; - } - - // This determines the relative offset of two loads or stores, returning - // true if the offset could be determined to be some constant value. - // For example, if OffsetInElmts == 1, then J accesses the memory directly - // after I; if OffsetInElmts == -1 then I accesses the memory - // directly after J. - bool getPairPtrInfo(Instruction *I, Instruction *J, - Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, - unsigned &IAddressSpace, unsigned &JAddressSpace, - int64_t &OffsetInElmts, bool ComputeOffset = true) { - OffsetInElmts = 0; - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - LoadInst *LJ = cast<LoadInst>(J); - IPtr = LI->getPointerOperand(); - JPtr = LJ->getPointerOperand(); - IAlignment = LI->getAlignment(); - JAlignment = LJ->getAlignment(); - IAddressSpace = LI->getPointerAddressSpace(); - JAddressSpace = LJ->getPointerAddressSpace(); - } else { - StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); - IPtr = SI->getPointerOperand(); - JPtr = SJ->getPointerOperand(); - IAlignment = SI->getAlignment(); - JAlignment = SJ->getAlignment(); - IAddressSpace = SI->getPointerAddressSpace(); - JAddressSpace = SJ->getPointerAddressSpace(); - } - - if (!ComputeOffset) - return true; - - const SCEV *IPtrSCEV = SE->getSCEV(IPtr); - const SCEV *JPtrSCEV = SE->getSCEV(JPtr); - - // If this is a trivial offset, then we'll get something like - // 1*sizeof(type). With target data, which we need anyway, this will get - // constant folded into a number. - const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); - if (const SCEVConstant *ConstOffSCEV = - dyn_cast<SCEVConstant>(OffsetSCEV)) { - ConstantInt *IntOff = ConstOffSCEV->getValue(); - int64_t Offset = IntOff->getSExtValue(); - const DataLayout &DL = I->getModule()->getDataLayout(); - Type *VTy = IPtr->getType()->getPointerElementType(); - int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy); - - Type *VTy2 = JPtr->getType()->getPointerElementType(); - if (VTy != VTy2 && Offset < 0) { - int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2); - OffsetInElmts = Offset/VTy2TSS; - return (std::abs(Offset) % VTy2TSS) == 0; - } - - OffsetInElmts = Offset/VTyTSS; - return (std::abs(Offset) % VTyTSS) == 0; - } - - return false; - } - - // Returns true if the provided CallInst represents an intrinsic that can - // be vectorized. - bool isVectorizableIntrinsic(CallInst* I) { - Function *F = I->getCalledFunction(); - if (!F) return false; - - Intrinsic::ID IID = F->getIntrinsicID(); - if (!IID) return false; - - switch(IID) { - default: - return false; - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::log: - case Intrinsic::log2: - case Intrinsic::log10: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::pow: - case Intrinsic::round: - case Intrinsic::copysign: - case Intrinsic::ceil: - case Intrinsic::nearbyint: - case Intrinsic::rint: - case Intrinsic::trunc: - case Intrinsic::floor: - case Intrinsic::fabs: - case Intrinsic::minnum: - case Intrinsic::maxnum: - return Config.VectorizeMath; - case Intrinsic::bswap: - case Intrinsic::ctpop: - case Intrinsic::ctlz: - case Intrinsic::cttz: - return Config.VectorizeBitManipulations; - case Intrinsic::fma: - case Intrinsic::fmuladd: - return Config.VectorizeFMA; - } - } - - bool isPureIEChain(InsertElementInst *IE) { - InsertElementInst *IENext = IE; - do { - if (!isa<UndefValue>(IENext->getOperand(0)) && - !isa<InsertElementInst>(IENext->getOperand(0))) { - return false; - } - } while ((IENext = - dyn_cast<InsertElementInst>(IENext->getOperand(0)))); - - return true; - } - }; - - // This function implements one vectorization iteration on the provided - // basic block. It returns true if the block is changed. - bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { - bool ShouldContinue; - BasicBlock::iterator Start = BB.getFirstInsertionPt(); - - std::vector<Value *> AllPairableInsts; - DenseMap<Value *, Value *> AllChosenPairs; - DenseSet<ValuePair> AllFixedOrderPairs; - DenseMap<VPPair, unsigned> AllPairConnectionTypes; - DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, - AllConnectedPairDeps; - - do { - std::vector<Value *> PairableInsts; - DenseMap<Value *, std::vector<Value *> > CandidatePairs; - DenseSet<ValuePair> FixedOrderPairs; - DenseMap<ValuePair, int> CandidatePairCostSavings; - ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, - FixedOrderPairs, - CandidatePairCostSavings, - PairableInsts, NonPow2Len); - if (PairableInsts.empty()) continue; - - // Build the candidate pair set for faster lookups. - DenseSet<ValuePair> CandidatePairsSet; - for (DenseMap<Value *, std::vector<Value *> >::iterator I = - CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) - for (std::vector<Value *>::iterator J = I->second.begin(), - JE = I->second.end(); J != JE; ++J) - CandidatePairsSet.insert(ValuePair(I->first, *J)); - - // Now we have a map of all of the pairable instructions and we need to - // select the best possible pairing. A good pairing is one such that the - // users of the pair are also paired. This defines a (directed) forest - // over the pairs such that two pairs are connected iff the second pair - // uses the first. - - // Note that it only matters that both members of the second pair use some - // element of the first pair (to allow for splatting). - - DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, - ConnectedPairDeps; - DenseMap<VPPair, unsigned> PairConnectionTypes; - computeConnectedPairs(CandidatePairs, CandidatePairsSet, - PairableInsts, ConnectedPairs, PairConnectionTypes); - if (ConnectedPairs.empty()) continue; - - for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator - I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); - I != IE; ++I) - for (std::vector<ValuePair>::iterator J = I->second.begin(), - JE = I->second.end(); J != JE; ++J) - ConnectedPairDeps[*J].push_back(I->first); - - // Build the pairable-instruction dependency map - DenseSet<ValuePair> PairableInstUsers; - buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); - - // There is now a graph of the connected pairs. For each variable, pick - // the pairing with the largest dag meeting the depth requirement on at - // least one branch. Then select all pairings that are part of that dag - // and remove them from the list of available pairings and pairable - // variables. - - DenseMap<Value *, Value *> ChosenPairs; - choosePairs(CandidatePairs, CandidatePairsSet, - CandidatePairCostSavings, - PairableInsts, FixedOrderPairs, PairConnectionTypes, - ConnectedPairs, ConnectedPairDeps, - PairableInstUsers, ChosenPairs); - - if (ChosenPairs.empty()) continue; - AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), - PairableInsts.end()); - AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); - - // Only for the chosen pairs, propagate information on fixed-order pairs, - // pair connections, and their types to the data structures used by the - // pair fusion procedures. - for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), - IE = ChosenPairs.end(); I != IE; ++I) { - if (FixedOrderPairs.count(*I)) - AllFixedOrderPairs.insert(*I); - else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) - AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); - - for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); - J != IE; ++J) { - DenseMap<VPPair, unsigned>::iterator K = - PairConnectionTypes.find(VPPair(*I, *J)); - if (K != PairConnectionTypes.end()) { - AllPairConnectionTypes.insert(*K); - } else { - K = PairConnectionTypes.find(VPPair(*J, *I)); - if (K != PairConnectionTypes.end()) - AllPairConnectionTypes.insert(*K); - } - } - } - - for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator - I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); - I != IE; ++I) - for (std::vector<ValuePair>::iterator J = I->second.begin(), - JE = I->second.end(); J != JE; ++J) - if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { - AllConnectedPairs[I->first].push_back(*J); - AllConnectedPairDeps[*J].push_back(I->first); - } - } while (ShouldContinue); - - if (AllChosenPairs.empty()) return false; - NumFusedOps += AllChosenPairs.size(); - - // A set of pairs has now been selected. It is now necessary to replace the - // paired instructions with vector instructions. For this procedure each - // operand must be replaced with a vector operand. This vector is formed - // by using build_vector on the old operands. The replaced values are then - // replaced with a vector_extract on the result. Subsequent optimization - // passes should coalesce the build/extract combinations. - - fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, - AllPairConnectionTypes, - AllConnectedPairs, AllConnectedPairDeps); - - // It is important to cleanup here so that future iterations of this - // function have less work to do. - (void)SimplifyInstructionsInBlock(&BB, TLI); - return true; - } - - // This function returns true if the provided instruction is capable of being - // fused into a vector instruction. This determination is based only on the - // type and other attributes of the instruction. - bool BBVectorize::isInstVectorizable(Instruction *I, - bool &IsSimpleLoadStore) { - IsSimpleLoadStore = false; - - if (CallInst *C = dyn_cast<CallInst>(I)) { - if (!isVectorizableIntrinsic(C)) - return false; - } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { - // Vectorize simple loads if possbile: - IsSimpleLoadStore = L->isSimple(); - if (!IsSimpleLoadStore || !Config.VectorizeMemOps) - return false; - } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { - // Vectorize simple stores if possbile: - IsSimpleLoadStore = S->isSimple(); - if (!IsSimpleLoadStore || !Config.VectorizeMemOps) - return false; - } else if (CastInst *C = dyn_cast<CastInst>(I)) { - // We can vectorize casts, but not casts of pointer types, etc. - if (!Config.VectorizeCasts) - return false; - - Type *SrcTy = C->getSrcTy(); - if (!SrcTy->isSingleValueType()) - return false; - - Type *DestTy = C->getDestTy(); - if (!DestTy->isSingleValueType()) - return false; - } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { - if (!Config.VectorizeSelect) - return false; - // We can vectorize a select if either all operands are scalars, - // or all operands are vectors. Trying to "widen" a select between - // vectors that has a scalar condition results in a malformed select. - // FIXME: We could probably be smarter about this by rewriting the select - // with different types instead. - return (SI->getCondition()->getType()->isVectorTy() == - SI->getTrueValue()->getType()->isVectorTy()); - } else if (isa<CmpInst>(I)) { - if (!Config.VectorizeCmp) - return false; - } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { - if (!Config.VectorizeGEP) - return false; - - // Currently, vector GEPs exist only with one index. - if (G->getNumIndices() != 1) - return false; - } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || - isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { - return false; - } - - Type *T1, *T2; - getInstructionTypes(I, T1, T2); - - // Not every type can be vectorized... - if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || - !(VectorType::isValidElementType(T2) || T2->isVectorTy())) - return false; - - if (T1->getScalarSizeInBits() == 1) { - if (!Config.VectorizeBools) - return false; - } else { - if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) - return false; - } - - if (T2->getScalarSizeInBits() == 1) { - if (!Config.VectorizeBools) - return false; - } else { - if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) - return false; - } - - if (!Config.VectorizeFloats - && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) - return false; - - // Don't vectorize target-specific types. - if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) - return false; - if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) - return false; - - if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() || - T2->getScalarType()->isPointerTy())) - return false; - - if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits)) - return false; - - return true; - } - - // This function returns true if the two provided instructions are compatible - // (meaning that they can be fused into a vector instruction). This assumes - // that I has already been determined to be vectorizable and that J is not - // in the use dag of I. - bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len, - int &CostSavings, int &FixedOrder) { - DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << - " <-> " << *J << "\n"); - - CostSavings = 0; - FixedOrder = 0; - - // Loads and stores can be merged if they have different alignments, - // but are otherwise the same. - if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | - (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) - return false; - - Type *IT1, *IT2, *JT1, *JT2; - getInstructionTypes(I, IT1, IT2); - getInstructionTypes(J, JT1, JT2); - unsigned MaxTypeBits = std::max( - IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), - IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (!TTI && MaxTypeBits > Config.VectorBits) - return false; - - // FIXME: handle addsub-type operations! - - if (IsSimpleLoadStore) { - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; - int64_t OffsetInElmts = 0; - if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - IAddressSpace, JAddressSpace, OffsetInElmts) && - std::abs(OffsetInElmts) == 1) { - FixedOrder = (int) OffsetInElmts; - unsigned BottomAlignment = IAlignment; - if (OffsetInElmts < 0) BottomAlignment = JAlignment; - - Type *aTypeI = isa<StoreInst>(I) ? - cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); - Type *aTypeJ = isa<StoreInst>(J) ? - cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); - Type *VType = getVecTypeForPair(aTypeI, aTypeJ); - - if (Config.AlignedOnly) { - // An aligned load or store is possible only if the instruction - // with the lower offset has an alignment suitable for the - // vector type. - const DataLayout &DL = I->getModule()->getDataLayout(); - unsigned VecAlignment = DL.getPrefTypeAlignment(VType); - if (BottomAlignment < VecAlignment) - return false; - } - - if (TTI) { - unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, - IAlignment, IAddressSpace); - unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, - JAlignment, JAddressSpace); - unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, - BottomAlignment, - IAddressSpace); - - ICost += TTI->getAddressComputationCost(aTypeI); - JCost += TTI->getAddressComputationCost(aTypeJ); - VCost += TTI->getAddressComputationCost(VType); - - if (VCost > ICost + JCost) - return false; - - // We don't want to fuse to a type that will be split, even - // if the two input types will also be split and there is no other - // associated cost. - unsigned VParts = TTI->getNumberOfParts(VType); - if (VParts > 1) - return false; - else if (!VParts && VCost == ICost + JCost) - return false; - - CostSavings = ICost + JCost - VCost; - } - } else { - return false; - } - } else if (TTI) { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; - unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I); - unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J); - Type *VT1 = getVecTypeForPair(IT1, JT1), - *VT2 = getVecTypeForPair(IT2, JT2); - - // On some targets (example X86) the cost of a vector shift may vary - // depending on whether the second operand is a Uniform or - // NonUniform Constant. - switch (I->getOpcode()) { - default : break; - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - - // If both I and J are scalar shifts by constant, then the - // merged vector shift count would be either a constant splat value - // or a non-uniform vector of constants. - if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1))) - Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : - TargetTransformInfo::OK_NonUniformConstantValue; - } else { - // Check for a splat of a constant or for a non uniform vector - // of constants. - Value *IOp = I->getOperand(1); - Value *JOp = J->getOperand(1); - if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) && - (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) { - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); - if (SplatValue != nullptr && - SplatValue == cast<Constant>(JOp)->getSplatValue()) - Op2VK = TargetTransformInfo::OK_UniformConstantValue; - } - } - } - - // Note that this procedure is incorrect for insert and extract element - // instructions (because combining these often results in a shuffle), - // but this cost is ignored (because insert and extract element - // instructions are assigned a zero depth factor and are not really - // fused in general). - unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I); - - if (VCost > ICost + JCost) - return false; - - // We don't want to fuse to a type that will be split, even - // if the two input types will also be split and there is no other - // associated cost. - unsigned VParts1 = TTI->getNumberOfParts(VT1), - VParts2 = TTI->getNumberOfParts(VT2); - if (VParts1 > 1 || VParts2 > 1) - return false; - else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) - return false; - - CostSavings = ICost + JCost - VCost; - } - - // The powi,ctlz,cttz intrinsics are special because only the first - // argument is vectorized, the second arguments must be equal. - CallInst *CI = dyn_cast<CallInst>(I); - Function *FI; - if (CI && (FI = CI->getCalledFunction())) { - Intrinsic::ID IID = FI->getIntrinsicID(); - if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) { - Value *A1I = CI->getArgOperand(1), - *A1J = cast<CallInst>(J)->getArgOperand(1); - const SCEV *A1ISCEV = SE->getSCEV(A1I), - *A1JSCEV = SE->getSCEV(A1J); - return (A1ISCEV == A1JSCEV); - } - - if (IID && TTI) { - FastMathFlags FMFCI; - if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI)) - FMFCI = FPMOCI->getFastMathFlags(); - SmallVector<Value *, 4> IArgs(CI->arg_operands()); - unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI); - - CallInst *CJ = cast<CallInst>(J); - - FastMathFlags FMFCJ; - if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ)) - FMFCJ = FPMOCJ->getFastMathFlags(); - - SmallVector<Value *, 4> JArgs(CJ->arg_operands()); - unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ); - - assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && - "Intrinsic argument counts differ"); - SmallVector<Type*, 4> Tys; - SmallVector<Value *, 4> VecArgs; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && i == 1) { - Tys.push_back(CI->getArgOperand(i)->getType()); - VecArgs.push_back(CI->getArgOperand(i)); - } - else { - Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), - CJ->getArgOperand(i)->getType())); - // Add both operands, and then count their scalarization overhead - // with VF 1. - VecArgs.push_back(CI->getArgOperand(i)); - VecArgs.push_back(CJ->getArgOperand(i)); - } - } - - // Compute the scalarization cost here with the original operands (to - // check for uniqueness etc), and then call getIntrinsicInstrCost() - // with the constructed vector types. - Type *RetTy = getVecTypeForPair(IT1, JT1); - unsigned ScalarizationCost = 0; - if (!RetTy->isVoidTy()) - ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false); - ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1); - - FastMathFlags FMFV = FMFCI; - FMFV &= FMFCJ; - unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV, - ScalarizationCost); - - if (VCost > ICost + JCost) - return false; - - // We don't want to fuse to a type that will be split, even - // if the two input types will also be split and there is no other - // associated cost. - unsigned RetParts = TTI->getNumberOfParts(RetTy); - if (RetParts > 1) - return false; - else if (!RetParts && VCost == ICost + JCost) - return false; - - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (!Tys[i]->isVectorTy()) - continue; - - unsigned NumParts = TTI->getNumberOfParts(Tys[i]); - if (NumParts > 1) - return false; - else if (!NumParts && VCost == ICost + JCost) - return false; - } - - CostSavings = ICost + JCost - VCost; - } - } - - return true; - } - - // Figure out whether or not J uses I and update the users and write-set - // structures associated with I. Specifically, Users represents the set of - // instructions that depend on I. WriteSet represents the set - // of memory locations that are dependent on I. If UpdateUsers is true, - // and J uses I, then Users is updated to contain J and WriteSet is updated - // to contain any memory locations to which J writes. The function returns - // true if J uses I. By default, alias analysis is used to determine - // whether J reads from memory that overlaps with a location in WriteSet. - // If LoadMoveSet is not null, then it is a previously-computed map - // where the key is the memory-based user instruction and the value is - // the instruction to be compared with I. So, if LoadMoveSet is provided, - // then the alias analysis is not used. This is necessary because this - // function is called during the process of moving instructions during - // vectorization and the results of the alias analysis are not stable during - // that process. - bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, - AliasSetTracker &WriteSet, Instruction *I, - Instruction *J, bool UpdateUsers, - DenseSet<ValuePair> *LoadMoveSetPairs) { - bool UsesI = false; - - // This instruction may already be marked as a user due, for example, to - // being a member of a selected pair. - if (Users.count(J)) - UsesI = true; - - if (!UsesI) - for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); - JU != JE; ++JU) { - Value *V = *JU; - if (I == V || Users.count(V)) { - UsesI = true; - break; - } - } - if (!UsesI && J->mayReadFromMemory()) { - if (LoadMoveSetPairs) { - UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); - } else { - for (AliasSetTracker::iterator W = WriteSet.begin(), - WE = WriteSet.end(); W != WE; ++W) { - if (W->aliasesUnknownInst(J, *AA)) { - UsesI = true; - break; - } - } - } - } - - if (UsesI && UpdateUsers) { - if (J->mayWriteToMemory()) WriteSet.add(J); - Users.insert(J); - } - - return UsesI; - } - - // This function iterates over all instruction pairs in the provided - // basic block and collects all candidate pairs for vectorization. - bool BBVectorize::getCandidatePairs(BasicBlock &BB, - BasicBlock::iterator &Start, - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, bool NonPow2Len) { - size_t TotalPairs = 0; - BasicBlock::iterator E = BB.end(); - if (Start == E) return false; - - bool ShouldContinue = false, IAfterStart = false; - for (BasicBlock::iterator I = Start++; I != E; ++I) { - if (I == Start) IAfterStart = true; - - bool IsSimpleLoadStore; - if (!isInstVectorizable(&*I, IsSimpleLoadStore)) - continue; - - // Look for an instruction with which to pair instruction *I... - DenseSet<Value *> Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) - WriteSet.add(&*I); - - bool JAfterStart = IAfterStart; - BasicBlock::iterator J = std::next(I); - for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { - if (J == Start) - JAfterStart = true; - - // Determine if J uses I, if so, exit the loop. - bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep); - if (Config.FastDep) { - // Note: For this heuristic to be effective, independent operations - // must tend to be intermixed. This is likely to be true from some - // kinds of grouped loop unrolling (but not the generic LLVM pass), - // but otherwise may require some kind of reordering pass. - - // When using fast dependency analysis, - // stop searching after first use: - if (UsesI) break; - } else { - if (UsesI) continue; - } - - // J does not use I, and comes before the first use of I, so it can be - // merged with I if the instructions are compatible. - int CostSavings, FixedOrder; - if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, - CostSavings, FixedOrder)) - continue; - - // J is a candidate for merging with I. - if (PairableInsts.empty() || - PairableInsts[PairableInsts.size() - 1] != &*I) { - PairableInsts.push_back(&*I); - } - - CandidatePairs[&*I].push_back(&*J); - ++TotalPairs; - if (TTI) - CandidatePairCostSavings.insert( - ValuePairWithCost(ValuePair(&*I, &*J), CostSavings)); - - if (FixedOrder == 1) - FixedOrderPairs.insert(ValuePair(&*I, &*J)); - else if (FixedOrder == -1) - FixedOrderPairs.insert(ValuePair(&*J, &*I)); - - // The next call to this function must start after the last instruction - // selected during this invocation. - if (JAfterStart) { - Start = std::next(J); - IAfterStart = JAfterStart = false; - } - - DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " - << *I << " <-> " << *J << " (cost savings: " << - CostSavings << ")\n"); - - // If we have already found too many pairs, break here and this function - // will be called again starting after the last instruction selected - // during this invocation. - if (PairableInsts.size() >= Config.MaxInsts || - TotalPairs >= Config.MaxPairs) { - ShouldContinue = true; - break; - } - } - - if (ShouldContinue) - break; - } - - DEBUG(dbgs() << "BBV: found " << PairableInsts.size() - << " instructions with candidate pairs\n"); - - return ShouldContinue; - } - - // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that - // it looks for pairs such that both members have an input which is an - // output of PI or PJ. - void BBVectorize::computePairsConnectedTo( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - ValuePair P) { - StoreInst *SI, *SJ; - - // For each possible pairing for this variable, look at the uses of - // the first value... - for (Value::user_iterator I = P.first->user_begin(), - E = P.first->user_end(); - I != E; ++I) { - User *UI = *I; - if (isa<LoadInst>(UI)) { - // A pair cannot be connected to a load because the load only takes one - // operand (the address) and it is a scalar even after vectorization. - continue; - } else if ((SI = dyn_cast<StoreInst>(UI)) && - P.first == SI->getPointerOperand()) { - // Similarly, a pair cannot be connected to a store through its - // pointer operand. - continue; - } - - // For each use of the first variable, look for uses of the second - // variable... - for (User *UJ : P.second->users()) { - if ((SJ = dyn_cast<StoreInst>(UJ)) && - P.second == SJ->getPointerOperand()) - continue; - - // Look for <I, J>: - if (CandidatePairsSet.count(ValuePair(UI, UJ))) { - VPPair VP(P, ValuePair(UI, UJ)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); - } - - // Look for <J, I>: - if (CandidatePairsSet.count(ValuePair(UJ, UI))) { - VPPair VP(P, ValuePair(UJ, UI)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); - } - } - - if (Config.SplatBreaksChain) continue; - // Look for cases where just the first value in the pair is used by - // both members of another pair (splatting). - for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) { - User *UJ = *J; - if ((SJ = dyn_cast<StoreInst>(UJ)) && - P.first == SJ->getPointerOperand()) - continue; - - if (CandidatePairsSet.count(ValuePair(UI, UJ))) { - VPPair VP(P, ValuePair(UI, UJ)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); - } - } - } - - if (Config.SplatBreaksChain) return; - // Look for cases where just the second value in the pair is used by - // both members of another pair (splatting). - for (Value::user_iterator I = P.second->user_begin(), - E = P.second->user_end(); - I != E; ++I) { - User *UI = *I; - if (isa<LoadInst>(UI)) - continue; - else if ((SI = dyn_cast<StoreInst>(UI)) && - P.second == SI->getPointerOperand()) - continue; - - for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) { - User *UJ = *J; - if ((SJ = dyn_cast<StoreInst>(UJ)) && - P.second == SJ->getPointerOperand()) - continue; - - if (CandidatePairsSet.count(ValuePair(UI, UJ))) { - VPPair VP(P, ValuePair(UI, UJ)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); - } - } - } - } - - // This function figures out which pairs are connected. Two pairs are - // connected if some output of the first pair forms an input to both members - // of the second pair. - void BBVectorize::computeConnectedPairs( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes) { - for (std::vector<Value *>::iterator PI = PairableInsts.begin(), - PE = PairableInsts.end(); PI != PE; ++PI) { - DenseMap<Value *, std::vector<Value *> >::iterator PP = - CandidatePairs.find(*PI); - if (PP == CandidatePairs.end()) - continue; - - for (std::vector<Value *>::iterator P = PP->second.begin(), - E = PP->second.end(); P != E; ++P) - computePairsConnectedTo(CandidatePairs, CandidatePairsSet, - PairableInsts, ConnectedPairs, - PairConnectionTypes, ValuePair(*PI, *P)); - } - - DEBUG(size_t TotalPairs = 0; - for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = - ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) - TotalPairs += I->second.size(); - dbgs() << "BBV: found " << TotalPairs - << " pair connections.\n"); - } - - // This function builds a set of use tuples such that <A, B> is in the set - // if B is in the use dag of A. If B is in the use dag of A, then B - // depends on the output of A. - void BBVectorize::buildDepMap( - BasicBlock &BB, - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &PairableInstUsers) { - DenseSet<Value *> IsInPair; - for (DenseMap<Value *, std::vector<Value *> >::iterator C = - CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { - IsInPair.insert(C->first); - IsInPair.insert(C->second.begin(), C->second.end()); - } - - // Iterate through the basic block, recording all users of each - // pairable instruction. - - BasicBlock::iterator E = BB.end(), EL = - BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); - for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { - if (IsInPair.find(&*I) == IsInPair.end()) - continue; - - DenseSet<Value *> Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) - WriteSet.add(&*I); - - for (BasicBlock::iterator J = std::next(I); J != E; ++J) { - (void)trackUsesOfI(Users, WriteSet, &*I, &*J); - - if (J == EL) - break; - } - - for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); - U != E; ++U) { - if (IsInPair.find(*U) == IsInPair.end()) continue; - PairableInstUsers.insert(ValuePair(&*I, *U)); - } - - if (I == EL) - break; - } - } - - // Returns true if an input to pair P is an output of pair Q and also an - // input of pair Q is an output of pair P. If this is the case, then these - // two pairs cannot be simultaneously fused. - bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, - DenseSet<VPPair> *PairableInstUserPairSet) { - // Two pairs are in conflict if they are mutual Users of eachother. - bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || - PairableInstUsers.count(ValuePair(P.first, Q.second)) || - PairableInstUsers.count(ValuePair(P.second, Q.first)) || - PairableInstUsers.count(ValuePair(P.second, Q.second)); - bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || - PairableInstUsers.count(ValuePair(Q.first, P.second)) || - PairableInstUsers.count(ValuePair(Q.second, P.first)) || - PairableInstUsers.count(ValuePair(Q.second, P.second)); - if (PairableInstUserMap) { - // FIXME: The expensive part of the cycle check is not so much the cycle - // check itself but this edge insertion procedure. This needs some - // profiling and probably a different data structure. - if (PUsesQ) { - if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) - (*PairableInstUserMap)[Q].push_back(P); - } - if (QUsesP) { - if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) - (*PairableInstUserMap)[P].push_back(Q); - } - } - - return (QUsesP && PUsesQ); - } - - // This function walks the use graph of current pairs to see if, starting - // from P, the walk returns to P. - bool BBVectorize::pairWillFormCycle(ValuePair P, - DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, - DenseSet<ValuePair> &CurrentPairs) { - DEBUG(if (DebugCycleCheck) - dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " - << *P.second << "\n"); - // A lookup table of visisted pairs is kept because the PairableInstUserMap - // contains non-direct associations. - DenseSet<ValuePair> Visited; - SmallVector<ValuePair, 32> Q; - // General depth-first post-order traversal: - Q.push_back(P); - do { - ValuePair QTop = Q.pop_back_val(); - Visited.insert(QTop); - - DEBUG(if (DebugCycleCheck) - dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " - << *QTop.second << "\n"); - DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = - PairableInstUserMap.find(QTop); - if (QQ == PairableInstUserMap.end()) - continue; - - for (std::vector<ValuePair>::iterator C = QQ->second.begin(), - CE = QQ->second.end(); C != CE; ++C) { - if (*C == P) { - DEBUG(dbgs() - << "BBV: rejected to prevent non-trivial cycle formation: " - << QTop.first << " <-> " << C->second << "\n"); - return true; - } - - if (CurrentPairs.count(*C) && !Visited.count(*C)) - Q.push_back(*C); - } - } while (!Q.empty()); - - return false; - } - - // This function builds the initial dag of connected pairs with the - // pair J at the root. - void BBVectorize::buildInitialDAGFor( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<ValuePair, size_t> &DAG, ValuePair J) { - // Each of these pairs is viewed as the root node of a DAG. The DAG - // is then walked (depth-first). As this happens, we keep track of - // the pairs that compose the DAG and the maximum depth of the DAG. - SmallVector<ValuePairWithDepth, 32> Q; - // General depth-first post-order traversal: - Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); - do { - ValuePairWithDepth QTop = Q.back(); - - // Push each child onto the queue: - bool MoreChildren = false; - size_t MaxChildDepth = QTop.second; - DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = - ConnectedPairs.find(QTop.first); - if (QQ != ConnectedPairs.end()) - for (std::vector<ValuePair>::iterator k = QQ->second.begin(), - ke = QQ->second.end(); k != ke; ++k) { - // Make sure that this child pair is still a candidate: - if (CandidatePairsSet.count(*k)) { - DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k); - if (C == DAG.end()) { - size_t d = getDepthFactor(k->first); - Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); - MoreChildren = true; - } else { - MaxChildDepth = std::max(MaxChildDepth, C->second); - } - } - } - - if (!MoreChildren) { - // Record the current pair as part of the DAG: - DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); - Q.pop_back(); - } - } while (!Q.empty()); - } - - // Given some initial dag, prune it by removing conflicting pairs (pairs - // that cannot be simultaneously chosen for vectorization). - void BBVectorize::pruneDAGFor( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - std::vector<Value *> &PairableInsts, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, - DenseSet<VPPair> &PairableInstUserPairSet, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<ValuePair, size_t> &DAG, - DenseSet<ValuePair> &PrunedDAG, ValuePair J, - bool UseCycleCheck) { - SmallVector<ValuePairWithDepth, 32> Q; - // General depth-first post-order traversal: - Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); - do { - ValuePairWithDepth QTop = Q.pop_back_val(); - PrunedDAG.insert(QTop.first); - - // Visit each child, pruning as necessary... - SmallVector<ValuePairWithDepth, 8> BestChildren; - DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = - ConnectedPairs.find(QTop.first); - if (QQ == ConnectedPairs.end()) - continue; - - for (std::vector<ValuePair>::iterator K = QQ->second.begin(), - KE = QQ->second.end(); K != KE; ++K) { - DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K); - if (C == DAG.end()) continue; - - // This child is in the DAG, now we need to make sure it is the - // best of any conflicting children. There could be multiple - // conflicting children, so first, determine if we're keeping - // this child, then delete conflicting children as necessary. - - // It is also necessary to guard against pairing-induced - // dependencies. Consider instructions a .. x .. y .. b - // such that (a,b) are to be fused and (x,y) are to be fused - // but a is an input to x and b is an output from y. This - // means that y cannot be moved after b but x must be moved - // after b for (a,b) to be fused. In other words, after - // fusing (a,b) we have y .. a/b .. x where y is an input - // to a/b and x is an output to a/b: x and y can no longer - // be legally fused. To prevent this condition, we must - // make sure that a child pair added to the DAG is not - // both an input and output of an already-selected pair. - - // Pairing-induced dependencies can also form from more complicated - // cycles. The pair vs. pair conflicts are easy to check, and so - // that is done explicitly for "fast rejection", and because for - // child vs. child conflicts, we may prefer to keep the current - // pair in preference to the already-selected child. - DenseSet<ValuePair> CurrentPairs; - - bool CanAdd = true; - for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 - = BestChildren.begin(), E2 = BestChildren.end(); - C2 != E2; ++C2) { - if (C2->first.first == C->first.first || - C2->first.first == C->first.second || - C2->first.second == C->first.first || - C2->first.second == C->first.second || - pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - if (C2->second >= C->second) { - CanAdd = false; - break; - } - - CurrentPairs.insert(C2->first); - } - } - if (!CanAdd) continue; - - // Even worse, this child could conflict with another node already - // selected for the DAG. If that is the case, ignore this child. - for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(), - E2 = PrunedDAG.end(); T != E2; ++T) { - if (T->first == C->first.first || - T->first == C->first.second || - T->second == C->first.first || - T->second == C->first.second || - pairsConflict(*T, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - CanAdd = false; - break; - } - - CurrentPairs.insert(*T); - } - if (!CanAdd) continue; - - // And check the queue too... - for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), - E2 = Q.end(); C2 != E2; ++C2) { - if (C2->first.first == C->first.first || - C2->first.first == C->first.second || - C2->first.second == C->first.first || - C2->first.second == C->first.second || - pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - CanAdd = false; - break; - } - - CurrentPairs.insert(C2->first); - } - if (!CanAdd) continue; - - // Last but not least, check for a conflict with any of the - // already-chosen pairs. - for (DenseMap<Value *, Value *>::iterator C2 = - ChosenPairs.begin(), E2 = ChosenPairs.end(); - C2 != E2; ++C2) { - if (pairsConflict(*C2, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - CanAdd = false; - break; - } - - CurrentPairs.insert(*C2); - } - if (!CanAdd) continue; - - // To check for non-trivial cycles formed by the addition of the - // current pair we've formed a list of all relevant pairs, now use a - // graph walk to check for a cycle. We start from the current pair and - // walk the use dag to see if we again reach the current pair. If we - // do, then the current pair is rejected. - - // FIXME: It may be more efficient to use a topological-ordering - // algorithm to improve the cycle check. This should be investigated. - if (UseCycleCheck && - pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) - continue; - - // This child can be added, but we may have chosen it in preference - // to an already-selected child. Check for this here, and if a - // conflict is found, then remove the previously-selected child - // before adding this one in its place. - for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 - = BestChildren.begin(); C2 != BestChildren.end();) { - if (C2->first.first == C->first.first || - C2->first.first == C->first.second || - C2->first.second == C->first.first || - C2->first.second == C->first.second || - pairsConflict(C2->first, C->first, PairableInstUsers)) - C2 = BestChildren.erase(C2); - else - ++C2; - } - - BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); - } - - for (SmallVectorImpl<ValuePairWithDepth>::iterator C - = BestChildren.begin(), E2 = BestChildren.end(); - C != E2; ++C) { - size_t DepthF = getDepthFactor(C->first.first); - Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); - } - } while (!Q.empty()); - } - - // This function finds the best dag of mututally-compatible connected - // pairs, given the choice of root pairs as an iterator range. - void BBVectorize::findBestDAGFor( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, - DenseSet<VPPair> &PairableInstUserPairSet, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, - int &BestEffSize, Value *II, std::vector<Value *>&JJ, - bool UseCycleCheck) { - for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); - J != JE; ++J) { - ValuePair IJ(II, *J); - if (!CandidatePairsSet.count(IJ)) - continue; - - // Before going any further, make sure that this pair does not - // conflict with any already-selected pairs (see comment below - // near the DAG pruning for more details). - DenseSet<ValuePair> ChosenPairSet; - bool DoesConflict = false; - for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), - E = ChosenPairs.end(); C != E; ++C) { - if (pairsConflict(*C, IJ, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { - DoesConflict = true; - break; - } - - ChosenPairSet.insert(*C); - } - if (DoesConflict) continue; - - if (UseCycleCheck && - pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) - continue; - - DenseMap<ValuePair, size_t> DAG; - buildInitialDAGFor(CandidatePairs, CandidatePairsSet, - PairableInsts, ConnectedPairs, - PairableInstUsers, ChosenPairs, DAG, IJ); - - // Because we'll keep the child with the largest depth, the largest - // depth is still the same in the unpruned DAG. - size_t MaxDepth = DAG.lookup(IJ); - - DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" - << *IJ.first << " <-> " << *IJ.second << "} of depth " << - MaxDepth << " and size " << DAG.size() << "\n"); - - // At this point the DAG has been constructed, but, may contain - // contradictory children (meaning that different children of - // some dag node may be attempting to fuse the same instruction). - // So now we walk the dag again, in the case of a conflict, - // keep only the child with the largest depth. To break a tie, - // favor the first child. - - DenseSet<ValuePair> PrunedDAG; - pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, - PairableInstUsers, PairableInstUserMap, - PairableInstUserPairSet, - ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); - - int EffSize = 0; - if (TTI) { - DenseSet<Value *> PrunedDAGInstrs; - for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), - E = PrunedDAG.end(); S != E; ++S) { - PrunedDAGInstrs.insert(S->first); - PrunedDAGInstrs.insert(S->second); - } - - // The set of pairs that have already contributed to the total cost. - DenseSet<ValuePair> IncomingPairs; - - // If the cost model were perfect, this might not be necessary; but we - // need to make sure that we don't get stuck vectorizing our own - // shuffle chains. - bool HasNontrivialInsts = false; - - // The node weights represent the cost savings associated with - // fusing the pair of instructions. - for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), - E = PrunedDAG.end(); S != E; ++S) { - if (!isa<ShuffleVectorInst>(S->first) && - !isa<InsertElementInst>(S->first) && - !isa<ExtractElementInst>(S->first)) - HasNontrivialInsts = true; - - bool FlipOrder = false; - - if (getDepthFactor(S->first)) { - int ESContrib = CandidatePairCostSavings.find(*S)->second; - DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" - << *S->first << " <-> " << *S->second << "} = " << - ESContrib << "\n"); - EffSize += ESContrib; - } - - // The edge weights contribute in a negative sense: they represent - // the cost of shuffles. - DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = - ConnectedPairDeps.find(*S); - if (SS != ConnectedPairDeps.end()) { - unsigned NumDepsDirect = 0, NumDepsSwap = 0; - for (std::vector<ValuePair>::iterator T = SS->second.begin(), - TE = SS->second.end(); T != TE; ++T) { - VPPair Q(*S, *T); - if (!PrunedDAG.count(Q.second)) - continue; - DenseMap<VPPair, unsigned>::iterator R = - PairConnectionTypes.find(VPPair(Q.second, Q.first)); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - if (R->second == PairConnectionDirect) - ++NumDepsDirect; - else if (R->second == PairConnectionSwap) - ++NumDepsSwap; - } - - // If there are more swaps than direct connections, then - // the pair order will be flipped during fusion. So the real - // number of swaps is the minimum number. - FlipOrder = !FixedOrderPairs.count(*S) && - ((NumDepsSwap > NumDepsDirect) || - FixedOrderPairs.count(ValuePair(S->second, S->first))); - - for (std::vector<ValuePair>::iterator T = SS->second.begin(), - TE = SS->second.end(); T != TE; ++T) { - VPPair Q(*S, *T); - if (!PrunedDAG.count(Q.second)) - continue; - DenseMap<VPPair, unsigned>::iterator R = - PairConnectionTypes.find(VPPair(Q.second, Q.first)); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - Type *Ty1 = Q.second.first->getType(), - *Ty2 = Q.second.second->getType(); - Type *VTy = getVecTypeForPair(Ty1, Ty2); - if ((R->second == PairConnectionDirect && FlipOrder) || - (R->second == PairConnectionSwap && !FlipOrder) || - R->second == PairConnectionSplat) { - int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - VTy, VTy); - - if (VTy->getVectorNumElements() == 2) { - if (R->second == PairConnectionSplat) - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_Broadcast, VTy)); - else - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_Reverse, VTy)); - } - - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << - *Q.second.first << " <-> " << *Q.second.second << - "} -> {" << - *S->first << " <-> " << *S->second << "} = " << - ESContrib << "\n"); - EffSize -= ESContrib; - } - } - } - - // Compute the cost of outgoing edges. We assume that edges outgoing - // to shuffles, inserts or extracts can be merged, and so contribute - // no additional cost. - if (!S->first->getType()->isVoidTy()) { - Type *Ty1 = S->first->getType(), - *Ty2 = S->second->getType(); - Type *VTy = getVecTypeForPair(Ty1, Ty2); - - bool NeedsExtraction = false; - for (User *U : S->first->users()) { - if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { - // Shuffle can be folded if it has no other input - if (isa<UndefValue>(SI->getOperand(1))) - continue; - } - if (isa<ExtractElementInst>(U)) - continue; - if (PrunedDAGInstrs.count(U)) - continue; - NeedsExtraction = true; - break; - } - - if (NeedsExtraction) { - int ESContrib; - if (Ty1->isVectorTy()) { - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - Ty1, VTy); - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); - } else - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::ExtractElement, VTy, 0); - - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << - *S->first << "} = " << ESContrib << "\n"); - EffSize -= ESContrib; - } - - NeedsExtraction = false; - for (User *U : S->second->users()) { - if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { - // Shuffle can be folded if it has no other input - if (isa<UndefValue>(SI->getOperand(1))) - continue; - } - if (isa<ExtractElementInst>(U)) - continue; - if (PrunedDAGInstrs.count(U)) - continue; - NeedsExtraction = true; - break; - } - - if (NeedsExtraction) { - int ESContrib; - if (Ty2->isVectorTy()) { - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - Ty2, VTy); - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, VTy, - Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); - } else - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::ExtractElement, VTy, 1); - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << - *S->second << "} = " << ESContrib << "\n"); - EffSize -= ESContrib; - } - } - - // Compute the cost of incoming edges. - if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { - Instruction *S1 = cast<Instruction>(S->first), - *S2 = cast<Instruction>(S->second); - for (unsigned o = 0; o < S1->getNumOperands(); ++o) { - Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); - - // Combining constants into vector constants (or small vector - // constants into larger ones are assumed free). - if (isa<Constant>(O1) && isa<Constant>(O2)) - continue; - - if (FlipOrder) - std::swap(O1, O2); - - ValuePair VP = ValuePair(O1, O2); - ValuePair VPR = ValuePair(O2, O1); - - // Internal edges are not handled here. - if (PrunedDAG.count(VP) || PrunedDAG.count(VPR)) - continue; - - Type *Ty1 = O1->getType(), - *Ty2 = O2->getType(); - Type *VTy = getVecTypeForPair(Ty1, Ty2); - - // Combining vector operations of the same type is also assumed - // folded with other operations. - if (Ty1 == Ty2) { - // If both are insert elements, then both can be widened. - InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), - *IEO2 = dyn_cast<InsertElementInst>(O2); - if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) - continue; - // If both are extract elements, and both have the same input - // type, then they can be replaced with a shuffle - ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), - *EIO2 = dyn_cast<ExtractElementInst>(O2); - if (EIO1 && EIO2 && - EIO1->getOperand(0)->getType() == - EIO2->getOperand(0)->getType()) - continue; - // If both are a shuffle with equal operand types and only two - // unqiue operands, then they can be replaced with a single - // shuffle - ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), - *SIO2 = dyn_cast<ShuffleVectorInst>(O2); - if (SIO1 && SIO2 && - SIO1->getOperand(0)->getType() == - SIO2->getOperand(0)->getType()) { - SmallSet<Value *, 4> SIOps; - SIOps.insert(SIO1->getOperand(0)); - SIOps.insert(SIO1->getOperand(1)); - SIOps.insert(SIO2->getOperand(0)); - SIOps.insert(SIO2->getOperand(1)); - if (SIOps.size() <= 2) - continue; - } - } - - int ESContrib; - // This pair has already been formed. - if (IncomingPairs.count(VP)) { - continue; - } else if (IncomingPairs.count(VPR)) { - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - VTy, VTy); - - if (VTy->getVectorNumElements() == 2) - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_Reverse, VTy)); - } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::InsertElement, VTy, 0); - ESContrib += (int) TTI->getVectorInstrCost( - Instruction::InsertElement, VTy, 1); - } else if (!Ty1->isVectorTy()) { - // O1 needs to be inserted into a vector of size O2, and then - // both need to be shuffled together. - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::InsertElement, Ty2, 0); - ESContrib += (int) getInstrCost(Instruction::ShuffleVector, - VTy, Ty2); - } else if (!Ty2->isVectorTy()) { - // O2 needs to be inserted into a vector of size O1, and then - // both need to be shuffled together. - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::InsertElement, Ty1, 0); - ESContrib += (int) getInstrCost(Instruction::ShuffleVector, - VTy, Ty1); - } else { - Type *TyBig = Ty1, *TySmall = Ty2; - if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) - std::swap(TyBig, TySmall); - - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - VTy, TyBig); - if (TyBig != TySmall) - ESContrib += (int) getInstrCost(Instruction::ShuffleVector, - TyBig, TySmall); - } - - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" - << *O1 << " <-> " << *O2 << "} = " << - ESContrib << "\n"); - EffSize -= ESContrib; - IncomingPairs.insert(VP); - } - } - } - - if (!HasNontrivialInsts) { - DEBUG(if (DebugPairSelection) dbgs() << - "\tNo non-trivial instructions in DAG;" - " override to zero effective size\n"); - EffSize = 0; - } - } else { - for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), - E = PrunedDAG.end(); S != E; ++S) - EffSize += (int) getDepthFactor(S->first); - } - - DEBUG(if (DebugPairSelection) - dbgs() << "BBV: found pruned DAG for pair {" - << *IJ.first << " <-> " << *IJ.second << "} of depth " << - MaxDepth << " and size " << PrunedDAG.size() << - " (effective size: " << EffSize << ")\n"); - if (((TTI && !UseChainDepthWithTI) || - MaxDepth >= Config.ReqChainDepth) && - EffSize > 0 && EffSize > BestEffSize) { - BestMaxDepth = MaxDepth; - BestEffSize = EffSize; - BestDAG = PrunedDAG; - } - } - } - - // Given the list of candidate pairs, this function selects those - // that will be fused into vector instructions. - void BBVectorize::choosePairs( - DenseMap<Value *, std::vector<Value *> > &CandidatePairs, - DenseSet<ValuePair> &CandidatePairsSet, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<Value *, Value *>& ChosenPairs) { - bool UseCycleCheck = - CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; - - DenseMap<Value *, std::vector<Value *> > CandidatePairs2; - for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), - E = CandidatePairsSet.end(); I != E; ++I) { - std::vector<Value *> &JJ = CandidatePairs2[I->second]; - if (JJ.empty()) JJ.reserve(32); - JJ.push_back(I->first); - } - - DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; - DenseSet<VPPair> PairableInstUserPairSet; - for (std::vector<Value *>::iterator I = PairableInsts.begin(), - E = PairableInsts.end(); I != E; ++I) { - // The number of possible pairings for this variable: - size_t NumChoices = CandidatePairs.lookup(*I).size(); - if (!NumChoices) continue; - - std::vector<Value *> &JJ = CandidatePairs[*I]; - - // The best pair to choose and its dag: - size_t BestMaxDepth = 0; - int BestEffSize = 0; - DenseSet<ValuePair> BestDAG; - findBestDAGFor(CandidatePairs, CandidatePairsSet, - CandidatePairCostSavings, - PairableInsts, FixedOrderPairs, PairConnectionTypes, - ConnectedPairs, ConnectedPairDeps, - PairableInstUsers, PairableInstUserMap, - PairableInstUserPairSet, ChosenPairs, - BestDAG, BestMaxDepth, BestEffSize, *I, JJ, - UseCycleCheck); - - if (BestDAG.empty()) - continue; - - // A dag has been chosen (or not) at this point. If no dag was - // chosen, then this instruction, I, cannot be paired (and is no longer - // considered). - - DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: " - << *cast<Instruction>(*I) << "\n"); - - for (DenseSet<ValuePair>::iterator S = BestDAG.begin(), - SE2 = BestDAG.end(); S != SE2; ++S) { - // Insert the members of this dag into the list of chosen pairs. - ChosenPairs.insert(ValuePair(S->first, S->second)); - DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << - *S->second << "\n"); - - // Remove all candidate pairs that have values in the chosen dag. - std::vector<Value *> &KK = CandidatePairs[S->first]; - for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); - K != KE; ++K) { - if (*K == S->second) - continue; - - CandidatePairsSet.erase(ValuePair(S->first, *K)); - } - - std::vector<Value *> &LL = CandidatePairs2[S->second]; - for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); - L != LE; ++L) { - if (*L == S->first) - continue; - - CandidatePairsSet.erase(ValuePair(*L, S->second)); - } - - std::vector<Value *> &MM = CandidatePairs[S->second]; - for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); - M != ME; ++M) { - assert(*M != S->first && "Flipped pair in candidate list?"); - CandidatePairsSet.erase(ValuePair(S->second, *M)); - } - - std::vector<Value *> &NN = CandidatePairs2[S->first]; - for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); - N != NE; ++N) { - assert(*N != S->second && "Flipped pair in candidate list?"); - CandidatePairsSet.erase(ValuePair(*N, S->first)); - } - } - } - - DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); - } - - std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, - unsigned n = 0) { - if (!I->hasName()) - return ""; - - return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + - (n > 0 ? "." + utostr(n) : "")).str(); - } - - // Returns the value that is to be used as the pointer input to the vector - // instruction that fuses I with J. - Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, - Instruction *I, Instruction *J, unsigned o) { - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; - int64_t OffsetInElmts; - - // Note: the analysis might fail here, that is why the pair order has - // been precomputed (OffsetInElmts must be unused here). - (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - IAddressSpace, JAddressSpace, - OffsetInElmts, false); - - // The pointer value is taken to be the one with the lowest offset. - Value *VPtr = IPtr; - - Type *ArgTypeI = IPtr->getType()->getPointerElementType(); - Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); - Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - Type *VArgPtrType - = PointerType::get(VArgType, - IPtr->getType()->getPointerAddressSpace()); - return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), - /* insert before */ I); - } - - void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned MaskOffset, unsigned NumInElem, - unsigned NumInElem1, unsigned IdxOffset, - std::vector<Constant*> &Mask) { - unsigned NumElem1 = J->getType()->getVectorNumElements(); - for (unsigned v = 0; v < NumElem1; ++v) { - int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); - if (m < 0) { - Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); - } else { - unsigned mm = m + (int) IdxOffset; - if (m >= (int) NumInElem1) - mm += (int) NumInElem; - - Mask[v+MaskOffset] = - ConstantInt::get(Type::getInt32Ty(Context), mm); - } - } - } - - // Returns the value that is to be used as the vector-shuffle mask to the - // vector instruction that fuses I with J. - Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, - Instruction *I, Instruction *J) { - // This is the shuffle mask. We need to append the second - // mask to the first, and the numbers need to be adjusted. - - Type *ArgTypeI = I->getType(); - Type *ArgTypeJ = J->getType(); - Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - - unsigned NumElemI = ArgTypeI->getVectorNumElements(); - - // Get the total number of elements in the fused vector type. - // By definition, this must equal the number of elements in - // the final mask. - unsigned NumElem = VArgType->getVectorNumElements(); - std::vector<Constant*> Mask(NumElem); - - Type *OpTypeI = I->getOperand(0)->getType(); - unsigned NumInElemI = OpTypeI->getVectorNumElements(); - Type *OpTypeJ = J->getOperand(0)->getType(); - unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); - - // The fused vector will be: - // ----------------------------------------------------- - // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | - // ----------------------------------------------------- - // from which we'll extract NumElem total elements (where the first NumElemI - // of them come from the mask in I and the remainder come from the mask - // in J. - - // For the mask from the first pair... - fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, - 0, Mask); - - // For the mask from the second pair... - fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, - NumInElemI, Mask); - - return ConstantVector::get(Mask); - } - - bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, Value *&LOp, - unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeH, - bool IBeforeJ, unsigned IdxOff) { - bool ExpandedIEChain = false; - if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { - // If we have a pure insertelement chain, then this can be rewritten - // into a chain that directly builds the larger type. - if (isPureIEChain(LIE)) { - SmallVector<Value *, 8> VectElemts(numElemL, - UndefValue::get(ArgTypeL->getScalarType())); - InsertElementInst *LIENext = LIE; - do { - unsigned Idx = - cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); - VectElemts[Idx] = LIENext->getOperand(1); - } while ((LIENext = - dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); - - LIENext = nullptr; - Value *LIEPrev = UndefValue::get(ArgTypeH); - for (unsigned i = 0; i < numElemL; ++i) { - if (isa<UndefValue>(VectElemts[i])) continue; - LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], - ConstantInt::get(Type::getInt32Ty(Context), - i + IdxOff), - getReplacementName(IBeforeJ ? I : J, - true, o, i+1)); - LIENext->insertBefore(IBeforeJ ? J : I); - LIEPrev = LIENext; - } - - LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); - ExpandedIEChain = true; - } - } - - return ExpandedIEChain; - } - - static unsigned getNumScalarElements(Type *Ty) { - if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) - return VecTy->getNumElements(); - return 1; - } - - // Returns the value to be used as the specified operand of the vector - // instruction that fuses I with J. - Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool IBeforeJ) { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); - - // Compute the fused vector type for this operand - Type *ArgTypeI = I->getOperand(o)->getType(); - Type *ArgTypeJ = J->getOperand(o)->getType(); - VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - - Instruction *L = I, *H = J; - Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - - unsigned numElemL = getNumScalarElements(ArgTypeL); - unsigned numElemH = getNumScalarElements(ArgTypeH); - - Value *LOp = L->getOperand(o); - Value *HOp = H->getOperand(o); - unsigned numElem = VArgType->getNumElements(); - - // First, we check if we can reuse the "original" vector outputs (if these - // exist). We might need a shuffle. - ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); - ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); - ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); - ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); - - // FIXME: If we're fusing shuffle instructions, then we can't apply this - // optimization. The input vectors to the shuffle might be a different - // length from the shuffle outputs. Unfortunately, the replacement - // shuffle mask has already been formed, and the mask entries are sensitive - // to the sizes of the inputs. - bool IsSizeChangeShuffle = - isa<ShuffleVectorInst>(L) && - (LOp->getType() != L->getType() || HOp->getType() != H->getType()); - - if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { - // We can have at most two unique vector inputs. - bool CanUseInputs = true; - Value *I1, *I2 = nullptr; - if (LEE) { - I1 = LEE->getOperand(0); - } else { - I1 = LSV->getOperand(0); - I2 = LSV->getOperand(1); - if (I2 == I1 || isa<UndefValue>(I2)) - I2 = nullptr; - } - - if (HEE) { - Value *I3 = HEE->getOperand(0); - if (!I2 && I3 != I1) - I2 = I3; - else if (I3 != I1 && I3 != I2) - CanUseInputs = false; - } else { - Value *I3 = HSV->getOperand(0); - if (!I2 && I3 != I1) - I2 = I3; - else if (I3 != I1 && I3 != I2) - CanUseInputs = false; - - if (CanUseInputs) { - Value *I4 = HSV->getOperand(1); - if (!isa<UndefValue>(I4)) { - if (!I2 && I4 != I1) - I2 = I4; - else if (I4 != I1 && I4 != I2) - CanUseInputs = false; - } - } - } - - if (CanUseInputs) { - unsigned LOpElem = - cast<Instruction>(LOp)->getOperand(0)->getType() - ->getVectorNumElements(); - - unsigned HOpElem = - cast<Instruction>(HOp)->getOperand(0)->getType() - ->getVectorNumElements(); - - // We have one or two input vectors. We need to map each index of the - // operands to the index of the original vector. - SmallVector<std::pair<int, int>, 8> II(numElem); - for (unsigned i = 0; i < numElemL; ++i) { - int Idx, INum; - if (LEE) { - Idx = - cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); - INum = LEE->getOperand(0) == I1 ? 0 : 1; - } else { - Idx = LSV->getMaskValue(i); - if (Idx < (int) LOpElem) { - INum = LSV->getOperand(0) == I1 ? 0 : 1; - } else { - Idx -= LOpElem; - INum = LSV->getOperand(1) == I1 ? 0 : 1; - } - } - - II[i] = std::pair<int, int>(Idx, INum); - } - for (unsigned i = 0; i < numElemH; ++i) { - int Idx, INum; - if (HEE) { - Idx = - cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); - INum = HEE->getOperand(0) == I1 ? 0 : 1; - } else { - Idx = HSV->getMaskValue(i); - if (Idx < (int) HOpElem) { - INum = HSV->getOperand(0) == I1 ? 0 : 1; - } else { - Idx -= HOpElem; - INum = HSV->getOperand(1) == I1 ? 0 : 1; - } - } - - II[i + numElemL] = std::pair<int, int>(Idx, INum); - } - - // We now have an array which tells us from which index of which - // input vector each element of the operand comes. - VectorType *I1T = cast<VectorType>(I1->getType()); - unsigned I1Elem = I1T->getNumElements(); - - if (!I2) { - // In this case there is only one underlying vector input. Check for - // the trivial case where we can use the input directly. - if (I1Elem == numElem) { - bool ElemInOrder = true; - for (unsigned i = 0; i < numElem; ++i) { - if (II[i].first != (int) i && II[i].first != -1) { - ElemInOrder = false; - break; - } - } - - if (ElemInOrder) - return I1; - } - - // A shuffle is needed. - std::vector<Constant *> Mask(numElem); - for (unsigned i = 0; i < numElem; ++i) { - int Idx = II[i].first; - if (Idx == -1) - Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); - else - Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); - } - - Instruction *S = - new ShuffleVectorInst(I1, UndefValue::get(I1T), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o)); - S->insertBefore(IBeforeJ ? J : I); - return S; - } - - VectorType *I2T = cast<VectorType>(I2->getType()); - unsigned I2Elem = I2T->getNumElements(); - - // This input comes from two distinct vectors. The first step is to - // make sure that both vectors are the same length. If not, the - // smaller one will need to grow before they can be shuffled together. - if (I1Elem < I2Elem) { - std::vector<Constant *> Mask(I2Elem); - unsigned v = 0; - for (; v < I1Elem; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < I2Elem; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - Instruction *NewI1 = - new ShuffleVectorInst(I1, UndefValue::get(I1T), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - NewI1->insertBefore(IBeforeJ ? J : I); - I1 = NewI1; - I1Elem = I2Elem; - } else if (I1Elem > I2Elem) { - std::vector<Constant *> Mask(I1Elem); - unsigned v = 0; - for (; v < I2Elem; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < I1Elem; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - Instruction *NewI2 = - new ShuffleVectorInst(I2, UndefValue::get(I2T), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - NewI2->insertBefore(IBeforeJ ? J : I); - I2 = NewI2; - } - - // Now that both I1 and I2 are the same length we can shuffle them - // together (and use the result). - std::vector<Constant *> Mask(numElem); - for (unsigned v = 0; v < numElem; ++v) { - if (II[v].first == -1) { - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - } else { - int Idx = II[v].first + II[v].second * I1Elem; - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); - } - } - - Instruction *NewOp = - new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, true, o)); - NewOp->insertBefore(IBeforeJ ? J : I); - return NewOp; - } - } - - Type *ArgType = ArgTypeL; - if (numElemL < numElemH) { - if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, - ArgTypeL, VArgType, IBeforeJ, 1)) { - // This is another short-circuit case: we're combining a scalar into - // a vector that is formed by an IE chain. We've just expanded the IE - // chain, now insert the scalar and we're done. - - Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, - getReplacementName(IBeforeJ ? I : J, true, o)); - S->insertBefore(IBeforeJ ? J : I); - return S; - } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, - ArgTypeH, IBeforeJ)) { - // The two vector inputs to the shuffle must be the same length, - // so extend the smaller vector to be the same length as the larger one. - Instruction *NLOp; - if (numElemL > 1) { - - std::vector<Constant *> Mask(numElemH); - unsigned v = 0; - for (; v < numElemL; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < numElemH; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } else { - NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } - - NLOp->insertBefore(IBeforeJ ? J : I); - LOp = NLOp; - } - - ArgType = ArgTypeH; - } else if (numElemL > numElemH) { - if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, - ArgTypeH, VArgType, IBeforeJ)) { - Instruction *S = - InsertElementInst::Create(LOp, HOp, - ConstantInt::get(Type::getInt32Ty(Context), - numElemL), - getReplacementName(IBeforeJ ? I : J, - true, o)); - S->insertBefore(IBeforeJ ? J : I); - return S; - } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, - ArgTypeL, IBeforeJ)) { - Instruction *NHOp; - if (numElemH > 1) { - std::vector<Constant *> Mask(numElemL); - unsigned v = 0; - for (; v < numElemH; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < numElemL; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } else { - NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } - - NHOp->insertBefore(IBeforeJ ? J : I); - HOp = NHOp; - } - } - - if (ArgType->isVectorTy()) { - unsigned numElem = VArgType->getVectorNumElements(); - std::vector<Constant*> Mask(numElem); - for (unsigned v = 0; v < numElem; ++v) { - unsigned Idx = v; - // If the low vector was expanded, we need to skip the extra - // undefined entries. - if (v >= numElemL && numElemH > numElemL) - Idx += (numElemH - numElemL); - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); - } - - Instruction *BV = new ShuffleVectorInst(LOp, HOp, - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, true, o)); - BV->insertBefore(IBeforeJ ? J : I); - return BV; - } - - Instruction *BV1 = InsertElementInst::Create( - UndefValue::get(VArgType), LOp, CV0, - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - BV1->insertBefore(IBeforeJ ? J : I); - Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, - getReplacementName(IBeforeJ ? I : J, - true, o, 2)); - BV2->insertBefore(IBeforeJ ? J : I); - return BV2; - } - - // This function creates an array of values that will be used as the inputs - // to the vector instruction that fuses I with J. - void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, - Instruction *I, Instruction *J, - SmallVectorImpl<Value *> &ReplacedOperands, - bool IBeforeJ) { - unsigned NumOperands = I->getNumOperands(); - - for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { - // Iterate backward so that we look at the store pointer - // first and know whether or not we need to flip the inputs. - - if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { - // This is the pointer for a load/store instruction. - ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); - continue; - } else if (isa<CallInst>(I)) { - Function *F = cast<CallInst>(I)->getCalledFunction(); - Intrinsic::ID IID = F->getIntrinsicID(); - if (o == NumOperands-1) { - BasicBlock &BB = *I->getParent(); - - Module *M = BB.getParent()->getParent(); - Type *ArgTypeI = I->getType(); - Type *ArgTypeJ = J->getType(); - Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - - ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); - continue; - } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && o == 1) { - // The second argument of powi/ctlz/cttz is a single integer/constant - // and we've already checked that both arguments are equal. - // As a result, we just keep I's second argument. - ReplacedOperands[o] = I->getOperand(o); - continue; - } - } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { - ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); - continue; - } - - ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); - } - } - - // This function creates two values that represent the outputs of the - // original I and J instructions. These are generally vector shuffles - // or extracts. In many cases, these will end up being unused and, thus, - // eliminated by later passes. - void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, - Instruction *J, Instruction *K, - Instruction *&InsertionPt, - Instruction *&K1, Instruction *&K2) { - if (isa<StoreInst>(I)) - return; - - Type *IType = I->getType(); - Type *JType = J->getType(); - - VectorType *VType = getVecTypeForPair(IType, JType); - unsigned numElem = VType->getNumElements(); - - unsigned numElemI = getNumScalarElements(IType); - unsigned numElemJ = getNumScalarElements(JType); - - if (IType->isVectorTy()) { - std::vector<Constant *> Mask1(numElemI), Mask2(numElemI); - for (unsigned v = 0; v < numElemI; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v); - } - - K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get(Mask1), - getReplacementName(K, false, 1)); - } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); - } - - if (JType->isVectorTy()) { - std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ); - for (unsigned v = 0; v < numElemJ; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v); - } - - K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get(Mask2), - getReplacementName(K, false, 2)); - } else { - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); - K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); - } - - K1->insertAfter(K); - K2->insertAfter(K1); - InsertionPt = K2; - } - - // Move all uses of the function I (including pairing-induced uses) after J. - bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, - DenseSet<ValuePair> &LoadMoveSetPairs, - Instruction *I, Instruction *J) { - // Skip to the first instruction past I. - BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); - - DenseSet<Value *> Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); - - for (; cast<Instruction>(L) != J; ++L) - (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs); - - assert(cast<Instruction>(L) == J && - "Tracking has not proceeded far enough to check for dependencies"); - // If J is now in the use set of I, then trackUsesOfI will return true - // and we have a dependency cycle (and the fusing operation must abort). - return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); - } - - // Move all uses of the function I (including pairing-induced uses) after J. - void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, - DenseSet<ValuePair> &LoadMoveSetPairs, - Instruction *&InsertionPt, - Instruction *I, Instruction *J) { - // Skip to the first instruction past I. - BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); - - DenseSet<Value *> Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); - - for (; cast<Instruction>(L) != J;) { - if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) { - // Move this instruction - Instruction *InstToMove = &*L++; - - DEBUG(dbgs() << "BBV: moving: " << *InstToMove << - " to after " << *InsertionPt << "\n"); - InstToMove->removeFromParent(); - InstToMove->insertAfter(InsertionPt); - InsertionPt = InstToMove; - } else { - ++L; - } - } - } - - // Collect all load instruction that are in the move set of a given first - // pair member. These loads depend on the first instruction, I, and so need - // to be moved after J (the second instruction) when the pair is fused. - void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, - DenseSet<ValuePair> &LoadMoveSetPairs, - Instruction *I) { - // Skip to the first instruction past I. - BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); - - DenseSet<Value *> Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); - - // Note: We cannot end the loop when we reach J because J could be moved - // farther down the use chain by another instruction pairing. Also, J - // could be before I if this is an inverted input. - for (BasicBlock::iterator E = BB.end(); L != E; ++L) { - if (trackUsesOfI(Users, WriteSet, I, &*L)) { - if (L->mayReadFromMemory()) { - LoadMoveSet[&*L].push_back(I); - LoadMoveSetPairs.insert(ValuePair(&*L, I)); - } - } - } - } - - // In cases where both load/stores and the computation of their pointers - // are chosen for vectorization, we can end up in a situation where the - // aliasing analysis starts returning different query results as the - // process of fusing instruction pairs continues. Because the algorithm - // relies on finding the same use dags here as were found earlier, we'll - // need to precompute the necessary aliasing information here and then - // manually update it during the fusion process. - void BBVectorize::collectLoadMoveSet(BasicBlock &BB, - std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, - DenseSet<ValuePair> &LoadMoveSetPairs) { - for (std::vector<Value *>::iterator PI = PairableInsts.begin(), - PIE = PairableInsts.end(); PI != PIE; ++PI) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); - if (P == ChosenPairs.end()) continue; - - Instruction *I = cast<Instruction>(P->first); - collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, - LoadMoveSetPairs, I); - } - } - - // This function fuses the chosen instruction pairs into vector instructions, - // taking care preserve any needed scalar outputs and, then, it reorders the - // remaining instructions as needed (users of the first member of the pair - // need to be moved to after the location of the second member of the pair - // because the vector instruction is inserted in the location of the pair's - // second member). - void BBVectorize::fuseChosenPairs(BasicBlock &BB, - std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, - DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { - LLVMContext& Context = BB.getContext(); - - // During the vectorization process, the order of the pairs to be fused - // could be flipped. So we'll add each pair, flipped, into the ChosenPairs - // list. After a pair is fused, the flipped pair is removed from the list. - DenseSet<ValuePair> FlippedPairs; - for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), - E = ChosenPairs.end(); P != E; ++P) - FlippedPairs.insert(ValuePair(P->second, P->first)); - for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), - E = FlippedPairs.end(); P != E; ++P) - ChosenPairs.insert(*P); - - DenseMap<Value *, std::vector<Value *> > LoadMoveSet; - DenseSet<ValuePair> LoadMoveSetPairs; - collectLoadMoveSet(BB, PairableInsts, ChosenPairs, - LoadMoveSet, LoadMoveSetPairs); - - DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); - - for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI); - if (P == ChosenPairs.end()) { - ++PI; - continue; - } - - if (getDepthFactor(P->first) == 0) { - // These instructions are not really fused, but are tracked as though - // they are. Any case in which it would be interesting to fuse them - // will be taken care of by InstCombine. - --NumFusedOps; - ++PI; - continue; - } - - Instruction *I = cast<Instruction>(P->first), - *J = cast<Instruction>(P->second); - - DEBUG(dbgs() << "BBV: fusing: " << *I << - " <-> " << *J << "\n"); - - // Remove the pair and flipped pair from the list. - DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); - assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); - ChosenPairs.erase(FP); - ChosenPairs.erase(P); - - if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { - DEBUG(dbgs() << "BBV: fusion of: " << *I << - " <-> " << *J << - " aborted because of non-trivial dependency cycle\n"); - --NumFusedOps; - ++PI; - continue; - } - - // If the pair must have the other order, then flip it. - bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); - if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { - // This pair does not have a fixed order, and so we might want to - // flip it if that will yield fewer shuffles. We count the number - // of dependencies connected via swaps, and those directly connected, - // and flip the order if the number of swaps is greater. - bool OrigOrder = true; - DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = - ConnectedPairDeps.find(ValuePair(I, J)); - if (IJ == ConnectedPairDeps.end()) { - IJ = ConnectedPairDeps.find(ValuePair(J, I)); - OrigOrder = false; - } - - if (IJ != ConnectedPairDeps.end()) { - unsigned NumDepsDirect = 0, NumDepsSwap = 0; - for (std::vector<ValuePair>::iterator T = IJ->second.begin(), - TE = IJ->second.end(); T != TE; ++T) { - VPPair Q(IJ->first, *T); - DenseMap<VPPair, unsigned>::iterator R = - PairConnectionTypes.find(VPPair(Q.second, Q.first)); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - if (R->second == PairConnectionDirect) - ++NumDepsDirect; - else if (R->second == PairConnectionSwap) - ++NumDepsSwap; - } - - if (!OrigOrder) - std::swap(NumDepsDirect, NumDepsSwap); - - if (NumDepsSwap > NumDepsDirect) { - FlipPairOrder = true; - DEBUG(dbgs() << "BBV: reordering pair: " << *I << - " <-> " << *J << "\n"); - } - } - } - - Instruction *L = I, *H = J; - if (FlipPairOrder) - std::swap(H, L); - - // If the pair being fused uses the opposite order from that in the pair - // connection map, then we need to flip the types. - DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = - ConnectedPairs.find(ValuePair(H, L)); - if (HL != ConnectedPairs.end()) - for (std::vector<ValuePair>::iterator T = HL->second.begin(), - TE = HL->second.end(); T != TE; ++T) { - VPPair Q(HL->first, *T); - DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - if (R->second == PairConnectionDirect) - R->second = PairConnectionSwap; - else if (R->second == PairConnectionSwap) - R->second = PairConnectionDirect; - } - - bool LBeforeH = !FlipPairOrder; - unsigned NumOperands = I->getNumOperands(); - SmallVector<Value *, 3> ReplacedOperands(NumOperands); - getReplacementInputsForPair(Context, L, H, ReplacedOperands, - LBeforeH); - - // Make a copy of the original operation, change its type to the vector - // type and replace its operands with the vector operands. - Instruction *K = L->clone(); - if (L->hasName()) - K->takeName(L); - else if (H->hasName()) - K->takeName(H); - - if (auto CS = CallSite(K)) { - SmallVector<Type *, 3> Tys; - FunctionType *Old = CS.getFunctionType(); - unsigned NumOld = Old->getNumParams(); - assert(NumOld <= ReplacedOperands.size()); - for (unsigned i = 0; i != NumOld; ++i) - Tys.push_back(ReplacedOperands[i]->getType()); - CS.mutateFunctionType( - FunctionType::get(getVecTypeForPair(L->getType(), H->getType()), - Tys, Old->isVarArg())); - } else if (!isa<StoreInst>(K)) - K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - - unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_group}; - combineMetadata(K, H, KnownIDs); - K->andIRFlags(H); - - for (unsigned o = 0; o < NumOperands; ++o) - K->setOperand(o, ReplacedOperands[o]); - - K->insertAfter(J); - - // Instruction insertion point: - Instruction *InsertionPt = K; - Instruction *K1 = nullptr, *K2 = nullptr; - replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); - - // The use dag of the first original instruction must be moved to after - // the location of the second instruction. The entire use dag of the - // first instruction is disjoint from the input dag of the second - // (by definition), and so commutes with it. - - moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); - - if (!isa<StoreInst>(I)) { - L->replaceAllUsesWith(K1); - H->replaceAllUsesWith(K2); - } - - // Instructions that may read from memory may be in the load move set. - // Once an instruction is fused, we no longer need its move set, and so - // the values of the map never need to be updated. However, when a load - // is fused, we need to merge the entries from both instructions in the - // pair in case those instructions were in the move set of some other - // yet-to-be-fused pair. The loads in question are the keys of the map. - if (I->mayReadFromMemory()) { - std::vector<ValuePair> NewSetMembers; - DenseMap<Value *, std::vector<Value *> >::iterator II = - LoadMoveSet.find(I); - if (II != LoadMoveSet.end()) - for (std::vector<Value *>::iterator N = II->second.begin(), - NE = II->second.end(); N != NE; ++N) - NewSetMembers.push_back(ValuePair(K, *N)); - DenseMap<Value *, std::vector<Value *> >::iterator JJ = - LoadMoveSet.find(J); - if (JJ != LoadMoveSet.end()) - for (std::vector<Value *>::iterator N = JJ->second.begin(), - NE = JJ->second.end(); N != NE; ++N) - NewSetMembers.push_back(ValuePair(K, *N)); - for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), - AE = NewSetMembers.end(); A != AE; ++A) { - LoadMoveSet[A->first].push_back(A->second); - LoadMoveSetPairs.insert(*A); - } - } - - // Before removing I, set the iterator to the next instruction. - PI = std::next(BasicBlock::iterator(I)); - if (cast<Instruction>(PI) == J) - ++PI; - - SE->forgetValue(I); - SE->forgetValue(J); - I->eraseFromParent(); - J->eraseFromParent(); - - DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << - BB << "\n"); - } - - DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); - } -} - -char BBVectorize::ID = 0; -static const char bb_vectorize_name[] = "Basic-Block Vectorization"; -INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) -INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) - -BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { - return new BBVectorize(C); -} - -bool -llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { - BBVectorize BBVectorizer(P, *BB.getParent(), C); - return BBVectorizer.vectorizeBB(BB); -} - -//===----------------------------------------------------------------------===// -VectorizeConfig::VectorizeConfig() { - VectorBits = ::VectorBits; - VectorizeBools = !::NoBools; - VectorizeInts = !::NoInts; - VectorizeFloats = !::NoFloats; - VectorizePointers = !::NoPointers; - VectorizeCasts = !::NoCasts; - VectorizeMath = !::NoMath; - VectorizeBitManipulations = !::NoBitManipulation; - VectorizeFMA = !::NoFMA; - VectorizeSelect = !::NoSelect; - VectorizeCmp = !::NoCmp; - VectorizeGEP = !::NoGEP; - VectorizeMemOps = !::NoMemOps; - AlignedOnly = ::AlignedOnly; - ReqChainDepth= ::ReqChainDepth; - SearchLimit = ::SearchLimit; - MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; - SplatBreaksChain = ::SplatBreaksChain; - MaxInsts = ::MaxInsts; - MaxPairs = ::MaxPairs; - MaxIter = ::MaxIter; - Pow2LenOnly = ::Pow2LenOnly; - NoMemOpBoost = ::NoMemOpBoost; - FastDep = ::FastDep; -} diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 395f440bda47..1aea73cd4a32 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,5 +1,4 @@ add_llvm_library(LLVMVectorize - BBVectorize.cpp LoadStoreVectorizer.cpp LoopVectorize.cpp SLPVectorizer.cpp diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index eac2867233bc..193cc4d13787 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -114,12 +114,13 @@ static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); -/// We don't vectorize loops with a known constant trip count below this number. +/// Loops with a known constant trip count below this number are vectorized only +/// if no scalar iteration overheads are incurred. static cl::opt<unsigned> TinyTripCountVectorThreshold( "vectorizer-min-trip-count", cl::init(16), cl::Hidden, - cl::desc("Don't vectorize loops with a constant " - "trip count that is smaller than this " - "value.")); + cl::desc("Loops with a constant trip count that is smaller than this " + "value are vectorized only if no scalar iteration overheads " + "are incurred.")); static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -532,21 +533,34 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// Return a constant reference to the VectorParts corresponding to \p V from - /// the original loop. If the value has already been vectorized, the - /// corresponding vector entry in VectorLoopValueMap is returned. If, + /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a + /// vector or scalar value on-demand if one is not yet available. When + /// vectorizing a loop, we visit the definition of an instruction before its + /// uses. When visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. + /// + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part. If the value has already been vectorized, + /// the corresponding vector entry in VectorLoopValueMap is returned. If, /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// new vector values on-demand by inserting the scalar values into vectors + /// a new vector value on-demand by inserting the scalar values into a vector /// with an insertelement sequence. If the value has been neither vectorized /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into vectors. - const VectorParts &getVectorValue(Value *V); + /// value into a vector. + Value *getOrCreateVectorValue(Value *V, unsigned Part); /// Return a value in the new loop corresponding to \p V from the original /// loop at unroll index \p Part and vector index \p Lane. If the value has /// been vectorized but not scalarized, the necessary extractelement /// instruction will be generated. - Value *getScalarValue(Value *V, unsigned Part, unsigned Lane); + Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -601,90 +615,103 @@ protected: /// UF x VF scalar values in the new loop. UF and VF are the unroll and /// vectorization factors, respectively. /// - /// Entries can be added to either map with initVector and initScalar, which - /// initialize and return a constant reference to the new entry. If a - /// non-constant reference to a vector entry is required, getVector can be - /// used to retrieve a mutable entry. We currently directly modify the mapped - /// values during "fix-up" operations that occur once the first phase of - /// widening is complete. These operations include type truncation and the - /// second phase of recurrence widening. + /// Entries can be added to either map with setVectorValue and setScalarValue, + /// which assert that an entry was not already added before. If an entry is to + /// replace an existing one, call resetVectorValue. This is currently needed + /// to modify the mapped values during "fix-up" operations that occur once the + /// first phase of widening is complete. These operations include type + /// truncation and the second phase of recurrence widening. /// - /// Otherwise, entries from either map should be accessed using the - /// getVectorValue or getScalarValue functions from InnerLoopVectorizer. - /// getVectorValue and getScalarValue coordinate to generate a vector or - /// scalar value on-demand if one is not yet available. When vectorizing a - /// loop, we visit the definition of an instruction before its uses. When - /// visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. + /// Entries from either map can be retrieved using the getVectorValue and + /// getScalarValue functions, which assert that the desired value exists. + struct ValueMap { /// Construct an empty map with the given unroll and vectorization factors. - ValueMap(unsigned UnrollFactor, unsigned VecWidth) - : UF(UnrollFactor), VF(VecWidth) { - // The unroll and vectorization factors are only used in asserts builds - // to verify map entries are sized appropriately. - (void)UF; - (void)VF; + ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + + /// \return True if the map has any vector entry for \p Key. + bool hasAnyVectorValue(Value *Key) const { + return VectorMapStorage.count(Key); + } + + /// \return True if the map has a vector entry for \p Key and \p Part. + bool hasVectorValue(Value *Key, unsigned Part) const { + assert(Part < UF && "Queried Vector Part is too large."); + if (!hasAnyVectorValue(Key)) + return false; + const VectorParts &Entry = VectorMapStorage.find(Key)->second; + assert(Entry.size() == UF && "VectorParts has wrong dimensions."); + return Entry[Part] != nullptr; } - /// \return True if the map has a vector entry for \p Key. - bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); } - - /// \return True if the map has a scalar entry for \p Key. - bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); } - - /// \brief Map \p Key to the given VectorParts \p Entry, and return a - /// constant reference to the new vector map entry. The given key should - /// not already be in the map, and the given VectorParts should be - /// correctly sized for the current unroll factor. - const VectorParts &initVector(Value *Key, const VectorParts &Entry) { - assert(!hasVector(Key) && "Vector entry already initialized"); - assert(Entry.size() == UF && "VectorParts has wrong dimensions"); - VectorMapStorage[Key] = Entry; - return VectorMapStorage[Key]; + /// \return True if the map has any scalar entry for \p Key. + bool hasAnyScalarValue(Value *Key) const { + return ScalarMapStorage.count(Key); } - /// \brief Map \p Key to the given ScalarParts \p Entry, and return a - /// constant reference to the new scalar map entry. The given key should - /// not already be in the map, and the given ScalarParts should be - /// correctly sized for the current unroll and vectorization factors. - const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) { - assert(!hasScalar(Key) && "Scalar entry already initialized"); - assert(Entry.size() == UF && - all_of(make_range(Entry.begin(), Entry.end()), - [&](const SmallVectorImpl<Value *> &Values) -> bool { - return Values.size() == VF; - }) && - "ScalarParts has wrong dimensions"); - ScalarMapStorage[Key] = Entry; - return ScalarMapStorage[Key]; + /// \return True if the map has a scalar entry for \p Key, \p Part and + /// \p Part. + bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { + assert(Part < UF && "Queried Scalar Part is too large."); + assert(Lane < VF && "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) + return false; + const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; + assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); + assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); + return Entry[Part][Lane] != nullptr; } - /// \return A reference to the vector map entry corresponding to \p Key. - /// The key should already be in the map. This function should only be used - /// when it's necessary to update values that have already been vectorized. - /// This is the case for "fix-up" operations including type truncation and - /// the second phase of recurrence vectorization. If a non-const reference - /// isn't required, getVectorValue should be used instead. - VectorParts &getVector(Value *Key) { - assert(hasVector(Key) && "Vector entry not initialized"); - return VectorMapStorage.find(Key)->second; + /// Retrieve the existing vector value that corresponds to \p Key and + /// \p Part. + Value *getVectorValue(Value *Key, unsigned Part) { + assert(hasVectorValue(Key, Part) && "Getting non-existent value."); + return VectorMapStorage[Key][Part]; } - /// Retrieve an entry from the vector or scalar maps. The preferred way to - /// access an existing mapped entry is with getVectorValue or - /// getScalarValue from InnerLoopVectorizer. Until those functions can be - /// moved inside ValueMap, we have to declare them as friends. - friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V); - friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, - unsigned Lane); + /// Retrieve the existing scalar value that corresponds to \p Key, \p Part + /// and \p Lane. + Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { + assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); + return ScalarMapStorage[Key][Part][Lane]; + } + + /// Set a vector value associated with \p Key and \p Part. Assumes such a + /// value is not already set. If it is, use resetVectorValue() instead. + void setVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); + if (!VectorMapStorage.count(Key)) { + VectorParts Entry(UF); + VectorMapStorage[Key] = Entry; + } + VectorMapStorage[Key][Part] = Vector; + } + + /// Set a scalar value associated with \p Key for \p Part and \p Lane. + /// Assumes such a value is not already set. + void setScalarValue(Value *Key, unsigned Part, unsigned Lane, + Value *Scalar) { + assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); + if (!ScalarMapStorage.count(Key)) { + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part].resize(VF, nullptr); + // TODO: Consider storing uniform values only per-part, as they occupy + // lane 0 only, keeping the other VF-1 redundant entries null. + ScalarMapStorage[Key] = Entry; + } + ScalarMapStorage[Key][Part][Lane] = Scalar; + } + + /// Reset the vector value associated with \p Key for the given \p Part. + /// This function can be used to update values that have already been + /// vectorized. This is the case for "fix-up" operations including type + /// truncation and the second phase of recurrence vectorization. + void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(hasVectorValue(Key, Part) && "Vector value not set for part"); + VectorMapStorage[Key][Part] = Vector; + } private: /// The unroll factor. Each entry in the vector map contains UF vector @@ -1577,6 +1604,9 @@ public: /// Return the first-order recurrences found in the loop. RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } + /// Return the set of instructions to sink to handle first-order recurrences. + DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -1779,6 +1809,9 @@ private: InductionList Inductions; /// Holds the phi nodes that are first-order recurrences. RecurrenceSet FirstOrderRecurrences; + /// Holds instructions that need to sink past other instructions to handle + /// first-order recurrences. + DenseMap<Instruction *, Instruction *> SinkAfter; /// Holds the widest induction type encountered. Type *WidestIndTy; @@ -2417,15 +2450,13 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", &*LoopVectorBody->getFirstInsertionPt()); Instruction *LastInduction = VecInd; - VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part] = LastInduction; + VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); + if (isa<TruncInst>(EntryVal)) + addMetadata(LastInduction, EntryVal); LastInduction = cast<Instruction>(addFastMathFlag( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); } - VectorLoopValueMap.initVector(EntryVal, Entry); - if (isa<TruncInst>(EntryVal)) - addMetadata(Entry, EntryVal); // Move the last step to the end of the latch block. This ensures consistent // placement of all induction updates. @@ -2531,13 +2562,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // induction variable, and build the necessary step vectors. if (!VectorizedIV) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); - VectorLoopValueMap.initVector(EntryVal, Entry); - if (Trunc) - addMetadata(Entry, Trunc); + VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); + if (Trunc) + addMetadata(EntryPart, Trunc); + } } // If an induction variable is only used for counting loop iterations or @@ -2637,17 +2668,14 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. - ScalarParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part].resize(VF); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - Entry[Part][Lane] = Add; + VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); } } - VectorLoopValueMap.initScalar(EntryVal, Entry); } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { @@ -2665,8 +2693,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return LAI->isUniform(V); } -const InnerLoopVectorizer::VectorParts & -InnerLoopVectorizer::getVectorValue(Value *V) { +Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); assert(!V->getType()->isVoidTy() && "Type does not produce a value"); @@ -2675,17 +2702,16 @@ InnerLoopVectorizer::getVectorValue(Value *V) { if (Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); - // If we have this scalar in the map, return it. - if (VectorLoopValueMap.hasVector(V)) - return VectorLoopValueMap.VectorMapStorage[V]; + // If we have a vector mapped to this value, return it. + if (VectorLoopValueMap.hasVectorValue(V, Part)) + return VectorLoopValueMap.getVectorValue(V, Part); // If the value has not been vectorized, check if it has been scalarized // instead. If it has been scalarized, and we actually need the value in // vector form, we will construct the vector values on demand. - if (VectorLoopValueMap.hasScalar(V)) { + if (VectorLoopValueMap.hasAnyScalarValue(V)) { - // Initialize a new vector map entry. - VectorParts Entry(UF); + Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); // If we've scalarized a value, that value should be an instruction. auto *I = cast<Instruction>(V); @@ -2693,17 +2719,17 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // If we aren't vectorizing, we can just copy the scalar map values over to // the vector map. if (VF == 1) { - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getScalarValue(V, Part, 0); - return VectorLoopValueMap.initVector(V, Entry); + VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); + return ScalarValue; } - // Get the last scalar instruction we generated for V. If the value is - // known to be uniform after vectorization, this corresponds to lane zero - // of the last unroll iteration. Otherwise, the last instruction is the one - // we created for the last vector lane of the last unroll iteration. + // Get the last scalar instruction we generated for V and Part. If the value + // is known to be uniform after vectorization, this corresponds to lane zero + // of the Part unroll iteration. Otherwise, the last instruction is the one + // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; - auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane)); + auto *LastInst = + cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -2717,52 +2743,50 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // iteration. Otherwise, we construct the vector values using insertelement // instructions. Since the resulting vectors are stored in // VectorLoopValueMap, we will only generate the insertelements once. - for (unsigned Part = 0; Part < UF; ++Part) { - Value *VectorValue = nullptr; - if (Cost->isUniformAfterVectorization(I, VF)) { - VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); - } else { - VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); - for (unsigned Lane = 0; Lane < VF; ++Lane) - VectorValue = Builder.CreateInsertElement( - VectorValue, getScalarValue(V, Part, Lane), - Builder.getInt32(Lane)); - } - Entry[Part] = VectorValue; + Value *VectorValue = nullptr; + if (Cost->isUniformAfterVectorization(I, VF)) { + VectorValue = getBroadcastInstrs(ScalarValue); + } else { + VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); + for (unsigned Lane = 0; Lane < VF; ++Lane) + VectorValue = Builder.CreateInsertElement( + VectorValue, getOrCreateScalarValue(V, Part, Lane), + Builder.getInt32(Lane)); } + VectorLoopValueMap.setVectorValue(V, Part, VectorValue); Builder.restoreIP(OldIP); - return VectorLoopValueMap.initVector(V, Entry); + return VectorValue; } // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - return VectorLoopValueMap.initVector(V, VectorParts(UF, B)); + VectorLoopValueMap.setVectorValue(V, Part, B); + return B; } -Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, - unsigned Lane) { +Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, + unsigned Lane) { // If the value is not an instruction contained in the loop, it should // already be scalar. if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? - !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalar(V)) - return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane]; + if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) + return VectorLoopValueMap.getScalarValue(V, Part, Lane); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the // vectorization factor is one), there is no need to generate an // extractelement instruction. - auto *U = getVectorValue(V)[Part]; + auto *U = getOrCreateVectorValue(V, Part); if (!U->getType()->isVectorTy()) { assert(VF == 1 && "Value not scalarized has non-vector type"); return U; @@ -2844,7 +2868,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getScalarValue(Ptr, Part, 0); + Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2887,7 +2911,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (!Member) continue; - VectorParts Entry(UF); Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( @@ -2899,10 +2922,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); } - Entry[Part] = - Group->isReverse() ? reverseVector(StridedVec) : StridedVec; + if (Group->isReverse()) + StridedVec = reverseVector(StridedVec); + + VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); } - VectorLoopValueMap.initVector(Member, Entry); } return; } @@ -2919,8 +2943,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Instruction *Member = Group->getMember(i); assert(Member && "Fail to get a member from an interleaved store group"); - Value *StoredVec = - getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part]; + Value *StoredVec = getOrCreateVectorValue( + cast<StoreInst>(Member)->getValueOperand(), Part); if (Group->isReverse()) StoredVec = reverseVector(StoredVec); @@ -2981,16 +3005,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { bool CreateGatherScatter = (Decision == LoopVectorizationCostModel::CM_GatherScatter); - VectorParts VectorGep; + // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector + // gather/scatter. Otherwise Decision should have been to Scalarize. + assert((ConsecutiveStride || CreateGatherScatter) && + "The instruction should be scalarized"); // Handle consecutive loads/stores. - if (ConsecutiveStride) { - Ptr = getScalarValue(Ptr, 0, 0); - } else { - // At this point we should vector version of GEP for Gather or Scatter - assert(CreateGatherScatter && "The instruction should be scalarized"); - VectorGep = getVectorValue(Ptr); - } + if (ConsecutiveStride) + Ptr = getOrCreateScalarValue(Ptr, 0, 0); VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -2998,16 +3020,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { assert(!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses"); setDebugLocFromInst(Builder, SI); - // We don't want to update the value in the map as it might be used in - // another expression. So don't use a reference type for "StoredVal". - VectorParts StoredVal = getVectorValue(SI->getValueOperand()); for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewSI = nullptr; + Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; - NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part], - Alignment, MaskPart); + Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); } else { // Calculate the pointer for the specific unroll-part. Value *PartPtr = @@ -3016,7 +3037,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (Reverse) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. - StoredVal[Part] = reverseVector(StoredVal[Part]); + StoredVal = reverseVector(StoredVal); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = @@ -3030,11 +3054,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); if (Legal->isMaskRequired(SI)) - NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask[Part]); else - NewSI = - Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } addMetadata(NewSI, SI); } @@ -3044,14 +3067,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Handle loads. assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); - VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { - Instruction *NewLI; + Value *NewLI; if (CreateGatherScatter) { Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; - NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart, + Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); - Entry[Part] = NewLI; + addMetadata(NewLI, LI); } else { // Calculate the pointer for the specific unroll-part. Value *PartPtr = @@ -3073,11 +3096,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); - Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; + + // Add metadata to the load, but setVectorValue to the reverse shuffle. + addMetadata(NewLI, LI); + if (Reverse) + NewLI = reverseVector(NewLI); } - addMetadata(NewLI, LI); + VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); } - VectorLoopValueMap.initVector(Instr, Entry); } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, @@ -3094,9 +3120,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - // Initialize a new scalar map entry. - ScalarParts Entry(UF); - VectorParts Cond; if (IfPredicateInstr) Cond = createBlockInMask(Instr->getParent()); @@ -3108,7 +3131,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part].resize(VF); // For each scalar that we create: for (unsigned Lane = 0; Lane < Lanes; ++Lane) { @@ -3129,7 +3151,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane); + auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3138,7 +3160,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, Builder.Insert(Cloned); // Add the cloned scalar to the scalar map entry. - Entry[Part][Lane] = Cloned; + VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) @@ -3150,7 +3172,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } - VectorLoopValueMap.initScalar(Instr, Entry); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3786,10 +3807,10 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // If the value wasn't vectorized, we must maintain the original scalar // type. The absence of the value from VectorLoopValueMap indicates that it // wasn't vectorized. - if (!VectorLoopValueMap.hasVector(KV.first)) + if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) continue; - VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); - for (Value *&I : Parts) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *I = getOrCreateVectorValue(KV.first, Part); if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) continue; Type *OriginalTy = I->getType(); @@ -3878,7 +3899,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { I->replaceAllUsesWith(Res); cast<Instruction>(I)->eraseFromParent(); Erased.insert(I); - I = Res; + VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); } } @@ -3887,15 +3908,15 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // If the value wasn't vectorized, we must maintain the original scalar // type. The absence of the value from VectorLoopValueMap indicates that it // wasn't vectorized. - if (!VectorLoopValueMap.hasVector(KV.first)) + if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) continue; - VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); - for (Value *&I : Parts) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *I = getOrCreateVectorValue(KV.first, Part); ZExtInst *Inst = dyn_cast<ZExtInst>(I); if (Inst && Inst->use_empty()) { Value *NewI = Inst->getOperand(0); Inst->eraseFromParent(); - I = NewI; + VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); } } } @@ -4025,28 +4046,29 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We constructed a temporary phi node in the first phase of vectorization. // This phi node will eventually be deleted. - VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi); - Builder.SetInsertPoint(cast<Instruction>(PhiParts[0])); + Builder.SetInsertPoint( + cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); // Create a phi node for the new recurrence. The current value will either be // the initial value inserted into a vector or loop-varying vector value. auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); - // Get the vectorized previous value. - auto &PreviousParts = getVectorValue(Previous); + // Get the vectorized previous value of the last part UF - 1. It appears last + // among all unrolled iterations, due to the order of their construction. + Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); // Set the insertion point after the previous value if it is an instruction. // Note that the previous value may have been constant-folded so it is not // guaranteed to be an instruction in the vector loop. Also, if the previous // value is a phi node, we should insert after all the phi nodes to avoid // breaking basic block verification. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) || - isa<PHINode>(PreviousParts[UF - 1])) + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || + isa<PHINode>(PreviousLastPart)) Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); else Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); + &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -4061,15 +4083,16 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Shuffle the current and previous vector and update the vector parts. for (unsigned Part = 0; Part < UF; ++Part) { + Value *PreviousPart = getOrCreateVectorValue(Previous, Part); + Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); auto *Shuffle = - VF > 1 - ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part], - ConstantVector::get(ShuffleMask)) - : Incoming; - PhiParts[Part]->replaceAllUsesWith(Shuffle); - cast<Instruction>(PhiParts[Part])->eraseFromParent(); - PhiParts[Part] = Shuffle; - Incoming = PreviousParts[Part]; + VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, + ConstantVector::get(ShuffleMask)) + : Incoming; + PhiPart->replaceAllUsesWith(Shuffle); + cast<Instruction>(PhiPart)->eraseFromParent(); + VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); + Incoming = PreviousPart; } // Fix the latch value of the new recurrence in the vector loop. @@ -4097,7 +4120,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // `Incoming`. This is analogous to the vectorized case above: extracting the // second last element when VF > 1. else if (UF > 1) - ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2]; + ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); @@ -4148,8 +4171,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. - const VectorParts &VectorExit = getVectorValue(LoopExitInst); - Type *VecTy = VectorExit[0]->getType(); + Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. @@ -4187,18 +4209,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Reductions do not have to start at zero. They can start with // any loop invariant values. - const VectorParts &VecRdxPhi = getVectorValue(Phi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); - const VectorParts &Val = getVectorValue(LoopVal); - for (unsigned part = 0; part < UF; ++part) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); + Value *Val = getOrCreateVectorValue(LoopVal, Part); // Make sure to add the reduction stat value only to the // first unroll part. - Value *StartVal = (part == 0) ? VectorStart : Identity; - cast<PHINode>(VecRdxPhi[part]) - ->addIncoming(StartVal, LoopVectorPreHeader); - cast<PHINode>(VecRdxPhi[part]) - ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch()); + Value *StartVal = (Part == 0) ? VectorStart : Identity; + cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); + cast<PHINode>(VecRdxPhi) + ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); } // Before each round, move the insertion point right between @@ -4207,7 +4228,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); // If the vector reduction can be performed in a smaller type, we truncate @@ -4216,37 +4236,42 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint(LoopVectorBody->getTerminator()); - for (unsigned part = 0; part < UF; ++part) { - Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + VectorParts RdxParts(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) - : Builder.CreateZExt(Trunc, VecTy); - for (Value::user_iterator UI = RdxParts[part]->user_begin(); - UI != RdxParts[part]->user_end();) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[Part]->user_begin(); + UI != RdxParts[Part]->user_end();) if (*UI != Trunc) { - (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); - RdxParts[part] = Extnd; + (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); + RdxParts[Part] = Extnd; } else { ++UI; } } Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - for (unsigned part = 0; part < UF; ++part) - RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + for (unsigned Part = 0; Part < UF; ++Part) { + RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); + VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); + } } // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; + Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); setDebugLocFromInst(Builder, ReducedPartRdx); - for (unsigned part = 1; part < UF; ++part) { + for (unsigned Part = 1; Part < UF; ++Part) { + Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); if (Op != Instruction::ICmp && Op != Instruction::FCmp) // Floating point operations had to be 'fast' to enable the reduction. ReducedPartRdx = addFastMathFlag( - Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], + Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx")); else ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( - Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); + Builder, MinMaxKind, ReducedPartRdx, RdxPart); } if (VF > 1) { @@ -4518,14 +4543,16 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(BI && "Unexpected terminator found"); if (BI->isConditional()) { - VectorParts EdgeMask = getVectorValue(BI->getCondition()); - if (BI->getSuccessor(0) != Dst) - for (unsigned part = 0; part < UF; ++part) - EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); + VectorParts EdgeMask(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part); + if (BI->getSuccessor(0) != Dst) + EdgeMaskPart = Builder.CreateNot(EdgeMaskPart); - for (unsigned part = 0; part < UF; ++part) - EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); + EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]); + EdgeMask[Part] = EdgeMaskPart; + } EdgeMaskCache[Edge] = EdgeMask; return EdgeMask; @@ -4544,23 +4571,27 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { if (BCEntryIt != BlockMaskCache.end()) return BCEntryIt->second; + VectorParts BlockMask(UF); + // Loop incoming mask is all-one. if (OrigLoop->getHeader() == BB) { Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); - const VectorParts &BlockMask = getVectorValue(C); + for (unsigned Part = 0; Part < UF; ++Part) + BlockMask[Part] = getOrCreateVectorValue(C, Part); BlockMaskCache[BB] = BlockMask; return BlockMask; } // This is the block mask. We OR all incoming edges, and with zero. Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); - VectorParts BlockMask = getVectorValue(Zero); + for (unsigned Part = 0; Part < UF; ++Part) + BlockMask[Part] = getOrCreateVectorValue(Zero, Part); // For each pred: - for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { - VectorParts EM = createEdgeMask(*it, BB); - for (unsigned part = 0; part < UF; ++part) - BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); + for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) { + VectorParts EM = createEdgeMask(*It, BB); + for (unsigned Part = 0; Part < UF; ++Part) + BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]); } BlockMaskCache[BB] = BlockMask; @@ -4575,15 +4606,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { - VectorParts Entry(UF); - for (unsigned part = 0; part < UF; ++part) { + for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create( + Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); + VectorLoopValueMap.setVectorValue(P, Part, EntryPart); } - VectorLoopValueMap.initVector(P, Entry); return; } @@ -4607,21 +4637,22 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned In = 0; In < NumIncoming; In++) { VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), P->getParent()); - const VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); - for (unsigned part = 0; part < UF; ++part) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part); // We might have single edge PHIs (blocks) - use an identity // 'select' for the first PHI operand. if (In == 0) - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]); + Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0); else // Select between the current value and the previous incoming edge // based on the incoming mask. - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part], + Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part], "predphi"); } } - VectorLoopValueMap.initVector(P, Entry); + for (unsigned Part = 0; Part < UF; ++Part) + VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]); return; } @@ -4652,18 +4683,15 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; // These are the scalar results. Notice that we don't generate vector GEPs // because scalar GEPs result in better code. - ScalarParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part].resize(VF); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - Entry[Part][Lane] = SclrGep; + VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); } } - VectorLoopValueMap.initScalar(P, Entry); return; } } @@ -4713,7 +4741,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. auto *GEP = cast<GetElementPtrInst>(&I); - VectorParts Entry(UF); if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { // If we are vectorizing, but the GEP has only loop-invariant operands, @@ -4729,8 +4756,11 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // collectLoopScalars() and teach getVectorValue() to broadcast // the lane-zero scalar value. auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = Builder.CreateVectorSplat(VF, Clone); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); + addMetadata(EntryPart, GEP); + } } else { // If the GEP has at least one loop-varying operand, we are sure to // produce a vector of pointers. But if we are only unrolling, we want @@ -4743,9 +4773,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. - auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getVectorValue(GEP->getPointerOperand())[Part]; + auto *Ptr = + OrigLoop->isLoopInvariant(GEP->getPointerOperand()) + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); // Collect all the indices for the new GEP. If any index is // loop-invariant, we won't broadcast it. @@ -4754,7 +4785,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { if (OrigLoop->isLoopInvariant(U.get())) Indices.push_back(U.get()); else - Indices.push_back(getVectorValue(U.get())[Part]); + Indices.push_back(getOrCreateVectorValue(U.get(), Part)); } // Create the new GEP. Note that this GEP may be a scalar if VF == 1, @@ -4764,12 +4795,11 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { : Builder.CreateGEP(Ptr, Indices); assert((VF == 1 || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); - Entry[Part] = NewGEP; + VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); + addMetadata(NewGEP, GEP); } } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, GEP); break; } case Instruction::UDiv: @@ -4800,22 +4830,20 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // Just widen binops. auto *BinOp = cast<BinaryOperator>(&I); setDebugLocFromInst(Builder, BinOp); - const VectorParts &A = getVectorValue(BinOp->getOperand(0)); - const VectorParts &B = getVectorValue(BinOp->getOperand(1)); - // Use this vector value for all users of the original instruction. - VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part); + Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) VecOp->copyIRFlags(BinOp); - Entry[Part] = V; + // Use this vector value for all users of the original instruction. + VectorLoopValueMap.setVectorValue(&I, Part, V); + addMetadata(V, BinOp); } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, BinOp); break; } case Instruction::Select: { @@ -4831,20 +4859,19 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - const VectorParts &Cond = getVectorValue(I.getOperand(0)); - const VectorParts &Op0 = getVectorValue(I.getOperand(1)); - const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); - VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part] = Builder.CreateSelect( - InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); + Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); + Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); + Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); + Value *Sel = + Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); + VectorLoopValueMap.setVectorValue(&I, Part, Sel); + addMetadata(Sel, &I); } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); break; } @@ -4854,22 +4881,20 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = dyn_cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); - const VectorParts &A = getVectorValue(Cmp->getOperand(0)); - const VectorParts &B = getVectorValue(Cmp->getOperand(1)); - VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { + Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); + Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); Value *C = nullptr; if (FCmp) { - C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); cast<FCmpInst>(C)->copyFastMathFlags(Cmp); } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } - Entry[Part] = C; + VectorLoopValueMap.setVectorValue(&I, Part, C); + addMetadata(C, &I); } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); break; } @@ -4906,12 +4931,12 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); - const VectorParts &A = getVectorValue(CI->getOperand(0)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + VectorLoopValueMap.setVectorValue(&I, Part, Cast); + addMetadata(Cast, &I); + } break; } @@ -4949,17 +4974,14 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { break; } - VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { Value *Arg = CI->getArgOperand(i); // Some intrinsics have a scalar argument - don't replace it with a // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); - Arg = VectorArg[Part]; - } + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) + Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); Args.push_back(Arg); } @@ -4992,11 +5014,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { if (isa<FPMathOperator>(V)) V->copyFastMathFlags(CI); - Entry[Part] = V; + VectorLoopValueMap.setVectorValue(&I, Part, V); + addMetadata(V, &I); } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); break; } @@ -5363,7 +5384,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) { + if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, + SinkAfter, DT)) { FirstOrderRecurrences.insert(Phi); continue; } @@ -7636,6 +7658,15 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { // 2. Copy and widen instructions from the old loop into the new loop. + // Move instructions to handle first-order recurrences. + DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter(); + for (auto &Entry : SinkAfter) { + Entry.first->removeFromParent(); + Entry.first->insertAfter(Entry.second); + DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second + << " to vectorize a 1st order recurrence.\n"); + } + // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For // example, original induction update instructions can become dead because we @@ -7787,8 +7818,25 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Check the loop for a trip count threshold: - // do not vectorize loops with a tiny trip count. + PredicatedScalarEvolution PSE(*SE, *L); + + // Check if it is legal to vectorize the loop. + LoopVectorizationRequirements Requirements(*ORE); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, + &Requirements, &Hints); + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + emitMissedWarning(F, L, Hints, ORE); + return false; + } + + // Check the function attributes to find out if this function should be + // optimized for size. + bool OptForSize = + Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + + // Check the loop for a trip count threshold: vectorize loops with a tiny trip + // count by optimizing for size, to minimize overheads. unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); bool HasExpectedTC = (ExpectedTC > 0); @@ -7802,36 +7850,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is not worth vectorizing."); + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NotBeneficial", L) - << "vectorization is not beneficial " - "and is not explicitly forced"); - return false; + // Loops with a very small trip count are considered for vectorization + // under OptForSize, thereby making sure the cost of their loop body is + // dominant, free of runtime guards and scalar iteration overheads. + OptForSize = true; } } - PredicatedScalarEvolution PSE(*SE, *L); - - // Check if it is legal to vectorize the loop. - LoopVectorizationRequirements Requirements(*ORE); - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, - &Requirements, &Hints); - if (!LVL.canVectorize()) { - DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); - emitMissedWarning(F, L, Hints, ORE); - return false; - } - - // Check the function attributes to find out if this function should be - // optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index b267230d3185..b494526369d6 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -173,6 +173,11 @@ static unsigned getAltOpcode(unsigned Op) { } } +/// true if the \p Value is odd, false otherwise. +static bool isOdd(unsigned Value) { + return Value & 1; +} + ///\returns bool representing if Opcode \p Op can be part /// of an alternate sequence which can later be merged as /// a ShuffleVector instruction. @@ -190,7 +195,7 @@ static unsigned isAltInst(ArrayRef<Value *> VL) { unsigned AltOpcode = getAltOpcode(Opcode); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode)) return 0; } return Instruction::ShuffleVector; @@ -504,7 +509,7 @@ private: Last->NeedToGather = !Vectorized; if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); ScalarToTreeEntry[VL[i]] = idx; } } else { @@ -521,6 +526,20 @@ private: /// Holds all of the tree entries. std::vector<TreeEntry> VectorizableTree; + TreeEntry *getTreeEntry(Value *V) { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return &VectorizableTree[I->second]; + return nullptr; + } + + const TreeEntry *getTreeEntry(Value *V) const { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return &VectorizableTree[I->second]; + return nullptr; + } + /// Maps a specific scalar to its tree entry. SmallDenseMap<Value*, int> ScalarToTreeEntry; @@ -1048,14 +1067,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, for (TreeEntry &EIdx : VectorizableTree) { TreeEntry *Entry = &EIdx; + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. - if (Entry->NeedToGather) - continue; - // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { @@ -1072,9 +1091,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, continue; // Skip in-tree scalars that become vectors - if (ScalarToTreeEntry.count(U)) { - int Idx = ScalarToTreeEntry[U]; - TreeEntry *UseEntry = &VectorizableTree[Idx]; + if (TreeEntry *UseEntry = getTreeEntry(U)) { Value *UseScalar = UseEntry->Scalars[0]; // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will @@ -1083,7 +1100,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); + assert(!UseEntry->NeedToGather && "Bad state"); continue; } } @@ -1156,9 +1173,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // Check if this is a duplicate of another entry. - if (ScalarToTreeEntry.count(VL[0])) { - int Idx = ScalarToTreeEntry[VL[0]]; - TreeEntry *E = &VectorizableTree[Idx]; + if (TreeEntry *E = getTreeEntry(VL[0])) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { @@ -1997,7 +2012,7 @@ int BoUpSLP::getSpillCost() { // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) + if (isa<Instruction>(&*J) && getTreeEntry(&*J)) LiveValues.insert(cast<Instruction>(&*J)); } @@ -2393,9 +2408,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. - if (ScalarToTreeEntry.count(VL[i])) { - int Idx = ScalarToTreeEntry[VL[i]]; - TreeEntry *E = &VectorizableTree[Idx]; + if (TreeEntry *E = getTreeEntry(VL[i])) { // Find which lane we need to extract. int FoundLane = -1; for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { @@ -2415,11 +2428,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { } Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { - SmallDenseMap<Value*, int>::const_iterator Entry - = ScalarToTreeEntry.find(VL[0]); - if (Entry != ScalarToTreeEntry.end()) { - int Idx = Entry->second; - const TreeEntry *En = &VectorizableTree[Idx]; + if (const TreeEntry *En = getTreeEntry(VL[0])) { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } @@ -2427,12 +2436,9 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { } Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { - if (ScalarToTreeEntry.count(VL[0])) { - int Idx = ScalarToTreeEntry[VL[0]]; - TreeEntry *E = &VectorizableTree[Idx]; + if (TreeEntry *E = getTreeEntry(VL[0])) if (E->isSame(VL)) return vectorizeTree(E); - } Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -2667,9 +2673,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the // future. - if (ScalarToTreeEntry.count(LI->getPointerOperand())) - ExternalUses.push_back( - ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); + Value *PO = LI->getPointerOperand(); + if (getTreeEntry(PO)) + ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); @@ -2700,9 +2706,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the // future. - if (ScalarToTreeEntry.count(SI->getPointerOperand())) - ExternalUses.push_back( - ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); + Value *PO = SI->getPointerOperand(); + if (getTreeEntry(PO)) + ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); if (!Alignment) { Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); @@ -2783,7 +2789,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // The scalar argument uses an in-tree scalar so we add the new vectorized // call to ExternalUses list to make sure that an extract will be // generated in the future. - if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) + if (ScalarArg && getTreeEntry(ScalarArg)) ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); E->VectorizedValue = V; @@ -2819,7 +2825,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned e = E->Scalars.size(); SmallVector<Constant *, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { - if (i & 1) { + if (isOdd(i)) { Mask[i] = Builder.getInt32(e + i); OddScalars.push_back(E->Scalars[i]); } else { @@ -2897,10 +2903,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // has multiple uses of the same value. if (User && !is_contained(Scalar->users(), User)) continue; - assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); - - int Idx = ScalarToTreeEntry[Scalar]; - TreeEntry *E = &VectorizableTree[Idx]; + TreeEntry *E = getTreeEntry(Scalar); + assert(E && "Invalid scalar"); assert(!E->NeedToGather && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; @@ -2986,7 +2990,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - assert((ScalarToTreeEntry.count(U) || + assert((getTreeEntry(U) || // It is legal to replace users in the ignorelist by undef. is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); @@ -3449,7 +3453,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { I = I->getNextNode()) { ScheduleData *SD = BS->getScheduleData(I); assert( - SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && + SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && "scheduler and vectorizer have different opinion on what is a bundle"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index a21928317888..fb2f509dcbaa 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -26,7 +26,6 @@ using namespace llvm; /// initializeVectorizationPasses - Initialize all passes linked into the /// Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { - initializeBBVectorizePass(Registry); initializeLoopVectorizePass(Registry); initializeSLPVectorizerPass(Registry); initializeLoadStoreVectorizerPass(Registry); @@ -36,8 +35,8 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) { initializeVectorization(*unwrap(R)); } +// DEPRECATED: Remove after the LLVM 5 release. void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createBBVectorizePass()); } void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { |